Skip to content

Commit

Permalink
Merge pull request #598 from RandomDefaultUser/fix_data_scaling
Browse files Browse the repository at this point in the history
Address incosistencies in DataScaling infrastructure
  • Loading branch information
RandomDefaultUser authored Nov 25, 2024
2 parents a402f79 + a98830b commit 6cb1d3f
Show file tree
Hide file tree
Showing 20 changed files with 254 additions and 120 deletions.
15 changes: 9 additions & 6 deletions docs/source/basic_usage/trainingmodel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ options to train a simple network with example data, namely
parameters = mala.Parameters()
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
parameters.network.layer_activations = ["ReLU"]
Expand All @@ -43,15 +43,18 @@ sub-objects dealing with the individual aspects of the workflow. In the first
two lines, which data scaling MALA should employ. Scaling data greatly
improves the performance of NN based ML models. Options are

* ``None``: No normalization is applied.
* ``None``: No scaling is applied.

* ``standard``: Standardization (Scale to mean 0, standard deviation 1)
* ``standard``: Standardization (Scale to mean 0, standard deviation 1) is
applied to the entire array.

* ``normal``: Min-Max scaling (Scale to be in range 0...1)
* ``minmax``: Min-Max scaling (Scale to be in range 0...1) is applied to the entire array.

* ``feature-wise-standard``: Row Standardization (Scale to mean 0, standard deviation 1)
* ``feature-wise-standard``: Standardization (Scale to mean 0, standard
deviation 1) is applied to each feature dimension individually.

* ``feature-wise-normal``: Row Min-Max scaling (Scale to be in range 0...1)
* ``feature-wise-minmax``: Min-Max scaling (Scale to be in range 0...1) is
applied to each feature dimension individually.

Here, we specify that MALA should standardize the input (=descriptors)
by feature (i.e., each entry of the vector separately on the grid) and
Expand Down
2 changes: 1 addition & 1 deletion examples/advanced/ex01_checkpoint_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def initial_setup():
parameters = mala.Parameters()
parameters.data.data_splitting_type = "by_snapshot"
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
parameters.network.layer_activations = ["ReLU"]
parameters.running.max_number_epochs = 9
parameters.running.mini_batch_size = 8
Expand Down
14 changes: 11 additions & 3 deletions examples/advanced/ex03_tensor_board.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

parameters = mala.Parameters()
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
parameters.targets.ldos_gridsize = 11
parameters.targets.ldos_gridspacing_ev = 2.5
parameters.targets.ldos_gridoffset_ev = -5
Expand All @@ -32,11 +32,19 @@

data_handler = mala.DataHandler(parameters)
data_handler.add_snapshot(
"Be_snapshot0.in.npy", data_path, "Be_snapshot0.out.npy", data_path, "tr",
"Be_snapshot0.in.npy",
data_path,
"Be_snapshot0.out.npy",
data_path,
"tr",
calculation_output_file=os.path.join(data_path, "Be_snapshot0.out"),
)
data_handler.add_snapshot(
"Be_snapshot1.in.npy", data_path, "Be_snapshot1.out.npy", data_path, "va",
"Be_snapshot1.in.npy",
data_path,
"Be_snapshot1.out.npy",
data_path,
"va",
calculation_output_file=os.path.join(data_path, "Be_snapshot1.out"),
)
data_handler.prepare_data()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
def initial_setup():
parameters = mala.Parameters()
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
parameters.running.max_number_epochs = 10
parameters.running.mini_batch_size = 40
parameters.running.learning_rate = 0.00001
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
parameters = mala.Parameters()
# Specify the data scaling.
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
parameters.running.max_number_epochs = 5
parameters.running.mini_batch_size = 40
parameters.running.learning_rate = 0.00001
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def optimize_hyperparameters(hyper_optimizer):

parameters = mala.Parameters()
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
parameters.running.max_number_epochs = 10
parameters.running.mini_batch_size = 40
parameters.running.learning_rate = 0.00001
Expand Down
2 changes: 1 addition & 1 deletion examples/basic/ex01_train_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Specify the data scaling. For regular bispectrum and LDOS data,
# these have proven successful.
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
# Specify the used activation function.
parameters.network.layer_activations = ["ReLU"]
# Specify the training parameters.
Expand Down
2 changes: 1 addition & 1 deletion examples/basic/ex04_hyperparameter_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
####################
parameters = mala.Parameters()
parameters.data.input_rescaling_type = "feature-wise-standard"
parameters.data.output_rescaling_type = "normal"
parameters.data.output_rescaling_type = "minmax"
parameters.running.max_number_epochs = 20
parameters.running.mini_batch_size = 40
parameters.running.optimizer = "Adam"
Expand Down
48 changes: 33 additions & 15 deletions mala/common/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,27 +568,45 @@ class ParametersData(ParametersBase):
Specifies how input quantities are normalized.
Options:
- "None": No normalization is applied.
- "standard": Standardization (Scale to mean 0, standard
deviation 1)
- "normal": Min-Max scaling (Scale to be in range 0...1)
- "feature-wise-standard": Row Standardization (Scale to mean 0,
standard deviation 1)
- "feature-wise-normal": Row Min-Max scaling (Scale to be in range
0...1)
- "None": No scaling is applied.
- "standard": Standardization (Scale to mean 0,
standard deviation 1) is applied to the entire array.
- "minmax": Min-Max scaling (Scale to be in range 0...1) is applied
to the entire array.
- "feature-wise-standard": Standardization (Scale to mean 0,
standard deviation 1) is applied to each feature dimension
individually.
I.e., if your training data has dimensions (d,f), then each
of the f columns with d entries is scaled indiviually.
- "feature-wise-minmax": Min-Max scaling (Scale to be in range
0...1) is applied to each feature dimension individually.
I.e., if your training data has dimensions (d,f), then each
of the f columns with d entries is scaled indiviually.
- "normal": (DEPRECATED) Old name for "minmax".
- "feature-wise-normal": (DEPRECATED) Old name for
"feature-wise-minmax"
output_rescaling_type : string
Specifies how output quantities are normalized.
Options:
- "None": No normalization is applied.
- "None": No scaling is applied.
- "standard": Standardization (Scale to mean 0,
standard deviation 1)
- "normal": Min-Max scaling (Scale to be in range 0...1)
- "feature-wise-standard": Row Standardization (Scale to mean 0,
standard deviation 1)
- "feature-wise-normal": Row Min-Max scaling (Scale to be in
range 0...1)
standard deviation 1) is applied to the entire array.
- "minmax": Min-Max scaling (Scale to be in range 0...1) is applied
to the entire array.
- "feature-wise-standard": Standardization (Scale to mean 0,
standard deviation 1) is applied to each feature dimension
individually.
I.e., if your training data has dimensions (d,f), then each
of the f columns with d entries is scaled indiviually.
- "feature-wise-minmax": Min-Max scaling (Scale to be in range
0...1) is applied to each feature dimension individually.
I.e., if your training data has dimensions (d,f), then each
of the f columns with d entries is scaled indiviually.
- "normal": (DEPRECATED) Old name for "minmax".
- "feature-wise-normal": (DEPRECATED) Old name for
"feature-wise-minmax"
use_lazy_loading : bool
If True, data is lazily loaded, i.e. only the snapshots that are
Expand Down
23 changes: 10 additions & 13 deletions mala/datahandling/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ def clear_data(self):
self.nr_training_snapshots = 0
self.nr_test_snapshots = 0
self.nr_validation_snapshots = 0
self.input_data_scaler.reset()
self.output_data_scaler.reset()
super(DataHandler, self).clear_data()

# Preparing data
Expand Down Expand Up @@ -303,7 +305,10 @@ def get_snapshot_calculation_output(self, snapshot_number):
######################

def raw_numpy_to_converted_scaled_tensor(
self, numpy_array, data_type, units, convert3Dto1D=False
self,
numpy_array,
data_type,
units,
):
"""
Transform a raw numpy array into a scaled torch tensor.
Expand All @@ -320,9 +325,6 @@ def raw_numpy_to_converted_scaled_tensor(
processed.
units : string
Units of the data that is processed.
convert3Dto1D : bool
If True (default: False), then a (x,y,z,dim) array is transformed
into a (x*y*z,dim) array.
Returns
-------
Expand All @@ -341,12 +343,12 @@ def raw_numpy_to_converted_scaled_tensor(
)

# If desired, the dimensions can be changed.
if convert3Dto1D:
if len(np.shape(numpy_array)) == 4:
if data_type == "in":
data_dimension = self.input_dimension
else:
data_dimension = self.output_dimension
grid_size = np.prod(numpy_array[0:3])
grid_size = np.prod(np.shape(numpy_array)[0:3])
desired_dimensions = [grid_size, data_dimension]
else:
desired_dimensions = None
Expand Down Expand Up @@ -815,7 +817,6 @@ def __parametrize_scalers(self):
# scaling. This should save some performance.

if self.parameters.use_lazy_loading:
self.input_data_scaler.start_incremental_fitting()
# We need to perform the data scaling over the entirety of the
# training data.
for snapshot in self.parameters.snapshot_directories_list:
Expand Down Expand Up @@ -853,9 +854,7 @@ def __parametrize_scalers(self):
[snapshot.grid_size, self.input_dimension]
)
tmp = torch.from_numpy(tmp).float()
self.input_data_scaler.incremental_fit(tmp)

self.input_data_scaler.finish_incremental_fitting()
self.input_data_scaler.partial_fit(tmp)

else:
self.__load_data("training", "inputs")
Expand All @@ -876,7 +875,6 @@ def __parametrize_scalers(self):

if self.parameters.use_lazy_loading:
i = 0
self.output_data_scaler.start_incremental_fitting()
# We need to perform the data scaling over the entirety of the
# training data.
for snapshot in self.parameters.snapshot_directories_list:
Expand Down Expand Up @@ -912,9 +910,8 @@ def __parametrize_scalers(self):
[snapshot.grid_size, self.output_dimension]
)
tmp = torch.from_numpy(tmp).float()
self.output_data_scaler.incremental_fit(tmp)
self.output_data_scaler.partial_fit(tmp)
i += 1
self.output_data_scaler.finish_incremental_fitting()

else:
self.__load_data("training", "outputs")
Expand Down
Loading

0 comments on commit 6cb1d3f

Please sign in to comment.