From 4139713d316cecd65879870c5d4b3aad758d8b50 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Mon, 24 Jun 2024 17:13:56 +0200 Subject: [PATCH 01/10] Unified error calculation --- .../source/advanced_usage/hyperparameters.rst | 2 +- docs/source/advanced_usage/predictions.rst | 3 +- docs/source/advanced_usage/trainingmodel.rst | 16 +- docs/source/basic_usage/hyperparameters.rst | 4 +- docs/source/basic_usage/trainingmodel.rst | 2 +- docs/source/install/installing_lammps.rst | 8 +- docs/source/install/installing_qe.rst | 23 +- examples/advanced/ex01_checkpoint_training.py | 2 +- examples/advanced/ex03_tensor_board.py | 4 +- ..._checkpoint_hyperparameter_optimization.py | 2 +- ...distributed_hyperparameter_optimization.py | 4 +- ...07_advanced_hyperparameter_optimization.py | 4 +- examples/basic/ex01_train_network.py | 2 +- examples/basic/ex02_test_network.py | 6 +- .../basic/ex04_hyperparameter_optimization.py | 2 +- mala/common/parameters.py | 83 +- mala/datahandling/data_shuffler.py | 14 +- mala/network/hyper_opt_naswot.py | 2 +- mala/network/objective_base.py | 8 +- mala/network/runner.py | 398 +++++++++- mala/network/tester.py | 184 +---- mala/network/trainer.py | 749 ++++++------------ test/all_lazy_loading_test.py | 17 +- test/basic_gpu_test.py | 4 +- test/checkpoint_hyperopt_test.py | 2 +- test/checkpoint_training_test.py | 10 +- test/complete_interfaces_test.py | 6 +- test/examples_test.py | 50 +- test/hyperopt_test.py | 14 +- test/shuffling_test.py | 8 +- test/workflow_test.py | 52 +- 31 files changed, 804 insertions(+), 881 deletions(-) diff --git a/docs/source/advanced_usage/hyperparameters.rst b/docs/source/advanced_usage/hyperparameters.rst index 4240250e7..5c0665b44 100644 --- a/docs/source/advanced_usage/hyperparameters.rst +++ b/docs/source/advanced_usage/hyperparameters.rst @@ -114,7 +114,7 @@ a physical validation metric such as .. code-block:: python - parameters.running.after_before_training_metric = "band_energy" + parameters.running.after_training_metric = "band_energy" Advanced optimization algorithms ******************************** diff --git a/docs/source/advanced_usage/predictions.rst b/docs/source/advanced_usage/predictions.rst index 7058f17de..20e82494b 100644 --- a/docs/source/advanced_usage/predictions.rst +++ b/docs/source/advanced_usage/predictions.rst @@ -40,6 +40,8 @@ Likewise, you can adjust the inference temperature via calculator.data_handler.target_calculator.temperature = ... +.. _production_gpu: + Predictions on GPU ******************* @@ -137,4 +139,3 @@ With the exception of the electronic density, which is saved into the ``.cube`` format for visualization with regular electronic structure visualization software, all of these observables can be plotted with Python based visualization libraries such as ``matplotlib``. - diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst index 52e50ec50..290aa15f3 100644 --- a/docs/source/advanced_usage/trainingmodel.rst +++ b/docs/source/advanced_usage/trainingmodel.rst @@ -77,7 +77,7 @@ Specifically, when setting .. code-block:: python - parameters.running.after_before_training_metric = "band_energy" + parameters.running.after_training_metric = "band_energy" the error in the band energy between actual and predicted LDOS will be calculated and printed before and after network training (in meV/atom). @@ -205,21 +205,21 @@ visualization prior to training via # 0: No visualizatuon, 1: loss and learning rate, 2: like 1, # but additionally weights and biases are saved - parameters.running.visualisation = 1 - parameters.running.visualisation_dir = "mala_vis" + parameters.running.logging = 1 + parameters.running.logging_dir = "mala_vis" -where ``visualisation_dir`` specifies some directory in which to save the -MALA visualization data. Afterwards, you can run the training without any +where ``logging_dir`` specifies some directory in which to save the +MALA logging data. Afterwards, you can run the training without any other modifications. Once training is finished (or during training, in case you want to use tensorboard to monitor progress), you can launch tensorboard via .. code-block:: bash - tensorboard --logdir path_to_visualization + tensorboard --logdir path_to_log_directory -The full path for ``path_to_visualization`` can be accessed via -``trainer.full_visualization_path``. +The full path for ``path_to_log_directory`` can be accessed via +``trainer.full_logging_path``. Training in parallel diff --git a/docs/source/basic_usage/hyperparameters.rst b/docs/source/basic_usage/hyperparameters.rst index 11742932d..d10bb440e 100644 --- a/docs/source/basic_usage/hyperparameters.rst +++ b/docs/source/basic_usage/hyperparameters.rst @@ -118,9 +118,9 @@ properties of the ``Parameters`` class: during the optimization. - ``network.layer_sizes`` - ``"int"``, ``"categorical"`` - * - ``"trainingtype"`` + * - ``"optimizer"`` - Optimization algorithm used during the NN optimization. - - ``running.trainingtype`` + - ``running.optimizer`` - ``"categorical"`` * - ``"mini_batch_size"`` - Size of the mini batches used to calculate the gradient during diff --git a/docs/source/basic_usage/trainingmodel.rst b/docs/source/basic_usage/trainingmodel.rst index 3995865e6..e6bc8c967 100644 --- a/docs/source/basic_usage/trainingmodel.rst +++ b/docs/source/basic_usage/trainingmodel.rst @@ -35,7 +35,7 @@ options to train a simple network with example data, namely parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.verbosity = 1 # level of output; 1 is standard, 0 is low, 2 is debug. Here, we can see that the ``Parameters`` object contains multiple diff --git a/docs/source/install/installing_lammps.rst b/docs/source/install/installing_lammps.rst index 50fb41cef..ae3933783 100644 --- a/docs/source/install/installing_lammps.rst +++ b/docs/source/install/installing_lammps.rst @@ -41,18 +41,24 @@ The MALA team recommends to build LAMMPS with ``cmake``. To do so * ``Kokkos_ARCH_GPUARCH=???``: Your GPU architecture (see see `Kokkos instructions `_) * ``CMAKE_CXX_COMPILER=???``: Path to the ``nvcc_wrapper`` executable shipped with the LAMMPS code, should be at ``/your/path/to/lammps/lib/kokkos/bin/nvcc_wrapper`` -* For example, this configures the LAMMPS cmake build with Kokkos support + + For example, this configures the LAMMPS cmake build with Kokkos support for an Intel Haswell CPU and an Nvidia Volta GPU, with MPI support: .. code-block:: bash cmake ../cmake -D PKG_KOKKOS=yes -D BUILD_MPI=yes -D PKG_ML-SNAP=yes -D Kokkos_ENABLE_CUDA=yes -D Kokkos_ARCH_HSW=yes -D Kokkos_ARCH_VOLTA70=yes -D CMAKE_CXX_COMPILER=/path/to/lammps/lib/kokkos/bin/nvcc_wrapper -D BUILD_SHARED_LIBS=yes +.. note:: + When using a GPU by setting ``parameters.use_gpu = True``, you *need* to + have a GPU version of ``LAMMPS`` installed. See :ref:`production_gpu` for + details. * Build the library and executable with ``cmake --build .`` (Add ``--parallel=8`` for a faster build) + Installing the Python extension ******************************** diff --git a/docs/source/install/installing_qe.rst b/docs/source/install/installing_qe.rst index 3b426ba48..9ff514c7a 100644 --- a/docs/source/install/installing_qe.rst +++ b/docs/source/install/installing_qe.rst @@ -4,24 +4,25 @@ Installing Quantum ESPRESSO (total energy module) Prerequisites ************* -To run the total energy module, you need a full Quantum ESPRESSO installation, -for which to install the Python bindings. This module has been tested with -version ``7.2.``, the most recent version at the time of this release of MALA. -Newer versions may work (untested), but installation instructions may vary. +To build and run the total energy module, you need a full Quantum ESPRESSO +installation, for which to install the Python bindings. This module has been +tested with version ``7.2.``, the most recent version at the time of this +release of MALA. Newer versions may work (untested), but installation +instructions may vary. Make sure you have an (MPI-aware) F90 compiler such as ``mpif90`` (e.g. Debian-ish machine: ``apt install openmpi-bin``, on an HPC cluster something like ``module load openmpi gcc``). Make sure to use the same compiler for QE and the extension. This should be the default case, but if problems arise you can manually select the compiler via -``--f90exec=`` in ``build_total_energy_energy_module.sh`` +``--f90exec=`` in ``build_total_energy_module.sh`` We assume that QE's ``configure`` script will find your system libs, e.g. use ``-lblas``, ``-llapack`` and ``-lfftw3``. We use those by default in -``build_total_energy_energy_module.sh``. If you have, say, the MKL library, +``build_total_energy_module.sh``. If you have, say, the MKL library, you may see ``configure`` use something like ``-lmkl_intel_lp64 -lmkl_sequential -lmkl_core`` when building QE. In this case you have to modify -``build_total_energy_energy_module.sh`` to use the same libraries! +``build_total_energy_module.sh`` to use the same libraries! Build Quantum ESPRESSO ********************** @@ -35,10 +36,16 @@ Build Quantum ESPRESSO * Change to the ``external_modules/total_energy_module`` directory of the MALA repository +.. note:: + At the moment, building QE using ``cmake`` `doesn't work together with the + build_total_energy_module.sh script + `_. Please use the + ``configure`` + ``make`` build workflow. + Installing the Python extension ******************************** -* Run ``build_total_energy_energy_module.sh /path/to/your/q-e``. +* Run ``build_total_energy_module.sh /path/to/your/q-e``. * If the build is successful, a file named something like ``total_energy.cpython-39m-x86_64-linux-gnu.so`` will be generated. This is diff --git a/examples/advanced/ex01_checkpoint_training.py b/examples/advanced/ex01_checkpoint_training.py index 01bb9b486..5222a5232 100644 --- a/examples/advanced/ex01_checkpoint_training.py +++ b/examples/advanced/ex01_checkpoint_training.py @@ -26,7 +26,7 @@ def initial_setup(): parameters.running.max_number_epochs = 9 parameters.running.mini_batch_size = 8 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" # We checkpoint the training every 5 epochs and save the results # as "ex07". diff --git a/examples/advanced/ex03_tensor_board.py b/examples/advanced/ex03_tensor_board.py index b15239495..43a066aaf 100644 --- a/examples/advanced/ex03_tensor_board.py +++ b/examples/advanced/ex03_tensor_board.py @@ -18,7 +18,7 @@ parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" # Turn the visualization on and select a folder to save the visualization # files into. @@ -45,6 +45,6 @@ trainer.train_network() printout( 'Run finished, launch tensorboard with "tensorboard --logdir ' - + trainer.full_visualization_path + + trainer.full_logging_path + '"' ) diff --git a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py index cef7c8f4f..99a92fa35 100644 --- a/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py +++ b/examples/advanced/ex05_checkpoint_hyperparameter_optimization.py @@ -21,7 +21,7 @@ def initial_setup(): parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 9 parameters.hyperparameters.checkpoints_each_trial = 5 parameters.hyperparameters.checkpoint_name = "ex05_checkpoint" diff --git a/examples/advanced/ex06_distributed_hyperparameter_optimization.py b/examples/advanced/ex06_distributed_hyperparameter_optimization.py index b34f9bb8b..215dd1ab2 100644 --- a/examples/advanced/ex06_distributed_hyperparameter_optimization.py +++ b/examples/advanced/ex06_distributed_hyperparameter_optimization.py @@ -28,7 +28,7 @@ parameters.running.max_number_epochs = 5 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 10 parameters.hyperparameters.checkpoints_each_trial = -1 parameters.hyperparameters.checkpoint_name = "ex06" @@ -44,7 +44,7 @@ parameters.targets.ldos_gridspacing_ev = 2.5 parameters.targets.ldos_gridoffset_ev = -5 parameters.hyperparameters.number_training_per_trial = 3 -parameters.running.after_before_training_metric = "band_energy" +parameters.running.after_training_metric = "band_energy" data_handler = mala.DataHandler(parameters) diff --git a/examples/advanced/ex07_advanced_hyperparameter_optimization.py b/examples/advanced/ex07_advanced_hyperparameter_optimization.py index 8165ef01e..242ffd7dd 100644 --- a/examples/advanced/ex07_advanced_hyperparameter_optimization.py +++ b/examples/advanced/ex07_advanced_hyperparameter_optimization.py @@ -21,7 +21,7 @@ def optimize_hyperparameters(hyper_optimizer): parameters.running.max_number_epochs = 10 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 - parameters.running.trainingtype = "Adam" + parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 8 parameters.hyperparameters.hyper_opt_method = hyper_optimizer @@ -64,7 +64,7 @@ def optimize_hyperparameters(hyper_optimizer): data_handler.output_dimension, ] hyperoptimizer.add_hyperparameter( - "categorical", "trainingtype", choices=["Adam", "SGD"] + "categorical", "optimizer", choices=["Adam", "SGD"] ) hyperoptimizer.add_hyperparameter( "categorical", "layer_activation_00", choices=["ReLU", "Sigmoid"] diff --git a/examples/basic/ex01_train_network.py b/examples/basic/ex01_train_network.py index 95eb2d51b..1eca8c6b7 100644 --- a/examples/basic/ex01_train_network.py +++ b/examples/basic/ex01_train_network.py @@ -28,7 +28,7 @@ parameters.running.max_number_epochs = 100 parameters.running.mini_batch_size = 40 parameters.running.learning_rate = 0.00001 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" # These parameters characterize how the LDOS and bispectrum descriptors # were calculated. They are _technically_ not needed to train a simple # network. However, it is useful to define them prior to training. Then, diff --git a/examples/basic/ex02_test_network.py b/examples/basic/ex02_test_network.py index 2e4b8953c..0d90dfe7f 100644 --- a/examples/basic/ex02_test_network.py +++ b/examples/basic/ex02_test_network.py @@ -21,15 +21,15 @@ # It is recommended to enable the "lazy-loading" feature, so that # data is loaded into memory one snapshot at a time during testing - this # helps keep RAM requirement down. Furthermore, you have to decide which -# observables to test (usual choices are "band_energy", "total_energy" and -# "number_of_electrons") and whether you want the results per snapshot +# observables to test (usual choices are "band_energy", "total_energy") +# and whether you want the results per snapshot # (output_format="list") or as an averaged value (output_format="mae") #################### parameters, network, data_handler, tester = mala.Tester.load_run( run_name=model_name, path=model_path ) -tester.observables_to_test = ["band_energy", "number_of_electrons"] +tester.observables_to_test = ["band_energy", "density"] tester.output_format = "list" parameters.data.use_lazy_loading = True diff --git a/examples/basic/ex04_hyperparameter_optimization.py b/examples/basic/ex04_hyperparameter_optimization.py index 4c68179c2..cebb4c42e 100644 --- a/examples/basic/ex04_hyperparameter_optimization.py +++ b/examples/basic/ex04_hyperparameter_optimization.py @@ -22,7 +22,7 @@ parameters.data.output_rescaling_type = "normal" parameters.running.max_number_epochs = 20 parameters.running.mini_batch_size = 40 -parameters.running.trainingtype = "Adam" +parameters.running.optimizer = "Adam" parameters.hyperparameters.n_trials = 20 #################### diff --git a/mala/common/parameters.py b/mala/common/parameters.py index 3627bd40f..c9b1b826c 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -265,11 +265,6 @@ class ParametersNetwork(ParametersBase): Number of hidden layers to be used in lstm or gru or transformer nets Default: None - dropout: float - Dropout rate for transformer net - 0.0 ≤ dropout ≤ 1.0 - Default: 0.0 - num_heads: int Number of heads to be used in Multi head attention network This should be a divisor of input dimension @@ -452,7 +447,7 @@ class ParametersTargets(ParametersBase): Number of points in the energy grid that is used to calculate the (L)DOS. - ldos_gridsize : float + ldos_gridsize : int Gridsize of the LDOS. ldos_gridspacing_ev: float @@ -625,9 +620,8 @@ class ParametersRunning(ParametersBase): Attributes ---------- - trainingtype : string - Training type to be used. Supported options at the moment: - + optimizer : string + Optimizer to be used. Supported options at the moment: - SGD: Stochastic gradient descent. - Adam: Adam Optimization Algorithm @@ -640,10 +634,6 @@ class ParametersRunning(ParametersBase): mini_batch_size : int Size of the mini batch for the optimization algorihm. Default: 10. - weight_decay : float - Weight decay for regularization. Always refers to L2 regularization. - Default: 0. - early_stopping_epochs : int Number of epochs the validation accuracy is allowed to not improve by at leastearly_stopping_threshold, before we terminate. If 0, no @@ -696,19 +686,13 @@ class ParametersRunning(ParametersBase): Name used for the checkpoints. Using this, multiple runs can be performed in the same directory. - visualisation : int - If True then Tensorboard is activated for visualisation - case 0: No tensorboard activated - case 1: tensorboard activated with Loss and learning rate - case 2; additonally weights and biases and gradient + logging_dir : string + Name of the folder that logging files will be saved to. - visualisation_dir : string - Name of the folder that visualization files will be saved to. - - visualisation_dir_append_date : bool - If True, then upon creating visualization files, these will be saved - in a subfolder of visualisation_dir labelled with the starting date - of the visualization, to avoid having to change input scripts often. + logging_dir_append_date : bool + If True, then upon creating logging files, these will be saved + in a subfolder of logging_dir labelled with the starting date + of the logging, to avoid having to change input scripts often. inference_data_grid : list List holding the grid to be used for inference in the form of @@ -717,7 +701,7 @@ class ParametersRunning(ParametersBase): use_mixed_precision : bool If True, mixed precision computation (via AMP) will be used. - training_report_frequency : int + training_log_interval : int Determines how often detailed performance info is printed during training (only has an effect if the verbosity is high enough). @@ -729,36 +713,49 @@ class ParametersRunning(ParametersBase): def __init__(self): super(ParametersRunning, self).__init__() - self.trainingtype = "SGD" - self.learning_rate = 0.5 + self.optimizer = "Adam" + self.learning_rate = 10 ** (-5) + self.learning_rate_embedding = 10 ** (-4) self.max_number_epochs = 100 self.verbosity = True self.mini_batch_size = 10 - self.weight_decay = 0 + self.snapshots_per_epoch = -1 + + self.l1_regularization = 0.0 + self.l2_regularization = 0.0 + self.dropout = 0.0 + self.batch_norm = False + self.input_noise = 0.0 + self.early_stopping_epochs = 0 self.early_stopping_threshold = 0 self.learning_rate_scheduler = None self.learning_rate_decay = 0.1 self.learning_rate_patience = 0 + self._during_training_metric = "ldos" + self._after_training_metric = "ldos" + self.use_compression = False self.num_workers = 0 self.use_shuffling_for_samplers = True self.checkpoints_each_epoch = 0 + self.checkpoint_best_so_far = False self.checkpoint_name = "checkpoint_mala" - self.visualisation = 0 - self.visualisation_dir = os.path.join(".", "mala_logging") - self.visualisation_dir_append_date = True - self.during_training_metric = "ldos" - self.after_before_training_metric = "ldos" + self.run_name = "" + self.logging_dir = "./mala_logging" + self.logging_dir_append_date = True + self.logger = "tensorboard" + self.validation_metrics = ["ldos"] + self.validate_on_training_data = False self.inference_data_grid = [0, 0, 0] self.use_mixed_precision = False self.use_graphs = False - self.training_report_frequency = 1000 - self.profiler_range = None # [1000, 2000] + self.training_log_interval = 1000 + self.profiler_range = [1000, 2000] def _update_ddp(self, new_ddp): super(ParametersRunning, self)._update_ddp(new_ddp) self.during_training_metric = self.during_training_metric - self.after_before_training_metric = self.after_before_training_metric + self.after_training_metric = self.after_training_metric @property def during_training_metric(self): @@ -786,7 +783,7 @@ def during_training_metric(self, value): self._during_training_metric = value @property - def after_before_training_metric(self): + def after_training_metric(self): """ Get the metric used during training. @@ -798,17 +795,17 @@ def after_before_training_metric(self): DFT results. Of these, the mean average error in eV/atom will be calculated. """ - return self._after_before_training_metric + return self._after_training_metric - @after_before_training_metric.setter - def after_before_training_metric(self, value): + @after_training_metric.setter + def after_training_metric(self, value): if value != "ldos": if self._configuration["ddp"]: raise Exception( "Currently, MALA can only operate with the " '"ldos" metric for ddp runs.' ) - self._after_before_training_metric = value + self._after_training_metric = value @during_training_metric.setter def during_training_metric(self, value): @@ -1474,7 +1471,7 @@ def save(self, filename, save_format="json"): if member[0][0] != "_": if isinstance(member[1], ParametersBase): # All the subclasses have to provide this function. - member[1]: ParametersBase + member[1]: ParametersBase # type: ignore json_dict[member[0]] = member[1].to_json() with open(filename, "w", encoding="utf-8") as f: json.dump(json_dict, f, ensure_ascii=False, indent=4) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 62d6e11a3..e7d7a07cb 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -131,10 +131,12 @@ def __shuffle_numpy( ) # Do the actual shuffling. - target_name_openpmd = os.path.join(target_save_path, - save_name.replace("*", "%T")) - descriptor_name_openpmd = os.path.join(descriptor_save_path, - save_name.replace("*", "%T")) + target_name_openpmd = os.path.join( + target_save_path, save_name.replace("*", "%T") + ) + descriptor_name_openpmd = os.path.join( + descriptor_save_path, save_name.replace("*", "%T") + ) for i in range(0, number_of_new_snapshots): new_descriptors = np.zeros( (int(np.prod(shuffle_dimensions)), self.input_dimension), @@ -363,9 +365,7 @@ def from_chunk_i(i, n, dset, slice_dimension=0): import json # Do the actual shuffling. - name_prefix = os.path.join( - dot.save_path, save_name.replace("*", "%T") - ) + name_prefix = os.path.join(dot.save_path, save_name.replace("*", "%T")) for i in range(my_items_start, my_items_end): # We check above that in the non-numpy case, OpenPMD will work. dot.calculator.grid_dimensions = list(shuffle_dimensions) diff --git a/mala/network/hyper_opt_naswot.py b/mala/network/hyper_opt_naswot.py index ae27f7d13..9a11e1ca0 100644 --- a/mala/network/hyper_opt_naswot.py +++ b/mala/network/hyper_opt_naswot.py @@ -39,7 +39,7 @@ def __init__(self, params, data): self.trial_list = None self.ignored_hyperparameters = [ "learning_rate", - "trainingtype", + "optimizer", "mini_batch_size", "early_stopping_epochs", "learning_rate_patience", diff --git a/mala/network/objective_base.py b/mala/network/objective_base.py index 52d0d9464..2fbf29503 100644 --- a/mala/network/objective_base.py +++ b/mala/network/objective_base.py @@ -231,8 +231,8 @@ def parse_trial_optuna(self, trial: Trial): turned_off_layers.append(layer_counter) layer_counter += 1 - elif "trainingtype" == par.name: - self.params.running.trainingtype = par.get_parameter(trial) + elif "optimizer" == par.name: + self.params.running.optimizer = par.get_parameter(trial) elif "mini_batch_size" == par.name: self.params.running.mini_batch_size = par.get_parameter(trial) @@ -358,8 +358,8 @@ def parse_trial_oat(self, trial): turned_off_layers.append(layer_counter) layer_counter += 1 - elif "trainingtype" == par.name: - self.params.running.trainingtype = par.get_parameter( + elif "optimizer" == par.name: + self.params.running.optimizer = par.get_parameter( trial, factor_idx ) elif "mini_batch_size" == par.name: diff --git a/mala/network/runner.py b/mala/network/runner.py index a5f620071..17ce572b6 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -3,6 +3,8 @@ import os from zipfile import ZipFile, ZIP_STORED +from mala.common.parallelizer import printout + import numpy as np import torch import torch.distributed as dist @@ -10,10 +12,16 @@ import mala from mala.common.parallelizer import get_rank from mala.common.parameters import ParametersRunning +from mala.datahandling.fast_tensor_dataset import FastTensorDataset from mala.network.network import Network from mala.datahandling.data_scaler import DataScaler from mala.datahandling.data_handler import DataHandler from mala import Parameters +from mala.targets.ldos import LDOS +from mala.targets.dos import DOS +from mala.targets.density import Density + +from tqdm.auto import tqdm, trange class Runner: @@ -41,6 +49,335 @@ def __init__(self, params, network, data, runner_dict=None): self.data = data self.__prepare_to_run() + def _calculate_errors( + self, actual_outputs, predicted_outputs, metrics, snapshot_number + ): + """ + Calculate the errors between the actual and predicted outputs. + + Parameters + ---------- + actual_outputs : numpy.ndarray + Actual outputs. + + predicted_outputs : numpy.ndarray + Predicted outputs. + + metrics : list + List of metrics to calculate. + + snapshot_number : int + Snapshot number for which the errors are calculated. + + Returns + ------- + errors : dict + Dictionary containing the errors. + """ + + energy_metrics = [metric for metric in metrics if "energy" in metric] + non_energy_metrics = [ + metric for metric in metrics if "energy" not in metric + ] + if len(energy_metrics) > 0: + errors = self._calculate_energy_errors( + actual_outputs, + predicted_outputs, + energy_metrics, + snapshot_number, + ) + else: + errors = {} + for metric in non_energy_metrics: + try: + if metric == "ldos": + error = np.mean((predicted_outputs - actual_outputs) ** 2) + errors[metric] = error + + elif metric == "density": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, Density): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density + errors[metric] = np.mean(np.abs(actual - predicted)) + + elif metric == "density_relative": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, Density): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density + errors[metric] = ( + np.mean(np.abs((actual - predicted) / actual)) * 100 + ) + + elif metric == "dos": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, DOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density_of_states + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density_of_states + + errors[metric] = np.abs(actual - predicted).mean() + + elif metric == "dos_relative": + target_calculator = self.data.target_calculator + if not isinstance( + target_calculator, LDOS + ) and not isinstance(target_calculator, DOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + + # We shift both the actual and predicted DOS by 1.0 to overcome + # numerical issues with the DOS having values equal to zero. + target_calculator.read_from_array(actual_outputs) + actual = target_calculator.density_of_states + 1.0 + + target_calculator.read_from_array(predicted_outputs) + predicted = target_calculator.density_of_states + 1.0 + + errors[metric] = ( + np.ma.masked_invalid( + np.abs( + (actual - predicted) + / (np.abs(actual) + np.abs(predicted)) + ) + ).mean() + * 100 + ) + else: + raise Exception(f"Invalid metric ({metric}) requested.") + except ValueError as e: + printout( + f"Error calculating observable: {metric} for snapshot {snapshot_number}", + min_verbosity=0, + ) + printout(e, min_verbosity=2) + errors[metric] = float("inf") + return errors + + def _calculate_energy_errors( + self, actual_outputs, predicted_outputs, energy_types, snapshot_number + ): + """ + Calculate the errors between the actual and predicted outputs. + + Parameters + ---------- + actual_outputs : numpy.ndarray + Actual outputs. + + predicted_outputs : numpy.ndarray + Predicted outputs. + + energy_types : list + List of energy types to calculate errors. + + snapshot_number : int + Snapshot number for which the errors are calculated. + """ + target_calculator = self.data.target_calculator + output_file = self.data.get_snapshot_calculation_output( + snapshot_number + ) + if not output_file: + raise Exception( + "Output file needed for energy error calculations." + ) + target_calculator.read_additional_calculation_data(output_file) + + errors = {} + fe_dft = target_calculator.fermi_energy_dft + fe_actual = None + fe_predicted = None + try: + fe_actual = target_calculator.get_self_consistent_fermi_energy( + actual_outputs + ) + except ValueError: + errors = { + energy_type: float("inf") for energy_type in energy_types + } + printout( + "CAUTION! LDOS ground truth is so wrong that the " + "estimation of the self consistent Fermi energy fails." + ) + return errors + try: + fe_predicted = target_calculator.get_self_consistent_fermi_energy( + predicted_outputs + ) + except ValueError: + errors = { + energy_type: float("inf") for energy_type in energy_types + } + printout( + "CAUTION! LDOS prediction is so wrong that the " + "estimation of the self consistent Fermi energy fails." + ) + return errors + for energy_type in energy_types: + if energy_type == "fermi_energy": + fe_error = fe_predicted - fe_actual + errors[energy_type] = fe_error + elif energy_type == "fermi_energy_dft": + fe_error_dft = fe_predicted - fe_dft + errors[energy_type] = fe_error_dft + elif energy_type == "band_energy": + if not isinstance(target_calculator, LDOS) and not isinstance( + target_calculator, DOS + ): + raise Exception( + "Cannot calculate the band energy from this observable." + ) + try: + target_calculator.read_from_array(actual_outputs) + be_actual = target_calculator.get_band_energy( + fermi_energy=fe_actual + ) + target_calculator.read_from_array(predicted_outputs) + be_predicted = target_calculator.get_band_energy( + fermi_energy=fe_predicted + ) + be_error = (be_predicted - be_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = be_error + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "band_energy_dft_fe": + try: + target_calculator.read_from_array(predicted_outputs) + be_predicted_dft_fe = target_calculator.get_band_energy( + fermi_energy=fe_dft + ) + be_error_dft_fe = (be_predicted_dft_fe - be_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = be_error_dft_fe + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "band_energy_actual_fe": + try: + target_calculator.read_from_array(predicted_outputs) + be_predicted_actual_fe = target_calculator.get_band_energy( + fermi_energy=fe_actual + ) + be_error_actual_fe = ( + be_predicted_actual_fe - be_actual + ) * (1000 / len(target_calculator.atoms)) + errors[energy_type] = be_error_actual_fe + except ValueError: + errors[energy_type] = float("inf") + + elif energy_type == "total_energy": + if not isinstance(target_calculator, LDOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + try: + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + target_calculator.read_from_array(actual_outputs) + te_actual = target_calculator.get_total_energy( + fermi_energy=fe_actual + ) + target_calculator.read_from_array(predicted_outputs) + te_predicted = target_calculator.get_total_energy( + fermi_energy=fe_predicted + ) + te_error = (te_predicted - te_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = te_error + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "total_energy_dft_fe": + try: + target_calculator.read_from_array(predicted_outputs) + te_predicted_dft_fe = target_calculator.get_total_energy( + fermi_energy=fe_dft + ) + te_error_dft_fe = (te_predicted_dft_fe - te_actual) * ( + 1000 / len(target_calculator.atoms) + ) + errors[energy_type] = te_error_dft_fe + except ValueError: + errors[energy_type] = float("inf") + elif energy_type == "total_energy_actual_fe": + try: + target_calculator.read_from_array(predicted_outputs) + te_predicted_actual_fe = ( + target_calculator.get_total_energy( + fermi_energy=fe_actual + ) + ) + te_error_actual_fe = ( + te_predicted_actual_fe - te_actual + ) * (1000 / len(target_calculator.atoms)) + errors[energy_type] = te_error_actual_fe + except ValueError: + errors[energy_type] = float("inf") + else: + raise Exception( + f"Invalid energy type ({energy_type}) requested." + ) + return errors + def save_run( self, run_name, @@ -87,7 +424,7 @@ def save_run( params_file = run_name + ".params.json" if save_runner: optimizer_file = run_name + ".optimizer.pth" - + os.makedirs(save_path, exist_ok=True) self.parameters_full.save(os.path.join(save_path, params_file)) if self.parameters_full.use_ddp: self.network.module.save_network( @@ -391,28 +728,51 @@ def _forward_entire_snapshot( from_index += snapshot.grid_size grid_size = to_index - from_index - if self.data.parameters.use_lazy_loading: - data_set.return_outputs_directly = True - actual_outputs = (data_set[from_index:to_index])[1] - else: - actual_outputs = self.data.output_data_scaler.inverse_transform( - (data_set[from_index:to_index])[1], as_numpy=True + if isinstance(data_set, FastTensorDataset): + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) ) - - predicted_outputs = np.zeros((grid_size, self.data.output_dimension)) - - for i in range(0, number_of_batches_per_snapshot): - inputs, outputs = data_set[ - from_index - + (i * batch_size) : from_index - + ((i + 1) * batch_size) - ] - inputs = inputs.to(self.parameters._configuration["device"]) - predicted_outputs[i * batch_size : (i + 1) * batch_size, :] = ( - self.data.output_data_scaler.inverse_transform( + actual_outputs = np.zeros((grid_size, self.data.output_dimension)) + + for i in range(len(data_set)): + inputs, outputs = data_set[from_index + i] + inputs = inputs.to(self.parameters._configuration["device"]) + predicted_outputs[ + i * data_set.batch_size : (i + 1) * data_set.batch_size, : + ] = self.data.output_data_scaler.inverse_transform( self.network(inputs).to("cpu"), as_numpy=True ) + actual_outputs[ + i * data_set.batch_size : (i + 1) * data_set.batch_size, : + ] = self.data.output_data_scaler.inverse_transform( + torch.tensor(outputs), as_numpy=True + ) + else: + if self.data.parameters.use_lazy_loading: + data_set.return_outputs_directly = True + actual_outputs = (data_set[from_index:to_index])[1] + else: + actual_outputs = ( + self.data.output_data_scaler.inverse_transform( + (data_set[from_index:to_index])[1], as_numpy=True + ) + ) + + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) ) + for i in range(0, number_of_batches_per_snapshot): + inputs, outputs = data_set[ + from_index + + (i * batch_size) : from_index + + ((i + 1) * batch_size) + ] + inputs = inputs.to(self.parameters._configuration["device"]) + predicted_outputs[i * batch_size : (i + 1) * batch_size, :] = ( + self.data.output_data_scaler.inverse_transform( + self.network(inputs).to("cpu"), as_numpy=True + ) + ) # Restricting the actual quantities to physical meaningful values, # i.e. restricting the (L)DOS to positive values. diff --git a/mala/network/tester.py b/mala/network/tester.py index 93e67b935..9a7831f57 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -61,7 +61,7 @@ def __init__( self.number_of_batches_per_snapshot = 0 self.observables_to_test = observables_to_test self.output_format = output_format - if self.output_format != "list" and self.output_format == "mae": + if self.output_format != "list" and self.output_format != "mae": raise Exception("Wrong output format for testing selected.") self.target_calculator = data.target_calculator @@ -117,22 +117,12 @@ def test_snapshot(self, snapshot_number, data_type="te"): snapshot_number, data_type=data_type ) - results = {} - for observable in self.observables_to_test: - try: - results[observable] = self.__calculate_observable_error( - snapshot_number, - observable, - predicted_outputs, - actual_outputs, - ) - except ValueError as e: - printout( - f"Error calculating observable: {observable} for snapshot {snapshot_number}", - min_verbosity=0, - ) - printout(e, min_verbosity=2) - results[observable] = np.inf + results = self._calculate_errors( + actual_outputs, + predicted_outputs, + self.observables_to_test, + snapshot_number, + ) return results def predict_targets(self, snapshot_number, data_type="te"): @@ -185,166 +175,6 @@ def predict_targets(self, snapshot_number, data_type="te"): self.parameters.mini_batch_size, ) - def __calculate_observable_error( - self, snapshot_number, observable, predicted_target, actual_target - ): - if observable == "ldos": - return np.mean((predicted_target - actual_target) ** 2) - - elif observable == "band_energy": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, DOS - ): - raise Exception( - "Cannot calculate the band energy from this observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.band_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.band_energy - return actual - predicted - - elif observable == "band_energy_full": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, DOS - ): - raise Exception( - "Cannot calculate the band energy from this observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.band_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.band_energy - return [ - actual, - predicted, - target_calculator.band_energy_dft_calculation, - ] - - elif observable == "number_of_electrons": - target_calculator = self.data.target_calculator - if ( - not isinstance(target_calculator, LDOS) - and not isinstance(target_calculator, DOS) - and not isinstance(target_calculator, Density) - ): - raise Exception( - "Cannot calculate the band energy from this observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - actual = target_calculator.get_number_of_electrons(actual_target) - - predicted = target_calculator.get_number_of_electrons( - predicted_target - ) - return actual - predicted - - elif observable == "total_energy": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.total_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.total_energy - return actual - predicted - - elif observable == "total_energy_full": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.total_energy - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.total_energy - return [ - actual, - predicted, - target_calculator.total_energy_dft_calculation, - ] - - elif observable == "density": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, Density - ): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - target_calculator.read_from_array(actual_target) - actual = target_calculator.density - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.density - return np.mean(np.abs((actual - predicted) / actual)) * 100 - - elif observable == "dos": - target_calculator = self.data.target_calculator - if not isinstance(target_calculator, LDOS) and not isinstance( - target_calculator, DOS - ): - raise Exception( - "Cannot calculate the total energy from this " - "observable." - ) - target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - - # We shift both the actual and predicted DOS by 1.0 to overcome - # numerical issues with the DOS having values equal to zero. - target_calculator.read_from_array(actual_target) - actual = target_calculator.density_of_states + 1.0 - - target_calculator.read_from_array(predicted_target) - predicted = target_calculator.density_of_states + 1.0 - - return ( - np.ma.masked_invalid( - np.abs( - (actual - predicted) - / (np.abs(actual) + np.abs(predicted)) - ) - ).mean() - * 100 - ) - def __prepare_to_test(self, snapshot_number): """Prepare the tester class to for test run.""" # We will use the DataSet iterator to iterate over the test data. diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 81977c40e..3cbf7cfad 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -21,6 +21,7 @@ from mala.datahandling.multi_lazy_load_data_loader import ( MultiLazyLoadDataLoader, ) +from tqdm.auto import trange, tqdm class Trainer(Runner): @@ -54,8 +55,6 @@ def __init__(self, params, network, data, optimizer_dict=None): self.network = DDP(self.network) torch.cuda.current_stream().wait_stream(s) - self.final_test_loss = float("inf") - self.initial_test_loss = float("inf") self.final_validation_loss = float("inf") self.initial_validation_loss = float("inf") self.optimizer = None @@ -65,36 +64,44 @@ def __init__(self, params, network, data, optimizer_dict=None): self.last_loss = None self.training_data_loaders = [] self.validation_data_loaders = [] - self.test_data_loaders = [] # Samplers for the ddp case. self.train_sampler = None - self.test_sampler = None self.validation_sampler = None self.__prepare_to_train(optimizer_dict) - self.tensor_board = None - self.full_visualization_path = None - if self.parameters.visualisation: - if not os.path.exists(self.parameters.visualisation_dir): - os.makedirs(self.parameters.visualisation_dir) - if self.parameters.visualisation_dir_append_date: + self.logger = None + self.full_logging_path = None + if self.parameters.logger is not None: + os.makedirs(self.parameters.logging_dir, exist_ok=True) + if self.parameters.logging_dir_append_date: date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - self.full_visualization_path = os.path.join( - self.parameters.visualisation_dir, date_time + if len(self.parameters.run_name) > 0: + name = self.parameters.run_name + "_" + date_time + else: + name = date_time + self.full_logging_path = os.path.join( + self.parameters.logging_dir, name ) - os.makedirs(self.full_visualization_path) + os.makedirs(self.full_logging_path, exist_ok=True) else: - self.full_visualization_path = ( - self.parameters.visualisation_dir - ) + self.full_logging_path = self.parameters.logging_dir # Set the path to log files - self.tensor_board = SummaryWriter(self.full_visualization_path) + if self.parameters.logger == "wandb": + import wandb + + self.logger = wandb + elif self.parameters.logger == "tensorboard": + self.logger = SummaryWriter(self.full_logging_path) + else: + raise Exception( + f"Unsupported logger {self.parameters.logger}." + ) printout( - "Writing visualization output to", - self.full_visualization_path, + "Writing logging output to", + self.full_logging_path, min_verbosity=1, ) @@ -256,45 +263,10 @@ def train_network(self): # CALCULATE INITIAL METRICS ############################ - tloss = float("inf") - vloss = self.__validate_network( - self.network, - "validation", - self.parameters.after_before_training_metric, - ) - - if self.data.test_data_sets: - tloss = self.__validate_network( - self.network, - "test", - self.parameters.after_before_training_metric, - ) - - # Collect and average all the losses from all the devices - if self.parameters_full.use_ddp: - vloss = self.__average_validation( - vloss, "average_loss", self.parameters._configuration["device"] - ) - self.initial_validation_loss = vloss - if self.data.test_data_sets: - tloss = self.__average_validation( - tloss, - "average_loss", - self.parameters._configuration["device"], - ) - self.initial_test_loss = tloss - - printout( - "Initial Guess - validation data loss: ", vloss, min_verbosity=1 - ) - if self.data.test_data_sets: - printout( - "Initial Guess - test data loss: ", tloss, min_verbosity=1 - ) + vloss = float("inf") # Save losses for later use. self.initial_validation_loss = vloss - self.initial_test_loss = tloss # Initialize all the counters. checkpoint_counter = 0 @@ -310,12 +282,16 @@ def train_network(self): # PERFORM TRAINING ############################ + total_batch_id = 0 + for epoch in range(self.last_epoch, self.parameters.max_number_epochs): start_time = time.time() # Prepare model for training. self.network.train() + training_loss_sum_logging = 0.0 + # Process each mini batch and save the training loss. training_loss_sum = torch.zeros( 1, device=self.parameters._configuration["device"] @@ -337,7 +313,15 @@ def train_network(self): t0 = time.time() batchid = 0 for loader in self.training_data_loaders: - for inputs, outputs in loader: + t = time.time() + for inputs, outputs in tqdm( + loader, + desc="training", + disable=self.parameters_full.verbosity < 2, + total=len(loader), + ): + dt = time.time() - t + printout(f"load time: {dt}", min_verbosity=3) if self.parameters.profiler_range is not None: if batchid == self.parameters.profiler_range[0]: @@ -348,6 +332,7 @@ def train_network(self): torch.cuda.nvtx.range_push(f"step {batchid}") torch.cuda.nvtx.range_push("data copy in") + t = time.time() inputs = inputs.to( self.parameters._configuration["device"], non_blocking=True, @@ -356,6 +341,8 @@ def train_network(self): self.parameters._configuration["device"], non_blocking=True, ) + dt = time.time() - t + printout(f"data copy in time: {dt}", min_verbosity=3) # data copy in torch.cuda.nvtx.range_pop() @@ -365,11 +352,12 @@ def train_network(self): # step torch.cuda.nvtx.range_pop() training_loss_sum += loss + training_loss_sum_logging += loss.item() if ( batchid != 0 and (batchid + 1) - % self.parameters.training_report_frequency + % self.parameters.training_log_interval == 0 ): torch.cuda.synchronize( @@ -378,10 +366,10 @@ def train_network(self): sample_time = time.time() - tsample avg_sample_time = ( sample_time - / self.parameters.training_report_frequency + / self.parameters.training_log_interval ) avg_sample_tput = ( - self.parameters.training_report_frequency + self.parameters.training_log_interval * inputs.shape[0] / sample_time ) @@ -389,18 +377,45 @@ def train_network(self): f"batch {batchid + 1}, " # /{total_samples}, " f"train avg time: {avg_sample_time} " f"train avg throughput: {avg_sample_tput}", - min_verbosity=2, + min_verbosity=3, ) tsample = time.time() + + # summary_writer tensor board + if self.parameters.logger == "tensorboard": + training_loss_mean = ( + training_loss_sum_logging + / self.parameters.training_log_interval + ) + self.logger.add_scalars( + "ldos", + {"during_training": training_loss_mean}, + total_batch_id, + ) + self.logger.close() + training_loss_sum_logging = 0.0 + if self.parameters.logger == "wandb": + training_loss_mean = ( + training_loss_sum_logging + / self.parameters.training_log_interval + ) + self.logger.log( + { + "ldos_during_training": training_loss_mean + }, + step=total_batch_id, + ) + training_loss_sum_logging = 0.0 + batchid += 1 + total_batch_id += 1 + t = time.time() torch.cuda.synchronize( self.parameters._configuration["device"] ) t1 = time.time() printout(f"training time: {t1 - t0}", min_verbosity=2) - training_loss = training_loss_sum.item() / batchid - # Calculate the validation loss. and output it. torch.cuda.synchronize( self.parameters._configuration["device"] @@ -419,14 +434,20 @@ def train_network(self): self.network, inputs, outputs ) batchid += 1 - training_loss = training_loss_sum.item() / batchid - - vloss = self.__validate_network( - self.network, - "validation", - self.parameters.during_training_metric, + dataset_fractions = ["validation"] + if self.parameters.validate_on_training_data: + dataset_fractions.append("train") + errors = self._validate_network( + dataset_fractions, self.parameters.validation_metrics ) - + for dataset_fraction in dataset_fractions: + for metric in errors[dataset_fraction]: + errors[dataset_fraction][metric] = np.mean( + errors[dataset_fraction][metric] + ) + vloss = errors["validation"][ + self.parameters.during_training_metric + ] if self.parameters_full.use_ddp: vloss = self.__average_validation( vloss, @@ -434,41 +455,37 @@ def train_network(self): self.parameters._configuration["device"], ) if self.parameters_full.verbosity > 1: - printout( - "Epoch {0}: validation data loss: {1}, " - "training data loss: {2}".format( - epoch, vloss, training_loss - ), - min_verbosity=2, - ) + printout("Errors:", errors, min_verbosity=2) else: printout( - "Epoch {0}: validation data loss: {1}".format( - epoch, vloss - ), + f"Epoch {epoch}: validation data loss: {vloss:.3e}", min_verbosity=1, ) - # summary_writer tensor board - if self.parameters.visualisation: - self.tensor_board.add_scalars( - "Loss", - {"validation": vloss, "training": training_loss}, - epoch, - ) - self.tensor_board.add_scalar( - "Learning rate", self.parameters.learning_rate, epoch - ) - if self.parameters.visualisation == 2: - for name, param in self.network.named_parameters(): - self.tensor_board.add_histogram(name, param, epoch) - self.tensor_board.add_histogram( - f"{name}.grad", param.grad, epoch + if self.parameters.logger == "tensorboard": + for dataset_fraction in dataset_fractions: + for metric in errors[dataset_fraction]: + self.logger.add_scalars( + metric, + { + dataset_fraction: errors[dataset_fraction][ + metric + ] + }, + total_batch_id, + ) + self.logger.close() + if self.parameters.logger == "wandb": + for dataset_fraction in dataset_fractions: + for metric in errors[dataset_fraction]: + self.logger.log( + { + f"{dataset_fraction}_{metric}": errors[ + dataset_fraction + ][metric] + }, + step=total_batch_id, ) - - # method to make sure that all pending events have been written - # to disk - self.tensor_board.close() if self.parameters._configuration["gpu"]: torch.cuda.synchronize( @@ -541,49 +558,141 @@ def train_network(self): ############################ # CALCULATE FINAL METRICS ############################ - - if ( - self.parameters.after_before_training_metric - != self.parameters.during_training_metric - ): - vloss = self.__validate_network( - self.network, - "validation", - self.parameters.after_before_training_metric, + if self.parameters.after_training_metric in errors["validation"]: + self.final_validation_loss = errors["validation"][ + self.parameters.after_training_metric + ] + else: + final_errors = self._validate_network( + ["validation"], [self.parameters.after_training_metric] ) + vloss = np.mean( + final_errors["validation"][ + self.parameters.after_training_metric + ] + ) + if self.parameters_full.use_ddp: vloss = self.__average_validation( vloss, "average_loss", self.parameters._configuration["device"], ) - - # Calculate final loss. - self.final_validation_loss = vloss - printout("Final validation data loss: ", vloss, min_verbosity=0) - - tloss = float("inf") - if len(self.data.test_data_sets) > 0: - tloss = self.__validate_network( - self.network, - "test", - self.parameters.after_before_training_metric, - ) - if self.parameters_full.use_ddp: - tloss = self.__average_validation( - tloss, - "average_loss", - self.parameters._configuration["device"], - ) - printout("Final test data loss: ", tloss, min_verbosity=0) - self.final_test_loss = tloss + self.final_validation_loss = vloss # Clean-up for pre-fetching lazy loading. if self.data.parameters.use_lazy_loading_prefetch: self.training_data_loaders.cleanup() self.validation_data_loaders.cleanup() - if len(self.data.test_data_sets) > 0: - self.test_data_loaders.cleanup() + + def _validate_network(self, data_set_fractions, metrics): + # """Validate a network, using train or validation data.""" + self.network.eval() + errors = {} + for data_set_type in data_set_fractions: + if data_set_type == "train": + data_loaders = self.training_data_loaders + data_sets = self.data.training_data_sets + number_of_snapshots = self.data.nr_training_snapshots + offset_snapshots = 0 + + elif data_set_type == "validation": + data_loaders = self.validation_data_loaders + data_sets = self.data.validation_data_sets + number_of_snapshots = self.data.nr_validation_snapshots + offset_snapshots = self.data.nr_training_snapshots + + elif data_set_type == "test": + raise Exception( + "You should not look at test set results during training" + ) + else: + raise Exception( + f"Dataset type ({data_set_type}) not recognized." + ) + + errors[data_set_type] = {} + for metric in metrics: + errors[data_set_type][metric] = [] + + if isinstance(data_loaders, MultiLazyLoadDataLoader): + loader_id = 0 + for loader in data_loaders: + grid_size = self.data.parameters.snapshot_directories_list[ + loader_id + offset_snapshots + ].grid_size + + actual_outputs = np.zeros( + (grid_size, self.data.output_dimension) + ) + predicted_outputs = np.zeros( + (grid_size, self.data.output_dimension) + ) + last_start = 0 + + for x, y in loader: + + x = x.to(self.parameters._configuration["device"]) + length = int(x.size()[0]) + predicted_outputs[ + last_start : last_start + length, : + ] = self.data.output_data_scaler.inverse_transform( + self.network(x).to("cpu"), as_numpy=True + ) + actual_outputs[last_start : last_start + length, :] = ( + self.data.output_data_scaler.inverse_transform( + y, as_numpy=True + ) + ) + + last_start += length + errors[data_set_type] = self._calculate_errors( + actual_outputs, + predicted_outputs, + metrics, + loader_id + offset_snapshots, + ) + loader_id += 1 + else: + with torch.no_grad(): + for snapshot_number in trange( + offset_snapshots, + number_of_snapshots + offset_snapshots, + desc="Validation", + disable=self.parameters_full.verbosity < 2, + ): + # Get optimal batch size and number of batches per snapshotss + grid_size = ( + self.data.parameters.snapshot_directories_list[ + snapshot_number + ].grid_size + ) + + optimal_batch_size = ( + self._correct_batch_size_for_testing( + grid_size, self.parameters.mini_batch_size + ) + ) + number_of_batches_per_snapshot = int( + grid_size / optimal_batch_size + ) + + actual_outputs, predicted_outputs = ( + self._forward_entire_snapshot( + snapshot_number, + data_sets[0], + data_set_type[0:2], + number_of_batches_per_snapshot, + optimal_batch_size, + ) + ) + errors[data_set_type] = self._calculate_errors( + actual_outputs, + predicted_outputs, + metrics, + snapshot_number, + ) + return errors def __prepare_to_train(self, optimizer_dict): """Prepare everything for training.""" @@ -612,32 +721,30 @@ def __prepare_to_train(self, optimizer_dict): ) # Choose an optimizer to use. - if self.parameters.trainingtype == "SGD": + if self.parameters.optimizer == "SGD": self.optimizer = optim.SGD( self.network.parameters(), lr=self.parameters.learning_rate, - weight_decay=self.parameters.weight_decay, + weight_decay=self.parameters.l2_regularization, ) - elif self.parameters.trainingtype == "Adam": + elif self.parameters.optimizer == "Adam": self.optimizer = optim.Adam( self.network.parameters(), lr=self.parameters.learning_rate, - weight_decay=self.parameters.weight_decay, + weight_decay=self.parameters.l2_regularization, ) - elif self.parameters.trainingtype == "FusedAdam": + elif self.parameters.optimizer == "FusedAdam": if version.parse(torch.__version__) >= version.parse("1.13.0"): self.optimizer = optim.Adam( self.network.parameters(), lr=self.parameters.learning_rate, - weight_decay=self.parameters.weight_decay, + weight_decay=self.parameters.l2_regularization, fused=True, ) else: - raise Exception( - "Training method requires at least torch 1.13.0." - ) + raise Exception("Optimizer requires " "at least torch 1.13.0.") else: - raise Exception("Unsupported training method.") + raise Exception("Unsupported optimizer.") # Load data from pytorch file. if optimizer_dict is not None: @@ -677,16 +784,6 @@ def __prepare_to_train(self, optimizer_dict): ) ) - if self.data.test_data_sets: - self.test_sampler = ( - torch.utils.data.distributed.DistributedSampler( - self.data.test_data_sets[0], - num_replicas=dist.get_world_size(), - rank=dist.get_rank(), - shuffle=False, - ) - ) - # Instantiate the learning rate scheduler, if necessary. if self.parameters.learning_rate_scheduler == "ReduceLROnPlateau": self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( @@ -774,21 +871,6 @@ def __prepare_to_train(self, optimizer_dict): ) ) - if self.data.test_data_sets: - if isinstance(self.data.test_data_sets[0], LazyLoadDatasetSingle): - self.test_data_loaders = MultiLazyLoadDataLoader( - self.data.test_data_sets, **kwargs - ) - else: - self.test_data_loaders.append( - DataLoader( - self.data.test_data_sets[0], - batch_size=self.parameters.mini_batch_size * 1, - sampler=self.test_sampler, - **kwargs, - ) - ) - def __process_mini_batch(self, network, input_data, target_data): """Process a mini batch.""" if self.parameters._configuration["gpu"]: @@ -870,7 +952,10 @@ def __process_mini_batch(self, network, input_data, target_data): enabled=self.parameters.use_mixed_precision ): torch.cuda.nvtx.range_push("forward") + t = time.time() prediction = network(input_data) + dt = time.time() - t + printout(f"forward time: {dt}", min_verbosity=3) # forward torch.cuda.nvtx.range_pop() @@ -881,6 +966,8 @@ def __process_mini_batch(self, network, input_data, target_data): ) else: loss = network.calculate_loss(prediction, target_data) + dt = time.time() - t + printout(f"loss time: {dt}", min_verbosity=3) # loss torch.cuda.nvtx.range_pop() @@ -889,12 +976,15 @@ def __process_mini_batch(self, network, input_data, target_data): else: loss.backward() + t = time.time() torch.cuda.nvtx.range_push("optimizer") if self.gradscaler: self.gradscaler.step(self.optimizer) self.gradscaler.update() else: self.optimizer.step() + dt = time.time() - t + printout(f"optimizer time: {dt}", min_verbosity=3) torch.cuda.nvtx.range_pop() # optimizer if self.train_graph: @@ -912,327 +1002,6 @@ def __process_mini_batch(self, network, input_data, target_data): self.optimizer.zero_grad() return loss - def __validate_network(self, network, data_set_type, validation_type): - """Validate a network, using test or validation data.""" - if data_set_type == "test": - data_loaders = self.test_data_loaders - data_sets = self.data.test_data_sets - number_of_snapshots = self.data.nr_test_snapshots - offset_snapshots = ( - self.data.nr_validation_snapshots - + self.data.nr_training_snapshots - ) - - elif data_set_type == "validation": - data_loaders = self.validation_data_loaders - data_sets = self.data.validation_data_sets - number_of_snapshots = self.data.nr_validation_snapshots - offset_snapshots = self.data.nr_training_snapshots - - else: - raise Exception( - "Please select test or validation when using this function." - ) - network.eval() - if validation_type == "ldos": - validation_loss_sum = torch.zeros( - 1, device=self.parameters._configuration["device"] - ) - with torch.no_grad(): - if self.parameters._configuration["gpu"]: - report_freq = self.parameters.training_report_frequency - torch.cuda.synchronize( - self.parameters._configuration["device"] - ) - tsample = time.time() - batchid = 0 - for loader in data_loaders: - for x, y in loader: - x = x.to( - self.parameters._configuration["device"], - non_blocking=True, - ) - y = y.to( - self.parameters._configuration["device"], - non_blocking=True, - ) - - if ( - self.parameters.use_graphs - and self.validation_graph is None - ): - printout( - "Capturing CUDA graph for validation.", - min_verbosity=2, - ) - s = torch.cuda.Stream( - self.parameters._configuration["device"] - ) - s.wait_stream( - torch.cuda.current_stream( - self.parameters._configuration[ - "device" - ] - ) - ) - # Warmup for graphs - with torch.cuda.stream(s): - for _ in range(20): - with torch.cuda.amp.autocast( - enabled=self.parameters.use_mixed_precision - ): - prediction = network(x) - if self.parameters_full.use_ddp: - loss = network.module.calculate_loss( - prediction, y - ) - else: - loss = network.calculate_loss( - prediction, y - ) - torch.cuda.current_stream( - self.parameters._configuration["device"] - ).wait_stream(s) - - # Create static entry point tensors to graph - self.static_input_validation = ( - torch.empty_like(x) - ) - self.static_target_validation = ( - torch.empty_like(y) - ) - - # Capture graph - self.validation_graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(self.validation_graph): - with torch.cuda.amp.autocast( - enabled=self.parameters.use_mixed_precision - ): - self.static_prediction_validation = ( - network( - self.static_input_validation - ) - ) - if self.parameters_full.use_ddp: - self.static_loss_validation = network.module.calculate_loss( - self.static_prediction_validation, - self.static_target_validation, - ) - else: - self.static_loss_validation = network.calculate_loss( - self.static_prediction_validation, - self.static_target_validation, - ) - - if self.validation_graph: - self.static_input_validation.copy_(x) - self.static_target_validation.copy_(y) - self.validation_graph.replay() - validation_loss_sum += ( - self.static_loss_validation - ) - else: - with torch.cuda.amp.autocast( - enabled=self.parameters.use_mixed_precision - ): - prediction = network(x) - if self.parameters_full.use_ddp: - loss = network.module.calculate_loss( - prediction, y - ) - else: - loss = network.calculate_loss( - prediction, y - ) - validation_loss_sum += loss - if ( - batchid != 0 - and (batchid + 1) % report_freq == 0 - ): - torch.cuda.synchronize( - self.parameters._configuration["device"] - ) - sample_time = time.time() - tsample - avg_sample_time = sample_time / report_freq - avg_sample_tput = ( - report_freq * x.shape[0] / sample_time - ) - printout( - f"batch {batchid + 1}, " # /{total_samples}, " - f"validation avg time: {avg_sample_time} " - f"validation avg throughput: {avg_sample_tput}", - min_verbosity=2, - ) - tsample = time.time() - batchid += 1 - torch.cuda.synchronize( - self.parameters._configuration["device"] - ) - else: - batchid = 0 - for loader in data_loaders: - for x, y in loader: - x = x.to(self.parameters._configuration["device"]) - y = y.to(self.parameters._configuration["device"]) - prediction = network(x) - if self.parameters_full.use_ddp: - validation_loss_sum += ( - network.module.calculate_loss( - prediction, y - ).item() - ) - else: - validation_loss_sum += network.calculate_loss( - prediction, y - ).item() - batchid += 1 - - validation_loss = validation_loss_sum.item() / batchid - return validation_loss - elif ( - validation_type == "band_energy" - or validation_type == "total_energy" - ): - errors = [] - if isinstance( - self.validation_data_loaders, MultiLazyLoadDataLoader - ): - loader_id = 0 - for loader in data_loaders: - grid_size = self.data.parameters.snapshot_directories_list[ - loader_id + offset_snapshots - ].grid_size - - actual_outputs = np.zeros( - (grid_size, self.data.output_dimension) - ) - predicted_outputs = np.zeros( - (grid_size, self.data.output_dimension) - ) - last_start = 0 - - for x, y in loader: - - x = x.to(self.parameters._configuration["device"]) - length = int(x.size()[0]) - predicted_outputs[ - last_start : last_start + length, : - ] = self.data.output_data_scaler.inverse_transform( - self.network(x).to("cpu"), as_numpy=True - ) - actual_outputs[last_start : last_start + length, :] = ( - self.data.output_data_scaler.inverse_transform( - y, as_numpy=True - ) - ) - - last_start += length - errors.append( - self._calculate_energy_errors( - actual_outputs, - predicted_outputs, - validation_type, - loader_id + offset_snapshots, - ) - ) - loader_id += 1 - - else: - for snapshot_number in range( - offset_snapshots, number_of_snapshots + offset_snapshots - ): - # Get optimal batch size and number of batches per snapshotss - grid_size = self.data.parameters.snapshot_directories_list[ - snapshot_number - ].grid_size - - optimal_batch_size = self._correct_batch_size_for_testing( - grid_size, self.parameters.mini_batch_size - ) - number_of_batches_per_snapshot = int( - grid_size / optimal_batch_size - ) - - actual_outputs, predicted_outputs = ( - self._forward_entire_snapshot( - snapshot_number, - data_sets[0], - data_set_type[0:2], - number_of_batches_per_snapshot, - optimal_batch_size, - ) - ) - - errors.append( - self._calculate_energy_errors( - actual_outputs, - predicted_outputs, - validation_type, - snapshot_number, - ) - ) - return np.mean(errors) - else: - raise Exception("Selected validation method not supported.") - - def _calculate_energy_errors( - self, actual_outputs, predicted_outputs, energy_type, snapshot_number - ): - self.data.target_calculator.read_additional_calculation_data( - self.data.get_snapshot_calculation_output(snapshot_number) - ) - if energy_type == "band_energy": - try: - fe_actual = self.data.target_calculator.get_self_consistent_fermi_energy( - actual_outputs - ) - be_actual = self.data.target_calculator.get_band_energy( - actual_outputs, fermi_energy=fe_actual - ) - - fe_predicted = self.data.target_calculator.get_self_consistent_fermi_energy( - predicted_outputs - ) - be_predicted = self.data.target_calculator.get_band_energy( - predicted_outputs, fermi_energy=fe_predicted - ) - return np.abs(be_predicted - be_actual) * ( - 1000 / len(self.data.target_calculator.atoms) - ) - except ValueError: - # If the training went badly, it might be that the above - # code results in an error, due to the LDOS being so wrong - # that the estimation of the self consistent Fermi energy - # fails. - return float("inf") - elif energy_type == "total_energy": - try: - fe_actual = self.data.target_calculator.get_self_consistent_fermi_energy( - actual_outputs - ) - be_actual = self.data.target_calculator.get_total_energy( - ldos_data=actual_outputs, fermi_energy=fe_actual - ) - - fe_predicted = self.data.target_calculator.get_self_consistent_fermi_energy( - predicted_outputs - ) - be_predicted = self.data.target_calculator.get_total_energy( - ldos_data=predicted_outputs, fermi_energy=fe_predicted - ) - return np.abs(be_predicted - be_actual) * ( - 1000 / len(self.data.target_calculator.atoms) - ) - except ValueError: - # If the training went badly, it might be that the above - # code results in an error, due to the LDOS being so wrong - # that the estimation of the self consistent Fermi energy - # fails. - return float("inf") - - else: - raise Exception("Invalid energy type requested.") - def __create_training_checkpoint(self): """ Create a checkpoint during training. @@ -1265,8 +1034,14 @@ def __create_training_checkpoint(self): torch.save( save_dict, optimizer_name, _use_new_zipfile_serialization=False ) - - self.save_run(self.parameters.checkpoint_name, save_runner=True) + if self.parameters.run_name != "": + self.save_run( + self.parameters.checkpoint_name, + save_runner=True, + save_path=self.parameters.run_name, + ) + else: + self.save_run(self.parameters.checkpoint_name, save_runner=True) @staticmethod def __average_validation(val, name, device="cpu"): diff --git a/test/all_lazy_loading_test.py b/test/all_lazy_loading_test.py index 065cbb86e..351c98292 100644 --- a/test/all_lazy_loading_test.py +++ b/test/all_lazy_loading_test.py @@ -38,7 +38,7 @@ def test_scaling(self): test_parameters.running.max_number_epochs = 3 test_parameters.running.mini_batch_size = 512 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.comment = "Lazy loading test." test_parameters.network.nn_type = "feed-forward" test_parameters.running.use_gpu = True @@ -157,10 +157,7 @@ def test_scaling(self): test_parameters, test_network, data_handler ) test_trainer.train_network() - training_tester.append( - test_trainer.final_test_loss - - test_trainer.initial_test_loss - ) + training_tester.append(test_trainer.final_validation_loss) elif scalingtype == "feature-wise-standard": # The lazy-loading STD equation (and to a smaller amount the @@ -269,7 +266,7 @@ def test_performance_horovod(self): test_parameters.network.layer_activations = ["LeakyReLU"] test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 500 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.comment = "Horovod / lazy loading benchmark." test_parameters.network.nn_type = "feed-forward" test_parameters.manual_seed = 2021 @@ -352,8 +349,8 @@ def test_performance_horovod(self): [ hvdstring, llstring, - test_trainer.initial_test_loss, - test_trainer.final_test_loss, + test_trainer.initial_validation_loss, + test_trainer.final_validation_loss, time.time() - start_time, ] ) @@ -400,8 +397,8 @@ def _train_lazy_loading(prefetching): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" - test_parameters.verbosity = 2 + test_parameters.running.optimizer = "Adam" + test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True test_parameters.data.use_lazy_loading_prefetch = prefetching diff --git a/test/basic_gpu_test.py b/test/basic_gpu_test.py index dcd588ad1..514a70f21 100644 --- a/test/basic_gpu_test.py +++ b/test/basic_gpu_test.py @@ -91,7 +91,7 @@ def __run(use_gpu): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.manual_seed = 1002 test_parameters.running.use_shuffling_for_samplers = False test_parameters.use_gpu = use_gpu @@ -150,4 +150,4 @@ def __run(use_gpu): starttime = time.time() test_trainer.train_network() - return test_trainer.final_test_loss, time.time() - starttime + return test_trainer.final_validation_loss, time.time() - starttime diff --git a/test/checkpoint_hyperopt_test.py b/test/checkpoint_hyperopt_test.py index 28889c2df..a1909f21b 100644 --- a/test/checkpoint_hyperopt_test.py +++ b/test/checkpoint_hyperopt_test.py @@ -67,7 +67,7 @@ def __original_setup(n_trials): test_parameters.running.max_number_epochs = 10 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" # Specify the number of trials, the hyperparameter optimizer should run # and the type of hyperparameter. diff --git a/test/checkpoint_training_test.py b/test/checkpoint_training_test.py index 4c56ed8eb..3bc5e83e3 100644 --- a/test/checkpoint_training_test.py +++ b/test/checkpoint_training_test.py @@ -20,7 +20,7 @@ def test_general(self): # First run the entire test. trainer = self.__original_setup(test_checkpoint_name, 40) trainer.train_network() - original_final_test_loss = trainer.final_test_loss + original_final_validation_loss = trainer.final_validation_loss # Now do the same, but cut at epoch 22 and see if it recovers the # correct result. @@ -28,9 +28,11 @@ def test_general(self): trainer.train_network() trainer = self.__resume_checkpoint(test_checkpoint_name, 40) trainer.train_network() - new_final_test_loss = trainer.final_test_loss + new_final_validation_loss = trainer.final_validation_loss assert np.isclose( - original_final_test_loss, new_final_test_loss, atol=accuracy + original_final_validation_loss, + new_final_validation_loss, + atol=accuracy, ) def test_learning_rate(self): @@ -144,7 +146,7 @@ def __original_setup( test_parameters.running.max_number_epochs = maxepochs test_parameters.running.mini_batch_size = 38 test_parameters.running.learning_rate = learning_rate - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.running.learning_rate_scheduler = ( learning_rate_scheduler ) diff --git a/test/complete_interfaces_test.py b/test/complete_interfaces_test.py index d793da77f..8aa7da85d 100644 --- a/test/complete_interfaces_test.py +++ b/test/complete_interfaces_test.py @@ -114,7 +114,7 @@ def test_ase_calculator(self): test_parameters.running.max_number_epochs = 100 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.targets.target_type = "LDOS" test_parameters.targets.ldos_gridsize = 11 test_parameters.targets.ldos_gridspacing_ev = 2.5 @@ -123,9 +123,7 @@ def test_ase_calculator(self): test_parameters.descriptors.descriptor_type = "Bispectrum" test_parameters.descriptors.bispectrum_twojmax = 10 test_parameters.descriptors.bispectrum_cutoff = 4.67637 - test_parameters.targets.pseudopotential_path = os.path.join( - data_repo_path, "Be2" - ) + test_parameters.targets.pseudopotential_path = data_path #################### # DATA diff --git a/test/examples_test.py b/test/examples_test.py index b5aa9143a..4a83dd538 100644 --- a/test/examples_test.py +++ b/test/examples_test.py @@ -6,6 +6,7 @@ import pytest + @pytest.mark.examples class TestExamples: dir_path = os.path.dirname(__file__) @@ -13,96 +14,85 @@ class TestExamples: def test_basic_ex01(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex01_train_network.py" + self.dir_path + "/../examples/basic/ex01_train_network.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex02(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex02_test_network.py" + self.dir_path + "/../examples/basic/ex02_test_network.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex03(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex03_preprocess_data.py" + self.dir_path + "/../examples/basic/ex03_preprocess_data.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex04(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex04_hyperparameter_optimization.py" + self.dir_path + + "/../examples/basic/ex04_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex05(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex05_run_predictions.py" + self.dir_path + "/../examples/basic/ex05_run_predictions.py" ) @pytest.mark.order(after="test_basic_ex01") def test_basic_ex06(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/basic/ex06_ase_calculator.py" + self.dir_path + "/../examples/basic/ex06_ase_calculator.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex01(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex01_checkpoint_training.py" + self.dir_path + "/../examples/advanced/ex01_checkpoint_training.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex02(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex02_shuffle_data.py" + self.dir_path + "/../examples/advanced/ex02_shuffle_data.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex03(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex03_tensor_board.py" + self.dir_path + "/../examples/advanced/ex03_tensor_board.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex04(self, tmp_path): os.chdir(tmp_path) - runpy.run_path( - self.dir_path + - "/../examples/advanced/ex04_acsd.py" - ) + runpy.run_path(self.dir_path + "/../examples/advanced/ex04_acsd.py") @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex05(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex05_checkpoint_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex05_checkpoint_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex06(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex06_distributed_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex06_distributed_hyperparameter_optimization.py" ) @pytest.mark.skipif( @@ -113,14 +103,14 @@ def test_advanced_ex06(self, tmp_path): def test_advanced_ex07(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex07_advanced_hyperparameter_optimization.py" + self.dir_path + + "/../examples/advanced/ex07_advanced_hyperparameter_optimization.py" ) @pytest.mark.order(after="test_basic_ex01") def test_advanced_ex08(self, tmp_path): os.chdir(tmp_path) runpy.run_path( - self.dir_path + - "/../examples/advanced/ex08_visualize_observables.py" + self.dir_path + + "/../examples/advanced/ex08_visualize_observables.py" ) diff --git a/test/hyperopt_test.py b/test/hyperopt_test.py index bb003082a..77b0b9896 100644 --- a/test/hyperopt_test.py +++ b/test/hyperopt_test.py @@ -42,7 +42,7 @@ def test_hyperopt(self): test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 20 test_parameters.hyperparameters.hyper_opt_method = "optuna" @@ -133,7 +133,7 @@ def test_distributed_hyperopt(self): test_parameters.running.max_number_epochs = 5 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 20 test_parameters.hyperparameters.hyper_opt_method = "optuna" test_parameters.hyperparameters.study_name = "test_ho" @@ -242,7 +242,7 @@ def test_naswot_eigenvalues(self): test_parameters.running.max_number_epochs = 10 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 8 test_parameters.hyperparameters.hyper_opt_method = "naswot" @@ -310,7 +310,7 @@ def __optimize_hyperparameters(hyper_optimizer): test_parameters.running.max_number_epochs = 20 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 8 test_parameters.hyperparameters.hyper_opt_method = hyper_optimizer @@ -352,7 +352,7 @@ def __optimize_hyperparameters(hyper_optimizer): # If we do a NASWOT run currently we can provide an input # array of trials. test_hp_optimizer.add_hyperparameter( - "categorical", "trainingtype", choices=["Adam", "SGD"] + "categorical", "optimizer", choices=["Adam", "SGD"] ) test_hp_optimizer.add_hyperparameter( "categorical", "layer_activation_00", choices=["ReLU", "Sigmoid"] @@ -375,7 +375,7 @@ def __optimize_hyperparameters(hyper_optimizer): ) test_trainer.train_network() test_parameters.show() - return test_trainer.final_test_loss + return test_trainer.final_validation_loss def test_hyperopt_optuna_requeue_zombie_trials(self, tmp_path): @@ -391,7 +391,7 @@ def test_hyperopt_optuna_requeue_zombie_trials(self, tmp_path): test_parameters.running.max_number_epochs = 2 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.hyperparameters.n_trials = 2 test_parameters.hyperparameters.hyper_opt_method = "optuna" test_parameters.hyperparameters.study_name = "test_ho" diff --git a/test/shuffling_test.py b/test/shuffling_test.py index e637c7d2b..72d28d6ef 100644 --- a/test/shuffling_test.py +++ b/test/shuffling_test.py @@ -124,7 +124,7 @@ def test_training(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True @@ -168,7 +168,7 @@ def test_training(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True data_shuffler = mala.DataShuffler(test_parameters) @@ -220,7 +220,7 @@ def test_training_openpmd(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True @@ -266,7 +266,7 @@ def test_training_openpmd(self): test_parameters.running.max_number_epochs = 50 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.verbosity = 1 test_parameters.data.use_lazy_loading = True diff --git a/test/workflow_test.py b/test/workflow_test.py index fa7dee018..8cc33faf6 100644 --- a/test/workflow_test.py +++ b/test/workflow_test.py @@ -29,28 +29,19 @@ def test_network_training(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training() - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_network_training_openpmd(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training(use_openpmd_data=True) - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_network_training_fast_dataset(self): """Test whether MALA can train a NN.""" test_trainer = self.__simple_training(use_fast_tensor_dataset=True) - assert ( - desired_loss_improvement_factor * test_trainer.initial_test_loss - > test_trainer.final_test_loss - ) + assert test_trainer.final_validation_loss < np.inf def test_preprocessing(self): """ @@ -191,16 +182,8 @@ def test_postprocessing_from_dos(self): self_consistent_fermi_energy = dos.get_self_consistent_fermi_energy( dos_data ) - number_of_electrons = dos.get_number_of_electrons( - dos_data, fermi_energy=self_consistent_fermi_energy - ) band_energy = dos.get_band_energy(dos_data) - assert np.isclose( - number_of_electrons, - dos.number_of_electrons_exact, - atol=accuracy_electrons, - ) assert np.isclose( band_energy, dos.band_energy_dft_calculation, @@ -232,18 +215,10 @@ def test_postprocessing(self): self_consistent_fermi_energy = ldos.get_self_consistent_fermi_energy( ldos_data ) - number_of_electrons = ldos.get_number_of_electrons( - ldos_data, fermi_energy=self_consistent_fermi_energy - ) band_energy = ldos.get_band_energy( ldos_data, fermi_energy=self_consistent_fermi_energy ) - assert np.isclose( - number_of_electrons, - ldos.number_of_electrons_exact, - atol=accuracy_electrons, - ) assert np.isclose( band_energy, ldos.band_energy_dft_calculation, @@ -403,13 +378,12 @@ def test_training_with_postprocessing_data_repo(self): data_handler.prepare_data(reparametrize_scaler=False) # Instantiate and use a Tester object. - tester.observables_to_test = ["band_energy", "number_of_electrons"] + tester.observables_to_test = ["band_energy"] errors = tester.test_snapshot(0) # Check whether the prediction is accurate enough. - assert np.isclose(errors["band_energy"], 0, atol=accuracy_predictions) assert np.isclose( - errors["number_of_electrons"], 0, atol=accuracy_predictions + errors["band_energy"], 0, atol=accuracy_predictions * 1000 ) @pytest.mark.skipif( @@ -460,9 +434,6 @@ def test_predictions(self): band_energy_tester_class = ldos_calculator.get_band_energy( predicted_ldos ) - nr_electrons_tester_class = ldos_calculator.get_number_of_electrons( - predicted_ldos - ) #################### # Now, use the predictor class to make the same prediction. @@ -478,12 +449,6 @@ def test_predictions(self): ldos_calculator.read_additional_calculation_data( os.path.join(data_path, "Be_snapshot3.out"), "espresso-out" ) - - nr_electrons_predictor_class = ( - data_handler.target_calculator.get_number_of_electrons( - predicted_ldos - ) - ) band_energy_predictor_class = ( data_handler.target_calculator.get_band_energy(predicted_ldos) ) @@ -493,11 +458,6 @@ def test_predictions(self): band_energy_tester_class, atol=accuracy_strict, ) - assert np.isclose( - nr_electrons_predictor_class, - nr_electrons_tester_class, - atol=accuracy_strict, - ) @pytest.mark.skipif( importlib.util.find_spec("total_energy") is None @@ -568,7 +528,7 @@ def __simple_training( test_parameters.running.max_number_epochs = 400 test_parameters.running.mini_batch_size = 40 test_parameters.running.learning_rate = 0.00001 - test_parameters.running.trainingtype = "Adam" + test_parameters.running.optimizer = "Adam" test_parameters.data.use_fast_tensor_data_set = use_fast_tensor_dataset # Load data. From 4a3f56d69ec69d97d2f2a8c6080d3f1298e44b10 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Mon, 22 Jul 2024 12:30:28 +0200 Subject: [PATCH 02/10] Fix error saving --- mala/network/trainer.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 3cbf7cfad..58a462463 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -646,12 +646,16 @@ def _validate_network(self, data_set_fractions, metrics): ) last_start += length - errors[data_set_type] = self._calculate_errors( + calculated_errors = self._calculate_errors( actual_outputs, predicted_outputs, metrics, loader_id + offset_snapshots, ) + for metric in metrics: + errors[data_set_type][metric].append( + calculated_errors[metric] + ) loader_id += 1 else: with torch.no_grad(): @@ -686,12 +690,16 @@ def _validate_network(self, data_set_fractions, metrics): optimal_batch_size, ) ) - errors[data_set_type] = self._calculate_errors( + calculated_errors = self._calculate_errors( actual_outputs, predicted_outputs, metrics, - snapshot_number, + loader_id + offset_snapshots, ) + for metric in metrics: + errors[data_set_type][metric].append( + calculated_errors[metric] + ) return errors def __prepare_to_train(self, optimizer_dict): From a9925b2d382526adf5f05ce895c816c3157138e1 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Mon, 22 Jul 2024 12:41:15 +0200 Subject: [PATCH 03/10] Updated .gitignore --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index ca9313d8e..e237a43a3 100644 --- a/.gitignore +++ b/.gitignore @@ -158,6 +158,13 @@ cython_debug/ # JupyterNotebooks .ipynb_checkpoints */.ipynb_checkpoints/* +*.ipynb + +# Lightning +lightning_logs/ + +# wandb +wandb/ # SQLite *.db From 049d51d08921a1294ac6a8a30bb2a16532cada1a Mon Sep 17 00:00:00 2001 From: nerkulec Date: Mon, 22 Jul 2024 13:11:08 +0200 Subject: [PATCH 04/10] Fix UnboundLocal error --- mala/network/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 58a462463..1d5adf5d2 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -694,7 +694,7 @@ def _validate_network(self, data_set_fractions, metrics): actual_outputs, predicted_outputs, metrics, - loader_id + offset_snapshots, + snapshot_number, ) for metric in metrics: errors[data_set_type][metric].append( From f392ee744f269b382d4ca04b898003c7e62705f8 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Fri, 26 Jul 2024 10:40:25 +0200 Subject: [PATCH 05/10] Miniscule error in docstring --- mala/network/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mala/network/runner.py b/mala/network/runner.py index 17ce572b6..04d629da0 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -74,7 +74,6 @@ def _calculate_errors( errors : dict Dictionary containing the errors. """ - energy_metrics = [metric for metric in metrics if "energy" in metric] non_energy_metrics = [ metric for metric in metrics if "energy" not in metric From 5239d4caf5d9c4a92579c79de9360f108367c597 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Wed, 21 Aug 2024 13:10:13 +0200 Subject: [PATCH 06/10] Added tqdm --- docs/source/conf.py | 1 + requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index d5a8c8b4e..1225852c5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,6 +80,7 @@ "asap3", "openpmd_io", "skspatial", + "tqdm", ] myst_heading_anchors = 3 diff --git a/requirements.txt b/requirements.txt index b784a6c69..7a6be370e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ pandas tensorboard openpmd-api scikit-spatial +tqdm From 515c165ed16ef35d116557a88b46691be5fad155 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Wed, 21 Aug 2024 13:12:40 +0200 Subject: [PATCH 07/10] Remove energy calculations with DFT fermi energy --- mala/network/runner.py | 39 +-------------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/mala/network/runner.py b/mala/network/runner.py index 04d629da0..9f91bf989 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -99,7 +99,7 @@ def _calculate_errors( target_calculator, LDOS ) and not isinstance(target_calculator, Density): raise Exception( - "Cannot calculate the total energy from this " + "Cannot calculate density from this " "observable." ) target_calculator.read_additional_calculation_data( @@ -294,31 +294,6 @@ def _calculate_energy_errors( errors[energy_type] = be_error except ValueError: errors[energy_type] = float("inf") - elif energy_type == "band_energy_dft_fe": - try: - target_calculator.read_from_array(predicted_outputs) - be_predicted_dft_fe = target_calculator.get_band_energy( - fermi_energy=fe_dft - ) - be_error_dft_fe = (be_predicted_dft_fe - be_actual) * ( - 1000 / len(target_calculator.atoms) - ) - errors[energy_type] = be_error_dft_fe - except ValueError: - errors[energy_type] = float("inf") - elif energy_type == "band_energy_actual_fe": - try: - target_calculator.read_from_array(predicted_outputs) - be_predicted_actual_fe = target_calculator.get_band_energy( - fermi_energy=fe_actual - ) - be_error_actual_fe = ( - be_predicted_actual_fe - be_actual - ) * (1000 / len(target_calculator.atoms)) - errors[energy_type] = be_error_actual_fe - except ValueError: - errors[energy_type] = float("inf") - elif energy_type == "total_energy": if not isinstance(target_calculator, LDOS): raise Exception( @@ -345,18 +320,6 @@ def _calculate_energy_errors( errors[energy_type] = te_error except ValueError: errors[energy_type] = float("inf") - elif energy_type == "total_energy_dft_fe": - try: - target_calculator.read_from_array(predicted_outputs) - te_predicted_dft_fe = target_calculator.get_total_energy( - fermi_energy=fe_dft - ) - te_error_dft_fe = (te_predicted_dft_fe - te_actual) * ( - 1000 / len(target_calculator.atoms) - ) - errors[energy_type] = te_error_dft_fe - except ValueError: - errors[energy_type] = float("inf") elif energy_type == "total_energy_actual_fe": try: target_calculator.read_from_array(predicted_outputs) From 7b093b5d0dbe9071140bd28d61bba3b4339dfa5e Mon Sep 17 00:00:00 2001 From: nerkulec Date: Wed, 21 Aug 2024 13:51:01 +0200 Subject: [PATCH 08/10] Fixed exceptions and added missing band_energy_actual_fe calculation --- mala/network/runner.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/mala/network/runner.py b/mala/network/runner.py index 9f91bf989..1a4837b99 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -121,7 +121,7 @@ def _calculate_errors( target_calculator, LDOS ) and not isinstance(target_calculator, Density): raise Exception( - "Cannot calculate the total energy from this " + "Cannot calculate the density from this " "observable." ) target_calculator.read_additional_calculation_data( @@ -145,7 +145,7 @@ def _calculate_errors( target_calculator, LDOS ) and not isinstance(target_calculator, DOS): raise Exception( - "Cannot calculate the total energy from this " + "Cannot calculate the DOS from this " "observable." ) target_calculator.read_additional_calculation_data( @@ -168,7 +168,7 @@ def _calculate_errors( target_calculator, LDOS ) and not isinstance(target_calculator, DOS): raise Exception( - "Cannot calculate the total energy from this " + "Cannot calculate the relative DOS from this " "observable." ) target_calculator.read_additional_calculation_data( @@ -269,9 +269,6 @@ def _calculate_energy_errors( if energy_type == "fermi_energy": fe_error = fe_predicted - fe_actual errors[energy_type] = fe_error - elif energy_type == "fermi_energy_dft": - fe_error_dft = fe_predicted - fe_dft - errors[energy_type] = fe_error_dft elif energy_type == "band_energy": if not isinstance(target_calculator, LDOS) and not isinstance( target_calculator, DOS @@ -294,6 +291,26 @@ def _calculate_energy_errors( errors[energy_type] = be_error except ValueError: errors[energy_type] = float("inf") + elif energy_type == "band_energy_actual_fe": + if not isinstance(target_calculator, LDOS) and not isinstance( + target_calculator, DOS + ): + raise Exception( + "Cannot calculate the band energy from this observable." + ) + try: + target_calculator.read_from_array(predicted_outputs) + be_predicted_actual_fe = ( + target_calculator.get_band_energy( + fermi_energy=fe_actual + ) + ) + be_error_actual_fe = ( + be_predicted_actual_fe - be_actual + ) * (1000 / len(target_calculator.atoms)) + errors[energy_type] = be_error_actual_fe + except ValueError: + errors[energy_type] = float("inf") elif energy_type == "total_energy": if not isinstance(target_calculator, LDOS): raise Exception( @@ -321,6 +338,11 @@ def _calculate_energy_errors( except ValueError: errors[energy_type] = float("inf") elif energy_type == "total_energy_actual_fe": + if not isinstance(target_calculator, LDOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) try: target_calculator.read_from_array(predicted_outputs) te_predicted_actual_fe = ( From 29fab9a55ff7434826fbf7f93f0947e201c4aeee Mon Sep 17 00:00:00 2001 From: nerkulec Date: Wed, 21 Aug 2024 14:01:38 +0200 Subject: [PATCH 09/10] Remove unused fe_dft --- mala/network/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mala/network/runner.py b/mala/network/runner.py index 1a4837b99..2c78163e3 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -236,7 +236,6 @@ def _calculate_energy_errors( target_calculator.read_additional_calculation_data(output_file) errors = {} - fe_dft = target_calculator.fermi_energy_dft fe_actual = None fe_predicted = None try: From 24f9f62b5ad8523d99c4a3d618dc089bd7499782 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Wed, 21 Aug 2024 14:43:01 +0200 Subject: [PATCH 10/10] Get energy targets and predictions --- mala/network/runner.py | 160 +++++++++++++++++++++++++++++++++++++++++ mala/network/tester.py | 32 ++++++++- 2 files changed, 191 insertions(+), 1 deletion(-) diff --git a/mala/network/runner.py b/mala/network/runner.py index 2c78163e3..fb5a99321 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -361,6 +361,166 @@ def _calculate_energy_errors( ) return errors + def _calculate_energy_targets_and_predictions( + self, actual_outputs, predicted_outputs, energy_types, snapshot_number + ): + """ + Calculate the energies corresponding to actual and predicted outputs. + + Parameters + ---------- + actual_outputs : numpy.ndarray + Actual outputs. + + predicted_outputs : numpy.ndarray + Predicted outputs. + + energy_types : list + List of energy types to calculate. + + snapshot_number : int + Snapshot number for which the energies are calculated. + """ + target_calculator = self.data.target_calculator + output_file = self.data.get_snapshot_calculation_output( + snapshot_number + ) + if not output_file: + raise Exception( + "Output file needed for energy calculations." + ) + target_calculator.read_additional_calculation_data(output_file) + + targets = {} + predictions = {} + fe_actual = None + fe_predicted = None + try: + fe_actual = target_calculator.get_self_consistent_fermi_energy( + actual_outputs + ) + except ValueError: + targets = { + energy_type: np.nan for energy_type in energy_types + } + predictions = { + energy_type: np.nan for energy_type in energy_types + } + printout( + "CAUTION! LDOS ground truth is so wrong that the " + "estimation of the self consistent Fermi energy fails." + ) + return targets, predictions + try: + fe_predicted = target_calculator.get_self_consistent_fermi_energy( + predicted_outputs + ) + except ValueError: + targets = { + energy_type: np.nan for energy_type in energy_types + } + predictions = { + energy_type: np.nan for energy_type in energy_types + } + printout( + "CAUTION! LDOS prediction is so wrong that the " + "estimation of the self consistent Fermi energy fails." + ) + return targets, predictions + for energy_type in energy_types: + if energy_type == "fermi_energy": + targets[energy_type] = fe_actual + predictions[energy_type] = fe_predicted + elif energy_type == "band_energy": + if not isinstance(target_calculator, LDOS) and not isinstance( + target_calculator, DOS + ): + raise Exception( + "Cannot calculate the band energy from this observable." + ) + try: + target_calculator.read_from_array(actual_outputs) + be_actual = target_calculator.get_band_energy( + fermi_energy=fe_actual + ) + target_calculator.read_from_array(predicted_outputs) + be_predicted = target_calculator.get_band_energy( + fermi_energy=fe_predicted + ) + targets[energy_type] = be_actual * 1000 / len(target_calculator.atoms) + predictions[energy_type] = be_predicted * 1000 / len(target_calculator.atoms) + except ValueError: + targets[energy_type] = np.nan + predictions[energy_type] = np.nan + elif energy_type == "band_energy_actual_fe": + if not isinstance(target_calculator, LDOS) and not isinstance( + target_calculator, DOS + ): + raise Exception( + "Cannot calculate the band energy from this observable." + ) + try: + target_calculator.read_from_array(predicted_outputs) + be_predicted_actual_fe = ( + target_calculator.get_band_energy( + fermi_energy=fe_actual + ) + ) + targets[energy_type] = be_actual * 1000 / len(target_calculator.atoms) + predictions[energy_type] = be_predicted_actual_fe * 1000 / len(target_calculator.atoms) + except ValueError: + targets[energy_type] = np.nan + predictions[energy_type] = np.nan + elif energy_type == "total_energy": + if not isinstance(target_calculator, LDOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + try: + target_calculator.read_additional_calculation_data( + self.data.get_snapshot_calculation_output( + snapshot_number + ) + ) + target_calculator.read_from_array(actual_outputs) + te_actual = target_calculator.get_total_energy( + fermi_energy=fe_actual + ) + target_calculator.read_from_array(predicted_outputs) + te_predicted = target_calculator.get_total_energy( + fermi_energy=fe_predicted + ) + targets[energy_type] = te_actual * 1000 / len(target_calculator.atoms) + predictions[energy_type] = te_predicted * 1000 / len(target_calculator.atoms) + except ValueError: + targets[energy_type] = np.nan + predictions[energy_type] = np.nan + elif energy_type == "total_energy_actual_fe": + if not isinstance(target_calculator, LDOS): + raise Exception( + "Cannot calculate the total energy from this " + "observable." + ) + try: + target_calculator.read_from_array(predicted_outputs) + te_predicted_actual_fe = ( + target_calculator.get_total_energy( + fermi_energy=fe_actual + ) + ) + + targets[energy_type] = te_actual * 1000 / len(target_calculator.atoms) + predictions[energy_type] = te_predicted_actual_fe * 1000 / len(target_calculator.atoms) + except ValueError: + targets[energy_type] = np.nan + predictions[energy_type] = np.nan + else: + raise Exception( + f"Invalid energy type ({energy_type}) requested." + ) + return targets, predictions + def save_run( self, run_name, diff --git a/mala/network/tester.py b/mala/network/tester.py index 9a7831f57..1d80efedb 100644 --- a/mala/network/tester.py +++ b/mala/network/tester.py @@ -124,10 +124,40 @@ def test_snapshot(self, snapshot_number, data_type="te"): snapshot_number, ) return results + + def get_energy_targets_and_predictions(self, snapshot_number, data_type="te"): + """ + Get the energy targets and predictions for a single snapshot. + + Parameters + ---------- + snapshot_number : int + Snapshot which to test. + + data_type : str + 'tr', 'va', or 'te' indicating the partition to be tested + + Returns + ------- + results : dict + A dictionary containing the errors for the selected observables. + """ + actual_outputs, predicted_outputs = self.predict_targets( + snapshot_number, data_type=data_type + ) + + energy_metrics = [metric for metric in self.observables_to_test if "energy" in metric] + targets, predictions = self._calculate_energy_targets_and_predictions( + actual_outputs, + predicted_outputs, + energy_metrics, + snapshot_number, + ) + return targets, predictions def predict_targets(self, snapshot_number, data_type="te"): """ - Get actual and predicted output for a snapshot. + Get actual and predicted energy outputs for a snapshot. Parameters ----------