From 930c6e0c518640c7856536dbb20655aa76b9f0fc Mon Sep 17 00:00:00 2001
From: nerkulec <nerkulec@github.com>
Date: Mon, 18 Nov 2024 11:13:24 +0100
Subject: [PATCH 1/5] Included missing parameters' docstrings

---
 mala/common/parameters.py | 46 ++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index 797dae210..37a0673e1 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -225,7 +225,6 @@ class ParametersNetwork(ParametersBase):
     ----------
     nn_type : string
         Type of the neural network that will be used. Currently supported are
-
             - "feed_forward" (default)
             - "transformer"
             - "lstm"
@@ -279,12 +278,12 @@ def __init__(self):
         self.layer_activations = ["Sigmoid"]
         self.loss_function_type = "mse"
 
-        # for LSTM/Gru + Transformer
-        self.num_hidden_layers = 1
-
         # for LSTM/Gru
         self.no_hidden_state = False
         self.bidirection = False
+        
+        # for LSTM/Gru + Transformer
+        self.num_hidden_layers = 1
 
         # for transformer net
         self.dropout = 0.1
@@ -556,11 +555,6 @@ class ParametersData(ParametersBase):
 
     Attributes
     ----------
-    descriptors_contain_xyz : bool
-        Legacy option. If True, it is assumed that the first three entries of
-        the descriptor vector are the xyz coordinates and they are cut from the
-        descriptor vector. If False, no such cutting is peformed.
-
     snapshot_directories_list : list
         A list of all added snapshots.
 
@@ -699,6 +693,9 @@ class ParametersRunning(ParametersBase):
     checkpoint_name : string
         Name used for the checkpoints. Using this, multiple runs
         can be performed in the same directory.
+        
+    run_name : string
+        Name of the run used for logging.
 
     logging_dir : string
         Name of the folder that logging files will be saved to.
@@ -707,6 +704,31 @@ class ParametersRunning(ParametersBase):
         If True, then upon creating logging files, these will be saved
         in a subfolder of logging_dir labelled with the starting date
         of the logging, to avoid having to change input scripts often.
+        
+    logger : string
+        Name of the logger to be used. Currently supported are:
+            - "tensorboard": Tensorboard logger.
+            - "wandb": Weights and Biases logger.
+    
+    validation_metrics : list
+        List of metrics to be used for validation. Default is ["ldos"].
+        Possible options are:
+            - "ldos": Loss on the LDOS.
+            - "band_energy": Band energy.
+            - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy.
+            - "total_energy": Total energy.
+            - "total_energy_actual_fe": Total energy computed with ground truth Fermi energy.
+            - "fermi_energy": Fermi energy.
+            - "density": Electron density.
+            - "density_relative": Rlectron density (MAPE).
+            - "dos": Density of states.
+            - "dos_relative": Density of states (MAPE).
+            
+    validate_on_training_data : bool
+        Whether to validate on the training data as well. Default is False.
+        
+    validate_every_n_epochs : int
+        Determines how often validation is performed. Default is 1.
 
     inference_data_grid : list
         List holding the grid to be used for inference in the form of
@@ -728,12 +750,11 @@ class ParametersRunning(ParametersBase):
     def __init__(self):
         super(ParametersRunning, self).__init__()
         self.optimizer = "Adam"
-        self.learning_rate = 10 ** (-5)
+        self.learning_rate = 0.5
         self.learning_rate_embedding = 10 ** (-4)
         self.max_number_epochs = 100
         self.verbosity = True
         self.mini_batch_size = 10
-        self.snapshots_per_epoch = -1
 
         self.l1_regularization = 0.0
         self.l2_regularization = 0.0
@@ -752,7 +773,6 @@ def __init__(self):
         self.num_workers = 0
         self.use_shuffling_for_samplers = True
         self.checkpoints_each_epoch = 0
-        self.checkpoint_best_so_far = False
         self.checkpoint_name = "checkpoint_mala"
         self.run_name = ""
         self.logging_dir = "./mala_logging"
@@ -883,7 +903,7 @@ class ParametersHyperparameterOptimization(ParametersBase):
               that _xxx is only so that optuna will differentiate between
               variables. No reordering is performed by MALA; the order
               depends on the order in the list. _xxx can be essentially
-              anything.
+              anything.use_graphs
 
         Users normally don't have to fill this list by hand, the hyperparamer
         optimizer provide interfaces for this task.

From 6040f115312d182b55a8ac5c1b1b338601747338 Mon Sep 17 00:00:00 2001
From: nerkulec <nerkulec@github.com>
Date: Mon, 18 Nov 2024 11:20:32 +0100
Subject: [PATCH 2/5] Fix mistake

---
 mala/common/parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index 37a0673e1..8c3a8ecb0 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -903,7 +903,7 @@ class ParametersHyperparameterOptimization(ParametersBase):
               that _xxx is only so that optuna will differentiate between
               variables. No reordering is performed by MALA; the order
               depends on the order in the list. _xxx can be essentially
-              anything.use_graphs
+              anything.
 
         Users normally don't have to fill this list by hand, the hyperparamer
         optimizer provide interfaces for this task.

From cab6b1257df26191f8f4c5cdeeecf046075e69ec Mon Sep 17 00:00:00 2001
From: nerkulec <nerkulec@github.com>
Date: Tue, 19 Nov 2024 16:24:52 +0100
Subject: [PATCH 3/5] Added documentation

---
 docs/source/advanced_usage/trainingmodel.rst | 59 +++++++++++++++++---
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst
index 290aa15f3..569bf9d80 100644
--- a/docs/source/advanced_usage/trainingmodel.rst
+++ b/docs/source/advanced_usage/trainingmodel.rst
@@ -194,22 +194,64 @@ keyword, you can fine-tune the number of new snapshots being created.
 By default, the same number of snapshots as had been provided will be created
 (if possible).
 
-Using tensorboard
+Logging metrics during training
 ******************
 
-Training routines in MALA can be visualized via tensorboard, as also shown
-in the file ``advanced/ex03_tensor_board``. Simply enable tensorboard
-visualization prior to training via
+Training progress in MALA can be visualized via tensorboard or wandb, as also shown
+in the file ``advanced/ex03_tensor_board``. Simply select a logger prior to training as
 
       .. code-block:: python
 
-            # 0: No visualizatuon, 1: loss and learning rate, 2: like 1,
-            # but additionally weights and biases are saved
-            parameters.running.logging = 1
+            parameters.running.logger = "tensorboard"
+            parameters.running.logging_dir = "mala_vis"
+
+or
+
+      .. code-block:: python
+
+            import wandb
+            wandb.init(
+                  project="mala_training",
+                  entity="your_wandb_entity"
+            )
+            parameters.running.logger = "wandb"
             parameters.running.logging_dir = "mala_vis"
 
 where ``logging_dir`` specifies some directory in which to save the
-MALA logging data. Afterwards, you can run the training without any
+MALA logging data. You can also select which metrics to record via
+
+      .. code-block:: python
+
+            parameters.validation_metrics = ["ldos", "dos", "density", "total_energy"]
+
+Full list of available metrics:
+      - "ldos": MSE of the LDOS.
+      - "band_energy": Band energy.
+      - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy.
+      - "total_energy": Total energy.
+      - "total_energy_actual_fe": Total energy computed with ground truth Fermi energy.
+      - "fermi_energy": Fermi energy.
+      - "density": Electron density.
+      - "density_relative": Rlectron density (Mean Absolute Percentage Error).
+      - "dos": Density of states.
+      - "dos_relative": Density of states (Mean Absolute Percentage Error).
+
+To save time and resources you can specify the logging interval via
+
+      .. code-block:: python
+
+            parameters.running.validate_every_n_epochs = 10
+
+If you want to monitor the degree to which the model overfits to the training data,
+you can use the option
+
+      .. code-block:: python
+            
+            parameters.running.validate_on_training_data = True
+
+MALA will evaluate the validation metrics on the training set as well as the validation set.
+
+Afterwards, you can run the training without any
 other modifications. Once training is finished (or during training, in case
 you want to use tensorboard to monitor progress), you can launch tensorboard
 via
@@ -221,6 +263,7 @@ via
 The full path for ``path_to_log_directory`` can be accessed via
 ``trainer.full_logging_path``.
 
+If you're using wandb, you can monitor the training progress on the wandb website.
 
 Training in parallel
 ********************

From ae2e0edd3fbbd2f4ebdb4c420a77ebe465d1691d Mon Sep 17 00:00:00 2001
From: nerkulec <nerkulec@github.com>
Date: Tue, 19 Nov 2024 17:17:28 +0100
Subject: [PATCH 4/5] Fixing minor issues

---
 docs/source/advanced_usage/trainingmodel.rst | 2 +-
 mala/common/parameters.py                    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst
index 569bf9d80..9b118d86b 100644
--- a/docs/source/advanced_usage/trainingmodel.rst
+++ b/docs/source/advanced_usage/trainingmodel.rst
@@ -195,7 +195,7 @@ By default, the same number of snapshots as had been provided will be created
 (if possible).
 
 Logging metrics during training
-******************
+*******************************
 
 Training progress in MALA can be visualized via tensorboard or wandb, as also shown
 in the file ``advanced/ex03_tensor_board``. Simply select a logger prior to training as
diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index 8c3a8ecb0..520f3a4d5 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -713,7 +713,7 @@ class ParametersRunning(ParametersBase):
     validation_metrics : list
         List of metrics to be used for validation. Default is ["ldos"].
         Possible options are:
-            - "ldos": Loss on the LDOS.
+            - "ldos": MSE of the LDOS.
             - "band_energy": Band energy.
             - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy.
             - "total_energy": Total energy.
@@ -743,8 +743,8 @@ class ParametersRunning(ParametersBase):
 
     profiler_range : list
         List with two entries determining with which batch/iteration number
-         the CUDA profiler will start and stop profiling. Please note that
-         this option only holds significance if the nsys profiler is used.
+        the CUDA profiler will start and stop profiling. Please note that
+        this option only holds significance if the nsys profiler is used.
     """
 
     def __init__(self):

From 85146db51400c01b59e3c0ab4ab222b943a4d2b7 Mon Sep 17 00:00:00 2001
From: nerkulec <nerkulec@github.com>
Date: Fri, 22 Nov 2024 16:19:50 +0100
Subject: [PATCH 5/5] Fix typo and indentation

---
 mala/common/parameters.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index 520f3a4d5..dbc074736 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -687,7 +687,7 @@ class ParametersRunning(ParametersBase):
         a "by snapshot" basis.
 
     checkpoints_each_epoch : int
-        If not 0, checkpoint files will be saved after eac
+        If not 0, checkpoint files will be saved after each
         checkpoints_each_epoch epoch.
 
     checkpoint_name : string
@@ -706,13 +706,16 @@ class ParametersRunning(ParametersBase):
         of the logging, to avoid having to change input scripts often.
         
     logger : string
-        Name of the logger to be used. Currently supported are:
+        Name of the logger to be used.
+        Currently supported are:
+        
             - "tensorboard": Tensorboard logger.
             - "wandb": Weights and Biases logger.
     
     validation_metrics : list
         List of metrics to be used for validation. Default is ["ldos"].
         Possible options are:
+        
             - "ldos": MSE of the LDOS.
             - "band_energy": Band energy.
             - "band_energy_actual_fe": Band energy computed with ground truth Fermi energy.