Merge branch 'f/dvc_live' into 'main'

Add extended results logging infrastructure See merge request es/ai/hannah/hannah!366
ekut-es · Jan 8, 2024 · 329f311 · 329f311
2 parents a0b6888 + 8961eb5
commit 329f311
Show file tree

Hide file tree

Showing 26 changed files with 2,898 additions and 3,052 deletions.
diff --git a/README.md b/README.md
@@ -135,10 +135,10 @@ Training is invoked by
 
     hannah-train
 
-If available the first GPU of the system will be used by default. Selecting another GPU is possible using the argument trainer.`gpus=[number]`
+If available the first GPU of the system will be used by default. Selecting another GPU is possible using the argument trainer.`devices=[number]`
 e.g. for GPU 2 use:
 
-    hannah-train trainer.gpus=[2]
+    hannah-train trainer.devices=[2]
 
 Trained models are saved under `trained_models/<experiment_id>/<model_name>`.
 
@@ -190,7 +190,7 @@ Training of emergency siren detection dataset is invoked by:
 
 # Parallel Launchers
 
-To launch multiple optimizations in parallel you can use a hydra launcher. 
+To launch multiple optimizations in parallel you can use a hydra launcher.
 
 Joblib launcher is installed by default:
 
@@ -225,7 +225,6 @@ best of your abilities.
 
 # Automatic Mirroring
 
-This project automatically mirrors its *main* branch and all branches prefixed with *pub/* to its public github repository. 
-
-These branches are configured as protected branches by default. 
+This project automatically mirrors its *main* branch and all branches prefixed with *pub/* to its public github repository.
 
+These branches are configured as protected branches by default.
diff --git a/doc/configuration/configuration.md b/doc/configuration/configuration.md
@@ -1,8 +1,8 @@
 <!--
-Copyright (c) 2022 University of Tübingen.
+Copyright (c) 2023 Hannah contributors.
 
 This file is part of hannah.
-See https://atreus.informatik.uni-tuebingen.de/ties/ai/hannah/hannah for further info.
+See https://github.com/ekut-es/hannah for further info.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -221,7 +221,7 @@ Capsules the options to the lightning trainer. Currently it sets the following d
 Default options are:
 
 
-`gpus`
+`devices`
 : 1
 
 `auto_select_gpus`

diff --git a/doc/configuration/multi_gpu.md b/doc/configuration/multi_gpu.md
@@ -1,8 +1,8 @@
 <!--
-Copyright (c) 2022 University of Tübingen.
+Copyright (c) 2023 Hannah contributors.
 
 This file is part of hannah.
-See https://atreus.informatik.uni-tuebingen.de/ties/ai/hannah/hannah for further info.
+See https://github.com/ekut-es/hannah for further info.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,13 +22,13 @@ Hannah supports multi GPU-Training using the lightning distributed APIs:
 
 We provide preset trainer configs for distributed data parallel training:
 
-```hannah-train trainer=ddp trainer.gpus=[0,1]```
+```hannah-train trainer=ddp trainer.devices=[0,1]```
 
 
 And for sharded training using fairscale:
 
 
-```hannah-train trainer=sharded trainer.gpus=[0,1]```
+```hannah-train trainer=sharded trainer.devices=[0,1]```
 
 
-Sharded training distributes some of the model parameters across multiple GPUs and allows fitting bigger models in the same amount of GPU memory.
+Sharded training distributes some of the model parameters across multiple devices and allows fitting bigger models in the same amount of GPU memory.
diff --git a/experiments/kvasir_ae/config.yaml b/experiments/kvasir_ae/config.yaml
@@ -18,11 +18,11 @@
 ##
 defaults:
     - base_config
-    - override dataset: kvasir_unlabeled   # Dataset configuration name
+    - override dataset: kvasir_capsule   # Dataset configuration name
     - override features: identity        # Feature extractor configuration name (use identity for vision datasets)
-    - override model: timm_mobilenetv3_small_075      # Neural network name (for now timm_resnet50 or timm_efficientnet_lite1)
-    - override scheduler: 1cycle         # learning rate scheduler config name
-    - override optimizer: adamw          # Optimizer config name
+    - override model: timm_resnet18      # Neural network name (for now timm_resnet50 or timm_efficientnet_lite1)
+    - override scheduler: cosine_warm         # learning rate scheduler config name
+    - override optimizer: sgd          # Optimizer config name
     - override normalizer: null          # Feature normalizer (used for quantized neural networks)
     - override module: image_classifier  # Lightning module config for the training loop (image classifier for image classification tasks)
     - _self_
@@ -35,4 +35,15 @@ trainer:
   max_epochs: 50
 
 scheduler:
-  max_lr: 0.01
+  T_0: 10
+
+optimizer:
+  lr: 0.1
+  momentum: 0.9
+  weight_decay: 0.0001
+
+
+model:
+  pretrained: false
+  decoder: true
+  classifier: false
diff --git a/hannah/callbacks/prediction_logger.py b/hannah/callbacks/prediction_logger.py
@@ -0,0 +1,56 @@
+#
+# Copyright (c) 2024 Hannah contributors.
+#
+# This file is part of hannah.
+# See https://github.com/ekut-es/hannah for further info.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from pytorch_lightning import Callback
+from torch import is_tensor
+
+
+class PredictionLogger(Callback):
+    def setup(self, trainer, pl_module, stage):
+        self.val_values = []
+        self.test_values = []
+
+    def on_validation_epoch_end(self, trainer, pl_module):
+        self._write_values("val", self.test_values)
+
+        self.val_values = []
+
+    def on_validation_batch_end(
+        self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0
+    ):
+        self._log("val", self.val_values, batch, outputs, batch_idx, dataloader_idx)
+
+    def on_test_epoch_end(self, trainer, pl_module):
+        self._write_values("test", self.test_values)
+        self.test_values = []
+
+    def on_test_batch_end(
+        self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0
+    ):
+        self._log("test", self.test_values, batch, outputs, batch_idx, dataloader_idx)
+
+    def _log(self, stage, values, batch, outputs, batch_idx, dataloader_idx):
+        values.append(outputs)
+
+        if isinstance(outputs, dict):
+            for key, value in outputs.items():
+                if is_tensor(value):
+                    value = value.detach().cpu().numpy()
+
+    def _write_values(self, stage, values):
+        pass
diff --git a/hannah/conf/objectdetection_eval.yaml b/hannah/conf/objectdetection_eval.yaml
@@ -21,7 +21,7 @@ checkpoints: [trained_models/fasterrcnn_brs_new/faster-rcnn-resnet50/best.ckpt]
 noise: []
 methods: ["original", "full_augmented", "real_rain", "dawn_rain", "dawn_snow", "dawn_fog"]
 output_dir: eval
-gpus: [1]
+devices: [1]
 
 augmentation:
   - augmented_pct: 50

diff --git a/hannah/conf/trainer/cross_validation.yaml b/hannah/conf/trainer/cross_validation.yaml
@@ -17,7 +17,8 @@
 ## limitations under the License.
 ##
 _target_: hannah.trainer.CrossValidationTrainer
-gpus: 1
+accelerator: auto
+devices: 1
 limit_train_batches: 1.0
 limit_val_batches: 1.0
 limit_test_batches: 1.0

diff --git a/hannah/conf/trainer/default.yaml b/hannah/conf/trainer/default.yaml
@@ -17,7 +17,8 @@
 ## limitations under the License.
 ##
 _target_: pytorch_lightning.trainer.Trainer
-gpus: 1
+accelerator: auto
+devices: 1
 limit_train_batches: 1.0
 limit_val_batches: 1.0
 limit_test_batches: 1.0
@@ -28,8 +29,8 @@ overfit_batches: 0.0
 benchmark: False
 deterministic: "warn"
 gradient_clip_val: 0
-auto_scale_batch_size: null
 accumulate_grad_batches: 1
 plugins: null
-strategy: null
+strategy: auto
 reload_dataloaders_every_n_epochs: 0
+precision: 32
diff --git a/hannah/models/timm.py b/hannah/models/timm.py
@@ -160,7 +160,15 @@ def __init__(self, latent_shape, input_shape):
                 ),
                 nn.BatchNorm2d(out_channels),
                 nn.LeakyReLU(2.0),
-                # nn.Upsample(scale_factor=2.0),
+                nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=1,
+                    padding=(1, 1),
+                ),
+                nn.BatchNorm2d(out_channels),
+                nn.LeakyReLU(2.0),
             )
 
             dim_x *= 2.0
@@ -170,9 +178,9 @@ def __init__(self, latent_shape, input_shape):
             upscale.append(stage)
 
         stage = nn.Sequential(
+            nn.Upsample(size=(input_x, input_y)),
             nn.Conv2d(channels, input_channels, 3, padding=(1, 1)),
             nn.BatchNorm2d(input_channels),
-            nn.Upsample(size=(input_x, input_y)),
             nn.Tanh(),
         )
         upscale.append(stage)

diff --git a/hannah/modules/angle_classifier.py b/hannah/modules/angle_classifier.py
@@ -58,7 +58,6 @@ def setup(self, stage):
         self.initialized = True
 
         if self.hparams.dataset is not None:
-
             # trainset needed to set values in hparams
             self.train_set, self.dev_set, self.test_set = self.get_split()
 
@@ -69,14 +68,10 @@ def setup(self, stage):
         self.example_input_array = self.get_example_input_array()
         dummy_input = self.example_input_array.to(device)
         logging.info("Example input array shape: %s", str(dummy_input.shape))
-        if platform.machine() == "ppc64le":
-            dummy_input = dummy_input.to("cuda:" + str(self.gpus[0]))
 
         # Instantiate features
         self.features = instantiate(self.hparams.features)
         self.features.to(device)
-        if platform.machine() == "ppc64le":
-            self.features.to("cuda:" + str(self.gpus[0]))
 
         features = self._extract_features(dummy_input)
         self.example_feature_array = features.to(self.device)
@@ -146,7 +141,6 @@ def training_step(self, batch, batch_idx):
 
     # VALIDATION CODE
     def validation_step(self, batch, batch_idx):
-
         # dataloader provides these four entries per batch
         x, x_length, y, y_length = batch
 
@@ -160,7 +154,6 @@ def validation_step(self, batch, batch_idx):
 
     # TEST CODE
     def test_step(self, batch, batch_idx):
-
         # dataloader provides these four entries per batch
         x, x_length, y, y_length = batch