From 10e05b0b1870ff7b6c70b0b4ba619799bd597b49 Mon Sep 17 00:00:00 2001
From: Julia Werner <julia-helga.werner@uni-tuebingen.de>
Date: Thu, 22 Feb 2024 11:03:17 +0000
Subject: [PATCH] Dresden Capsule Dataset Integration

---
 experiments/dresden_capsule/config.yaml   |  2 +-
 hannah/conf/dataset/dresden_capsule.yaml  | 14 ++++-
 hannah/datasets/vision/dresden_capsule.py | 63 ++++++++++++++++++-----
 3 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/experiments/dresden_capsule/config.yaml b/experiments/dresden_capsule/config.yaml
index e585a43a..2541fd76 100644
--- a/experiments/dresden_capsule/config.yaml
+++ b/experiments/dresden_capsule/config.yaml
@@ -31,7 +31,7 @@ dataset:
   data_folder: ${oc.env:HANNAH_DATA_FOLDER,${hydra:runtime.cwd}/../../datasets/}
 
 module:
-  batch_size: 32
+  batch_size: 128
 
 trainer:
   max_epochs: 15
diff --git a/hannah/conf/dataset/dresden_capsule.yaml b/hannah/conf/dataset/dresden_capsule.yaml
index 16dee941..b285146f 100644
--- a/hannah/conf/dataset/dresden_capsule.yaml
+++ b/hannah/conf/dataset/dresden_capsule.yaml
@@ -25,5 +25,17 @@ dataset: dresden_capsule
 sampler: random
 weighted_loss: false
 
-task: section
+task: sections # Check splits folder for other task options.
 split: split_0
+
+downsampling:
+  enabled: true
+  ratio:
+    binary: 1 # ratio normal : anomaly
+    # Proportions of each class that should be used.
+    sections: [1, 1, 0.05, 0.01, 0.005]
+    technical_multilabel_bubbles_dirt: [1, 1]
+    technical_multiclass_view: [1, 1, 1]
+  anomalies_fraction: 0.3 # only relevant for binary task
+
+seed: 1234
\ No newline at end of file
diff --git a/hannah/datasets/vision/dresden_capsule.py b/hannah/datasets/vision/dresden_capsule.py
index 71e29ae3..8d40042a 100644
--- a/hannah/datasets/vision/dresden_capsule.py
+++ b/hannah/datasets/vision/dresden_capsule.py
@@ -25,29 +25,65 @@
 
 import logging
 import pathlib
-import shutil
 
 import numpy as np
 import pandas as pd
-import torchvision
 from albumentations.pytorch import ToTensorV2
 import albumentations as A
 import tqdm
-
+from sklearn.utils import resample
 from .base import ImageDatasetBase
 
 logger = logging.getLogger(__name__)
 
 
 def prepare_data(study_folder: pathlib.Path, data: pd.DataFrame):
-    label_names = list(data.columns)[:-1]
 
+    label_names = list(data.columns)[:-1]
     files = [study_folder / image for image in data["path"].to_list()]
-    labels = np.argmax(data.iloc[:, :-1].values, axis=1)
+
+    if len(label_names) > 1: # True for section and technical tasks
+        labels = np.argmax(data.iloc[:, :-1].values, axis=1)
+    
+    else: # Assuming binary task (some anomaly and normal)
+        label_names.insert(0, 'normal')
+        labels = np.max(data.iloc[:, :-1].values, axis=1)
+    
     labels = [label_names[x] for x in labels]
 
     return files, labels, label_names
 
+def downsampling(X: list, y: list, labels: list, config: dict):
+    
+    task = config.task
+    seed = config.seed
+
+    if task == 'sections' or task == 'technical_multiclass_view' or task == 'technical_multilabel_bubbles_dirt':
+        idx_resampled = np.empty(0, dtype=int)
+        for i in range(len(labels)):
+            y = np.array(y)
+            idx_y = np.where(y == labels[i])[0] # get only one class
+            n_samples = int(config.downsampling.ratio[task][i]*len(idx_y))
+            idx_resampled_temp = resample(idx_y, n_samples=n_samples, random_state=seed) # downsample class to n samples
+            idx_resampled = np.concatenate([idx_resampled, idx_resampled_temp])
+
+    else: # Assuming binary task
+        ratio = config.downsampling.ratio.binary
+        y = np.array(y)
+        normal_idx  = np.where(y == labels[0])[0]
+        anomaly_idx = np.where(y == labels[1])[0]
+        n_samples = int(len(anomaly_idx)*config.downsampling.anomalies_fraction)
+        idx_resampled_anomaly = resample(anomaly_idx, n_samples=n_samples, random_state=seed)
+        idx_resampled_normal = resample(normal_idx, n_samples=int(n_samples*ratio), random_state=seed)
+        idx_resampled = np.concatenate([idx_resampled_anomaly, idx_resampled_normal])
+
+    ordered_idx_resampled = np.sort(idx_resampled)
+    y = y[ordered_idx_resampled]
+    X = np.array(X)[ordered_idx_resampled]
+    assert len(X) == len(y)
+
+    return X, y
+
 
 class DresdenCapsuleDataset(ImageDatasetBase):
     @classmethod
@@ -56,9 +92,9 @@ def prepare(cls, config):
 
     @classmethod
     def splits(cls, config):
-        data_folder = pathlib.Path(config["data_folder"]) / "dresden_capsule"
+        data_folder = pathlib.Path(config["data_folder"]) / "dresden-capsule"
         study_folder = data_folder / "images"
-        split_folder = data_folder / "splits" / config.task
+        split_folder = data_folder / "splits_tuebingen" / config.task
 
         test_data = pd.read_csv(split_folder / "test.csv")
         val_data = pd.read_csv(split_folder / config.split / "val.csv")
@@ -68,13 +104,16 @@ def splits(cls, config):
         X_val, y_val, labels = prepare_data(study_folder, val_data)
         X_test, y_test, labels = prepare_data(study_folder, test_data)
 
+        # Resampling
+        if config.downsampling.enabled:
+            X_train, y_train = downsampling(X_train, y_train, labels, config)
+            X_val, y_val = downsampling(X_val, y_val, labels, config)
+
         transform = A.Compose([A.augmentations.geometric.resize.Resize(config.sensor.resolution[0], config.sensor.resolution[1]), ToTensorV2()])
+        test_transform = A.Compose([A.augmentations.geometric.resize.Resize(config.sensor.resolution[0], config.sensor.resolution[1]), ToTensorV2()])
         train_set = cls(X_train, y_train, labels, transform=transform)
-        val_set = cls(X_val, y_val, labels)
-        test_set = cls(X_test, y_test, labels)
-
-        # RANDOM, RANDOM_PER_STUDY Splits
-        # preprocessing,
+        val_set = cls(X_val, y_val, labels, transform=test_transform)
+        test_set = cls(X_test, y_test, labels, transform=test_transform)
 
         return (
             train_set,