From 10e05b0b1870ff7b6c70b0b4ba619799bd597b49 Mon Sep 17 00:00:00 2001 From: Julia Werner Date: Thu, 22 Feb 2024 11:03:17 +0000 Subject: [PATCH] Dresden Capsule Dataset Integration --- experiments/dresden_capsule/config.yaml | 2 +- hannah/conf/dataset/dresden_capsule.yaml | 14 ++++- hannah/datasets/vision/dresden_capsule.py | 63 ++++++++++++++++++----- 3 files changed, 65 insertions(+), 14 deletions(-) diff --git a/experiments/dresden_capsule/config.yaml b/experiments/dresden_capsule/config.yaml index e585a43a..2541fd76 100644 --- a/experiments/dresden_capsule/config.yaml +++ b/experiments/dresden_capsule/config.yaml @@ -31,7 +31,7 @@ dataset: data_folder: ${oc.env:HANNAH_DATA_FOLDER,${hydra:runtime.cwd}/../../datasets/} module: - batch_size: 32 + batch_size: 128 trainer: max_epochs: 15 diff --git a/hannah/conf/dataset/dresden_capsule.yaml b/hannah/conf/dataset/dresden_capsule.yaml index 16dee941..b285146f 100644 --- a/hannah/conf/dataset/dresden_capsule.yaml +++ b/hannah/conf/dataset/dresden_capsule.yaml @@ -25,5 +25,17 @@ dataset: dresden_capsule sampler: random weighted_loss: false -task: section +task: sections # Check splits folder for other task options. split: split_0 + +downsampling: + enabled: true + ratio: + binary: 1 # ratio normal : anomaly + # Proportions of each class that should be used. + sections: [1, 1, 0.05, 0.01, 0.005] + technical_multilabel_bubbles_dirt: [1, 1] + technical_multiclass_view: [1, 1, 1] + anomalies_fraction: 0.3 # only relevant for binary task + +seed: 1234 \ No newline at end of file diff --git a/hannah/datasets/vision/dresden_capsule.py b/hannah/datasets/vision/dresden_capsule.py index 71e29ae3..8d40042a 100644 --- a/hannah/datasets/vision/dresden_capsule.py +++ b/hannah/datasets/vision/dresden_capsule.py @@ -25,29 +25,65 @@ import logging import pathlib -import shutil import numpy as np import pandas as pd -import torchvision from albumentations.pytorch import ToTensorV2 import albumentations as A import tqdm - +from sklearn.utils import resample from .base import ImageDatasetBase logger = logging.getLogger(__name__) def prepare_data(study_folder: pathlib.Path, data: pd.DataFrame): - label_names = list(data.columns)[:-1] + label_names = list(data.columns)[:-1] files = [study_folder / image for image in data["path"].to_list()] - labels = np.argmax(data.iloc[:, :-1].values, axis=1) + + if len(label_names) > 1: # True for section and technical tasks + labels = np.argmax(data.iloc[:, :-1].values, axis=1) + + else: # Assuming binary task (some anomaly and normal) + label_names.insert(0, 'normal') + labels = np.max(data.iloc[:, :-1].values, axis=1) + labels = [label_names[x] for x in labels] return files, labels, label_names +def downsampling(X: list, y: list, labels: list, config: dict): + + task = config.task + seed = config.seed + + if task == 'sections' or task == 'technical_multiclass_view' or task == 'technical_multilabel_bubbles_dirt': + idx_resampled = np.empty(0, dtype=int) + for i in range(len(labels)): + y = np.array(y) + idx_y = np.where(y == labels[i])[0] # get only one class + n_samples = int(config.downsampling.ratio[task][i]*len(idx_y)) + idx_resampled_temp = resample(idx_y, n_samples=n_samples, random_state=seed) # downsample class to n samples + idx_resampled = np.concatenate([idx_resampled, idx_resampled_temp]) + + else: # Assuming binary task + ratio = config.downsampling.ratio.binary + y = np.array(y) + normal_idx = np.where(y == labels[0])[0] + anomaly_idx = np.where(y == labels[1])[0] + n_samples = int(len(anomaly_idx)*config.downsampling.anomalies_fraction) + idx_resampled_anomaly = resample(anomaly_idx, n_samples=n_samples, random_state=seed) + idx_resampled_normal = resample(normal_idx, n_samples=int(n_samples*ratio), random_state=seed) + idx_resampled = np.concatenate([idx_resampled_anomaly, idx_resampled_normal]) + + ordered_idx_resampled = np.sort(idx_resampled) + y = y[ordered_idx_resampled] + X = np.array(X)[ordered_idx_resampled] + assert len(X) == len(y) + + return X, y + class DresdenCapsuleDataset(ImageDatasetBase): @classmethod @@ -56,9 +92,9 @@ def prepare(cls, config): @classmethod def splits(cls, config): - data_folder = pathlib.Path(config["data_folder"]) / "dresden_capsule" + data_folder = pathlib.Path(config["data_folder"]) / "dresden-capsule" study_folder = data_folder / "images" - split_folder = data_folder / "splits" / config.task + split_folder = data_folder / "splits_tuebingen" / config.task test_data = pd.read_csv(split_folder / "test.csv") val_data = pd.read_csv(split_folder / config.split / "val.csv") @@ -68,13 +104,16 @@ def splits(cls, config): X_val, y_val, labels = prepare_data(study_folder, val_data) X_test, y_test, labels = prepare_data(study_folder, test_data) + # Resampling + if config.downsampling.enabled: + X_train, y_train = downsampling(X_train, y_train, labels, config) + X_val, y_val = downsampling(X_val, y_val, labels, config) + transform = A.Compose([A.augmentations.geometric.resize.Resize(config.sensor.resolution[0], config.sensor.resolution[1]), ToTensorV2()]) + test_transform = A.Compose([A.augmentations.geometric.resize.Resize(config.sensor.resolution[0], config.sensor.resolution[1]), ToTensorV2()]) train_set = cls(X_train, y_train, labels, transform=transform) - val_set = cls(X_val, y_val, labels) - test_set = cls(X_test, y_test, labels) - - # RANDOM, RANDOM_PER_STUDY Splits - # preprocessing, + val_set = cls(X_val, y_val, labels, transform=test_transform) + test_set = cls(X_test, y_test, labels, transform=test_transform) return ( train_set,