init

slds-lmu · May 24, 2024 · 24b8809 · 24b8809
commit 24b8809
Show file tree

Hide file tree

Showing 363 changed files with 10,226 additions and 0 deletions.
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -0,0 +1,12 @@
+name: Lint
+
+on: [push, pull_request]
+
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@stable
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
@@ -0,0 +1,27 @@
+name: Unittests
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .[test]
+          pip install pymoo==0.6.1.1
+          pip install gpytorch==1.11
+          pip install HEBO==0.3.5 --no-deps
+
+      - name: Run pytest
+        run: |
+          pytest -sv tests/
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,17 @@
+.idea/
+job_name_mappings/*
+job_name_mappings_analysis/*
+venv/
+__pycache__
+logs/*
+logs_analysis/*
+attic/*
+run_*
+*egg-info
+*.csv
+*.zip
+storage_structure
+.coverage
+submit_*
+reanalyse_existing/*
+TODO
diff --git a/README.md b/README.md
@@ -0,0 +1,118 @@
+# reshuffling
+
+python 3.10.6 venv
+
+```bash
+pip install -e .
+pip install gpytorch>=1.4.0
+pip install pymoo>=0.6.0
+pip install HEBO==0.3.5 --no-deps
+```
+
+Experiments:
+
+* create experiments scripts and run them (`run_experiments.sh`), e.g., via slurm submit scripts
+* this will create folders and result files in `results/`
+
+Below is example code to generate experiments (e.g., for CatBoost).
+See `main.py` and the main logic in `reshufflebench`.
+Code in `analyze/` is used to analyze experiment results.
+Code in `visualize` is used to visualize analyzed experiment results.
+
+Random Holdout
+
+`python create_experiments.py --classifier=catboost --default=False --optimizer=random --valid_type=holdout`
+
+Random 5x 5-fold CV
+
+`python create_experiments.py --classifier=catboost --default=False --optimizer=random --valid_type=cv`
+
+5-fold CV and {1, 2, 3, 4, 5}-fold Holdout can further be simulated from 5x 5-fold CV
+
+Hebo Holdout
+
+`python create_experiments.py --classifier=catboost --default=False --optimizer=hebo --valid_type=holdout`
+
+Hebo 5-fold CV
+
+`python create_experiments.py --classifier=catboost --default=False --optimizer=hebo --valid_type=cv --n_repeats=1`
+
+Hebo 5x 5-fold CV
+
+`python create_experiments.py --classifier=catboost --default=False --optimizer=hebo --valid_type=cv --n_repeats=5`
+
+HEBO 5-fold Holdout
+
+`python create_experiments.py --classifier=catboost --default=False --optimizer=hebo --valid_type=repeatedholdout`
+
+Analysis:
+
+* create analysis scripts and run them (`run_analysis.sh`), e.g. via slurm submit scripts
+* this will create folders in `csvs/raw/`
+
+Random Holdout
+
+`python create_analyses.py --optimizer=random --valid_type=holdout --type=post_naive --max_workers=1 --reshuffle=Both --check_files=False`
+
+Random 5-fold CV
+
+`python create_analyses.py --optimizer=random --valid_type=cv --n_repeats=1 --type=post_naive --max_workers=10 --reshuffle=Both --check_files=False`
+
+Random 5x 5-fold CV
+
+`python create_analyses.py --optimizer=random --valid_type=cv_repeated --n_repeats=5 --type=post_naive --max_workers=10 --reshuffle=Both --check_files=False`
+
+Random 5-fold Holdout
+
+`python create_analyses.py --optimizer=random --valid_type=repeatedholdout --n_repeats=5 --type=post_naive_simulate_repeatedholdout --max_workers=10 --reshuffle=Both --check_files=False`
+
+`python create_analyses.py --optimizer=random --valid_type=repeatedholdout --n_repeats=4 --type=post_naive_simulate_repeatedholdout --max_workers=10 --reshuffle=Both --check_files=False`
+
+`python create_analyses.py --optimizer=random --valid_type=repeatedholdout --n_repeats=3 --type=post_naive_simulate_repeatedholdout --max_workers=10 --reshuffle=Both --check_files=False`
+
+`python create_analyses.py --optimizer=random --valid_type=repeatedholdout --n_repeats=2 --type=post_naive_simulate_repeatedholdout --max_workers=10 --reshuffle=Both --check_files=False`
+
+`python create_analyses.py --optimizer=random --valid_type=repeatedholdout --n_repeats=1 --type=post_naive_simulate_repeatedholdout --max_workers=10 --reshuffle=Both --check_files=False`
+
+Hebo Holdout
+
+`python create_analyses.py --optimizer=hebo --valid_type=holdout --type=post_naive --max_workers=1 --reshuffle=Both --check_files=False`
+
+Hebo 5-fold CV
+
+`python create_analyses.py --optimizer=hebo --valid_type=cv --n_repeats=1 --type=post_naive --max_workers=1 --reshuffle=Both --check_files=False`
+
+Hebo 5x 5-fold CV
+
+`python create_analyses.py --optimizer=hebo --valid_type=cv_repeated --n_repeats=5 --type=post_naive --max_workers=1 --reshuffle=Both --check_files=False`
+
+Hebo 5-fold Holdout
+
+`python create_analyses.py --optimizer=hebo --valid_type=repeatedholdout --n_repeats=5 --type=post_naive --max_workers=1 --reshuffle=Both --check_files=False`
+
+Collect:
+
+* collect analyzed results
+* this will create result files in `csvs/`
+
+`python collect_results.py --valid_type=holdout`
+
+`python collect_results.py --valid_type=cv`
+
+`python collect_results.py --valid_type=cv_repeated`
+
+`python collect_results.py --valid_type=repeatedholdout`
+
+Afterwards, analyses of these result files can be performed via scripts in `visualize/`
+* `analyze_random_search.R` for random search
+* `analyze_BO.R` for HEBO vs. random search
+* `analyze_random_search_repeatedholdout.R` for random search M-fold holdout ablation
+
+Figures were created using `R 4.3.3` and `ggplot2 3.5.0`.
+Running these scripts (from the main directory, i.e. from here via `source("visualize/analyze_random_search.R")`) will generate figures and folders in `plots/`.
+
+To recreate figures, you can obtain raw results via the following link: `https://www.dropbox.com/scl/fi/r0flng59st1tnw8d1dwuj/results.zip?rlkey=ee59lczjlil6b3gi08kvvz1nl&st=pufjvckp&dl=0` and unzip these csvs and place them in `csvs/`.
+
+Simulations:
+
+Please see `simulations/README.md`
diff --git a/analyze/__init__.py b/analyze/__init__.py
diff --git a/analyze/post_selector.py b/analyze/post_selector.py
@@ -0,0 +1,96 @@
+import os
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+import numpy as np
+
+
+class PostSelector(ABC):
+    """
+    Abstract class for a post selector.
+    """
+
+    def __init__(
+        self,
+        id: str,
+        result_analyzer: "ResultAnalyzer",
+        supported_valid_types: list = [
+            "holdout",
+            "cv",
+            "cv_repeated",
+            "repeatedholdout",
+        ],
+        supported_reshufflings: list = [True, False],
+        resolution_sparse: bool = False,
+        additional_iterationwise_results: Optional[List[str]] = None,
+        bootstrap_results_path: str = os.path.abspath("../bootstrap_results"),
+    ):
+        self.id = id
+        self.result_analyzer = result_analyzer
+        self.supported_valid_types = supported_valid_types
+        self.supported_reshufflings = supported_reshufflings
+        self.resolution_sparse = resolution_sparse
+        self.additional_iterationwise_results = additional_iterationwise_results
+        self.bootstrap_results_path = bootstrap_results_path
+
+    def select(self, iteration: int, metric: str, **kwargs) -> int:
+        """
+        Function to select a configuration.
+        Performs some checks and then calls the _select function.
+        """
+        if self.result_analyzer.valid_type not in self.supported_valid_types:
+            raise ValueError(
+                f"Valid type {self.result_analyzer.valid_type} not supported by post selector {self.id}"
+            )
+        if self.result_analyzer.reshuffle not in self.supported_reshufflings:
+            raise ValueError(
+                f"Reshuffling {self.result_analyzer.reshuffle} not supported by post selector {self.id}"
+            )
+
+        return self._select(iteration=iteration, metric=metric, **kwargs)
+
+    @abstractmethod
+    def _select(self, iteration: int, metric: str, **kwargs) -> int:
+        """
+        Function to select a configuration.
+        """
+        pass
+
+    def reset(self):
+        """
+        Reset the post selector.
+        Calls the _reset function.
+        """
+        self._reset()
+
+    @abstractmethod
+    def _reset(self):
+        """
+        Reset the post selector.
+        """
+        pass
+
+
+class PostSelectorNaive(PostSelector):
+    """
+    Selects the configuration with the lowest validation score.
+    """
+
+    def __init__(self, result_analyzer: "ResultAnalyzer"):
+        super().__init__(id="naive", result_analyzer=result_analyzer)
+
+    def _select(self, iteration: int, metric: str, **kwargs) -> int:
+        """
+        Selects the configuration with the lowest validation score.
+        """
+        valid = self.result_analyzer.results_raw[metric]["valid"].values[
+            : iteration + 1
+        ]
+        selected = np.argmin(valid)
+        return selected
+
+    def _reset(self):
+        """
+        Reset the post selector.
+        """
+        pass