release publicly

se-jaeger · Feb 14, 2024 · fd8a212 · fd8a212
1 parent d42f101
commit fd8a212
Show file tree

Hide file tree

Showing 98 changed files with 14,200 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,37 @@
+## Basics
+
+# Editor
+.DS_Store
+.vscode
+
+# Python
+*.pyc
+.mypy_cache
+__pycache__
+*.iml
+.[v]env
+.ipynb_checkpoints
+dist
+build
+project.egg-info
+
+# AutoGluon
+**AutogluonModels
+
+## Not necessary in Docker
+.git
+.github
+data
+garf
+infrastructure
+models
+notebooks
+paper
+processed
+results
+scripts/deploy_experiments.py
+
+.flake8
+**.gitignore
+Makefile
+README.md
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,29 @@
+# Editor
+.DS_Store
+.vscode
+
+
+# Python
+*.pyc
+.mypy_cache
+.ruff_cache
+__pycache__
+*.iml
+.[v]env
+.ipynb_checkpoints
+dist
+build
+project.egg-info
+
+# AutoGluon
+**AutogluonModels
+
+# Latex
+*.aux
+*.log
+*.out
+*.bbl
+*.blg
+
+*.synctex.gz
+*.synctex.gz(busy)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,30 @@
+CHART_PATH=infrastructure/helm/conformal-data-cleaning
+
+check-docker-prerequisites:
+ifndef DOCKER_IMAGE_NAME
+	$(error DOCKER_IMAGE_NAME is not set)
+endif
+
+build-docker: check-docker-prerequisites
+	poetry build
+	rm -rf garf/dist
+	mv dist garf/dist
+
+	docker build -t ${DOCKER_IMAGE_NAME}:final -f infrastructure/docker/Dockerfile .
+	docker build -t ${DOCKER_IMAGE_NAME}:garf -f infrastructure/docker/Dockerfile.garf garf
+
+push-docker: check-docker-prerequisites
+	docker push ${DOCKER_IMAGE_NAME}:final
+	docker push ${DOCKER_IMAGE_NAME}:garf
+
+docker: build-docker push-docker
+
+helm-delete:
+	# ignoring error as long as it does not exist
+	-helm delete $(shell helm list --filter conformal-data-cleaning --short)
+	-helm delete $(shell helm list --filter conformal-data-cleaning-garf --short)
+
+helm-install:
+	cd scripts && python deploy_experiments.py
+
+deploy-all: docker helm-install
diff --git a/README.md b/README.md
@@ -0,0 +1,119 @@
+# Conformal Data Cleaning
+
+This repository contains source code for the experiments conducted in the AISTATS 2024 paper `From Data Imputation to Data Cleaning - Automated Cleaning of Tabular Data Improves Downstream Predictive Performance`.
+
+
+## Run Experiments
+
+First of all, use [`load_corrupt_and_test_datasets.ipynb`](./notebooks/load_corrupt_and_test_datasets.ipynb) to download and corrupt the datasets and setup the expected structure of the [`data`](./data/) directory. 
+
+[`run_experiment.py`](./scripts/run_experiment.py) implements a simple CLI script (`run-experiment`), which allows to easily run experiments.
+
+**Conformal Data Cleaning:**
+```bash
+run-experiment \
+	--task_id \
+	"42493" \
+	--error_fractions \
+	"0.01" \
+	"0.05" \
+	"0.1" \
+	"0.3" \
+	"0.5" \
+	--num_repetitions \
+	"3" \
+	--results_path \
+	"/conformal-data-cleaning/results/final-experiments" \
+	--models_path \
+	"/conformal-data-cleaning/models/final-experiments" \
+	--how_many_hpo_trials \
+	"50" \
+	experiment \
+	--confidence_level \
+	"0.999"
+```
+
+**ML Baseline:**
+```bash
+run-experiment \
+	--task_id \
+	"42493" \
+	--error_fractions \
+	"0.01" \
+	"0.05" \
+	"0.1" \
+	"0.3" \
+	"0.5" \
+	--num_repetitions \
+	"3" \
+	--results_path \
+	"/conformal-data-cleaning/results/final-experiments" \
+	--models_path \
+	"/conformal-data-cleaning/models/final-experiments" \
+	--how_many_hpo_trials \
+	"50" \
+	baseline \
+	--method \
+	"AutoGluon" \
+	--method_hyperparameter \
+	"0.999"
+```
+
+**PyOD Baseline (not included in the paper):**
+```bash
+run-experiment \
+	--task_id \
+	"42493" \
+	--error_fractions \
+	"0.01" \
+	"0.05" \
+	"0.1" \
+	"0.3" \
+	"0.5" \
+	--num_repetitions \
+	"3" \
+	--results_path \
+	"/conformal-data-cleaning/results/final-experiments" \
+	--models_path \
+	"/conformal-data-cleaning/models/final-experiments" \
+	--how_many_hpo_trials \
+	"50" \
+	baseline \
+	--method \
+	"PyodECOD" \
+	--method_hyperparameter \
+	"0.3"
+```
+
+For Garf, please use [main.py](./garf/main.py).
+```bash
+python \
+	main.py \
+	--task_id \
+	"42493" \
+	--error_fractions \
+	"0.01" \
+	"0.05" \
+	"0.1" \
+	"0.3" \
+	"0.5" \
+	--num_repetitions \
+	"3" \
+	--results_path \
+	"/conformal-data-cleaning/results/final-experiments" \
+	--models_path \
+	"/conformal-data-cleaning/models/final-experiments"
+```
+
+
+## Run our Experimental Setup
+
+We ran our experiments on Kubernetes using Helm. Please checkout the [helm charts](./infrastructure/helm/) and change the `image` and `imagePullSecrets` settings in the `values.yaml` files accordingly to your setup.
+Therefore, some read-write-many volumes are necessary to store the experiment results. Please checkout the [`infrastructure/k8s`](./infrastructure/k8s/) directory (and don't forget to setup the data directory as describe above).
+
+Using `make docker` builds and pushes the necessary docker images and `make helm-install` uses [`deploy_experiments.py`](./scripts/deploy_experiments.py) to start our experimental setup.
+
+
+## Evaluation
+
+[`notebooks/evaluation`](./notebooks/evaluation/) contains notebooks we use for evaluating the results and [`5_plotting.ipynb`](./notebooks/evaluation/5_plotting.ipynb) outputs the plots shown in the paper.
diff --git a/conformal_data_cleaning/__init__.py b/conformal_data_cleaning/__init__.py
@@ -0,0 +1,20 @@
+import os
+from logging import Formatter, StreamHandler, getLogger
+
+
+def setup_logger(name: str) -> None:
+    """Sets up a common logging format.
+
+    Args:
+        name (str): `name` of the logger to setup
+    """
+    level = os.getenv("LOG_LEVEL", "INFO")
+    logger = getLogger(name)
+    logger.setLevel(level)
+    handler = StreamHandler()
+    formatter = Formatter("%(asctime)s - %(levelname)s - %(name)s: %(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+
+setup_logger(__name__)
diff --git a/conformal_data_cleaning/cleaner/__init__.py b/conformal_data_cleaning/cleaner/__init__.py
@@ -0,0 +1,3 @@
+from .. import setup_logger
+
+setup_logger(__name__)
diff --git a/conformal_data_cleaning/cleaner/_base.py b/conformal_data_cleaning/cleaner/_base.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+from sklearn.utils.validation import check_is_fitted
+
+from ..utils import is_categorical, set_seed
+
+
+class CleanerError(Exception):
+    """Exception raised for errors in Imputers."""
+
+
+class BaseCleaner(ABC):
+    _outlier_predictions: dict
+
+    def __init__(self, seed: Optional[int] = None):
+        self._seed = seed
+        set_seed(self._seed)
+
+    def _guess_dtypes(self, data: pd.DataFrame) -> None:
+        self._categorical_columns = [c for c in data.columns if is_categorical(data[c])]
+        self._numerical_columns = [
+            c for c in data.columns if is_numeric_dtype(data[c]) and c not in self._categorical_columns
+        ]
+
+        if len(data.columns) != (len(self._categorical_columns) + len(self._numerical_columns)):
+            raise Exception(
+                f"There are {len(data.columns)} columns but found "
+                f"{len(self._categorical_columns)} categorical and "
+                f"{len(self._numerical_columns)} numerical columns.",
+            )
+
+    def fit(self, data: pd.DataFrame, target_columns: Optional[list] = None, **kwargs: dict[str, Any]) -> BaseCleaner:
+        if target_columns is None:
+            target_columns = data.columns.to_list()
+
+        if not type(target_columns) == list:
+            raise CleanerError(
+                f"Parameter 'target_column' need to be of type list\
+                    but is '{type(target_columns)}'",
+            )
+
+        if any([column not in data.columns for column in target_columns]):
+            raise CleanerError(f"All target columns ('{target_columns}') must be in: {', '.join(data.columns)}")
+
+        self.target_columns_ = target_columns
+
+        self._guess_dtypes(data)
+        return self._fit_method(data=data.copy(), **kwargs)
+
+    def remove_outliers(
+        self,
+        data: pd.DataFrame,
+        **kwargs: dict[str, Any],
+    ) -> tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:
+        check_is_fitted(self, ["predictors_", "target_columns_"])
+
+        missing_mask = data[self.target_columns_].isna()
+        data_without_outliers, prediction_sets = self._remove_outliers_method(data=data.copy(), **kwargs)
+
+        missing_mask_outliers_removed = data_without_outliers[self.target_columns_].isna()
+        outlier_mask = missing_mask_outliers_removed & ~missing_mask
+
+        return data_without_outliers, outlier_mask, prediction_sets
+
+    def impute(self, data: pd.DataFrame, **kwargs: dict[str, Any]) -> tuple[pd.DataFrame, pd.DataFrame]:
+        check_is_fitted(self, ["predictors_", "target_columns_"])
+
+        missing_mask = data[self.target_columns_].isna()
+        imputed_data = self._impute_method(data=data.copy(), **kwargs)
+
+        return imputed_data, missing_mask
+
+    def transform(
+        self,
+        data: pd.DataFrame,
+        **kwargs: dict[str, Any],
+    ) -> tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:
+        data_without_outliers, outlier_mask, prediction_sets = self.remove_outliers(data, **kwargs)
+
+        if kwargs.get("reuse_intermediate", True):
+            for column in self.target_columns_:
+                data_without_outliers.loc[outlier_mask.loc[:, column], column] = self._outlier_predictions[column]
+
+        cleaned_data, imputed_mask = self.impute(data_without_outliers, **kwargs)
+        cleaned_mask = imputed_mask | outlier_mask
+
+        delattr(self, "_outlier_predictions")
+
+        return cleaned_data, cleaned_mask, prediction_sets
+
+    @abstractmethod
+    def _fit_method(self, data: pd.DataFrame, **kwargs: dict[str, Any]) -> BaseCleaner:
+        pass
+
+    @abstractmethod
+    def _remove_outliers_method(
+        self,
+        data: pd.DataFrame,
+        **kwargs: dict[str, Any],
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        pass
+
+    @abstractmethod
+    def _impute_method(self, data: pd.DataFrame, **kwargs: dict[str, Any]) -> pd.DataFrame:
+        pass
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .. import setup_logger

		setup_logger(__name__)