-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
98 changed files
with
14,200 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
## Basics | ||
|
||
# Editor | ||
.DS_Store | ||
.vscode | ||
|
||
# Python | ||
*.pyc | ||
.mypy_cache | ||
__pycache__ | ||
*.iml | ||
.[v]env | ||
.ipynb_checkpoints | ||
dist | ||
build | ||
project.egg-info | ||
|
||
# AutoGluon | ||
**AutogluonModels | ||
|
||
## Not necessary in Docker | ||
.git | ||
.github | ||
data | ||
garf | ||
infrastructure | ||
models | ||
notebooks | ||
paper | ||
processed | ||
results | ||
scripts/deploy_experiments.py | ||
|
||
.flake8 | ||
**.gitignore | ||
Makefile | ||
README.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# Editor | ||
.DS_Store | ||
.vscode | ||
|
||
|
||
# Python | ||
*.pyc | ||
.mypy_cache | ||
.ruff_cache | ||
__pycache__ | ||
*.iml | ||
.[v]env | ||
.ipynb_checkpoints | ||
dist | ||
build | ||
project.egg-info | ||
|
||
# AutoGluon | ||
**AutogluonModels | ||
|
||
# Latex | ||
*.aux | ||
*.log | ||
*.out | ||
*.bbl | ||
*.blg | ||
|
||
*.synctex.gz | ||
*.synctex.gz(busy) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
CHART_PATH=infrastructure/helm/conformal-data-cleaning | ||
|
||
check-docker-prerequisites: | ||
ifndef DOCKER_IMAGE_NAME | ||
$(error DOCKER_IMAGE_NAME is not set) | ||
endif | ||
|
||
build-docker: check-docker-prerequisites | ||
poetry build | ||
rm -rf garf/dist | ||
mv dist garf/dist | ||
|
||
docker build -t ${DOCKER_IMAGE_NAME}:final -f infrastructure/docker/Dockerfile . | ||
docker build -t ${DOCKER_IMAGE_NAME}:garf -f infrastructure/docker/Dockerfile.garf garf | ||
|
||
push-docker: check-docker-prerequisites | ||
docker push ${DOCKER_IMAGE_NAME}:final | ||
docker push ${DOCKER_IMAGE_NAME}:garf | ||
|
||
docker: build-docker push-docker | ||
|
||
helm-delete: | ||
# ignoring error as long as it does not exist | ||
-helm delete $(shell helm list --filter conformal-data-cleaning --short) | ||
-helm delete $(shell helm list --filter conformal-data-cleaning-garf --short) | ||
|
||
helm-install: | ||
cd scripts && python deploy_experiments.py | ||
|
||
deploy-all: docker helm-install |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
# Conformal Data Cleaning | ||
|
||
This repository contains source code for the experiments conducted in the AISTATS 2024 paper `From Data Imputation to Data Cleaning - Automated Cleaning of Tabular Data Improves Downstream Predictive Performance`. | ||
|
||
|
||
## Run Experiments | ||
|
||
First of all, use [`load_corrupt_and_test_datasets.ipynb`](./notebooks/load_corrupt_and_test_datasets.ipynb) to download and corrupt the datasets and setup the expected structure of the [`data`](./data/) directory. | ||
|
||
[`run_experiment.py`](./scripts/run_experiment.py) implements a simple CLI script (`run-experiment`), which allows to easily run experiments. | ||
|
||
**Conformal Data Cleaning:** | ||
```bash | ||
run-experiment \ | ||
--task_id \ | ||
"42493" \ | ||
--error_fractions \ | ||
"0.01" \ | ||
"0.05" \ | ||
"0.1" \ | ||
"0.3" \ | ||
"0.5" \ | ||
--num_repetitions \ | ||
"3" \ | ||
--results_path \ | ||
"/conformal-data-cleaning/results/final-experiments" \ | ||
--models_path \ | ||
"/conformal-data-cleaning/models/final-experiments" \ | ||
--how_many_hpo_trials \ | ||
"50" \ | ||
experiment \ | ||
--confidence_level \ | ||
"0.999" | ||
``` | ||
|
||
**ML Baseline:** | ||
```bash | ||
run-experiment \ | ||
--task_id \ | ||
"42493" \ | ||
--error_fractions \ | ||
"0.01" \ | ||
"0.05" \ | ||
"0.1" \ | ||
"0.3" \ | ||
"0.5" \ | ||
--num_repetitions \ | ||
"3" \ | ||
--results_path \ | ||
"/conformal-data-cleaning/results/final-experiments" \ | ||
--models_path \ | ||
"/conformal-data-cleaning/models/final-experiments" \ | ||
--how_many_hpo_trials \ | ||
"50" \ | ||
baseline \ | ||
--method \ | ||
"AutoGluon" \ | ||
--method_hyperparameter \ | ||
"0.999" | ||
``` | ||
|
||
**PyOD Baseline (not included in the paper):** | ||
```bash | ||
run-experiment \ | ||
--task_id \ | ||
"42493" \ | ||
--error_fractions \ | ||
"0.01" \ | ||
"0.05" \ | ||
"0.1" \ | ||
"0.3" \ | ||
"0.5" \ | ||
--num_repetitions \ | ||
"3" \ | ||
--results_path \ | ||
"/conformal-data-cleaning/results/final-experiments" \ | ||
--models_path \ | ||
"/conformal-data-cleaning/models/final-experiments" \ | ||
--how_many_hpo_trials \ | ||
"50" \ | ||
baseline \ | ||
--method \ | ||
"PyodECOD" \ | ||
--method_hyperparameter \ | ||
"0.3" | ||
``` | ||
|
||
For Garf, please use [main.py](./garf/main.py). | ||
```bash | ||
python \ | ||
main.py \ | ||
--task_id \ | ||
"42493" \ | ||
--error_fractions \ | ||
"0.01" \ | ||
"0.05" \ | ||
"0.1" \ | ||
"0.3" \ | ||
"0.5" \ | ||
--num_repetitions \ | ||
"3" \ | ||
--results_path \ | ||
"/conformal-data-cleaning/results/final-experiments" \ | ||
--models_path \ | ||
"/conformal-data-cleaning/models/final-experiments" | ||
``` | ||
|
||
|
||
## Run our Experimental Setup | ||
|
||
We ran our experiments on Kubernetes using Helm. Please checkout the [helm charts](./infrastructure/helm/) and change the `image` and `imagePullSecrets` settings in the `values.yaml` files accordingly to your setup. | ||
Therefore, some read-write-many volumes are necessary to store the experiment results. Please checkout the [`infrastructure/k8s`](./infrastructure/k8s/) directory (and don't forget to setup the data directory as describe above). | ||
|
||
Using `make docker` builds and pushes the necessary docker images and `make helm-install` uses [`deploy_experiments.py`](./scripts/deploy_experiments.py) to start our experimental setup. | ||
|
||
|
||
## Evaluation | ||
|
||
[`notebooks/evaluation`](./notebooks/evaluation/) contains notebooks we use for evaluating the results and [`5_plotting.ipynb`](./notebooks/evaluation/5_plotting.ipynb) outputs the plots shown in the paper. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import os | ||
from logging import Formatter, StreamHandler, getLogger | ||
|
||
|
||
def setup_logger(name: str) -> None: | ||
"""Sets up a common logging format. | ||
Args: | ||
name (str): `name` of the logger to setup | ||
""" | ||
level = os.getenv("LOG_LEVEL", "INFO") | ||
logger = getLogger(name) | ||
logger.setLevel(level) | ||
handler = StreamHandler() | ||
formatter = Formatter("%(asctime)s - %(levelname)s - %(name)s: %(message)s") | ||
handler.setFormatter(formatter) | ||
logger.addHandler(handler) | ||
|
||
|
||
setup_logger(__name__) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .. import setup_logger | ||
|
||
setup_logger(__name__) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
from __future__ import annotations | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import Any, Optional | ||
|
||
import pandas as pd | ||
from pandas.api.types import is_numeric_dtype | ||
from sklearn.utils.validation import check_is_fitted | ||
|
||
from ..utils import is_categorical, set_seed | ||
|
||
|
||
class CleanerError(Exception): | ||
"""Exception raised for errors in Imputers.""" | ||
|
||
|
||
class BaseCleaner(ABC): | ||
_outlier_predictions: dict | ||
|
||
def __init__(self, seed: Optional[int] = None): | ||
self._seed = seed | ||
set_seed(self._seed) | ||
|
||
def _guess_dtypes(self, data: pd.DataFrame) -> None: | ||
self._categorical_columns = [c for c in data.columns if is_categorical(data[c])] | ||
self._numerical_columns = [ | ||
c for c in data.columns if is_numeric_dtype(data[c]) and c not in self._categorical_columns | ||
] | ||
|
||
if len(data.columns) != (len(self._categorical_columns) + len(self._numerical_columns)): | ||
raise Exception( | ||
f"There are {len(data.columns)} columns but found " | ||
f"{len(self._categorical_columns)} categorical and " | ||
f"{len(self._numerical_columns)} numerical columns.", | ||
) | ||
|
||
def fit(self, data: pd.DataFrame, target_columns: Optional[list] = None, **kwargs: dict[str, Any]) -> BaseCleaner: | ||
if target_columns is None: | ||
target_columns = data.columns.to_list() | ||
|
||
if not type(target_columns) == list: | ||
raise CleanerError( | ||
f"Parameter 'target_column' need to be of type list\ | ||
but is '{type(target_columns)}'", | ||
) | ||
|
||
if any([column not in data.columns for column in target_columns]): | ||
raise CleanerError(f"All target columns ('{target_columns}') must be in: {', '.join(data.columns)}") | ||
|
||
self.target_columns_ = target_columns | ||
|
||
self._guess_dtypes(data) | ||
return self._fit_method(data=data.copy(), **kwargs) | ||
|
||
def remove_outliers( | ||
self, | ||
data: pd.DataFrame, | ||
**kwargs: dict[str, Any], | ||
) -> tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]: | ||
check_is_fitted(self, ["predictors_", "target_columns_"]) | ||
|
||
missing_mask = data[self.target_columns_].isna() | ||
data_without_outliers, prediction_sets = self._remove_outliers_method(data=data.copy(), **kwargs) | ||
|
||
missing_mask_outliers_removed = data_without_outliers[self.target_columns_].isna() | ||
outlier_mask = missing_mask_outliers_removed & ~missing_mask | ||
|
||
return data_without_outliers, outlier_mask, prediction_sets | ||
|
||
def impute(self, data: pd.DataFrame, **kwargs: dict[str, Any]) -> tuple[pd.DataFrame, pd.DataFrame]: | ||
check_is_fitted(self, ["predictors_", "target_columns_"]) | ||
|
||
missing_mask = data[self.target_columns_].isna() | ||
imputed_data = self._impute_method(data=data.copy(), **kwargs) | ||
|
||
return imputed_data, missing_mask | ||
|
||
def transform( | ||
self, | ||
data: pd.DataFrame, | ||
**kwargs: dict[str, Any], | ||
) -> tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]: | ||
data_without_outliers, outlier_mask, prediction_sets = self.remove_outliers(data, **kwargs) | ||
|
||
if kwargs.get("reuse_intermediate", True): | ||
for column in self.target_columns_: | ||
data_without_outliers.loc[outlier_mask.loc[:, column], column] = self._outlier_predictions[column] | ||
|
||
cleaned_data, imputed_mask = self.impute(data_without_outliers, **kwargs) | ||
cleaned_mask = imputed_mask | outlier_mask | ||
|
||
delattr(self, "_outlier_predictions") | ||
|
||
return cleaned_data, cleaned_mask, prediction_sets | ||
|
||
@abstractmethod | ||
def _fit_method(self, data: pd.DataFrame, **kwargs: dict[str, Any]) -> BaseCleaner: | ||
pass | ||
|
||
@abstractmethod | ||
def _remove_outliers_method( | ||
self, | ||
data: pd.DataFrame, | ||
**kwargs: dict[str, Any], | ||
) -> tuple[pd.DataFrame, pd.DataFrame]: | ||
pass | ||
|
||
@abstractmethod | ||
def _impute_method(self, data: pd.DataFrame, **kwargs: dict[str, Any]) -> pd.DataFrame: | ||
pass |
Oops, something went wrong.