From c84f292a11280ecb64623f171f5382f7cfcab6bf Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Mon, 2 Dec 2024 08:50:54 +0000
Subject: [PATCH 1/4] some plotting improvements

---
 causaltune/score/scoring.py                   |  17 +-
 notebooks/RunExperiments/experiment_runner.py | 280 +++++++++++-------
 2 files changed, 186 insertions(+), 111 deletions(-)

diff --git a/causaltune/score/scoring.py b/causaltune/score/scoring.py
index e79150e..5547876 100644
--- a/causaltune/score/scoring.py
+++ b/causaltune/score/scoring.py
@@ -51,13 +51,13 @@ def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List
         else:
             metrics = [
                 "erupt",
-                "norm_erupt",
-                "greedy_erupt",  # regular erupt was made probabilistic, no need for a separate one
+                # "norm_erupt",
+                # "greedy_erupt",  # regular erupt was made probabilistic, no need for a separate one
                 "policy_risk",  # NEW
                 "qini",
                 "auc",
                 # "r_scorer",
-                "energy_distance",  # is broken without propensity weighting
+                # "energy_distance",  # is broken without propensity weighting
                 "psw_energy_distance",
                 "frobenius_norm",  # NEW
                 "codec",  # NEW
@@ -68,6 +68,17 @@ def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List
             return metrics
 
 
+def metrics_to_minimize():
+    return [
+        "energy_distance",
+        "psw_energy_distance",
+        "codec",
+        "frobenius_norm",
+        "psw_frobenius_norm",
+        "policy_risk",
+    ]
+
+
 class Scorer:
     def __init__(
         self,
diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py
index b66d105..8961d01 100644
--- a/notebooks/RunExperiments/experiment_runner.py
+++ b/notebooks/RunExperiments/experiment_runner.py
@@ -1,11 +1,15 @@
 import os
 import sys
 import pickle
+import glob
+import copy
+import argparse
+
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
-import copy
-import argparse
+import pandas as pd
+
 from sklearn.model_selection import train_test_split
 from datetime import datetime
 import warnings
@@ -20,7 +24,11 @@
 from causaltune import CausalTune  # noqa: E402
 from causaltune.datasets import load_dataset  # noqa: E402
 from causaltune.models.passthrough import passthrough_model  # noqa: E402
-from causaltune.search.params import SimpleParamService
+from causaltune.search.params import SimpleParamService  # noqa: E402
+from causaltune.score.scoring import (
+    metrics_to_minimize,
+    supported_metrics,
+)  # noqa: E402
 
 
 def parse_arguments():
@@ -41,6 +49,13 @@ def parse_arguments():
     parser.add_argument(
         "--outcome_model", type=str, default="nested", help="Outcome model type"
     )
+    parser.add_argument(
+        "--timestamp_in_dirname",
+        type=bool,
+        default="False",
+        help="Include timestampl in out_dir name?",
+    )
+
     parser.add_argument("--test_size", type=float, default=0.33, help="Test set size")
     parser.add_argument(
         "--time_budget", type=int, default=None, help="Time budget for optimization"
@@ -93,10 +108,6 @@ def get_estimator_list(dataset_name):
 
 
 def run_experiment(args):
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    out_dir = f"EXPERIMENT_RESULTS_{timestamp}_{args.identifier}"
-    os.makedirs(out_dir, exist_ok=True)
-
     # Process datasets
     data_sets = {}
     for dataset in args.datasets:
@@ -110,6 +121,16 @@ def run_experiment(args):
         file_path = f"RunDatasets/{size}/{name}.pkl"
         data_sets[f"{size} {name}"] = load_dataset(file_path)
 
+    if args.timestamp_in_dirname:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        out_dir = f"EXPERIMENT_RESULTS_{timestamp}_{args.identifier}"
+    else:
+        out_dir = f"EXPERIMENT_RESULTS_{args.identifier}"
+
+    os.makedirs(out_dir, exist_ok=True)
+    out_dir = os.path.join(out_dir, size)
+    os.makedirs(out_dir, exist_ok=True)
+
     print(f"Loaded datasets: {list(data_sets.keys())}")
 
     # Set time budgets properly
@@ -128,6 +149,9 @@ def run_experiment(args):
         args.time_budget = None  # Use only components_time_budget
 
     for dataset_name, cd in data_sets.items():
+        case = dataset_name.split("_")[-1]
+        os.makedirs(f"{out_dir}/{case}", exist_ok=True)
+
         for i_run in range(1, args.n_runs + 1):
             cd_i = copy.deepcopy(cd)
             train_df, test_df = train_test_split(cd_i.data, test_size=args.test_size)
@@ -135,41 +159,47 @@ def run_experiment(args):
             cd_i.data = train_df
 
             for metric in args.metrics:
-                if "KC" in dataset_name and "KCKP" not in dataset_name:
-                    propensity_model = "auto"
-                elif "KCKP" in dataset_name:
-                    propensity_model = passthrough_model(
-                        cd.propensity_modifiers, include_control=False
+                try:
+                    fn = make_filename(metric, dataset_name, i_run)
+                    out_fn = os.path.join(out_dir, case, fn)
+                    if os.path.isfile(out_fn):
+                        print(f"File {out_fn} exists, skipping...")
+                        continue
+
+                    if "KC" in dataset_name and "KCKP" not in dataset_name:
+                        propensity_model = "auto"
+                    elif "KCKP" in dataset_name:
+                        propensity_model = passthrough_model(
+                            cd.propensity_modifiers, include_control=False
+                        )
+                    else:
+                        propensity_model = "dummy"
+
+                    ct = CausalTune(
+                        metric=metric,
+                        estimator_list=get_estimator_list(dataset_name),
+                        num_samples=args.num_samples,
+                        components_time_budget=args.components_time_budget,  # Use this instead
+                        verbose=1,
+                        components_verbose=1,
+                        store_all_estimators=True,
+                        propensity_model=propensity_model,
+                        outcome_model=args.outcome_model,
                     )
-                else:
-                    propensity_model = "dummy"
-
-                ct = CausalTune(
-                    metric=metric,
-                    estimator_list=get_estimator_list(dataset_name),
-                    num_samples=args.num_samples,
-                    components_time_budget=args.components_time_budget,  # Use this instead
-                    verbose=1,
-                    components_verbose=1,
-                    store_all_estimators=True,
-                    propensity_model=propensity_model,
-                    outcome_model=args.outcome_model,
-                )
 
-                ct.fit(
-                    data=cd_i,
-                    treatment="treatment",
-                    outcome="outcome",
-                )
+                    ct.fit(
+                        data=cd_i,
+                        treatment="treatment",
+                        outcome="outcome",
+                    )
 
-                # Compute scores and save results
-                results = compute_scores(ct, metric, test_df)
+                    # Compute scores and save results
+                    results = compute_scores(ct, metric, test_df)
 
-                with open(
-                    f"{out_dir}/{metric}_run_{i_run}_{dataset_name.replace(' ', '_')}.pkl",
-                    "wb",
-                ) as f:
-                    pickle.dump(results, f)
+                    with open(out_fn, "wb") as f:
+                        pickle.dump(results, f)
+                except Exception as e:
+                    print(f"Error processing {dataset_name}_{metric}_{i_run}: {e}")
 
     return out_dir
 
@@ -180,49 +210,46 @@ def compute_scores(ct, metric, test_df):
 
     all_scores = []
     for trial in ct.results.trials:
-        estimator_name = trial.last_result["estimator_name"]
-        if trial.last_result["estimator"]:
-            estimator = trial.last_result["estimator"]
-            scores = {}
-            for ds_name, df in datasets.items():
-                scores[ds_name] = {}
-                est_scores = ct.scorer.make_scores(
-                    estimator,
-                    df,
-                    metrics_to_report=ct.metrics_to_report,
-                )
-                est_scores["estimator_name"] = estimator_name
+        try:
+            estimator_name = trial.last_result["estimator_name"]
+            if "estimator" in trial.last_result and trial.last_result["estimator"]:
+                estimator = trial.last_result["estimator"]
+                scores = {}
+                for ds_name, df in datasets.items():
+                    scores[ds_name] = {}
+                    est_scores = ct.scorer.make_scores(
+                        estimator,
+                        df,
+                        metrics_to_report=ct.metrics_to_report,
+                    )
+                    est_scores["estimator_name"] = estimator_name
 
-                scores[ds_name]["CATE_estimate"] = np.squeeze(
-                    estimator.estimator.effect(df)
-                )
-                scores[ds_name]["CATE_groundtruth"] = np.squeeze(df["true_effect"])
-                est_scores["MSE"] = np.mean(
-                    (
-                        scores[ds_name]["CATE_estimate"]
-                        - scores[ds_name]["CATE_groundtruth"]
+                    scores[ds_name]["CATE_estimate"] = np.squeeze(
+                        estimator.estimator.effect(df)
+                    )
+                    scores[ds_name]["CATE_groundtruth"] = np.squeeze(df["true_effect"])
+                    est_scores["MSE"] = np.mean(
+                        (
+                            scores[ds_name]["CATE_estimate"]
+                            - scores[ds_name]["CATE_groundtruth"]
+                        )
+                        ** 2
                     )
-                    ** 2
+                    scores[ds_name]["scores"] = est_scores
+                scores["optimization_score"] = trial.last_result.get(
+                    "optimization_score"
                 )
-                scores[ds_name]["scores"] = est_scores
-            scores["optimization_score"] = trial.last_result.get("optimization_score")
-            estimator_scores[estimator_name].append(copy.deepcopy(scores))
+                estimator_scores[estimator_name].append(copy.deepcopy(scores))
             # Will use this in the nex
             all_scores.append(scores)
+        except Exception as e:
+            print(f"Error processing trial: {e}")
 
     for k in estimator_scores.keys():
         estimator_scores[k] = sorted(
             estimator_scores[k],
             key=lambda x: x["validation"]["scores"][metric],
-            reverse=metric
-            not in [
-                "energy_distance",
-                "psw_energy_distance",
-                "codec",
-                "frobenius_norm",
-                "psw_frobenius_norm",
-                "policy_risk",
-            ],
+            reverse=metric not in metrics_to_minimize(),
         )
 
     return {
@@ -235,12 +262,47 @@ def compute_scores(ct, metric, test_df):
     }
 
 
-def generate_plots(out_dir, metrics, datasets, n_runs):
+def extract_metrics_datasets(out_dir: str):
+    metrics = set()
+    datasets = set()
+
+    for file in glob.glob(f"{out_dir}/*.pkl"):
+        parts = os.path.basename(file).split("-")
+        metrics.add(parts[0])
+        datasets.add(parts[-1].replace(".pkl", "").replace("_", " "))
+
+    return sorted(list(metrics)), sorted(list(datasets))
+
+
+def make_filename(metric, dataset, i_run):
+    return f"{metric}-run-{i_run}-{dataset.replace(' ', '_')}.pkl"
+
+
+def get_all_test_scores(out_dir, dataset_name):
+    size, ds_type, case = dataset_name.split(" ")
+    all_scores = []
+    for file in glob.glob(f"{out_dir}/*_{ds_type}_{case}.pkl"):
+        with open(file, "rb") as f:
+            results = pickle.load(f)
+            for x in results["all_scores"]:
+                all_scores.append(
+                    {
+                        k: v
+                        for k, v in x["test"]["scores"].items()
+                        if k not in ["values"]
+                    }
+                )
+    out = pd.DataFrame(all_scores)
+    return out
+
+
+def generate_plots(out_dir):
+    metrics, datasets = extract_metrics_datasets(out_dir)
     # Define names for metrics and experiments
     metric_names = {
         "psw_frobenius_norm": "Propensity Weighted Frobenius Norm",
         "frobenius_norm": "Frobenius Norm",
-        "prob_erupt": "Probabilistic Erupt",
+        "erupt": "ERUPT",
         "codec": "CODEC",
         "policy_risk": "Policy Risk",
         "energy_distance": "Energy Distance",
@@ -272,7 +334,7 @@ def plot_grid(title):
             for j, dataset in enumerate(datasets):
                 ax = axs[i, j]
 
-                filename = f"{metric}_run_1_{dataset.replace(' ', '_')}.pkl"
+                filename = make_filename(metric, dataset, 1)
                 filepath = os.path.join(out_dir, filename)
 
                 if os.path.exists(filepath):
@@ -334,47 +396,46 @@ def plot_grid(title):
         plt.close()
 
     def plot_mse_grid(title):
+        df = get_all_test_scores(out_dir, datasets[0])
+        est_names = sorted(df["estimator_name"].unique())
+        problem = "iv" if "IV" in datasets[0] else "backdoor"
+        all_metrics = [
+            c for c in df.columns if c in supported_metrics(problem, False, False)
+        ]
+
         fig, axs = plt.subplots(
-            len(metrics), len(datasets), figsize=(20, 5 * len(metrics)), dpi=300
+            len(all_metrics), len(datasets), figsize=(20, 5 * len(all_metrics)), dpi=300
         )
-        if len(metrics) == 1 and len(datasets) == 1:
+        if len(all_metrics) == 1 and len(datasets) == 1:
             axs = np.array([[axs]])
-        elif len(metrics) == 1 or len(datasets) == 1:
+        elif len(all_metrics) == 1 or len(datasets) == 1:
             axs = axs.reshape(-1, 1) if len(datasets) == 1 else axs.reshape(1, -1)
 
         legend_elements = []
+        for j, dataset in enumerate(datasets):
+            df = get_all_test_scores(out_dir, dataset)
 
-        for i, metric in enumerate(metrics):
-            for j, dataset in enumerate(datasets):
+            for i, metric in enumerate(all_metrics):
                 ax = axs[i, j]
+                this_df = df[["estimator_name", metric, "MSE"]].dropna()
+                this_df = this_df[~np.isinf(this_df[metric].values)]
+                if len(this_df):
+                    for idx, est_name in enumerate(est_names):
+                        df_slice = this_df[this_df["estimator_name"] == est_name]
+                        if "Dummy" not in est_name and len(df_slice):
 
-                filename = f"{metric}_run_1_{dataset.replace(' ', '_')}.pkl"
-                filepath = os.path.join(out_dir, filename)
-
-                if os.path.exists(filepath):
-                    with open(filepath, "rb") as f:
-                        results = pickle.load(f)
-
-                    for idx, (est_name, scr) in enumerate(
-                        results["scores_per_estimator"].items()
-                    ):
-                        if "Dummy" not in est_name and len(scr):
-                            CATE_gt = scr[0]["test"]["CATE_groundtruth"]
-                            CATE_est = scr[0]["test"]["CATE_estimate"]
-                            CATE_gt = np.array(CATE_gt).flatten()
-                            CATE_est = np.array(CATE_est).flatten()
-                            mse = np.mean((CATE_gt - CATE_est) ** 2)
-                            score = scr[0]["test"][metric]
                             marker = markers[idx % len(markers)]
-
+                            # TODO: throw x-axis outliers away
                             ax.scatter(
-                                mse,
-                                score,
+                                df_slice["MSE"],
+                                df_slice[metric],
                                 color=colors[idx],
                                 s=50,
                                 marker=marker,
                                 linewidths=0.5,
                             )
+                            if metric not in metrics_to_minimize():
+                                ax.invert_yaxis()
 
                             trimmed_est_name = est_name.split(".")[-1]
                             if i == 0 and j == 0:
@@ -392,9 +453,9 @@ def plot_mse_grid(title):
 
                     ax.set_xscale("log")
                     ax.grid(True)
-                    ax.set_title(
-                        f"{results['best_estimator'].split('.')[-1]}", fontsize=8
-                    )
+                    # ax.set_title(
+                    #     f"{results['best_estimator'].split('.')[-1]}", fontsize=8
+                    # )
                 else:
                     ax.text(0.5, 0.5, "No data", ha="center", va="center")
 
@@ -432,10 +493,13 @@ def plot_mse_grid(title):
 
 if __name__ == "__main__":
     args = parse_arguments()
-    # args.identifier = "Egor_test"
-    # args.metrics = ["psw_energy_distance"]
-    # args.num_samples = 40
-    # args.outcome_model = "auto"  # or use "nested" for the old-style nested model
-    runs = run_experiment(args)
+    args.identifier = "Egor_test"
+    args.metrics = supported_metrics("backdoor", False, False)
+    # run_experiment assumes we don't mix large and small datasets in the same call
+    args.datasets = ["Large Linear_RCT", "Large NonLinear_RCT"]
+    args.num_samples = 100
+    args.timestamp_in_dirname = False
+    args.outcome_model = "auto"  # or use "nested" for the old-style nested model
+    out_dir = run_experiment(args)
     # Plots should be supplied with the directory name pattern where the files live, load the rest from there
-    generate_plots(runs, args.metrics, args.datasets, args.n_runs)
+    generate_plots(os.path.join(out_dir, "RCT"))

From 1c22dddc2691269e0866a31d6d5ca5d322725b46 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Mon, 2 Dec 2024 08:53:02 +0000
Subject: [PATCH 2/4] Comment out Egor's tweaks

---
 notebooks/RunExperiments/experiment_runner.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py
index 8961d01..a770091 100644
--- a/notebooks/RunExperiments/experiment_runner.py
+++ b/notebooks/RunExperiments/experiment_runner.py
@@ -493,13 +493,13 @@ def plot_mse_grid(title):
 
 if __name__ == "__main__":
     args = parse_arguments()
-    args.identifier = "Egor_test"
-    args.metrics = supported_metrics("backdoor", False, False)
-    # run_experiment assumes we don't mix large and small datasets in the same call
-    args.datasets = ["Large Linear_RCT", "Large NonLinear_RCT"]
-    args.num_samples = 100
-    args.timestamp_in_dirname = False
-    args.outcome_model = "auto"  # or use "nested" for the old-style nested model
+    # args.identifier = "Egor_test"
+    # args.metrics = supported_metrics("backdoor", False, False)
+    # # run_experiment assumes we don't mix large and small datasets in the same call
+    # args.datasets = ["Large Linear_RCT", "Large NonLinear_RCT"]
+    # args.num_samples = 100
+    # args.timestamp_in_dirname = False
+    # args.outcome_model = "auto"  # or use "nested" for the old-style nested model
     out_dir = run_experiment(args)
     # Plots should be supplied with the directory name pattern where the files live, load the rest from there
     generate_plots(os.path.join(out_dir, "RCT"))

From e51b58c8af53957fb88868627d50e8eeb4ae407e Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Fri, 6 Dec 2024 07:49:48 +0000
Subject: [PATCH 3/4] Minor fixes as part of experiments

---
 causaltune/optimiser.py                       | 30 ++----------
 causaltune/score/scoring.py                   | 10 ++--
 causaltune/search/component.py                | 21 +++++++-
 notebooks/RunExperiments/experiment_runner.py | 48 +++++++++----------
 4 files changed, 54 insertions(+), 55 deletions(-)

diff --git a/causaltune/optimiser.py b/causaltune/optimiser.py
index 337809a..245cee2 100644
--- a/causaltune/optimiser.py
+++ b/causaltune/optimiser.py
@@ -20,7 +20,7 @@
 from joblib import Parallel, delayed
 
 from causaltune.search.params import SimpleParamService
-from causaltune.score.scoring import Scorer
+from causaltune.score.scoring import Scorer, metrics_to_minimize
 from causaltune.utils import treatment_is_multivalue
 from causaltune.models.monkey_patches import (
     AutoML,
@@ -514,19 +514,7 @@ def fit(
                 evaluated_rewards=(
                     [] if len(self.resume_scores) == 0 else self.resume_scores
                 ),
-                mode=(
-                    "min"
-                    if self.metric
-                    in [
-                        "energy_distance",
-                        "psw_energy_distance",
-                        "frobenius_norm",
-                        "psw_frobenius_norm",
-                        "codec",
-                        "policy_risk",
-                    ]
-                    else "max"
-                ),
+                mode=("min" if self.metric in metrics_to_minimize() else "max"),
                 low_cost_partial_config={},
                 **self._settings["tuner"],
             )
@@ -547,19 +535,7 @@ def fit(
                 evaluated_rewards=(
                     [] if len(self.resume_scores) == 0 else self.resume_scores
                 ),
-                mode=(
-                    "min"
-                    if self.metric
-                    in [
-                        "energy_distance",
-                        "psw_energy_distance",
-                        "frobenius_norm",
-                        "psw_frobenius_norm",
-                        "codec",
-                        "policy_risk",
-                    ]
-                    else "max"
-                ),
+                mode=("min" if self.metric in metrics_to_minimize() else "max"),
                 low_cost_partial_config={},
                 **self._settings["tuner"],
             )
diff --git a/causaltune/score/scoring.py b/causaltune/score/scoring.py
index 5547876..36aa894 100644
--- a/causaltune/score/scoring.py
+++ b/causaltune/score/scoring.py
@@ -37,7 +37,7 @@ def const_marginal_effect(self, X):
         return self.cate_estimate
 
 
-def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List[str]:
+def supported_metrics(problem: str, multivalue: bool, scores_only: bool, constant_ptt: bool=False) -> List[str]:
     if problem == "iv":
         metrics = ["energy_distance", "frobenius_norm", "codec"]
         if not scores_only:
@@ -51,13 +51,13 @@ def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List
         else:
             metrics = [
                 "erupt",
-                # "norm_erupt",
+                "norm_erupt",
                 # "greedy_erupt",  # regular erupt was made probabilistic, no need for a separate one
                 "policy_risk",  # NEW
                 "qini",
                 "auc",
                 # "r_scorer",
-                # "energy_distance",  # is broken without propensity weighting
+                "energy_distance",  # should only be used in iv problems
                 "psw_energy_distance",
                 "frobenius_norm",  # NEW
                 "codec",  # NEW
@@ -101,6 +101,10 @@ def __init__(
         self.identified_estimand = causal_model.identify_effect(
             proceed_when_unidentifiable=True
         )
+        if "Dummy" in propensity_model.__class__.__name__:
+            self.constant_ptt = True
+        else:
+            self.constant_ptt = False
 
         if problem == "backdoor":
             print(
diff --git a/causaltune/search/component.py b/causaltune/search/component.py
index f3920aa..542a494 100644
--- a/causaltune/search/component.py
+++ b/causaltune/search/component.py
@@ -61,7 +61,7 @@ def joint_config(data_size: Tuple[int, int], estimator_list=None):
         cfg, init_params, low_cost_init_params = flaml_config_to_tune_config(
             cls.search_space(data_size=data_size, task=task)
         )
-
+        cfg, init_params = tweak_config(cfg, init_params, name)
         # Test if the estimator instantiates fine
         try:
             cls(task=task, **init_params)
@@ -76,6 +76,25 @@ def joint_config(data_size: Tuple[int, int], estimator_list=None):
     return tune.choice(joint_cfg), joint_init_params, joint_low_cost_init_params
 
 
+def tweak_config(cfg: dict, init_params: dict, estimator_name: str):
+    """
+    Tweak built-in FLAML search spaces to limit the number of estimators
+    :param cfg:
+    :param estimator_name:
+    :return:
+    """
+    out = copy.deepcopy(cfg)
+    if "xgboost" in estimator_name or estimator_name in [
+        "random_forest",
+        "extra_trees",
+        "lgbm",
+        "catboost",
+    ]:
+        out["n_estimators"] = tune.lograndint(4, 1000)
+        init_params["n_estimators"] = 100
+    return out, init_params
+
+
 def model_from_cfg(cfg: dict):
     cfg = copy.deepcopy(cfg)
     model_name = cfg.pop("estimator_name")
diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py
index a770091..b8eec36 100644
--- a/notebooks/RunExperiments/experiment_runner.py
+++ b/notebooks/RunExperiments/experiment_runner.py
@@ -4,6 +4,7 @@
 import glob
 import copy
 import argparse
+from typing import List
 
 import numpy as np
 import matplotlib
@@ -26,8 +27,8 @@
 from causaltune.models.passthrough import passthrough_model  # noqa: E402
 from causaltune.search.params import SimpleParamService  # noqa: E402
 from causaltune.score.scoring import (
-    metrics_to_minimize,
-    supported_metrics,
+    metrics_to_minimize,  # noqa: E402
+    supported_metrics,  # noqa: E402
 )  # noqa: E402
 
 
@@ -86,25 +87,6 @@ def get_estimator_list(dataset_name):
     estimator_list = cfg.estimator_names_from_patterns(problem, "all", 1001)
     return [est for est in estimator_list if "Dummy" not in est]
 
-    #     return [
-    #         "iv.econml.iv.dr.LinearDRIV",
-    #         "iv.econml.iv.dml.DMLIV",
-    #         "iv.econml.iv.dr.SparseLinearDRIV",
-    #         "iv.econml.iv.dr.LinearIntentToTreatDRIV",
-    #     ]
-    # else:
-    #     return [
-    #         # "Dummy", # Let's exclude this until the FLAML PR for attr_cost is in
-    #         "SparseLinearDML",
-    #         "ForestDRLearner",
-    #         "TransformedOutcome",
-    #         "CausalForestDML",
-    #         ".LinearDML",
-    #         "DomainAdaptationLearner",
-    #         "SLearner",
-    #         "XLearner",
-    #         "TLearner",
-    #     ]
 
 
 def run_experiment(args):
@@ -159,6 +141,9 @@ def run_experiment(args):
             cd_i.data = train_df
 
             for metric in args.metrics:
+                if metric == "ate":  # this is not something to optimize
+                    continue
+                print(f"Optimzing {metric} for {dataset_name} (run {i_run})")
                 try:
                     fn = make_filename(metric, dataset_name, i_run)
                     out_fn = os.path.join(out_dir, case, fn)
@@ -296,7 +281,12 @@ def get_all_test_scores(out_dir, dataset_name):
     return out
 
 
-def generate_plots(out_dir):
+def generate_plots(out_dir: str,
+                   log_scale: List[str]|None = None,
+                   upper_bounds: dict|None = None,
+                   lower_bounds: dict|None=None):
+    if log_scale is None:
+        log_scale = ["energy_distance", "psw_energy_distance", "frobenius_norm"]
     metrics, datasets = extract_metrics_datasets(out_dir)
     # Define names for metrics and experiments
     metric_names = {
@@ -414,6 +404,13 @@ def plot_mse_grid(title):
         legend_elements = []
         for j, dataset in enumerate(datasets):
             df = get_all_test_scores(out_dir, dataset)
+            for m, value  in upper_bounds.items():
+                if m in df.columns:
+                    df = df[df[m] < value].copy()
+            for m, value in lower_bounds.items():
+                if m in df.columns:
+                    df = df[df[m] > value].copy()
+
 
             for i, metric in enumerate(all_metrics):
                 ax = axs[i, j]
@@ -452,6 +449,8 @@ def plot_mse_grid(title):
                                 )
 
                     ax.set_xscale("log")
+                    if metric in log_scale:
+                        ax.set_yscale("log")
                     ax.grid(True)
                     # ax.set_title(
                     #     f"{results['best_estimator'].split('.')[-1]}", fontsize=8
@@ -501,5 +500,6 @@ def plot_mse_grid(title):
     # args.timestamp_in_dirname = False
     # args.outcome_model = "auto"  # or use "nested" for the old-style nested model
     out_dir = run_experiment(args)
-    # Plots should be supplied with the directory name pattern where the files live, load the rest from there
-    generate_plots(os.path.join(out_dir, "RCT"))
+    # upper_bounds = {"MSE": 1e2, "policy_risk": 0.2}
+    # lower_bounds = {"erupt": 0.06, "bite": 0.75}
+    generate_plots(os.path.join(out_dir, "RCT"))#, upper_bounds=upper_bounds, lower_bounds=lower_bounds)

From 5627d9a18533b2ec5d9c05e0b2be727d4eaf5a45 Mon Sep 17 00:00:00 2001
From: "Egor.Kraev" <egor.kraev@transferwise.com>
Date: Fri, 6 Dec 2024 10:33:02 +0000
Subject: [PATCH 4/4] fix ERUPT std

---
 causaltune/score/erupt_core.py                |   4 +-
 notebooks/ERUPT basics.ipynb                  | 130 +++++++++---------
 notebooks/RunExperiments/experiment_runner.py |  18 +--
 3 files changed, 77 insertions(+), 75 deletions(-)

diff --git a/causaltune/score/erupt_core.py b/causaltune/score/erupt_core.py
index ae79260..6a69a8c 100644
--- a/causaltune/score/erupt_core.py
+++ b/causaltune/score/erupt_core.py
@@ -50,8 +50,8 @@ def erupt_with_std(
         ]
         mean += np.mean(means)
         std += np.std(means) / np.sqrt(num_splits)  # Standard error of the mean
-
-    return mean / resamples, std / resamples
+    # 1.5 is an empirical factor to make the confidence interval wider
+    return mean / resamples, 1.5 * std / resamples
 
 
 def erupt(
diff --git a/notebooks/ERUPT basics.ipynb b/notebooks/ERUPT basics.ipynb
index c4bdb54..94fcb30 100644
--- a/notebooks/ERUPT basics.ipynb	
+++ b/notebooks/ERUPT basics.ipynb	
@@ -103,33 +103,33 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.452636</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.684484</td>\n",
+       "      <td>0.898227</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.288637</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>0.380215</td>\n",
+       "      <td>0.462092</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.745268</td>\n",
+       "      <td>0.771976</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.584036</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.762300</td>\n",
+       "      <td>0.858974</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.881019</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>0.505191</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.425354</td>\n",
+       "      <td>0.228084</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.357797</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>0.384110</td>\n",
+       "      <td>0.962512</td>\n",
        "      <td>1</td>\n",
-       "      <td>1.834628</td>\n",
+       "      <td>1.066413</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -137,11 +137,11 @@
       ],
       "text/plain": [
        "          X  T1        Y1\n",
-       "0  0.452636   0  1.684484\n",
-       "1  0.380215   0  0.745268\n",
-       "2  0.584036   1  0.762300\n",
-       "3  0.505191   0  1.425354\n",
-       "4  0.384110   1  1.834628"
+       "0  0.898227   1  1.288637\n",
+       "1  0.462092   0  0.771976\n",
+       "2  0.858974   0  1.881019\n",
+       "3  0.228084   1  0.357797\n",
+       "4  0.962512   1  1.066413"
       ]
      },
      "execution_count": 2,
@@ -216,53 +216,53 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>0.452636</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.684484</td>\n",
-       "      <td>0.726318</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.273682</td>\n",
-       "      <td>0.904259</td>\n",
+       "      <td>0.898227</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.288637</td>\n",
+       "      <td>0.949114</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.949114</td>\n",
+       "      <td>2.229118</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>0.380215</td>\n",
+       "      <td>0.462092</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.745268</td>\n",
-       "      <td>0.690108</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.690108</td>\n",
-       "      <td>1.930383</td>\n",
+       "      <td>0.771976</td>\n",
+       "      <td>0.731046</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.268954</td>\n",
+       "      <td>0.572308</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>0.584036</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.762300</td>\n",
-       "      <td>0.792018</td>\n",
+       "      <td>0.858974</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1.881019</td>\n",
+       "      <td>0.929487</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.792018</td>\n",
-       "      <td>0.959608</td>\n",
+       "      <td>0.929487</td>\n",
+       "      <td>2.601592</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>0.505191</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1.425354</td>\n",
-       "      <td>0.752596</td>\n",
+       "      <td>0.228084</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.357797</td>\n",
+       "      <td>0.614042</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.752596</td>\n",
-       "      <td>1.017777</td>\n",
+       "      <td>0.614042</td>\n",
+       "      <td>0.542638</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>0.384110</td>\n",
+       "      <td>0.962512</td>\n",
        "      <td>1</td>\n",
-       "      <td>1.834628</td>\n",
-       "      <td>0.692055</td>\n",
+       "      <td>1.066413</td>\n",
+       "      <td>0.981256</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.692055</td>\n",
-       "      <td>2.374030</td>\n",
+       "      <td>0.981256</td>\n",
+       "      <td>2.401383</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -270,11 +270,11 @@
       ],
       "text/plain": [
        "          X  T1        Y1         p  T2  p_of_actual        Y2\n",
-       "0  0.452636   0  1.684484  0.726318   0     0.273682  0.904259\n",
-       "1  0.380215   0  0.745268  0.690108   1     0.690108  1.930383\n",
-       "2  0.584036   1  0.762300  0.792018   1     0.792018  0.959608\n",
-       "3  0.505191   0  1.425354  0.752596   1     0.752596  1.017777\n",
-       "4  0.384110   1  1.834628  0.692055   1     0.692055  2.374030"
+       "0  0.898227   1  1.288637  0.949114   1     0.949114  2.229118\n",
+       "1  0.462092   0  0.771976  0.731046   0     0.268954  0.572308\n",
+       "2  0.858974   0  1.881019  0.929487   1     0.929487  2.601592\n",
+       "3  0.228084   1  0.357797  0.614042   1     0.614042  0.542638\n",
+       "4  0.962512   1  1.066413  0.981256   1     0.981256  2.401383"
       ]
      },
      "execution_count": 3,
@@ -319,10 +319,10 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Average outcome of the actual biased assignment experiment: 1.411675477573636\n",
-      "Estimated outcome of random assignment: 1.251567372523789\n",
-      "95% confidence interval for estimated outcome: 1.2311928820519622 1.2719418629956158\n",
-      "Average outcome of the actual random assignment experiment: 1.2559621877416332\n"
+      "Average outcome of the actual biased assignment experiment: 1.4064676444383317\n",
+      "Estimated outcome of random assignment: 1.2594221770638483\n",
+      "95% confidence interval for estimated outcome: 1.230204391668238 1.2886399624594587\n",
+      "Average outcome of the actual random assignment experiment: 1.2461659092712785\n"
      ]
     }
    ],
@@ -360,17 +360,17 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Average outcome of the actual random assignment experiment: 1.2559621877416332\n",
-      "Estimated outcome of biased assignment: 1.4147647990746988\n",
-      "Confidence interval for estimated outcome: 1.398423601541284 1.4311059966081134\n",
-      "Average outcome of the actual biased assignment experiment: 1.411675477573636\n"
+      "Average outcome of the actual random assignment experiment: 1.2461659092712785\n",
+      "Estimated outcome of biased assignment: 1.405112521603215\n",
+      "95% confidence interval for estimated outcome: 1.3814865905561569 1.428738452650273\n",
+      "Average outcome of the actual biased assignment experiment: 1.4064676444383317\n"
      ]
     }
    ],
    "source": [
-    "# Conversely, we can take the outcome of the fully random test and use it to estimate what the outcome of the biased assignment would have been\n",
+    "# Conversely, we can take the outcome of the fully random test and use it \n",
+    "# to estimate what the outcome of the biased assignment would have been\n",
     "\n",
-    "# Let's use data from biased assignment experiment to estimate the average effect of fully random assignment\n",
     "hypothetical_policy = df[\"T2\"]\n",
     "est, std = erupt_with_std(actual_propensity=0.5*pd.Series(np.ones(len(df))), \n",
     "                     actual_treatment=df[\"T1\"],\n",
@@ -379,7 +379,7 @@
     "\n",
     "print(\"Average outcome of the actual random assignment experiment:\", df[\"Y1\"].mean())\n",
     "print(\"Estimated outcome of biased assignment:\", est)\n",
-    "print(\"Confidence interval for estimated outcome:\", est-2*std, est + 2*std)\n",
+    "print(\"95% confidence interval for estimated outcome:\", est-2*std, est + 2*std)\n",
     "print(\"Average outcome of the actual biased assignment experiment:\",  df[\"Y2\"].mean())"
    ]
   },
@@ -388,7 +388,7 @@
    "id": "f724dbc3",
    "metadata": {},
    "source": [
-    "As you can see, the actual outcome is well within the confidence interval estimated by ERUPT"
+    "As you can see, the actual outcome is within the confidence interval estimated by ERUPT"
    ]
   },
   {
diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py
index b8eec36..033b86a 100644
--- a/notebooks/RunExperiments/experiment_runner.py
+++ b/notebooks/RunExperiments/experiment_runner.py
@@ -88,7 +88,6 @@ def get_estimator_list(dataset_name):
     return [est for est in estimator_list if "Dummy" not in est]
 
 
-
 def run_experiment(args):
     # Process datasets
     data_sets = {}
@@ -281,10 +280,12 @@ def get_all_test_scores(out_dir, dataset_name):
     return out
 
 
-def generate_plots(out_dir: str,
-                   log_scale: List[str]|None = None,
-                   upper_bounds: dict|None = None,
-                   lower_bounds: dict|None=None):
+def generate_plots(
+    out_dir: str,
+    log_scale: List[str] | None = None,
+    upper_bounds: dict | None = None,
+    lower_bounds: dict | None = None,
+):
     if log_scale is None:
         log_scale = ["energy_distance", "psw_energy_distance", "frobenius_norm"]
     metrics, datasets = extract_metrics_datasets(out_dir)
@@ -404,14 +405,13 @@ def plot_mse_grid(title):
         legend_elements = []
         for j, dataset in enumerate(datasets):
             df = get_all_test_scores(out_dir, dataset)
-            for m, value  in upper_bounds.items():
+            for m, value in upper_bounds.items():
                 if m in df.columns:
                     df = df[df[m] < value].copy()
             for m, value in lower_bounds.items():
                 if m in df.columns:
                     df = df[df[m] > value].copy()
 
-
             for i, metric in enumerate(all_metrics):
                 ax = axs[i, j]
                 this_df = df[["estimator_name", metric, "MSE"]].dropna()
@@ -502,4 +502,6 @@ def plot_mse_grid(title):
     out_dir = run_experiment(args)
     # upper_bounds = {"MSE": 1e2, "policy_risk": 0.2}
     # lower_bounds = {"erupt": 0.06, "bite": 0.75}
-    generate_plots(os.path.join(out_dir, "RCT"))#, upper_bounds=upper_bounds, lower_bounds=lower_bounds)
+    generate_plots(
+        os.path.join(out_dir, "RCT")
+    )  # , upper_bounds=upper_bounds, lower_bounds=lower_bounds)