From c84f292a11280ecb64623f171f5382f7cfcab6bf Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Mon, 2 Dec 2024 08:50:54 +0000 Subject: [PATCH 1/4] some plotting improvements --- causaltune/score/scoring.py | 17 +- notebooks/RunExperiments/experiment_runner.py | 280 +++++++++++------- 2 files changed, 186 insertions(+), 111 deletions(-) diff --git a/causaltune/score/scoring.py b/causaltune/score/scoring.py index e79150e..5547876 100644 --- a/causaltune/score/scoring.py +++ b/causaltune/score/scoring.py @@ -51,13 +51,13 @@ def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List else: metrics = [ "erupt", - "norm_erupt", - "greedy_erupt", # regular erupt was made probabilistic, no need for a separate one + # "norm_erupt", + # "greedy_erupt", # regular erupt was made probabilistic, no need for a separate one "policy_risk", # NEW "qini", "auc", # "r_scorer", - "energy_distance", # is broken without propensity weighting + # "energy_distance", # is broken without propensity weighting "psw_energy_distance", "frobenius_norm", # NEW "codec", # NEW @@ -68,6 +68,17 @@ def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List return metrics +def metrics_to_minimize(): + return [ + "energy_distance", + "psw_energy_distance", + "codec", + "frobenius_norm", + "psw_frobenius_norm", + "policy_risk", + ] + + class Scorer: def __init__( self, diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py index b66d105..8961d01 100644 --- a/notebooks/RunExperiments/experiment_runner.py +++ b/notebooks/RunExperiments/experiment_runner.py @@ -1,11 +1,15 @@ import os import sys import pickle +import glob +import copy +import argparse + import numpy as np import matplotlib import matplotlib.pyplot as plt -import copy -import argparse +import pandas as pd + from sklearn.model_selection import train_test_split from datetime import datetime import warnings @@ -20,7 +24,11 @@ from causaltune import CausalTune # noqa: E402 from causaltune.datasets import load_dataset # noqa: E402 from causaltune.models.passthrough import passthrough_model # noqa: E402 -from causaltune.search.params import SimpleParamService +from causaltune.search.params import SimpleParamService # noqa: E402 +from causaltune.score.scoring import ( + metrics_to_minimize, + supported_metrics, +) # noqa: E402 def parse_arguments(): @@ -41,6 +49,13 @@ def parse_arguments(): parser.add_argument( "--outcome_model", type=str, default="nested", help="Outcome model type" ) + parser.add_argument( + "--timestamp_in_dirname", + type=bool, + default="False", + help="Include timestampl in out_dir name?", + ) + parser.add_argument("--test_size", type=float, default=0.33, help="Test set size") parser.add_argument( "--time_budget", type=int, default=None, help="Time budget for optimization" @@ -93,10 +108,6 @@ def get_estimator_list(dataset_name): def run_experiment(args): - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - out_dir = f"EXPERIMENT_RESULTS_{timestamp}_{args.identifier}" - os.makedirs(out_dir, exist_ok=True) - # Process datasets data_sets = {} for dataset in args.datasets: @@ -110,6 +121,16 @@ def run_experiment(args): file_path = f"RunDatasets/{size}/{name}.pkl" data_sets[f"{size} {name}"] = load_dataset(file_path) + if args.timestamp_in_dirname: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + out_dir = f"EXPERIMENT_RESULTS_{timestamp}_{args.identifier}" + else: + out_dir = f"EXPERIMENT_RESULTS_{args.identifier}" + + os.makedirs(out_dir, exist_ok=True) + out_dir = os.path.join(out_dir, size) + os.makedirs(out_dir, exist_ok=True) + print(f"Loaded datasets: {list(data_sets.keys())}") # Set time budgets properly @@ -128,6 +149,9 @@ def run_experiment(args): args.time_budget = None # Use only components_time_budget for dataset_name, cd in data_sets.items(): + case = dataset_name.split("_")[-1] + os.makedirs(f"{out_dir}/{case}", exist_ok=True) + for i_run in range(1, args.n_runs + 1): cd_i = copy.deepcopy(cd) train_df, test_df = train_test_split(cd_i.data, test_size=args.test_size) @@ -135,41 +159,47 @@ def run_experiment(args): cd_i.data = train_df for metric in args.metrics: - if "KC" in dataset_name and "KCKP" not in dataset_name: - propensity_model = "auto" - elif "KCKP" in dataset_name: - propensity_model = passthrough_model( - cd.propensity_modifiers, include_control=False + try: + fn = make_filename(metric, dataset_name, i_run) + out_fn = os.path.join(out_dir, case, fn) + if os.path.isfile(out_fn): + print(f"File {out_fn} exists, skipping...") + continue + + if "KC" in dataset_name and "KCKP" not in dataset_name: + propensity_model = "auto" + elif "KCKP" in dataset_name: + propensity_model = passthrough_model( + cd.propensity_modifiers, include_control=False + ) + else: + propensity_model = "dummy" + + ct = CausalTune( + metric=metric, + estimator_list=get_estimator_list(dataset_name), + num_samples=args.num_samples, + components_time_budget=args.components_time_budget, # Use this instead + verbose=1, + components_verbose=1, + store_all_estimators=True, + propensity_model=propensity_model, + outcome_model=args.outcome_model, ) - else: - propensity_model = "dummy" - - ct = CausalTune( - metric=metric, - estimator_list=get_estimator_list(dataset_name), - num_samples=args.num_samples, - components_time_budget=args.components_time_budget, # Use this instead - verbose=1, - components_verbose=1, - store_all_estimators=True, - propensity_model=propensity_model, - outcome_model=args.outcome_model, - ) - ct.fit( - data=cd_i, - treatment="treatment", - outcome="outcome", - ) + ct.fit( + data=cd_i, + treatment="treatment", + outcome="outcome", + ) - # Compute scores and save results - results = compute_scores(ct, metric, test_df) + # Compute scores and save results + results = compute_scores(ct, metric, test_df) - with open( - f"{out_dir}/{metric}_run_{i_run}_{dataset_name.replace(' ', '_')}.pkl", - "wb", - ) as f: - pickle.dump(results, f) + with open(out_fn, "wb") as f: + pickle.dump(results, f) + except Exception as e: + print(f"Error processing {dataset_name}_{metric}_{i_run}: {e}") return out_dir @@ -180,49 +210,46 @@ def compute_scores(ct, metric, test_df): all_scores = [] for trial in ct.results.trials: - estimator_name = trial.last_result["estimator_name"] - if trial.last_result["estimator"]: - estimator = trial.last_result["estimator"] - scores = {} - for ds_name, df in datasets.items(): - scores[ds_name] = {} - est_scores = ct.scorer.make_scores( - estimator, - df, - metrics_to_report=ct.metrics_to_report, - ) - est_scores["estimator_name"] = estimator_name + try: + estimator_name = trial.last_result["estimator_name"] + if "estimator" in trial.last_result and trial.last_result["estimator"]: + estimator = trial.last_result["estimator"] + scores = {} + for ds_name, df in datasets.items(): + scores[ds_name] = {} + est_scores = ct.scorer.make_scores( + estimator, + df, + metrics_to_report=ct.metrics_to_report, + ) + est_scores["estimator_name"] = estimator_name - scores[ds_name]["CATE_estimate"] = np.squeeze( - estimator.estimator.effect(df) - ) - scores[ds_name]["CATE_groundtruth"] = np.squeeze(df["true_effect"]) - est_scores["MSE"] = np.mean( - ( - scores[ds_name]["CATE_estimate"] - - scores[ds_name]["CATE_groundtruth"] + scores[ds_name]["CATE_estimate"] = np.squeeze( + estimator.estimator.effect(df) + ) + scores[ds_name]["CATE_groundtruth"] = np.squeeze(df["true_effect"]) + est_scores["MSE"] = np.mean( + ( + scores[ds_name]["CATE_estimate"] + - scores[ds_name]["CATE_groundtruth"] + ) + ** 2 ) - ** 2 + scores[ds_name]["scores"] = est_scores + scores["optimization_score"] = trial.last_result.get( + "optimization_score" ) - scores[ds_name]["scores"] = est_scores - scores["optimization_score"] = trial.last_result.get("optimization_score") - estimator_scores[estimator_name].append(copy.deepcopy(scores)) + estimator_scores[estimator_name].append(copy.deepcopy(scores)) # Will use this in the nex all_scores.append(scores) + except Exception as e: + print(f"Error processing trial: {e}") for k in estimator_scores.keys(): estimator_scores[k] = sorted( estimator_scores[k], key=lambda x: x["validation"]["scores"][metric], - reverse=metric - not in [ - "energy_distance", - "psw_energy_distance", - "codec", - "frobenius_norm", - "psw_frobenius_norm", - "policy_risk", - ], + reverse=metric not in metrics_to_minimize(), ) return { @@ -235,12 +262,47 @@ def compute_scores(ct, metric, test_df): } -def generate_plots(out_dir, metrics, datasets, n_runs): +def extract_metrics_datasets(out_dir: str): + metrics = set() + datasets = set() + + for file in glob.glob(f"{out_dir}/*.pkl"): + parts = os.path.basename(file).split("-") + metrics.add(parts[0]) + datasets.add(parts[-1].replace(".pkl", "").replace("_", " ")) + + return sorted(list(metrics)), sorted(list(datasets)) + + +def make_filename(metric, dataset, i_run): + return f"{metric}-run-{i_run}-{dataset.replace(' ', '_')}.pkl" + + +def get_all_test_scores(out_dir, dataset_name): + size, ds_type, case = dataset_name.split(" ") + all_scores = [] + for file in glob.glob(f"{out_dir}/*_{ds_type}_{case}.pkl"): + with open(file, "rb") as f: + results = pickle.load(f) + for x in results["all_scores"]: + all_scores.append( + { + k: v + for k, v in x["test"]["scores"].items() + if k not in ["values"] + } + ) + out = pd.DataFrame(all_scores) + return out + + +def generate_plots(out_dir): + metrics, datasets = extract_metrics_datasets(out_dir) # Define names for metrics and experiments metric_names = { "psw_frobenius_norm": "Propensity Weighted Frobenius Norm", "frobenius_norm": "Frobenius Norm", - "prob_erupt": "Probabilistic Erupt", + "erupt": "ERUPT", "codec": "CODEC", "policy_risk": "Policy Risk", "energy_distance": "Energy Distance", @@ -272,7 +334,7 @@ def plot_grid(title): for j, dataset in enumerate(datasets): ax = axs[i, j] - filename = f"{metric}_run_1_{dataset.replace(' ', '_')}.pkl" + filename = make_filename(metric, dataset, 1) filepath = os.path.join(out_dir, filename) if os.path.exists(filepath): @@ -334,47 +396,46 @@ def plot_grid(title): plt.close() def plot_mse_grid(title): + df = get_all_test_scores(out_dir, datasets[0]) + est_names = sorted(df["estimator_name"].unique()) + problem = "iv" if "IV" in datasets[0] else "backdoor" + all_metrics = [ + c for c in df.columns if c in supported_metrics(problem, False, False) + ] + fig, axs = plt.subplots( - len(metrics), len(datasets), figsize=(20, 5 * len(metrics)), dpi=300 + len(all_metrics), len(datasets), figsize=(20, 5 * len(all_metrics)), dpi=300 ) - if len(metrics) == 1 and len(datasets) == 1: + if len(all_metrics) == 1 and len(datasets) == 1: axs = np.array([[axs]]) - elif len(metrics) == 1 or len(datasets) == 1: + elif len(all_metrics) == 1 or len(datasets) == 1: axs = axs.reshape(-1, 1) if len(datasets) == 1 else axs.reshape(1, -1) legend_elements = [] + for j, dataset in enumerate(datasets): + df = get_all_test_scores(out_dir, dataset) - for i, metric in enumerate(metrics): - for j, dataset in enumerate(datasets): + for i, metric in enumerate(all_metrics): ax = axs[i, j] + this_df = df[["estimator_name", metric, "MSE"]].dropna() + this_df = this_df[~np.isinf(this_df[metric].values)] + if len(this_df): + for idx, est_name in enumerate(est_names): + df_slice = this_df[this_df["estimator_name"] == est_name] + if "Dummy" not in est_name and len(df_slice): - filename = f"{metric}_run_1_{dataset.replace(' ', '_')}.pkl" - filepath = os.path.join(out_dir, filename) - - if os.path.exists(filepath): - with open(filepath, "rb") as f: - results = pickle.load(f) - - for idx, (est_name, scr) in enumerate( - results["scores_per_estimator"].items() - ): - if "Dummy" not in est_name and len(scr): - CATE_gt = scr[0]["test"]["CATE_groundtruth"] - CATE_est = scr[0]["test"]["CATE_estimate"] - CATE_gt = np.array(CATE_gt).flatten() - CATE_est = np.array(CATE_est).flatten() - mse = np.mean((CATE_gt - CATE_est) ** 2) - score = scr[0]["test"][metric] marker = markers[idx % len(markers)] - + # TODO: throw x-axis outliers away ax.scatter( - mse, - score, + df_slice["MSE"], + df_slice[metric], color=colors[idx], s=50, marker=marker, linewidths=0.5, ) + if metric not in metrics_to_minimize(): + ax.invert_yaxis() trimmed_est_name = est_name.split(".")[-1] if i == 0 and j == 0: @@ -392,9 +453,9 @@ def plot_mse_grid(title): ax.set_xscale("log") ax.grid(True) - ax.set_title( - f"{results['best_estimator'].split('.')[-1]}", fontsize=8 - ) + # ax.set_title( + # f"{results['best_estimator'].split('.')[-1]}", fontsize=8 + # ) else: ax.text(0.5, 0.5, "No data", ha="center", va="center") @@ -432,10 +493,13 @@ def plot_mse_grid(title): if __name__ == "__main__": args = parse_arguments() - # args.identifier = "Egor_test" - # args.metrics = ["psw_energy_distance"] - # args.num_samples = 40 - # args.outcome_model = "auto" # or use "nested" for the old-style nested model - runs = run_experiment(args) + args.identifier = "Egor_test" + args.metrics = supported_metrics("backdoor", False, False) + # run_experiment assumes we don't mix large and small datasets in the same call + args.datasets = ["Large Linear_RCT", "Large NonLinear_RCT"] + args.num_samples = 100 + args.timestamp_in_dirname = False + args.outcome_model = "auto" # or use "nested" for the old-style nested model + out_dir = run_experiment(args) # Plots should be supplied with the directory name pattern where the files live, load the rest from there - generate_plots(runs, args.metrics, args.datasets, args.n_runs) + generate_plots(os.path.join(out_dir, "RCT")) From 1c22dddc2691269e0866a31d6d5ca5d322725b46 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Mon, 2 Dec 2024 08:53:02 +0000 Subject: [PATCH 2/4] Comment out Egor's tweaks --- notebooks/RunExperiments/experiment_runner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py index 8961d01..a770091 100644 --- a/notebooks/RunExperiments/experiment_runner.py +++ b/notebooks/RunExperiments/experiment_runner.py @@ -493,13 +493,13 @@ def plot_mse_grid(title): if __name__ == "__main__": args = parse_arguments() - args.identifier = "Egor_test" - args.metrics = supported_metrics("backdoor", False, False) - # run_experiment assumes we don't mix large and small datasets in the same call - args.datasets = ["Large Linear_RCT", "Large NonLinear_RCT"] - args.num_samples = 100 - args.timestamp_in_dirname = False - args.outcome_model = "auto" # or use "nested" for the old-style nested model + # args.identifier = "Egor_test" + # args.metrics = supported_metrics("backdoor", False, False) + # # run_experiment assumes we don't mix large and small datasets in the same call + # args.datasets = ["Large Linear_RCT", "Large NonLinear_RCT"] + # args.num_samples = 100 + # args.timestamp_in_dirname = False + # args.outcome_model = "auto" # or use "nested" for the old-style nested model out_dir = run_experiment(args) # Plots should be supplied with the directory name pattern where the files live, load the rest from there generate_plots(os.path.join(out_dir, "RCT")) From e51b58c8af53957fb88868627d50e8eeb4ae407e Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Fri, 6 Dec 2024 07:49:48 +0000 Subject: [PATCH 3/4] Minor fixes as part of experiments --- causaltune/optimiser.py | 30 ++---------- causaltune/score/scoring.py | 10 ++-- causaltune/search/component.py | 21 +++++++- notebooks/RunExperiments/experiment_runner.py | 48 +++++++++---------- 4 files changed, 54 insertions(+), 55 deletions(-) diff --git a/causaltune/optimiser.py b/causaltune/optimiser.py index 337809a..245cee2 100644 --- a/causaltune/optimiser.py +++ b/causaltune/optimiser.py @@ -20,7 +20,7 @@ from joblib import Parallel, delayed from causaltune.search.params import SimpleParamService -from causaltune.score.scoring import Scorer +from causaltune.score.scoring import Scorer, metrics_to_minimize from causaltune.utils import treatment_is_multivalue from causaltune.models.monkey_patches import ( AutoML, @@ -514,19 +514,7 @@ def fit( evaluated_rewards=( [] if len(self.resume_scores) == 0 else self.resume_scores ), - mode=( - "min" - if self.metric - in [ - "energy_distance", - "psw_energy_distance", - "frobenius_norm", - "psw_frobenius_norm", - "codec", - "policy_risk", - ] - else "max" - ), + mode=("min" if self.metric in metrics_to_minimize() else "max"), low_cost_partial_config={}, **self._settings["tuner"], ) @@ -547,19 +535,7 @@ def fit( evaluated_rewards=( [] if len(self.resume_scores) == 0 else self.resume_scores ), - mode=( - "min" - if self.metric - in [ - "energy_distance", - "psw_energy_distance", - "frobenius_norm", - "psw_frobenius_norm", - "codec", - "policy_risk", - ] - else "max" - ), + mode=("min" if self.metric in metrics_to_minimize() else "max"), low_cost_partial_config={}, **self._settings["tuner"], ) diff --git a/causaltune/score/scoring.py b/causaltune/score/scoring.py index 5547876..36aa894 100644 --- a/causaltune/score/scoring.py +++ b/causaltune/score/scoring.py @@ -37,7 +37,7 @@ def const_marginal_effect(self, X): return self.cate_estimate -def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List[str]: +def supported_metrics(problem: str, multivalue: bool, scores_only: bool, constant_ptt: bool=False) -> List[str]: if problem == "iv": metrics = ["energy_distance", "frobenius_norm", "codec"] if not scores_only: @@ -51,13 +51,13 @@ def supported_metrics(problem: str, multivalue: bool, scores_only: bool) -> List else: metrics = [ "erupt", - # "norm_erupt", + "norm_erupt", # "greedy_erupt", # regular erupt was made probabilistic, no need for a separate one "policy_risk", # NEW "qini", "auc", # "r_scorer", - # "energy_distance", # is broken without propensity weighting + "energy_distance", # should only be used in iv problems "psw_energy_distance", "frobenius_norm", # NEW "codec", # NEW @@ -101,6 +101,10 @@ def __init__( self.identified_estimand = causal_model.identify_effect( proceed_when_unidentifiable=True ) + if "Dummy" in propensity_model.__class__.__name__: + self.constant_ptt = True + else: + self.constant_ptt = False if problem == "backdoor": print( diff --git a/causaltune/search/component.py b/causaltune/search/component.py index f3920aa..542a494 100644 --- a/causaltune/search/component.py +++ b/causaltune/search/component.py @@ -61,7 +61,7 @@ def joint_config(data_size: Tuple[int, int], estimator_list=None): cfg, init_params, low_cost_init_params = flaml_config_to_tune_config( cls.search_space(data_size=data_size, task=task) ) - + cfg, init_params = tweak_config(cfg, init_params, name) # Test if the estimator instantiates fine try: cls(task=task, **init_params) @@ -76,6 +76,25 @@ def joint_config(data_size: Tuple[int, int], estimator_list=None): return tune.choice(joint_cfg), joint_init_params, joint_low_cost_init_params +def tweak_config(cfg: dict, init_params: dict, estimator_name: str): + """ + Tweak built-in FLAML search spaces to limit the number of estimators + :param cfg: + :param estimator_name: + :return: + """ + out = copy.deepcopy(cfg) + if "xgboost" in estimator_name or estimator_name in [ + "random_forest", + "extra_trees", + "lgbm", + "catboost", + ]: + out["n_estimators"] = tune.lograndint(4, 1000) + init_params["n_estimators"] = 100 + return out, init_params + + def model_from_cfg(cfg: dict): cfg = copy.deepcopy(cfg) model_name = cfg.pop("estimator_name") diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py index a770091..b8eec36 100644 --- a/notebooks/RunExperiments/experiment_runner.py +++ b/notebooks/RunExperiments/experiment_runner.py @@ -4,6 +4,7 @@ import glob import copy import argparse +from typing import List import numpy as np import matplotlib @@ -26,8 +27,8 @@ from causaltune.models.passthrough import passthrough_model # noqa: E402 from causaltune.search.params import SimpleParamService # noqa: E402 from causaltune.score.scoring import ( - metrics_to_minimize, - supported_metrics, + metrics_to_minimize, # noqa: E402 + supported_metrics, # noqa: E402 ) # noqa: E402 @@ -86,25 +87,6 @@ def get_estimator_list(dataset_name): estimator_list = cfg.estimator_names_from_patterns(problem, "all", 1001) return [est for est in estimator_list if "Dummy" not in est] - # return [ - # "iv.econml.iv.dr.LinearDRIV", - # "iv.econml.iv.dml.DMLIV", - # "iv.econml.iv.dr.SparseLinearDRIV", - # "iv.econml.iv.dr.LinearIntentToTreatDRIV", - # ] - # else: - # return [ - # # "Dummy", # Let's exclude this until the FLAML PR for attr_cost is in - # "SparseLinearDML", - # "ForestDRLearner", - # "TransformedOutcome", - # "CausalForestDML", - # ".LinearDML", - # "DomainAdaptationLearner", - # "SLearner", - # "XLearner", - # "TLearner", - # ] def run_experiment(args): @@ -159,6 +141,9 @@ def run_experiment(args): cd_i.data = train_df for metric in args.metrics: + if metric == "ate": # this is not something to optimize + continue + print(f"Optimzing {metric} for {dataset_name} (run {i_run})") try: fn = make_filename(metric, dataset_name, i_run) out_fn = os.path.join(out_dir, case, fn) @@ -296,7 +281,12 @@ def get_all_test_scores(out_dir, dataset_name): return out -def generate_plots(out_dir): +def generate_plots(out_dir: str, + log_scale: List[str]|None = None, + upper_bounds: dict|None = None, + lower_bounds: dict|None=None): + if log_scale is None: + log_scale = ["energy_distance", "psw_energy_distance", "frobenius_norm"] metrics, datasets = extract_metrics_datasets(out_dir) # Define names for metrics and experiments metric_names = { @@ -414,6 +404,13 @@ def plot_mse_grid(title): legend_elements = [] for j, dataset in enumerate(datasets): df = get_all_test_scores(out_dir, dataset) + for m, value in upper_bounds.items(): + if m in df.columns: + df = df[df[m] < value].copy() + for m, value in lower_bounds.items(): + if m in df.columns: + df = df[df[m] > value].copy() + for i, metric in enumerate(all_metrics): ax = axs[i, j] @@ -452,6 +449,8 @@ def plot_mse_grid(title): ) ax.set_xscale("log") + if metric in log_scale: + ax.set_yscale("log") ax.grid(True) # ax.set_title( # f"{results['best_estimator'].split('.')[-1]}", fontsize=8 @@ -501,5 +500,6 @@ def plot_mse_grid(title): # args.timestamp_in_dirname = False # args.outcome_model = "auto" # or use "nested" for the old-style nested model out_dir = run_experiment(args) - # Plots should be supplied with the directory name pattern where the files live, load the rest from there - generate_plots(os.path.join(out_dir, "RCT")) + # upper_bounds = {"MSE": 1e2, "policy_risk": 0.2} + # lower_bounds = {"erupt": 0.06, "bite": 0.75} + generate_plots(os.path.join(out_dir, "RCT"))#, upper_bounds=upper_bounds, lower_bounds=lower_bounds) From 5627d9a18533b2ec5d9c05e0b2be727d4eaf5a45 Mon Sep 17 00:00:00 2001 From: "Egor.Kraev" Date: Fri, 6 Dec 2024 10:33:02 +0000 Subject: [PATCH 4/4] fix ERUPT std --- causaltune/score/erupt_core.py | 4 +- notebooks/ERUPT basics.ipynb | 130 +++++++++--------- notebooks/RunExperiments/experiment_runner.py | 18 +-- 3 files changed, 77 insertions(+), 75 deletions(-) diff --git a/causaltune/score/erupt_core.py b/causaltune/score/erupt_core.py index ae79260..6a69a8c 100644 --- a/causaltune/score/erupt_core.py +++ b/causaltune/score/erupt_core.py @@ -50,8 +50,8 @@ def erupt_with_std( ] mean += np.mean(means) std += np.std(means) / np.sqrt(num_splits) # Standard error of the mean - - return mean / resamples, std / resamples + # 1.5 is an empirical factor to make the confidence interval wider + return mean / resamples, 1.5 * std / resamples def erupt( diff --git a/notebooks/ERUPT basics.ipynb b/notebooks/ERUPT basics.ipynb index c4bdb54..94fcb30 100644 --- a/notebooks/ERUPT basics.ipynb +++ b/notebooks/ERUPT basics.ipynb @@ -103,33 +103,33 @@ " \n", " \n", " 0\n", - " 0.452636\n", - " 0\n", - " 1.684484\n", + " 0.898227\n", + " 1\n", + " 1.288637\n", " \n", " \n", " 1\n", - " 0.380215\n", + " 0.462092\n", " 0\n", - " 0.745268\n", + " 0.771976\n", " \n", " \n", " 2\n", - " 0.584036\n", - " 1\n", - " 0.762300\n", + " 0.858974\n", + " 0\n", + " 1.881019\n", " \n", " \n", " 3\n", - " 0.505191\n", - " 0\n", - " 1.425354\n", + " 0.228084\n", + " 1\n", + " 0.357797\n", " \n", " \n", " 4\n", - " 0.384110\n", + " 0.962512\n", " 1\n", - " 1.834628\n", + " 1.066413\n", " \n", " \n", "\n", @@ -137,11 +137,11 @@ ], "text/plain": [ " X T1 Y1\n", - "0 0.452636 0 1.684484\n", - "1 0.380215 0 0.745268\n", - "2 0.584036 1 0.762300\n", - "3 0.505191 0 1.425354\n", - "4 0.384110 1 1.834628" + "0 0.898227 1 1.288637\n", + "1 0.462092 0 0.771976\n", + "2 0.858974 0 1.881019\n", + "3 0.228084 1 0.357797\n", + "4 0.962512 1 1.066413" ] }, "execution_count": 2, @@ -216,53 +216,53 @@ " \n", " \n", " 0\n", - " 0.452636\n", - " 0\n", - " 1.684484\n", - " 0.726318\n", - " 0\n", - " 0.273682\n", - " 0.904259\n", + " 0.898227\n", + " 1\n", + " 1.288637\n", + " 0.949114\n", + " 1\n", + " 0.949114\n", + " 2.229118\n", " \n", " \n", " 1\n", - " 0.380215\n", + " 0.462092\n", " 0\n", - " 0.745268\n", - " 0.690108\n", - " 1\n", - " 0.690108\n", - " 1.930383\n", + " 0.771976\n", + " 0.731046\n", + " 0\n", + " 0.268954\n", + " 0.572308\n", " \n", " \n", " 2\n", - " 0.584036\n", - " 1\n", - " 0.762300\n", - " 0.792018\n", + " 0.858974\n", + " 0\n", + " 1.881019\n", + " 0.929487\n", " 1\n", - " 0.792018\n", - " 0.959608\n", + " 0.929487\n", + " 2.601592\n", " \n", " \n", " 3\n", - " 0.505191\n", - " 0\n", - " 1.425354\n", - " 0.752596\n", + " 0.228084\n", + " 1\n", + " 0.357797\n", + " 0.614042\n", " 1\n", - " 0.752596\n", - " 1.017777\n", + " 0.614042\n", + " 0.542638\n", " \n", " \n", " 4\n", - " 0.384110\n", + " 0.962512\n", " 1\n", - " 1.834628\n", - " 0.692055\n", + " 1.066413\n", + " 0.981256\n", " 1\n", - " 0.692055\n", - " 2.374030\n", + " 0.981256\n", + " 2.401383\n", " \n", " \n", "\n", @@ -270,11 +270,11 @@ ], "text/plain": [ " X T1 Y1 p T2 p_of_actual Y2\n", - "0 0.452636 0 1.684484 0.726318 0 0.273682 0.904259\n", - "1 0.380215 0 0.745268 0.690108 1 0.690108 1.930383\n", - "2 0.584036 1 0.762300 0.792018 1 0.792018 0.959608\n", - "3 0.505191 0 1.425354 0.752596 1 0.752596 1.017777\n", - "4 0.384110 1 1.834628 0.692055 1 0.692055 2.374030" + "0 0.898227 1 1.288637 0.949114 1 0.949114 2.229118\n", + "1 0.462092 0 0.771976 0.731046 0 0.268954 0.572308\n", + "2 0.858974 0 1.881019 0.929487 1 0.929487 2.601592\n", + "3 0.228084 1 0.357797 0.614042 1 0.614042 0.542638\n", + "4 0.962512 1 1.066413 0.981256 1 0.981256 2.401383" ] }, "execution_count": 3, @@ -319,10 +319,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Average outcome of the actual biased assignment experiment: 1.411675477573636\n", - "Estimated outcome of random assignment: 1.251567372523789\n", - "95% confidence interval for estimated outcome: 1.2311928820519622 1.2719418629956158\n", - "Average outcome of the actual random assignment experiment: 1.2559621877416332\n" + "Average outcome of the actual biased assignment experiment: 1.4064676444383317\n", + "Estimated outcome of random assignment: 1.2594221770638483\n", + "95% confidence interval for estimated outcome: 1.230204391668238 1.2886399624594587\n", + "Average outcome of the actual random assignment experiment: 1.2461659092712785\n" ] } ], @@ -360,17 +360,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "Average outcome of the actual random assignment experiment: 1.2559621877416332\n", - "Estimated outcome of biased assignment: 1.4147647990746988\n", - "Confidence interval for estimated outcome: 1.398423601541284 1.4311059966081134\n", - "Average outcome of the actual biased assignment experiment: 1.411675477573636\n" + "Average outcome of the actual random assignment experiment: 1.2461659092712785\n", + "Estimated outcome of biased assignment: 1.405112521603215\n", + "95% confidence interval for estimated outcome: 1.3814865905561569 1.428738452650273\n", + "Average outcome of the actual biased assignment experiment: 1.4064676444383317\n" ] } ], "source": [ - "# Conversely, we can take the outcome of the fully random test and use it to estimate what the outcome of the biased assignment would have been\n", + "# Conversely, we can take the outcome of the fully random test and use it \n", + "# to estimate what the outcome of the biased assignment would have been\n", "\n", - "# Let's use data from biased assignment experiment to estimate the average effect of fully random assignment\n", "hypothetical_policy = df[\"T2\"]\n", "est, std = erupt_with_std(actual_propensity=0.5*pd.Series(np.ones(len(df))), \n", " actual_treatment=df[\"T1\"],\n", @@ -379,7 +379,7 @@ "\n", "print(\"Average outcome of the actual random assignment experiment:\", df[\"Y1\"].mean())\n", "print(\"Estimated outcome of biased assignment:\", est)\n", - "print(\"Confidence interval for estimated outcome:\", est-2*std, est + 2*std)\n", + "print(\"95% confidence interval for estimated outcome:\", est-2*std, est + 2*std)\n", "print(\"Average outcome of the actual biased assignment experiment:\", df[\"Y2\"].mean())" ] }, @@ -388,7 +388,7 @@ "id": "f724dbc3", "metadata": {}, "source": [ - "As you can see, the actual outcome is well within the confidence interval estimated by ERUPT" + "As you can see, the actual outcome is within the confidence interval estimated by ERUPT" ] }, { diff --git a/notebooks/RunExperiments/experiment_runner.py b/notebooks/RunExperiments/experiment_runner.py index b8eec36..033b86a 100644 --- a/notebooks/RunExperiments/experiment_runner.py +++ b/notebooks/RunExperiments/experiment_runner.py @@ -88,7 +88,6 @@ def get_estimator_list(dataset_name): return [est for est in estimator_list if "Dummy" not in est] - def run_experiment(args): # Process datasets data_sets = {} @@ -281,10 +280,12 @@ def get_all_test_scores(out_dir, dataset_name): return out -def generate_plots(out_dir: str, - log_scale: List[str]|None = None, - upper_bounds: dict|None = None, - lower_bounds: dict|None=None): +def generate_plots( + out_dir: str, + log_scale: List[str] | None = None, + upper_bounds: dict | None = None, + lower_bounds: dict | None = None, +): if log_scale is None: log_scale = ["energy_distance", "psw_energy_distance", "frobenius_norm"] metrics, datasets = extract_metrics_datasets(out_dir) @@ -404,14 +405,13 @@ def plot_mse_grid(title): legend_elements = [] for j, dataset in enumerate(datasets): df = get_all_test_scores(out_dir, dataset) - for m, value in upper_bounds.items(): + for m, value in upper_bounds.items(): if m in df.columns: df = df[df[m] < value].copy() for m, value in lower_bounds.items(): if m in df.columns: df = df[df[m] > value].copy() - for i, metric in enumerate(all_metrics): ax = axs[i, j] this_df = df[["estimator_name", metric, "MSE"]].dropna() @@ -502,4 +502,6 @@ def plot_mse_grid(title): out_dir = run_experiment(args) # upper_bounds = {"MSE": 1e2, "policy_risk": 0.2} # lower_bounds = {"erupt": 0.06, "bite": 0.75} - generate_plots(os.path.join(out_dir, "RCT"))#, upper_bounds=upper_bounds, lower_bounds=lower_bounds) + generate_plots( + os.path.join(out_dir, "RCT") + ) # , upper_bounds=upper_bounds, lower_bounds=lower_bounds)