Merge pull request #310 from ZmeiGorynych/flat_search

First rough cut of simultaneous search for estimator and outcome function
py-why · Aug 29, 2024 · 7f68879 · 7f68879
2 parents 4bba4d1 + cb6677e
commit 7f68879
Show file tree

Hide file tree

Showing 15 changed files with 575 additions and 187 deletions.
diff --git a/causaltune/dataset_processor.py b/causaltune/dataset_processor.py
@@ -14,7 +14,10 @@ def __init__(self):
         self.encoder = None
 
     def fit(
-        self, cd: CausalityDataset, encoder_type: Optional[str] = "onehot", outcome: str = None
+        self,
+        cd: CausalityDataset,
+        encoder_type: Optional[str] = "onehot",
+        outcome: str = None,
     ):
         cd = copy.deepcopy(cd)
         self.preprocess_dataset(

diff --git a/causaltune/models/regression.py b/causaltune/models/regression.py
@@ -0,0 +1,86 @@
+from sklearn.linear_model import ElasticNet, LassoLars
+
+
+from flaml.automl.model import SKLearnEstimator
+from flaml import tune
+
+# These models are for some reason not in the deployed version of flaml 2.2.0,
+# but in the source code they are there
+# So keep this file in the project for now
+
+
+class ElasticNetEstimator(SKLearnEstimator):
+    """The class for tuning Elastic Net regression model."""
+
+    """Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html"""
+
+    ITER_HP = "max_iter"
+
+    @classmethod
+    def search_space(cls, data_size, task="regresssion", **params):
+        return {
+            "alpha": {
+                "domain": tune.loguniform(lower=0.0001, upper=1.0),
+                "init_value": 0.1,
+            },
+            "l1_ratio": {
+                "domain": tune.uniform(lower=0.0, upper=1.0),
+                "init_value": 0.5,
+            },
+            "selection": {
+                "domain": tune.choice(["cyclic", "random"]),
+                "init_value": "cyclic",
+            },
+        }
+
+    def config2params(self, config: dict) -> dict:
+        params = super().config2params(config)
+        params["tol"] = params.get("tol", 0.0001)
+        if "n_jobs" in params:
+            params.pop("n_jobs")
+        return params
+
+    def __init__(self, task="regression", **config):
+        super().__init__(task, **config)
+        assert self._task.is_regression(), "ElasticNet for regression task only"
+        self.estimator_class = ElasticNet
+
+
+class LassoLarsEstimator(SKLearnEstimator):
+    """The class for tuning Lasso model fit with Least Angle Regression a.k.a. Lars."""
+
+    """Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html"""
+
+    ITER_HP = "max_iter"
+
+    @classmethod
+    def search_space(cls, task=None, **params):
+        return {
+            "alpha": {
+                "domain": tune.loguniform(lower=1e-4, upper=1.0),
+                "init_value": 0.1,
+            },
+            "fit_intercept": {
+                "domain": tune.choice([True, False]),
+                "init_value": True,
+            },
+            "eps": {
+                "domain": tune.loguniform(lower=1e-16, upper=1e-4),
+                "init_value": 2.220446049250313e-16,
+            },
+        }
+
+    def config2params(self, config: dict) -> dict:
+        params = super().config2params(config)
+        if "n_jobs" in params:
+            params.pop("n_jobs")
+        return params
+
+    def __init__(self, task="regression", **config):
+        super().__init__(task, **config)
+        assert self._task.is_regression(), "LassoLars for regression task only"
+        self.estimator_class = LassoLars
+
+    def predict(self, X, **kwargs):
+        X = self._preprocess(X)
+        return self._model.predict(X, **kwargs)
diff --git a/causaltune/optimiser.py b/causaltune/optimiser.py
@@ -1,5 +1,4 @@
 import copy
-from copy import deepcopy
 import warnings
 from typing import List, Optional, Union
 from collections import defaultdict
@@ -19,10 +18,9 @@
 
 from joblib import Parallel, delayed
 
-from causaltune.params import SimpleParamService
+from causaltune.search.params import SimpleParamService
 from causaltune.scoring import Scorer
-from causaltune.r_score import RScoreWrapper
-from causaltune.utils import clean_config, treatment_is_multivalue
+from causaltune.utils import treatment_is_multivalue
 from causaltune.models.monkey_patches import (
     AutoML,
     apply_multitreatment,
@@ -96,7 +94,7 @@ def __init__(
         test_size=None,
         num_samples=-1,
         propensity_model="dummy",
-        outcome_model=None,
+        outcome_model="nested",
         components_task="regression",
         components_verbose=0,
         components_pred_time_limit=10 / 1e6,
@@ -181,9 +179,9 @@ def __init__(
             resources_per_trial if resources_per_trial is not None else {"cpu": 0.5}
         )
         self._settings["try_init_configs"] = try_init_configs
-        self._settings["include_experimental_estimators"] = (
-            include_experimental_estimators
-        )
+        self._settings[
+            "include_experimental_estimators"
+        ] = include_experimental_estimators
 
         # params for FLAML on component models:
         self._settings["component_models"] = {}
@@ -246,33 +244,44 @@ def init_propensity_model(self, propensity_model: str):
             )
 
     def init_outcome_model(self, outcome_model):
+        # TODO: implement filtering like below, when there are propensity-only features
+        # feature_filter below acts on classes not instances
+        # to preserve all the extra methods through inheritance
         # if we are only supplying certain features to the propensity function,
         # make them invisible to the outcome component model
         # This is a workaround for the DoWhy/EconML data model which doesn't
         # support that out of the box
-        if outcome_model is not None:
-            # TODO: implement filtering like below, when there are propensity-only features
-            # feature_filter below acts on classes not instances
-            # to preserve all the extra methods through inheritance
-            self.outcome_model = outcome_model
+
+        if hasattr(outcome_model, "fit") and hasattr(outcome_model, "predict"):
+            return outcome_model
+        elif outcome_model == "auto":
+            # Will be dynamically chosen at optimization time
+            return outcome_model
+        elif outcome_model == "nested":
+            # The current default behavior
+            return self.auto_outcome_model()
         else:
-            data = self.data
-            propensity_only_cols = [
-                p
-                for p in data.propensity_modifiers
-                if p not in data.common_causes + data.effect_modifiers
-            ]
-
-            if len(propensity_only_cols):
-                outcome_model_class = feature_filter(
-                    AutoML, data.effect_modifiers + data.common_causes, first_cols=True
-                )
-            else:
-                outcome_model_class = AutoML
+            raise ValueError(
+                'outcome_model valid values are None, "auto", or an estimator object'
+            )
 
-            self.outcome_model = outcome_model_class(
-                **self._settings["component_models"]
+    def auto_outcome_model(self):
+        data = self.data
+        propensity_only_cols = [
+            p
+            for p in data.propensity_modifiers
+            if p not in data.common_causes + data.effect_modifiers
+        ]
+
+        if len(propensity_only_cols):
+            # TODO: implement feature_filter for arbitrary outcome models
+            outcome_model_class = feature_filter(
+                AutoML, data.effect_modifiers + data.common_causes, first_cols=True
             )
+        else:
+            outcome_model_class = AutoML
+
+        return outcome_model_class(**self._settings["component_models"])
 
     def fit(
         self,
@@ -363,7 +372,6 @@ def fit(
         )
 
         self.init_propensity_model(self._settings["propensity_model"])
-        self.init_outcome_model(self._settings["outcome_model"])
 
         self.identified_estimand: IdentifiedEstimand = (
             self.causal_model.identify_effect(proceed_when_unidentifiable=True)
@@ -406,11 +414,10 @@ def fit(
 
         # config with method-specific params
         self.cfg = SimpleParamService(
-            self.propensity_model,
-            self.outcome_model,
             n_jobs=self._settings["component_models"]["n_jobs"],
             include_experimental=self._settings["include_experimental_estimators"],
             multivalue=treatment_is_multivalue(self._treatment_values),
+            sample_outcome_estimators=self._settings["outcome_model"] == "auto",
         )
 
         self.estimator_list = self.cfg.estimator_names_from_patterns(
@@ -454,24 +461,30 @@ def fit(
         if self._settings["test_size"] is not None:
             self.test_df = self.test_df.sample(self._settings["test_size"])
 
-        self.r_scorer = (
-            None
-            if "r_scorer" not in self.metrics_to_report
-            else RScoreWrapper(
-                self.outcome_model,
-                self.propensity_model,
-                self.train_df,
-                self.test_df,
-                outcome,
-                treatment,
-                common_causes,
-                effect_modifiers,
+        if "r_scorer" in self.metrics_to_report:
+            raise NotImplementedError(
+                "R-squared scorer no longer suported, please raise an issue if you want it back"
             )
+        # self.r_scorer = (
+        #     None
+        #     if "r_scorer" not in self.metrics_to_report
+        #     else RScoreWrapper(
+        #         self.outcome_model,
+        #         self.propensity_model,
+        #         self.train_df,
+        #         self.test_df,
+        #         outcome,
+        #         treatment,
+        #         common_causes,
+        #         effect_modifiers,
+        #     )
+        # )
+
+        search_space = self.cfg.search_space(
+            self.estimator_list, data_size=data.data.shape
         )
-
-        search_space = self.cfg.search_space(self.estimator_list)
         init_cfg = (
-            self.cfg.default_configs(self.estimator_list)
+            self.cfg.default_configs(self.estimator_list, data_size=data.data.shape)
             if self._settings["try_init_configs"]
             else []
         )
@@ -543,7 +556,7 @@ def _tune_with_config(self, config: dict) -> dict:
         # to spawn a separate process to prevent cross-talk between tuner and automl on component models:
 
         estimates = Parallel(n_jobs=2, backend="threading")(
-            delayed(self._estimate_effect)(config["estimator"]) for i in range(1)
+            delayed(self._estimate_effect)(config) for i in range(1)
         )[0]
         # estimates = self._estimate_effect(config["estimator"])
 
@@ -582,19 +595,15 @@ def _est_effect_stub(self, method_params):
     def _estimate_effect(self, config):
         """estimates effect with chosen estimator"""
 
-        # add params that are tuned by flaml:
-        config = clean_config(copy.copy(config))
-        self.estimator_name = config.pop("estimator_name")
-        # params_to_tune = {
-        #     k: v for k, v in config.items() if (not k == "estimator_name")
-        # }
-        cfg = self.cfg.method_params(self.estimator_name)
-        method_params = {
-            "init_params": {**deepcopy(config), **cfg.init_params},
-            "fit_params": {},
-        }
+        # Do we need an boject property for this, instead of a local var?
+        self.estimator_name = config["estimator"]["estimator_name"]
+        outcome_model = self.init_outcome_model(self._settings["outcome_model"])
+        method_params = self.cfg.method_params(
+            config, outcome_model, self.propensity_model
+        )
+
         try:  #
-            # if True:  #
+            # This calls the causal model's estimate_effect method
             estimate = self._est_effect_stub(method_params)
             scores = {
                 "estimator_name": self.estimator_name,
@@ -611,8 +620,9 @@ def _estimate_effect(self, config):
             return {
                 self.metric: scores["validation"][self.metric],
                 "estimator": estimate,
-                "estimator_name": scores.pop("estimator_name"),
+                "estimator_name": self.estimator_name,
                 "scores": scores,
+                # TODO: return full config!
                 "config": config,
             }
         except Exception as e:
@@ -641,7 +651,14 @@ def score_dataset(self, df: pd.DataFrame, dataset_name: str):
             None.
         """
         for scr in self.scores.values():
-            scr["scores"][dataset_name] = self._compute_metrics(scr["estimator"], df)
+            if scr["estimator"] is None:
+                warnings.warn(
+                    "Skipping scoring for estimator %s", scr["estimator_name"]
+                )
+            else:
+                scr["scores"][dataset_name] = self._compute_metrics(
+                    scr["estimator"], df
+                )
 
     @property
     def best_estimator(self) -> str:
@@ -788,21 +805,20 @@ def effect_stderr(self, df, n_bootstrap_samples=5, n_jobs=1, *args, **kwargs):
         if "Econml" in str(type(self.model)):
             # Get a list of "Inference" objects from EconML, one per treatment
             self.model.__class__.effect_stderr = effect_stderr
-            cfg = self.cfg.method_params(self.best_estimator)
+            outcome_model = self.init_outcome_model(self._settings["outcome_model"])
+            method_params = self.cfg.method_params(
+                self.best_config, outcome_model, self.propensity_model
+            )
 
-            if cfg.inference == "bootstrap":
+            if self.cfg.full_config(self.best_estimator).inference == "bootstrap":
                 # TODO: before bootstrapping, check whether that's already been done
                 bootstrap = BootstrapInference(
                     n_bootstrap_samples=n_bootstrap_samples, n_jobs=n_jobs
                 )
-
-                best_cfg = {
-                    k: v for k, v in self.best_config.items() if k not in ["estimator"]
-                }
-                method_params = {
-                    "init_params": {**best_cfg, **cfg.init_params},
-                    "fit_params": {"inference": bootstrap},
-                }
+                method_params["fit_params"]["inference"] = bootstrap
+                self.estimator_name = (
+                    self.best_estimator
+                )  # needed for _est_effect_stub, just in case
                 self.bootstrapped_estimate = self._est_effect_stub(method_params)
                 est = self.bootstrapped_estimate.estimator
             else:

diff --git a/causaltune/search/__init__.py b/causaltune/search/__init__.py