Skip to content

Commit

Permalink
Merge pull request #310 from ZmeiGorynych/flat_search
Browse files Browse the repository at this point in the history
First rough cut of simultaneous search for estimator and outcome function
  • Loading branch information
AlxdrPolyakov authored Aug 29, 2024
2 parents 4bba4d1 + cb6677e commit 7f68879
Show file tree
Hide file tree
Showing 15 changed files with 575 additions and 187 deletions.
5 changes: 4 additions & 1 deletion causaltune/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ def __init__(self):
self.encoder = None

def fit(
self, cd: CausalityDataset, encoder_type: Optional[str] = "onehot", outcome: str = None
self,
cd: CausalityDataset,
encoder_type: Optional[str] = "onehot",
outcome: str = None,
):
cd = copy.deepcopy(cd)
self.preprocess_dataset(
Expand Down
86 changes: 86 additions & 0 deletions causaltune/models/regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from sklearn.linear_model import ElasticNet, LassoLars


from flaml.automl.model import SKLearnEstimator
from flaml import tune

# These models are for some reason not in the deployed version of flaml 2.2.0,
# but in the source code they are there
# So keep this file in the project for now


class ElasticNetEstimator(SKLearnEstimator):
"""The class for tuning Elastic Net regression model."""

"""Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html"""

ITER_HP = "max_iter"

@classmethod
def search_space(cls, data_size, task="regresssion", **params):
return {
"alpha": {
"domain": tune.loguniform(lower=0.0001, upper=1.0),
"init_value": 0.1,
},
"l1_ratio": {
"domain": tune.uniform(lower=0.0, upper=1.0),
"init_value": 0.5,
},
"selection": {
"domain": tune.choice(["cyclic", "random"]),
"init_value": "cyclic",
},
}

def config2params(self, config: dict) -> dict:
params = super().config2params(config)
params["tol"] = params.get("tol", 0.0001)
if "n_jobs" in params:
params.pop("n_jobs")
return params

def __init__(self, task="regression", **config):
super().__init__(task, **config)
assert self._task.is_regression(), "ElasticNet for regression task only"
self.estimator_class = ElasticNet


class LassoLarsEstimator(SKLearnEstimator):
"""The class for tuning Lasso model fit with Least Angle Regression a.k.a. Lars."""

"""Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html"""

ITER_HP = "max_iter"

@classmethod
def search_space(cls, task=None, **params):
return {
"alpha": {
"domain": tune.loguniform(lower=1e-4, upper=1.0),
"init_value": 0.1,
},
"fit_intercept": {
"domain": tune.choice([True, False]),
"init_value": True,
},
"eps": {
"domain": tune.loguniform(lower=1e-16, upper=1e-4),
"init_value": 2.220446049250313e-16,
},
}

def config2params(self, config: dict) -> dict:
params = super().config2params(config)
if "n_jobs" in params:
params.pop("n_jobs")
return params

def __init__(self, task="regression", **config):
super().__init__(task, **config)
assert self._task.is_regression(), "LassoLars for regression task only"
self.estimator_class = LassoLars

def predict(self, X, **kwargs):
X = self._preprocess(X)
return self._model.predict(X, **kwargs)
158 changes: 87 additions & 71 deletions causaltune/optimiser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import copy
from copy import deepcopy
import warnings
from typing import List, Optional, Union
from collections import defaultdict
Expand All @@ -19,10 +18,9 @@

from joblib import Parallel, delayed

from causaltune.params import SimpleParamService
from causaltune.search.params import SimpleParamService
from causaltune.scoring import Scorer
from causaltune.r_score import RScoreWrapper
from causaltune.utils import clean_config, treatment_is_multivalue
from causaltune.utils import treatment_is_multivalue
from causaltune.models.monkey_patches import (
AutoML,
apply_multitreatment,
Expand Down Expand Up @@ -96,7 +94,7 @@ def __init__(
test_size=None,
num_samples=-1,
propensity_model="dummy",
outcome_model=None,
outcome_model="nested",
components_task="regression",
components_verbose=0,
components_pred_time_limit=10 / 1e6,
Expand Down Expand Up @@ -181,9 +179,9 @@ def __init__(
resources_per_trial if resources_per_trial is not None else {"cpu": 0.5}
)
self._settings["try_init_configs"] = try_init_configs
self._settings["include_experimental_estimators"] = (
include_experimental_estimators
)
self._settings[
"include_experimental_estimators"
] = include_experimental_estimators

# params for FLAML on component models:
self._settings["component_models"] = {}
Expand Down Expand Up @@ -246,33 +244,44 @@ def init_propensity_model(self, propensity_model: str):
)

def init_outcome_model(self, outcome_model):
# TODO: implement filtering like below, when there are propensity-only features
# feature_filter below acts on classes not instances
# to preserve all the extra methods through inheritance
# if we are only supplying certain features to the propensity function,
# make them invisible to the outcome component model
# This is a workaround for the DoWhy/EconML data model which doesn't
# support that out of the box
if outcome_model is not None:
# TODO: implement filtering like below, when there are propensity-only features
# feature_filter below acts on classes not instances
# to preserve all the extra methods through inheritance
self.outcome_model = outcome_model

if hasattr(outcome_model, "fit") and hasattr(outcome_model, "predict"):
return outcome_model
elif outcome_model == "auto":
# Will be dynamically chosen at optimization time
return outcome_model
elif outcome_model == "nested":
# The current default behavior
return self.auto_outcome_model()
else:
data = self.data
propensity_only_cols = [
p
for p in data.propensity_modifiers
if p not in data.common_causes + data.effect_modifiers
]

if len(propensity_only_cols):
outcome_model_class = feature_filter(
AutoML, data.effect_modifiers + data.common_causes, first_cols=True
)
else:
outcome_model_class = AutoML
raise ValueError(
'outcome_model valid values are None, "auto", or an estimator object'
)

self.outcome_model = outcome_model_class(
**self._settings["component_models"]
def auto_outcome_model(self):
data = self.data
propensity_only_cols = [
p
for p in data.propensity_modifiers
if p not in data.common_causes + data.effect_modifiers
]

if len(propensity_only_cols):
# TODO: implement feature_filter for arbitrary outcome models
outcome_model_class = feature_filter(
AutoML, data.effect_modifiers + data.common_causes, first_cols=True
)
else:
outcome_model_class = AutoML

return outcome_model_class(**self._settings["component_models"])

def fit(
self,
Expand Down Expand Up @@ -363,7 +372,6 @@ def fit(
)

self.init_propensity_model(self._settings["propensity_model"])
self.init_outcome_model(self._settings["outcome_model"])

self.identified_estimand: IdentifiedEstimand = (
self.causal_model.identify_effect(proceed_when_unidentifiable=True)
Expand Down Expand Up @@ -406,11 +414,10 @@ def fit(

# config with method-specific params
self.cfg = SimpleParamService(
self.propensity_model,
self.outcome_model,
n_jobs=self._settings["component_models"]["n_jobs"],
include_experimental=self._settings["include_experimental_estimators"],
multivalue=treatment_is_multivalue(self._treatment_values),
sample_outcome_estimators=self._settings["outcome_model"] == "auto",
)

self.estimator_list = self.cfg.estimator_names_from_patterns(
Expand Down Expand Up @@ -454,24 +461,30 @@ def fit(
if self._settings["test_size"] is not None:
self.test_df = self.test_df.sample(self._settings["test_size"])

self.r_scorer = (
None
if "r_scorer" not in self.metrics_to_report
else RScoreWrapper(
self.outcome_model,
self.propensity_model,
self.train_df,
self.test_df,
outcome,
treatment,
common_causes,
effect_modifiers,
if "r_scorer" in self.metrics_to_report:
raise NotImplementedError(
"R-squared scorer no longer suported, please raise an issue if you want it back"
)
# self.r_scorer = (
# None
# if "r_scorer" not in self.metrics_to_report
# else RScoreWrapper(
# self.outcome_model,
# self.propensity_model,
# self.train_df,
# self.test_df,
# outcome,
# treatment,
# common_causes,
# effect_modifiers,
# )
# )

search_space = self.cfg.search_space(
self.estimator_list, data_size=data.data.shape
)

search_space = self.cfg.search_space(self.estimator_list)
init_cfg = (
self.cfg.default_configs(self.estimator_list)
self.cfg.default_configs(self.estimator_list, data_size=data.data.shape)
if self._settings["try_init_configs"]
else []
)
Expand Down Expand Up @@ -543,7 +556,7 @@ def _tune_with_config(self, config: dict) -> dict:
# to spawn a separate process to prevent cross-talk between tuner and automl on component models:

estimates = Parallel(n_jobs=2, backend="threading")(
delayed(self._estimate_effect)(config["estimator"]) for i in range(1)
delayed(self._estimate_effect)(config) for i in range(1)
)[0]
# estimates = self._estimate_effect(config["estimator"])

Expand Down Expand Up @@ -582,19 +595,15 @@ def _est_effect_stub(self, method_params):
def _estimate_effect(self, config):
"""estimates effect with chosen estimator"""

# add params that are tuned by flaml:
config = clean_config(copy.copy(config))
self.estimator_name = config.pop("estimator_name")
# params_to_tune = {
# k: v for k, v in config.items() if (not k == "estimator_name")
# }
cfg = self.cfg.method_params(self.estimator_name)
method_params = {
"init_params": {**deepcopy(config), **cfg.init_params},
"fit_params": {},
}
# Do we need an boject property for this, instead of a local var?
self.estimator_name = config["estimator"]["estimator_name"]
outcome_model = self.init_outcome_model(self._settings["outcome_model"])
method_params = self.cfg.method_params(
config, outcome_model, self.propensity_model
)

try: #
# if True: #
# This calls the causal model's estimate_effect method
estimate = self._est_effect_stub(method_params)
scores = {
"estimator_name": self.estimator_name,
Expand All @@ -611,8 +620,9 @@ def _estimate_effect(self, config):
return {
self.metric: scores["validation"][self.metric],
"estimator": estimate,
"estimator_name": scores.pop("estimator_name"),
"estimator_name": self.estimator_name,
"scores": scores,
# TODO: return full config!
"config": config,
}
except Exception as e:
Expand Down Expand Up @@ -641,7 +651,14 @@ def score_dataset(self, df: pd.DataFrame, dataset_name: str):
None.
"""
for scr in self.scores.values():
scr["scores"][dataset_name] = self._compute_metrics(scr["estimator"], df)
if scr["estimator"] is None:
warnings.warn(
"Skipping scoring for estimator %s", scr["estimator_name"]
)
else:
scr["scores"][dataset_name] = self._compute_metrics(
scr["estimator"], df
)

@property
def best_estimator(self) -> str:
Expand Down Expand Up @@ -788,21 +805,20 @@ def effect_stderr(self, df, n_bootstrap_samples=5, n_jobs=1, *args, **kwargs):
if "Econml" in str(type(self.model)):
# Get a list of "Inference" objects from EconML, one per treatment
self.model.__class__.effect_stderr = effect_stderr
cfg = self.cfg.method_params(self.best_estimator)
outcome_model = self.init_outcome_model(self._settings["outcome_model"])
method_params = self.cfg.method_params(
self.best_config, outcome_model, self.propensity_model
)

if cfg.inference == "bootstrap":
if self.cfg.full_config(self.best_estimator).inference == "bootstrap":
# TODO: before bootstrapping, check whether that's already been done
bootstrap = BootstrapInference(
n_bootstrap_samples=n_bootstrap_samples, n_jobs=n_jobs
)

best_cfg = {
k: v for k, v in self.best_config.items() if k not in ["estimator"]
}
method_params = {
"init_params": {**best_cfg, **cfg.init_params},
"fit_params": {"inference": bootstrap},
}
method_params["fit_params"]["inference"] = bootstrap
self.estimator_name = (
self.best_estimator
) # needed for _est_effect_stub, just in case
self.bootstrapped_estimate = self._est_effect_stub(method_params)
est = self.bootstrapped_estimate.estimator
else:
Expand Down
Empty file added causaltune/search/__init__.py
Empty file.
Loading

0 comments on commit 7f68879

Please sign in to comment.