Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First rough cut of simultaneous search for estimator and outcome function #310

Merged
merged 12 commits into from
Aug 29, 2024
5 changes: 4 additions & 1 deletion causaltune/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ def __init__(self):
self.encoder = None

def fit(
self, cd: CausalityDataset, encoder_type: Optional[str] = "onehot", outcome: str = None
self,
cd: CausalityDataset,
encoder_type: Optional[str] = "onehot",
outcome: str = None,
):
cd = copy.deepcopy(cd)
self.preprocess_dataset(
Expand Down
86 changes: 86 additions & 0 deletions causaltune/models/regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from sklearn.linear_model import ElasticNet, LassoLars


from flaml.automl.model import SKLearnEstimator
from flaml import tune

# These models are for some reason not in the deployed version of flaml 2.2.0,
# but in the source code they are there
# So keep this file in the project for now


class ElasticNetEstimator(SKLearnEstimator):
"""The class for tuning Elastic Net regression model."""

"""Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html"""

ITER_HP = "max_iter"

@classmethod
def search_space(cls, data_size, task="regresssion", **params):
return {
"alpha": {
"domain": tune.loguniform(lower=0.0001, upper=1.0),
"init_value": 0.1,
},
"l1_ratio": {
"domain": tune.uniform(lower=0.0, upper=1.0),
"init_value": 0.5,
},
"selection": {
"domain": tune.choice(["cyclic", "random"]),
"init_value": "cyclic",
},
}

def config2params(self, config: dict) -> dict:
params = super().config2params(config)
params["tol"] = params.get("tol", 0.0001)
if "n_jobs" in params:
params.pop("n_jobs")
return params

def __init__(self, task="regression", **config):
super().__init__(task, **config)
assert self._task.is_regression(), "ElasticNet for regression task only"
self.estimator_class = ElasticNet


class LassoLarsEstimator(SKLearnEstimator):
"""The class for tuning Lasso model fit with Least Angle Regression a.k.a. Lars."""

"""Reference: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html"""

ITER_HP = "max_iter"

@classmethod
def search_space(cls, task=None, **params):
return {
"alpha": {
"domain": tune.loguniform(lower=1e-4, upper=1.0),
"init_value": 0.1,
},
"fit_intercept": {
"domain": tune.choice([True, False]),
"init_value": True,
},
"eps": {
"domain": tune.loguniform(lower=1e-16, upper=1e-4),
"init_value": 2.220446049250313e-16,
},
}

def config2params(self, config: dict) -> dict:
params = super().config2params(config)
if "n_jobs" in params:
params.pop("n_jobs")
return params

def __init__(self, task="regression", **config):
super().__init__(task, **config)
assert self._task.is_regression(), "LassoLars for regression task only"
self.estimator_class = LassoLars

def predict(self, X, **kwargs):
X = self._preprocess(X)
return self._model.predict(X, **kwargs)
158 changes: 87 additions & 71 deletions causaltune/optimiser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import copy
from copy import deepcopy
import warnings
from typing import List, Optional, Union
from collections import defaultdict
Expand All @@ -19,10 +18,9 @@

from joblib import Parallel, delayed

from causaltune.params import SimpleParamService
from causaltune.search.params import SimpleParamService
from causaltune.scoring import Scorer
from causaltune.r_score import RScoreWrapper
from causaltune.utils import clean_config, treatment_is_multivalue
from causaltune.utils import treatment_is_multivalue
from causaltune.models.monkey_patches import (
AutoML,
apply_multitreatment,
Expand Down Expand Up @@ -96,7 +94,7 @@ def __init__(
test_size=None,
num_samples=-1,
propensity_model="dummy",
outcome_model=None,
outcome_model="nested",
components_task="regression",
components_verbose=0,
components_pred_time_limit=10 / 1e6,
Expand Down Expand Up @@ -181,9 +179,9 @@ def __init__(
resources_per_trial if resources_per_trial is not None else {"cpu": 0.5}
)
self._settings["try_init_configs"] = try_init_configs
self._settings["include_experimental_estimators"] = (
include_experimental_estimators
)
self._settings[
"include_experimental_estimators"
] = include_experimental_estimators

# params for FLAML on component models:
self._settings["component_models"] = {}
Expand Down Expand Up @@ -246,33 +244,44 @@ def init_propensity_model(self, propensity_model: str):
)

def init_outcome_model(self, outcome_model):
# TODO: implement filtering like below, when there are propensity-only features
# feature_filter below acts on classes not instances
# to preserve all the extra methods through inheritance
# if we are only supplying certain features to the propensity function,
# make them invisible to the outcome component model
# This is a workaround for the DoWhy/EconML data model which doesn't
# support that out of the box
if outcome_model is not None:
# TODO: implement filtering like below, when there are propensity-only features
# feature_filter below acts on classes not instances
# to preserve all the extra methods through inheritance
self.outcome_model = outcome_model

if hasattr(outcome_model, "fit") and hasattr(outcome_model, "predict"):
return outcome_model
elif outcome_model == "auto":
# Will be dynamically chosen at optimization time
return outcome_model
elif outcome_model == "nested":
# The current default behavior
return self.auto_outcome_model()
else:
data = self.data
propensity_only_cols = [
p
for p in data.propensity_modifiers
if p not in data.common_causes + data.effect_modifiers
]

if len(propensity_only_cols):
outcome_model_class = feature_filter(
AutoML, data.effect_modifiers + data.common_causes, first_cols=True
)
else:
outcome_model_class = AutoML
raise ValueError(
'outcome_model valid values are None, "auto", or an estimator object'
)

self.outcome_model = outcome_model_class(
**self._settings["component_models"]
def auto_outcome_model(self):
data = self.data
propensity_only_cols = [
p
for p in data.propensity_modifiers
if p not in data.common_causes + data.effect_modifiers
]

if len(propensity_only_cols):
# TODO: implement feature_filter for arbitrary outcome models
outcome_model_class = feature_filter(
AutoML, data.effect_modifiers + data.common_causes, first_cols=True
)
else:
outcome_model_class = AutoML

return outcome_model_class(**self._settings["component_models"])

def fit(
self,
Expand Down Expand Up @@ -363,7 +372,6 @@ def fit(
)

self.init_propensity_model(self._settings["propensity_model"])
self.init_outcome_model(self._settings["outcome_model"])

self.identified_estimand: IdentifiedEstimand = (
self.causal_model.identify_effect(proceed_when_unidentifiable=True)
Expand Down Expand Up @@ -406,11 +414,10 @@ def fit(

# config with method-specific params
self.cfg = SimpleParamService(
self.propensity_model,
self.outcome_model,
n_jobs=self._settings["component_models"]["n_jobs"],
include_experimental=self._settings["include_experimental_estimators"],
multivalue=treatment_is_multivalue(self._treatment_values),
sample_outcome_estimators=self._settings["outcome_model"] == "auto",
)

self.estimator_list = self.cfg.estimator_names_from_patterns(
Expand Down Expand Up @@ -454,24 +461,30 @@ def fit(
if self._settings["test_size"] is not None:
self.test_df = self.test_df.sample(self._settings["test_size"])

self.r_scorer = (
None
if "r_scorer" not in self.metrics_to_report
else RScoreWrapper(
self.outcome_model,
self.propensity_model,
self.train_df,
self.test_df,
outcome,
treatment,
common_causes,
effect_modifiers,
if "r_scorer" in self.metrics_to_report:
raise NotImplementedError(
"R-squared scorer no longer suported, please raise an issue if you want it back"
)
# self.r_scorer = (
# None
# if "r_scorer" not in self.metrics_to_report
# else RScoreWrapper(
# self.outcome_model,
# self.propensity_model,
# self.train_df,
# self.test_df,
# outcome,
# treatment,
# common_causes,
# effect_modifiers,
# )
# )

search_space = self.cfg.search_space(
self.estimator_list, data_size=data.data.shape
)

search_space = self.cfg.search_space(self.estimator_list)
init_cfg = (
self.cfg.default_configs(self.estimator_list)
self.cfg.default_configs(self.estimator_list, data_size=data.data.shape)
if self._settings["try_init_configs"]
else []
)
Expand Down Expand Up @@ -543,7 +556,7 @@ def _tune_with_config(self, config: dict) -> dict:
# to spawn a separate process to prevent cross-talk between tuner and automl on component models:

estimates = Parallel(n_jobs=2, backend="threading")(
delayed(self._estimate_effect)(config["estimator"]) for i in range(1)
delayed(self._estimate_effect)(config) for i in range(1)
)[0]
# estimates = self._estimate_effect(config["estimator"])

Expand Down Expand Up @@ -582,19 +595,15 @@ def _est_effect_stub(self, method_params):
def _estimate_effect(self, config):
"""estimates effect with chosen estimator"""

# add params that are tuned by flaml:
config = clean_config(copy.copy(config))
self.estimator_name = config.pop("estimator_name")
# params_to_tune = {
# k: v for k, v in config.items() if (not k == "estimator_name")
# }
cfg = self.cfg.method_params(self.estimator_name)
method_params = {
"init_params": {**deepcopy(config), **cfg.init_params},
"fit_params": {},
}
# Do we need an boject property for this, instead of a local var?
self.estimator_name = config["estimator"]["estimator_name"]
outcome_model = self.init_outcome_model(self._settings["outcome_model"])
method_params = self.cfg.method_params(
config, outcome_model, self.propensity_model
)

try: #
# if True: #
# This calls the causal model's estimate_effect method
estimate = self._est_effect_stub(method_params)
scores = {
"estimator_name": self.estimator_name,
Expand All @@ -611,8 +620,9 @@ def _estimate_effect(self, config):
return {
self.metric: scores["validation"][self.metric],
"estimator": estimate,
"estimator_name": scores.pop("estimator_name"),
"estimator_name": self.estimator_name,
"scores": scores,
# TODO: return full config!
"config": config,
}
except Exception as e:
Expand Down Expand Up @@ -641,7 +651,14 @@ def score_dataset(self, df: pd.DataFrame, dataset_name: str):
None.
"""
for scr in self.scores.values():
scr["scores"][dataset_name] = self._compute_metrics(scr["estimator"], df)
if scr["estimator"] is None:
warnings.warn(
"Skipping scoring for estimator %s", scr["estimator_name"]
)
else:
scr["scores"][dataset_name] = self._compute_metrics(
scr["estimator"], df
)

@property
def best_estimator(self) -> str:
Expand Down Expand Up @@ -788,21 +805,20 @@ def effect_stderr(self, df, n_bootstrap_samples=5, n_jobs=1, *args, **kwargs):
if "Econml" in str(type(self.model)):
# Get a list of "Inference" objects from EconML, one per treatment
self.model.__class__.effect_stderr = effect_stderr
cfg = self.cfg.method_params(self.best_estimator)
outcome_model = self.init_outcome_model(self._settings["outcome_model"])
method_params = self.cfg.method_params(
self.best_config, outcome_model, self.propensity_model
)

if cfg.inference == "bootstrap":
if self.cfg.full_config(self.best_estimator).inference == "bootstrap":
# TODO: before bootstrapping, check whether that's already been done
bootstrap = BootstrapInference(
n_bootstrap_samples=n_bootstrap_samples, n_jobs=n_jobs
)

best_cfg = {
k: v for k, v in self.best_config.items() if k not in ["estimator"]
}
method_params = {
"init_params": {**best_cfg, **cfg.init_params},
"fit_params": {"inference": bootstrap},
}
method_params["fit_params"]["inference"] = bootstrap
self.estimator_name = (
self.best_estimator
) # needed for _est_effect_stub, just in case
self.bootstrapped_estimate = self._est_effect_stub(method_params)
est = self.bootstrapped_estimate.estimator
else:
Expand Down
Empty file added causaltune/search/__init__.py
Empty file.
Loading
Loading