Skip to content

Commit

Permalink
Merge pull request #302 from 1andrin/main
Browse files Browse the repository at this point in the history
New Metrics, Optimiser fix, Linear Synthetic Datasets, and Comprehensive Testing Environment
  • Loading branch information
AlxdrPolyakov authored Aug 29, 2024
2 parents 7f68879 + ff7f5f7 commit b5a94dc
Show file tree
Hide file tree
Showing 6 changed files with 2,332 additions and 148 deletions.
158 changes: 142 additions & 16 deletions causaltune/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@


def linear_multi_dataset(
n_points=10000, impact=None, include_propensity=False, include_control=False
) -> CausalityDataset:
n_points=10000,
impact=None,
include_propensity=False,
include_control=False) -> CausalityDataset:
if impact is None:
impact = {0: 0.0, 1: 2.0, 2: 1.0}
df = pd.DataFrame(
Expand Down Expand Up @@ -78,9 +80,8 @@ def nhefs() -> CausalityDataset:
df = df.loc[~missing]

df = df[covariates + ["qsmk"] + ["wt82_71"]]
df.rename(
columns={c: "x" + str(i + 1) for i, c in enumerate(covariates)}, inplace=True
)
df.rename(columns={c: "x" + str(i + 1)
for i, c in enumerate(covariates)}, inplace=True)

return CausalityDataset(df, treatment="qsmk", outcomes=["wt82_71"])

Expand Down Expand Up @@ -171,7 +172,8 @@ def amazon_reviews(rating="pos") -> CausalityDataset:
gdown.download(url, "amazon_" + rating + ".csv", fuzzy=True)
df = pd.read_csv("amazon_" + rating + ".csv")
df.drop(df.columns[[2, 3, 4]], axis=1, inplace=True)
df.columns = ["treatment", "y_factual"] + ["x" + str(i) for i in range(1, 301)]
df.columns = ["treatment", "y_factual"] + \
["x" + str(i) for i in range(1, 301)]
return CausalityDataset(df, "treatment", ["y_factual"])
else:
print(
Expand Down Expand Up @@ -224,10 +226,14 @@ def synth_ihdp(return_df=False) -> CausalityDataset:
data.columns = col
# drop the columns we don't care about
ignore_patterns = ["y_cfactual", "mu"]
ignore_cols = [c for c in data.columns if any([s in c for s in ignore_patterns])]
ignore_cols = [c for c in data.columns if any(
[s in c for s in ignore_patterns])]
data = data.drop(columns=ignore_cols)

return CausalityDataset(data, "treatment", ["y_factual"]) if not return_df else data
return CausalityDataset(
data,
"treatment",
["y_factual"]) if not return_df else data


def synth_acic(condition=1) -> CausalityDataset:
Expand Down Expand Up @@ -378,7 +384,7 @@ def generate_synthetic_data(
p = 1 / (1 + np.exp(X[:, 0] * X[:, 1] + X[:, 2] * 3))
p = np.clip(p, 0.1, 0.9)
C = p > np.random.rand(n_samples)
print(min(p), max(p))
# print(min(p), max(p))

else:
p = 0.5 * np.ones(n_samples)
Expand All @@ -403,17 +409,129 @@ def generate_synthetic_data(
err = np.random.randn(n_samples) * 0.05 if noisy_outcomes else 0

# nonlinear dependence of Y on X:
mu = lambda X: X[:, 0] * X[:, 1] + X[:, 2] + X[:, 3] * X[:, 4] # noqa E731
def mu(X):
return X[:, 0] * X[:, 1] + X[:, 2] + X[:, 3] * X[:, 4] # noqa E731

Y_base = mu(X) + err
Y = tau * T + Y_base

features = [f"X{i+1}" for i in range(n_covariates)]
df = pd.DataFrame(
np.array([*X.T, T, Y, tau, p, Y_base]).T,
columns=features
+ ["treatment", "outcome", "true_effect", "propensity", "base_outcome"],
df = pd.DataFrame(np.array([*X.T,
T,
Y,
tau,
p,
Y_base]).T,
columns=features + ["treatment",
"outcome",
"true_effect",
"propensity",
"base_outcome"],
)
data = CausalityDataset(
data=df,
treatment="treatment",
outcomes=["outcome"],
effect_modifiers=features,
propensity_modifiers=["propensity"],
)
if add_instrument:
df["instrument"] = Z
data.instruments = ["instrument"]
return data


def generate_linear_synthetic_data(
n_samples: int = 100,
n_covariates: int = 5,
covariance: Union[str, np.ndarray] = "isotropic",
confounding: bool = True,
linear_confounder: bool = False,
noisy_outcomes: bool = False,
effect_size: Union[int, None] = None,
add_instrument: bool = False,
) -> CausalityDataset:
"""Generates synthetic dataset with linear treatment effect (CATE) and optional instrumental variable.
Supports RCT (unconfounded) and observational (confounded) data.
Args:
n_samples (int, optional): number of independent samples. Defaults to 100.
n_covariates (int, optional): number of covariates. Defaults to 5.
covariance (Union[str, np.ndarray], optional): covariance matrix of covariates. can be "isotropic",
"anisotropic" or user-supplied. Defaults to "isotropic".
confounding (bool, optional): whether or not values of covariates affect treatment effect.
Defaults to True.
linear_confounder (bool, optional): whether to use a linear confounder for treatment assignment.
Defaults to False.
noisy_outcomes (bool, optional): additive noise in the outcomes. Defaults to False.
add_instrument (bool, optional): include instrumental variable (yes/no). Defaults to False
effect_size (Union[int, None]): if provided, constant effect size (ATE). if None, generate CATE.
Defaults to None.
Returns:
CausalityDataset: columns for covariates, treatment assignment, outcome and true treatment effect
"""

if covariance == "isotropic":
sigma = np.random.randn(1)
covmat = np.eye(n_covariates) * sigma**2
elif covariance == "anisotropic":
covmat = generate_psdmat(n_covariates)

X = np.random.multivariate_normal(
mean=[0] * n_covariates, cov=covmat, size=n_samples
)

if confounding:
if linear_confounder:
p = 1 / (1 + np.exp(X[:, 0] * X[:, 1] + 3 * X[:, 2]))
else:
p = 1 / (1 + np.exp(X[:, 0] * X[:, 1] + X[:, 2] * 3))

p = np.clip(p, 0.1, 0.9)
C = p > np.random.rand(n_samples)
else:
p = 0.5 * np.ones(n_samples)
C = np.random.binomial(n=1, p=0.5, size=n_samples)

if add_instrument:
Z = np.random.binomial(n=1, p=0.5, size=n_samples)
C0 = np.random.binomial(n=1, p=0.006, size=n_samples)
T = C * Z + C0 * (1 - Z)
else:
T = C

# fixed effect size:
if effect_size is not None:
tau = [effect_size] * n_samples
else:
# heterogeneity in effect size:
weights = np.random.uniform(low=0.4, high=0.7, size=n_covariates)
e = np.random.randn(n_samples) * 0.01
tau = X @ weights.T + e + 0.1

err = np.random.randn(n_samples) * 0.05 if noisy_outcomes else 0

# linear dependence of Y on X:
def mu(X):
return X @ np.random.uniform(0.1, 0.3, size=n_covariates) # noqa E731

Y_base = mu(X) + err
Y = tau * T + Y_base

features = [f"X{i+1}" for i in range(n_covariates)]
df = pd.DataFrame(np.array([*X.T,
T,
Y,
tau,
p,
Y_base]).T,
columns=features + ["treatment",
"outcome",
"true_effect",
"propensity",
"base_outcome"],
)
data = CausalityDataset(
data=df,
treatment="treatment",
Expand Down Expand Up @@ -523,8 +641,16 @@ def generate_non_random_dataset(num_samples=1000):
)
treatment = np.random.binomial(1, propensity)
outcome = (
0.2 * treatment + 0.5 * x1 - 0.2 * x2 + np.random.normal(0, 1, num_samples)
)
0.2
* treatment
+ 0.5
* x1
- 0.2
* x2
+ np.random.normal(
0,
1,
num_samples))

dataset = {
"T": treatment,
Expand Down
80 changes: 79 additions & 1 deletion causaltune/erupt.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,87 @@ def weights(
else:
weight[weight > 1 / self.clip] = 1 / self.clip

# and just for paranoia's sake let's normalize, though it shouldn't matter for big samples
# and just for paranoia's sake let's normalize, though it shouldn't
# matter for big samples
weight *= len(df) / sum(weight)

assert not np.isnan(weight.sum()), "NaNs in ERUPT weights"

return pd.Series(index=df.index, data=weight)

# NEW:

def probabilistic_erupt_score(
self,
df: pd.DataFrame,
outcome: pd.Series,
treatment_effects: pd.Series,
treatment_std_devs: pd.Series,
iterations: int = 1000
) -> float:
"""
Calculate the Probabilistic ERUPT (Expected Response Under Proposed
Treatments) score.
This method uses Monte Carlo simulation to estimate the expected
outcome under a probabilistic treatment policy, accounting for
uncertainty in treatment effects. It balances potential improvements
against estimation uncertainty and treatment rates.
Args:
df (pd.DataFrame): The input dataframe containing treatment
information.
outcome (pd.Series): The observed outcomes for each unit.
treatment_effects (pd.Series): Estimated treatment effects for
each unit.
treatment_std_devs (pd.Series): Standard deviations of treatment
effects.
iterations (int): Number of Monte Carlo iterations (default: 1000).
Returns:
float: The Probabilistic ERUPT score, representing the relative
improvement over the baseline outcome, adjusted for uncertainty.
"""
# Calculate the baseline outcome (mean outcome for untreated units)
baseline_outcome = outcome[df[self.treatment_name] == 0].mean()

policy_values = []
treatment_decisions = []

# Perform Monte Carlo simulation
for _ in range(iterations):
# Sample treatment effects from normal distributions
sampled_effects = pd.Series(
np.random.normal(treatment_effects, treatment_std_devs),
index=treatment_effects.index
)

# Define policy: treat if sampled effect is positive
# Note: A more conservative policy could use: sampled_effects > 2 *
# treatment_std_devs
policy = (sampled_effects > 0).astype(int)

# Calculate expected outcome under this policy
expected_outcome = (
baseline_outcome
+ (policy * sampled_effects).mean()
)

policy_values.append(expected_outcome)
treatment_decisions.append(policy.mean())

# Calculate mean and standard error of policy values
mean_value = np.mean(policy_values)
se_value = np.std(policy_values) / np.sqrt(iterations)

# Placeholder for potential treatment rate penalty
treatment_penalty = 0

# Calculate score: mean value minus 2 standard errors, adjusted for
# treatment penalty
score = (mean_value - 2 * se_value) * (1 - treatment_penalty)

# Calculate relative improvement over baseline
improvement = (score - baseline_outcome) / baseline_outcome

return improvement
Loading

0 comments on commit b5a94dc

Please sign in to comment.