Merge pull request #302 from 1andrin/main

New Metrics, Optimiser fix, Linear Synthetic Datasets, and Comprehensive Testing Environment
py-why · Aug 29, 2024 · b5a94dc · b5a94dc
2 parents 7f68879 + ff7f5f7
commit b5a94dc
Show file tree

Hide file tree

Showing 6 changed files with 2,332 additions and 148 deletions.
diff --git a/causaltune/datasets.py b/causaltune/datasets.py
@@ -12,8 +12,10 @@
 
 
 def linear_multi_dataset(
-    n_points=10000, impact=None, include_propensity=False, include_control=False
-) -> CausalityDataset:
+        n_points=10000,
+        impact=None,
+        include_propensity=False,
+        include_control=False) -> CausalityDataset:
     if impact is None:
         impact = {0: 0.0, 1: 2.0, 2: 1.0}
     df = pd.DataFrame(
@@ -78,9 +80,8 @@ def nhefs() -> CausalityDataset:
     df = df.loc[~missing]
 
     df = df[covariates + ["qsmk"] + ["wt82_71"]]
-    df.rename(
-        columns={c: "x" + str(i + 1) for i, c in enumerate(covariates)}, inplace=True
-    )
+    df.rename(columns={c: "x" + str(i + 1)
+                       for i, c in enumerate(covariates)}, inplace=True)
 
     return CausalityDataset(df, treatment="qsmk", outcomes=["wt82_71"])
 
@@ -171,7 +172,8 @@ def amazon_reviews(rating="pos") -> CausalityDataset:
             gdown.download(url, "amazon_" + rating + ".csv", fuzzy=True)
             df = pd.read_csv("amazon_" + rating + ".csv")
         df.drop(df.columns[[2, 3, 4]], axis=1, inplace=True)
-        df.columns = ["treatment", "y_factual"] + ["x" + str(i) for i in range(1, 301)]
+        df.columns = ["treatment", "y_factual"] + \
+            ["x" + str(i) for i in range(1, 301)]
         return CausalityDataset(df, "treatment", ["y_factual"])
     else:
         print(
@@ -224,10 +226,14 @@ def synth_ihdp(return_df=False) -> CausalityDataset:
     data.columns = col
     # drop the columns we don't care about
     ignore_patterns = ["y_cfactual", "mu"]
-    ignore_cols = [c for c in data.columns if any([s in c for s in ignore_patterns])]
+    ignore_cols = [c for c in data.columns if any(
+        [s in c for s in ignore_patterns])]
     data = data.drop(columns=ignore_cols)
 
-    return CausalityDataset(data, "treatment", ["y_factual"]) if not return_df else data
+    return CausalityDataset(
+        data,
+        "treatment",
+        ["y_factual"]) if not return_df else data
 
 
 def synth_acic(condition=1) -> CausalityDataset:
@@ -378,7 +384,7 @@ def generate_synthetic_data(
             p = 1 / (1 + np.exp(X[:, 0] * X[:, 1] + X[:, 2] * 3))
         p = np.clip(p, 0.1, 0.9)
         C = p > np.random.rand(n_samples)
-        print(min(p), max(p))
+        # print(min(p), max(p))
 
     else:
         p = 0.5 * np.ones(n_samples)
@@ -403,17 +409,129 @@ def generate_synthetic_data(
     err = np.random.randn(n_samples) * 0.05 if noisy_outcomes else 0
 
     # nonlinear dependence of Y on X:
-    mu = lambda X: X[:, 0] * X[:, 1] + X[:, 2] + X[:, 3] * X[:, 4]  # noqa E731
+    def mu(X):
+        return X[:, 0] * X[:, 1] + X[:, 2] + X[:, 3] * X[:, 4]  # noqa E731
 
     Y_base = mu(X) + err
     Y = tau * T + Y_base
 
     features = [f"X{i+1}" for i in range(n_covariates)]
-    df = pd.DataFrame(
-        np.array([*X.T, T, Y, tau, p, Y_base]).T,
-        columns=features
-        + ["treatment", "outcome", "true_effect", "propensity", "base_outcome"],
+    df = pd.DataFrame(np.array([*X.T,
+                                T,
+                                Y,
+                                tau,
+                                p,
+                                Y_base]).T,
+                      columns=features + ["treatment",
+                                          "outcome",
+                                          "true_effect",
+                                          "propensity",
+                                          "base_outcome"],
+                      )
+    data = CausalityDataset(
+        data=df,
+        treatment="treatment",
+        outcomes=["outcome"],
+        effect_modifiers=features,
+        propensity_modifiers=["propensity"],
     )
+    if add_instrument:
+        df["instrument"] = Z
+        data.instruments = ["instrument"]
+    return data
+
+
+def generate_linear_synthetic_data(
+    n_samples: int = 100,
+    n_covariates: int = 5,
+    covariance: Union[str, np.ndarray] = "isotropic",
+    confounding: bool = True,
+    linear_confounder: bool = False,
+    noisy_outcomes: bool = False,
+    effect_size: Union[int, None] = None,
+    add_instrument: bool = False,
+) -> CausalityDataset:
+    """Generates synthetic dataset with linear treatment effect (CATE) and optional instrumental variable.
+    Supports RCT (unconfounded) and observational (confounded) data.
+
+    Args:
+        n_samples (int, optional): number of independent samples. Defaults to 100.
+        n_covariates (int, optional): number of covariates. Defaults to 5.
+        covariance (Union[str, np.ndarray], optional): covariance matrix of covariates. can be "isotropic",
+            "anisotropic" or user-supplied. Defaults to "isotropic".
+        confounding (bool, optional): whether or not values of covariates affect treatment effect.
+            Defaults to True.
+        linear_confounder (bool, optional): whether to use a linear confounder for treatment assignment.
+            Defaults to False.
+        noisy_outcomes (bool, optional): additive noise in the outcomes. Defaults to False.
+        add_instrument (bool, optional): include instrumental variable (yes/no). Defaults to False
+        effect_size (Union[int, None]): if provided, constant effect size (ATE). if None, generate CATE.
+            Defaults to None.
+
+    Returns:
+        CausalityDataset: columns for covariates, treatment assignment, outcome and true treatment effect
+    """
+
+    if covariance == "isotropic":
+        sigma = np.random.randn(1)
+        covmat = np.eye(n_covariates) * sigma**2
+    elif covariance == "anisotropic":
+        covmat = generate_psdmat(n_covariates)
+
+    X = np.random.multivariate_normal(
+        mean=[0] * n_covariates, cov=covmat, size=n_samples
+    )
+
+    if confounding:
+        if linear_confounder:
+            p = 1 / (1 + np.exp(X[:, 0] * X[:, 1] + 3 * X[:, 2]))
+        else:
+            p = 1 / (1 + np.exp(X[:, 0] * X[:, 1] + X[:, 2] * 3))
+
+        p = np.clip(p, 0.1, 0.9)
+        C = p > np.random.rand(n_samples)
+    else:
+        p = 0.5 * np.ones(n_samples)
+        C = np.random.binomial(n=1, p=0.5, size=n_samples)
+
+    if add_instrument:
+        Z = np.random.binomial(n=1, p=0.5, size=n_samples)
+        C0 = np.random.binomial(n=1, p=0.006, size=n_samples)
+        T = C * Z + C0 * (1 - Z)
+    else:
+        T = C
+
+    # fixed effect size:
+    if effect_size is not None:
+        tau = [effect_size] * n_samples
+    else:
+        # heterogeneity in effect size:
+        weights = np.random.uniform(low=0.4, high=0.7, size=n_covariates)
+        e = np.random.randn(n_samples) * 0.01
+        tau = X @ weights.T + e + 0.1
+
+    err = np.random.randn(n_samples) * 0.05 if noisy_outcomes else 0
+
+    # linear dependence of Y on X:
+    def mu(X):
+        return X @ np.random.uniform(0.1, 0.3, size=n_covariates)  # noqa E731
+
+    Y_base = mu(X) + err
+    Y = tau * T + Y_base
+
+    features = [f"X{i+1}" for i in range(n_covariates)]
+    df = pd.DataFrame(np.array([*X.T,
+                                T,
+                                Y,
+                                tau,
+                                p,
+                                Y_base]).T,
+                      columns=features + ["treatment",
+                                          "outcome",
+                                          "true_effect",
+                                          "propensity",
+                                          "base_outcome"],
+                      )
     data = CausalityDataset(
         data=df,
         treatment="treatment",
@@ -523,8 +641,16 @@ def generate_non_random_dataset(num_samples=1000):
     )
     treatment = np.random.binomial(1, propensity)
     outcome = (
-        0.2 * treatment + 0.5 * x1 - 0.2 * x2 + np.random.normal(0, 1, num_samples)
-    )
+        0.2
+        * treatment
+        + 0.5
+        * x1
+        - 0.2
+        * x2
+        + np.random.normal(
+            0,
+            1,
+            num_samples))
 
     dataset = {
         "T": treatment,

diff --git a/causaltune/erupt.py b/causaltune/erupt.py
@@ -88,9 +88,87 @@ def weights(
         else:
             weight[weight > 1 / self.clip] = 1 / self.clip
 
-        # and just for paranoia's sake let's normalize, though it shouldn't matter for big samples
+        # and just for paranoia's sake let's normalize, though it shouldn't
+        # matter for big samples
         weight *= len(df) / sum(weight)
 
         assert not np.isnan(weight.sum()), "NaNs in ERUPT weights"
 
         return pd.Series(index=df.index, data=weight)
+
+        # NEW:
+
+    def probabilistic_erupt_score(
+        self,
+        df: pd.DataFrame,
+        outcome: pd.Series,
+        treatment_effects: pd.Series,
+        treatment_std_devs: pd.Series,
+        iterations: int = 1000
+    ) -> float:
+        """
+        Calculate the Probabilistic ERUPT (Expected Response Under Proposed
+        Treatments) score.
+
+        This method uses Monte Carlo simulation to estimate the expected
+        outcome under a probabilistic treatment policy, accounting for
+        uncertainty in treatment effects. It balances potential improvements
+        against estimation uncertainty and treatment rates.
+
+        Args:
+            df (pd.DataFrame): The input dataframe containing treatment
+            information.
+            outcome (pd.Series): The observed outcomes for each unit.
+            treatment_effects (pd.Series): Estimated treatment effects for
+            each unit.
+            treatment_std_devs (pd.Series): Standard deviations of treatment
+            effects.
+            iterations (int): Number of Monte Carlo iterations (default: 1000).
+
+        Returns:
+            float: The Probabilistic ERUPT score, representing the relative
+            improvement over the baseline outcome, adjusted for uncertainty.
+        """
+        # Calculate the baseline outcome (mean outcome for untreated units)
+        baseline_outcome = outcome[df[self.treatment_name] == 0].mean()
+
+        policy_values = []
+        treatment_decisions = []
+
+        # Perform Monte Carlo simulation
+        for _ in range(iterations):
+            # Sample treatment effects from normal distributions
+            sampled_effects = pd.Series(
+                np.random.normal(treatment_effects, treatment_std_devs),
+                index=treatment_effects.index
+            )
+
+            # Define policy: treat if sampled effect is positive
+            # Note: A more conservative policy could use: sampled_effects > 2 *
+            # treatment_std_devs
+            policy = (sampled_effects > 0).astype(int)
+
+            # Calculate expected outcome under this policy
+            expected_outcome = (
+                baseline_outcome
+                + (policy * sampled_effects).mean()
+            )
+
+            policy_values.append(expected_outcome)
+            treatment_decisions.append(policy.mean())
+
+        # Calculate mean and standard error of policy values
+        mean_value = np.mean(policy_values)
+        se_value = np.std(policy_values) / np.sqrt(iterations)
+
+        # Placeholder for potential treatment rate penalty
+        treatment_penalty = 0
+
+        # Calculate score: mean value minus 2 standard errors, adjusted for
+        # treatment penalty
+        score = (mean_value - 2 * se_value) * (1 - treatment_penalty)
+
+        # Calculate relative improvement over baseline
+        improvement = (score - baseline_outcome) / baseline_outcome
+
+        return improvement