Merge pull request #92 from gibsramen/nb-lme-single

Add NB LME Single as default model
biocore · Oct 7, 2023 · 58f763d · 58f763d
2 parents 4302281 + a1df859
commit 58f763d
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ birdman/templates/negative_binomial
 birdman/templates/multinomial
 birdman/templates/negative_binomial_single
 birdman/templates/negative_binomial_lme
+birdman/templates/negative_binomial_lme_single
 tests/custom_model
 
 *__pycache__/

diff --git a/birdman/__init__.py b/birdman/__init__.py
@@ -1,10 +1,10 @@
 from .model_base import (BaseModel, TableModel, SingleFeatureModel,
                          ModelIterator)
 from .default_models import (NegativeBinomial, NegativeBinomialLME,
-                             NegativeBinomialSingle)
+                             NegativeBinomialSingle, NegativeBinomialLMESingle)
 
 __version__ = "0.1.0"
 
 __all__ = ["BaseModel", "TableModel", "SingleFeatureModel", "ModelIterator",
            "NegativeBinomial", "NegativeBinomialSingle",
-           "NegativeBinomialLME"]
+           "NegativeBinomialLME", "NegativeBinomialLMESingle"]
diff --git a/birdman/default_models.py b/birdman/default_models.py
@@ -1,4 +1,4 @@
-import os
+from os.path import join as pjoin
 from pkg_resources import resource_filename
 
 import biom
@@ -10,9 +10,10 @@
 TEMPLATES = resource_filename("birdman", "templates")
 DEFAULT_MODEL_DICT = {
     "negative_binomial": {
-        "standard": os.path.join(TEMPLATES, "negative_binomial.stan"),
-        "single": os.path.join(TEMPLATES, "negative_binomial_single.stan"),
-        "lme": os.path.join(TEMPLATES, "negative_binomial_lme.stan")
+        "standard": pjoin(TEMPLATES, "negative_binomial.stan"),
+        "single": pjoin(TEMPLATES, "negative_binomial_single.stan"),
+        "full_lme": pjoin(TEMPLATES, "negative_binomial_lme.stan"),
+        "single_lme": pjoin(TEMPLATES, "negative_binomial_lme_single.stan")
     }
 }
 
@@ -271,7 +272,7 @@ def __init__(
         inv_disp_sd: float = 0.5,
         group_var_prior: float = 1.0
     ):
-        filepath = DEFAULT_MODEL_DICT["negative_binomial"]["lme"]
+        filepath = DEFAULT_MODEL_DICT["negative_binomial"]["full_lme"]
         super().__init__(
             table=table,
             model_path=filepath,
@@ -318,3 +319,116 @@ def __init__(
             posterior_predictive="y_predict",
             log_likelihood="log_lhood"
         )
+
+
+class NegativeBinomialLMESingle(SingleFeatureModel):
+    """Fit count data using negative binomial model on single feature.
+
+    .. math::
+
+        y_{ij} &\\sim \\textrm{NB}(\\mu_{ij}, \\phi_j)
+
+        \\log(\\mu_{ij}) &= \\log(\\textrm{Depth}_i) + x_i \\beta + z_i u
+
+    Priors:
+
+    .. math::
+
+        \\beta_j \\sim \\begin{cases}
+            \\textrm{Normal}(A, B_p), & j = 0
+
+            \\textrm{Normal}(0, B_p), & j > 0
+        \\end{cases}
+
+    .. math:: A = \\ln{\\frac{1}{D}},\\ D = \\textrm{Number of features}
+
+    .. math::
+
+        \\frac{1}{\\phi_j} \\sim \\textrm{Lognormal}(0, s),\\ s \\in
+            \\mathbb{R}_{>0}
+
+    .. math::
+
+        u_j &\\sim \\textrm{Normal}(0, u_p),\\ u_p \\in \\mathbb{R}_{>0}
+
+
+    :param table: Feature table (features x samples)
+    :type table: biom.table.Table
+
+    :param feature_id: ID of feature to fit
+    :type feature_id: str
+
+    :param formula: Design formula to use in model
+    :type formula: str
+
+    :param group_var: Variable in metadata to use as grouping
+    :type group_var: str
+
+    :param metadata: Metadata for design matrix
+    :type metadata: pd.DataFrame
+
+    :param beta_prior: Standard deviation for normally distributed prior values
+        of beta, defaults to 5.0
+    :type beta_prior: float
+
+    :param inv_disp_sd: Standard deviation for lognormally distributed prior
+        values of 1/phi, defaults to 0.5
+    :type inv_disp_sd: float
+    """
+    def __init__(
+        self,
+        table: biom.table.Table,
+        feature_id: str,
+        formula: str,
+        group_var: str,
+        metadata: pd.DataFrame,
+        beta_prior: float = 5.0,
+        inv_disp_sd: float = 0.5,
+        group_var_prior: float = 1.0
+    ):
+        filepath = DEFAULT_MODEL_DICT["negative_binomial"]["single_lme"]
+
+        super().__init__(
+            table=table,
+            feature_id=feature_id,
+            model_path=filepath,
+        )
+        self.create_regression(formula=formula, metadata=metadata)
+
+        D = table.shape[0]
+        A = np.log(1 / D)
+
+        # Encode group IDs starting at 1 because Stan 1-indexes arrays
+        group_var_series = metadata[group_var].loc[self.sample_names]
+        samp_subj_map = group_var_series.astype("category").cat.codes + 1
+        # Encoding as categories uses alphabetic sorting
+        self.groups = np.sort(group_var_series.unique())
+
+        param_dict = {
+            "depth": np.log(table.sum(axis="sample")),
+            "B_p": beta_prior,
+            "inv_disp_sd": inv_disp_sd,
+            "A": A,
+            "subj_ids": samp_subj_map,
+            "u_p": group_var_prior,
+            "S": len(self.groups)
+        }
+        self.add_parameters(param_dict)
+
+        self.specify_model(
+            params=["beta_var", "inv_disp", "subj_int"],
+            dims={
+                "beta_var": ["covariate"],
+                "log_lhood": ["tbl_sample"],
+                "y_predict": ["tbl_sample"],
+                "subj_int": ["group"]
+            },
+            coords={
+                "covariate": self.colnames,
+                "tbl_sample": self.sample_names,
+                "group": self.groups
+            },
+            include_observed_data=True,
+            posterior_predictive="y_predict",
+            log_likelihood="log_lhood"
+        )
diff --git a/birdman/templates/negative_binomial_lme.stan b/birdman/templates/negative_binomial_lme.stan
@@ -48,7 +48,7 @@ model {
   // generating counts
   for (n in 1:N){
     for (i in 1:D){
-      target += neg_binomial_2_log_lpmf(y[n, i] | lam_clr[n, i], inv_disp[i]);
+      target += neg_binomial_2_log_lpmf(y[n, i] | lam_clr[n, i], inv(inv_disp[i]));
     }
   }
 }
@@ -59,8 +59,8 @@ generated quantities {
 
   for (n in 1:N){
     for (i in 1:D){
-      y_predict[n, i] = neg_binomial_2_log_rng(lam_clr[n, i], inv_disp[i]);
-      log_lhood[n, i] = neg_binomial_2_log_lpmf(y[n, i] | lam_clr[n, i], inv_disp[i]);
+      y_predict[n, i] = neg_binomial_2_log_rng(lam_clr[n, i], inv(inv_disp[i]));
+      log_lhood[n, i] = neg_binomial_2_log_lpmf(y[n, i] | lam_clr[n, i], inv(inv_disp[i]));
     }
   }
 }
diff --git a/birdman/templates/negative_binomial_lme_single.stan b/birdman/templates/negative_binomial_lme_single.stan
@@ -0,0 +1,51 @@
+data {
+  int<lower=0> N;                           // number of samples
+  int<lower=0> S;                           // number of groups (subjects)
+  int<lower=0> p;                           // number of covariates
+  real A;                                   // mean intercept
+  vector[N] depth;                          // log sequencing depths of microbes
+  matrix[N, p] x;                           // covariate matrix
+  array[N] int y;                           // observed microbe abundances
+  array[N] int<lower=1, upper=S> subj_ids;  // mapping of samples to subject IDs
+
+  real<lower=0> B_p;                        // stdev for beta normal prior
+  real<lower=0> inv_disp_sd;                // stdev for inv disp lognormal prior
+  real<lower=0> u_p;                        // stdev for subject intercept normal prior
+}
+
+parameters {
+  real<offset=A, multiplier=B_p> beta_0;
+  vector<multiplier=B_p>[p-1] beta_x;
+  real<lower=0> inv_disp;
+  vector[S] subj_int;
+}
+
+transformed parameters {
+  vector[p] beta_var = append_row(beta_0, beta_x);
+  vector[N] lam = x * beta_var + depth;
+
+  for (n in 1:N){
+    lam[n] += subj_int[subj_ids[n]];
+  }
+}
+
+model {
+  inv_disp ~ lognormal(0., inv_disp_sd);
+  beta_0 ~ normal(A, B_p);
+  beta_x ~ normal(0, B_p);
+  for (j in 1:S){
+    subj_int[j] ~ normal(0., u_p);
+  }
+
+  y ~ neg_binomial_2_log(lam, inv(inv_disp));
+}
+
+generated quantities {
+  vector[N] log_lhood;
+  vector[N] y_predict;
+
+  for (n in 1:N){
+    y_predict[n] = neg_binomial_2_log_rng(lam[n], inv(inv_disp));
+    log_lhood[n] = neg_binomial_2_log_lpmf(y[n] | lam[n], inv(inv_disp));
+  }
+}
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -4,7 +4,8 @@
 import numpy as np
 
 from birdman import (NegativeBinomial, NegativeBinomialLME,
-                     NegativeBinomialSingle, ModelIterator)
+                     NegativeBinomialSingle, NegativeBinomialLMESingle,
+                     ModelIterator)
 
 TEMPLATES = resource_filename("birdman", "templates")
 
@@ -70,6 +71,22 @@ def test_single_feat(self, table_biom, metadata):
             nb.compile_model()
             nb.fit_model(method="mcmc", num_draws=100)
 
+    def test_lme_single_feat(self, table_biom, metadata):
+        md = metadata.copy()
+        np.random.seed(42)
+        md["group"] = np.random.randint(low=0, high=3, size=md.shape[0])
+        md["group"] = "G" + md["group"].astype(str)
+        for fid in table_biom.ids(axis="observation"):
+            nb = NegativeBinomialLMESingle(
+                table=table_biom,
+                feature_id=fid,
+                formula="host_common_name",
+                group_var="group",
+                metadata=md,
+            )
+            nb.compile_model()
+            nb.fit_model(num_draws=100)
+
 
 class TestToInference:
     def test_serial_to_inference(self, example_model):