Merge branch 'main' into edger-class

scverse · Nov 28, 2023 · 38fcf64 · 38fcf64
2 parents fe17448 + b11593a
commit 38fcf64
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ pip install multi-condition-comparisions
 1. Install the latest development version:
 
 ```bash
-pip install git+https://github.com/grst/multi-condition-comparisions.git@main
+pip install git+https://github.com/scverse/multi-condition-comparisions.git@main
 ```
 
 ## Release notes

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ urls.Home-page = "https://github.com/scverse/multi-condition-comparisons"
 dependencies = [
     "anndata",
     "formulaic",
+    "pandas",
     "pydeseq2",
     "scanpy",
     "rpy2",

diff --git a/src/multi_condition_comparisions/tl/de.py b/src/multi_condition_comparisions/tl/de.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pandas as pd
 import scanpy as sc
-import statsmodels.regression.linear_model
+import statsmodels.api as sm
 from anndata import AnnData
 from formulaic import model_matrix
 from formulaic.model_matrix import ModelMatrix
@@ -16,7 +16,12 @@
 
 class BaseMethod(ABC):
     def __init__(
-        self, adata: AnnData, design: str | np.ndarray, mask: str | None = None, layer: str | None = None, **kwargs
+        self,
+        adata: AnnData,
+        design: str | np.ndarray,
+        mask: str | None = None,
+        layer: str | None = None,
+        **kwargs,
     ):
         """
         Initialize the method
@@ -28,7 +33,9 @@ def __init__(
         design
             Model design. Can be either a design matrix, a formulaic formula.
         mask
-            a column in adata.var that contains a boolean mask with selected features.
+            A column in adata.var that contains a boolean mask with selected features.
+        layer
+            Layer to use in fit(). If None, use the X matrix.
         **kwargs
             Keyword arguments specific to the method implementation
         """
@@ -141,12 +148,36 @@ def contrast(self, column: str, baseline: str, group_to_compare: str) -> np.ndar
 class StatsmodelsDE(BaseMethod):
     """Differential expression test using a statsmodels linear regression"""
 
-    def fit(self):
-        """Fit the OLS model"""
+    def fit(
+        self,
+        regression_model: sm.OLS | sm.GLM = sm.OLS,
+        **kwargs,
+    ) -> None:
+        """
+        Fit the specified regression model.
+
+        Parameters
+        ----------
+        regression_model
+            A statsmodels regression model class, either OLS or GLM. Defaults to OLS.
+
+        **kwargs
+            Additional arguments for fitting the specific method. In particular, this
+            is where you can specify the family for GLM.
+
+        Example
+        -------
+        >>> import statsmodels.api as sm
+        >>> model = StatsmodelsDE(adata, design="~condition")
+        >>> model.fit(sm.GLM, family=sm.families.NegativeBinomial(link=sm.families.links.Log()))
+        >>> results = model.test_contrasts(np.array([0, 1]))
+        """
         self.models = []
         for var in tqdm(self.adata.var_names):
-            mod = statsmodels.regression.linear_model.OLS(
-                sc.get.obs_df(self.adata, keys=[var], layer=self.layer)[var], self.design
+            mod = regression_model(
+                sc.get.obs_df(self.adata, keys=[var], layer=self.layer)[var],
+                self.design,
+                **kwargs,
             )
             mod = mod.fit()
             self.models.append(mod)
@@ -164,7 +195,8 @@ def _test_single_contrast(self, contrast, **kwargs) -> pd.DataFrame:
                     "fold_change": t_test.effect.item(),
                 }
             )
-        return pd.DataFrame(res).sort_values("pvalue")
+
+        return pd.DataFrame(res).sort_values("pvalue").set_index("variable")
 
 class EdgeRDE(BaseMethod):
     """Differential expression test using EdgeR"""
@@ -312,3 +344,4 @@ def _test_single_contrast(self, contrast: List[str]) -> pd.DataFrame:
 
         return de_res
 
+
diff --git a/tests/test_de.py b/tests/test_de.py
@@ -1,6 +1,8 @@
 import anndata as ad
 import numpy as np
 import pytest
+import statsmodels.api as sm
+from pandas import testing as tm
 from pydeseq2.utils import load_example_data
 
 import multi_condition_comparisions
@@ -28,11 +30,29 @@ def test_adata():
     return ad.AnnData(X=counts, obs=metadata)
 
 
-@pytest.mark.parametrize("method_class", [StatsmodelsDE])
-def test_de(test_adata, method_class: BaseMethod):
-    """Check that the method can be initialized and fitted and that the test_contrast
-    method returns a dataframe with the correct number of rows"""
+@pytest.mark.parametrize(
+    "method_class,kwargs",
+    [
+        # OLS
+        (StatsmodelsDE, {}),
+        # Negative Binomial
+        (
+            StatsmodelsDE,
+            {"regression_model": sm.GLM, "family": sm.families.NegativeBinomial()},
+        ),
+    ],
+)
+def test_de(test_adata, method_class: BaseMethod, kwargs):
+    """Check that the method can be initialized and fitted, and perform basic checks on
+    the result of test_contrasts."""
     method = method_class(adata=test_adata, design="~condition")
-    method.fit()
+    method.fit(**kwargs)
     res_df = method.test_contrasts(np.array([0, 1]))
+    # Check that the result has the correct number of rows
     assert len(res_df) == test_adata.n_vars
+    # Check that the index of the result matches the var_names of the adata
+    tm.assert_index_equal(test_adata.var_names, res_df.index, check_order=False, check_names=False)
+    # Check that there is a p-value column
+    assert "pvalue" in res_df.columns
+    # Check that p-values are between 0 and 1
+    assert np.all((0 <= res_df["pvalue"]) & (res_df["pvalue"] <= 1))