From 9860b69a8168ee172cec2f06482ace224d5f2bb9 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Feb 2024 09:18:49 +0100 Subject: [PATCH 01/36] Stub custom materializer --- .../_util/__init__.py | 0 .../_util/formulaic.py | 27 +++++++++++++++++++ src/multi_condition_comparisions/tl/de.py | 4 +-- 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 src/multi_condition_comparisions/_util/__init__.py create mode 100644 src/multi_condition_comparisions/_util/formulaic.py diff --git a/src/multi_condition_comparisions/_util/__init__.py b/src/multi_condition_comparisions/_util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py new file mode 100644 index 0000000..a964fc3 --- /dev/null +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -0,0 +1,27 @@ +"""Helpers to interact with Formulaic Formulas""" + +from collections.abc import Sequence +from typing import Any + +from formulaic import ModelSpec +from formulaic.materializers import PandasMaterializer +from formulaic.materializers.types import EvaluatedFactor +from interface_meta import override + + +class CustomPandasMaterializer(PandasMaterializer): + """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" + + REGISTER_NAME = "custom_pandas" + REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) + REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") + + @override + def _encode_evaled_factor( + self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False + ) -> dict[str, Any]: + print(factor) + print(spec) + res = super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) + print(res) + return res diff --git a/src/multi_condition_comparisions/tl/de.py b/src/multi_condition_comparisions/tl/de.py index d4c9b97..8076d1c 100644 --- a/src/multi_condition_comparisions/tl/de.py +++ b/src/multi_condition_comparisions/tl/de.py @@ -62,7 +62,7 @@ def __init__( self.layer = layer if isinstance(design, str): - self.design = model_matrix(design, adata.obs) + self.design = model_matrix(design, adata.obs, materializer="custom_pandas") else: self.design = design @@ -220,7 +220,7 @@ class StatsmodelsDE(BaseMethod): """Differential expression test using a statsmodels linear regression""" def _check_counts(self) -> bool: - return self._check_count_matrix(self.adata.X) + pass def fit( self, From 91f8b1ca05913c2dc6620b81fbecdbd6548a19f1 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Feb 2024 09:47:04 +0100 Subject: [PATCH 02/36] Stub materializer factory --- .../_util/formulaic.py | 57 ++++++++++++++----- tests/conftest.py | 32 +++++++++++ tests/test_de.py | 4 ++ 3 files changed, 79 insertions(+), 14 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index a964fc3..f504605 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -1,6 +1,7 @@ """Helpers to interact with Formulaic Formulas""" from collections.abc import Sequence +from dataclasses import dataclass from typing import Any from formulaic import ModelSpec @@ -9,19 +10,47 @@ from interface_meta import override -class CustomPandasMaterializer(PandasMaterializer): - """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" +@dataclass +class FactorMetadata: + """Store (relevant) metadata for a factor of a formula.""" - REGISTER_NAME = "custom_pandas" - REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) - REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") + name: str + reduced_rank: bool + base_level: str - @override - def _encode_evaled_factor( - self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False - ) -> dict[str, Any]: - print(factor) - print(spec) - res = super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) - print(res) - return res + +class FactorMetadataStorage: + """ + Keep track of categorical factors used in a model spec. + + Generates a custom materializers that reports back certain metadata upon materialization of the model matrix. + """ + + def __init__(self) -> None: + self.factors: dict[str, FactorMetadata] = {} + self.materializer = self._make_materializer() + + def _make_materializer(self_factor_metadata_storage): + class CustomPandasMaterializer(PandasMaterializer): + """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" + + REGISTER_NAME = "custom_pandas" + REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) + REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") + + factor_metadata_storage = self_factor_metadata_storage.factors + + @override + def _encode_evaled_factor( + self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False + ) -> dict[str, Any]: + self.factor_metadata_storage[factor] = FactorMetadata( + name=str(factor), reduced_rank=factor.reduced_rank, base_level="TODO" + ) + print(factor) + print(spec) + res = super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) + print(res) + return res + + return CustomPandasMaterializer diff --git a/tests/conftest.py b/tests/conftest.py index 43d7c8a..5f99689 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,6 @@ import anndata as ad +import numpy as np +import pandas as pd import pytest from pydeseq2.utils import load_example_data @@ -23,6 +25,36 @@ def test_metadata(): ) +@pytest.fixture +def test_adata_minimal(): + obs = pd.DataFrame( + [ + ["A", "D1", "X", 0.2], + ["B", "D1", "X", 0.7], + ["A", "D2", "Y", 1.6], + ["B", "D2", "Y", 42], + ["A", "D3", "Y", 212], + ["B", "D3", "Y", 6023], + ], + columns=["condition", "donor", "other", "cont"], + ) + var = pd.DataFrame(index=["gene1", "gene2"]) + X = np.array( + [ + # gene1 differs between condition A/B by approx. FC=2 + # gene2 differs between donor D1/D2 by approx FC=2 + # [gene1, gene2] + [10, 200], + [20, 220], + [12, 400], + [25, 420], + [13, 600], + [24, 620], + ] + ) + return ad.AnnData(X=X, obs=obs, var=var) + + @pytest.fixture def test_adata(test_counts, test_metadata): return ad.AnnData(X=test_counts, obs=test_metadata) diff --git a/tests/test_de.py b/tests/test_de.py index e39cdd2..7614d85 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -11,6 +11,10 @@ def test_package_has_version(): assert multi_condition_comparisions.__version__ is not None +def test_foo(test_adata_minimal): + StatsmodelsDE(test_adata_minimal, "~ 0 + C(condition, contr.treatment(base='B')) + donor") + + @pytest.mark.parametrize( "method_class,kwargs", [ From 937e303b1a6e1f6d56aaac68fafa7a495a0dc03c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Feb 2024 10:02:29 +0100 Subject: [PATCH 03/36] Setup basic factor metadata registry --- .../_util/formulaic.py | 57 +++++++++---------- src/multi_condition_comparisions/tl/de.py | 9 ++- tests/test_de.py | 3 +- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index f504605..66e68b8 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -19,38 +19,35 @@ class FactorMetadata: base_level: str -class FactorMetadataStorage: +def get_factor_storage_and_materializer(): """ Keep track of categorical factors used in a model spec. Generates a custom materializers that reports back certain metadata upon materialization of the model matrix. """ - - def __init__(self) -> None: - self.factors: dict[str, FactorMetadata] = {} - self.materializer = self._make_materializer() - - def _make_materializer(self_factor_metadata_storage): - class CustomPandasMaterializer(PandasMaterializer): - """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" - - REGISTER_NAME = "custom_pandas" - REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) - REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") - - factor_metadata_storage = self_factor_metadata_storage.factors - - @override - def _encode_evaled_factor( - self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False - ) -> dict[str, Any]: - self.factor_metadata_storage[factor] = FactorMetadata( - name=str(factor), reduced_rank=factor.reduced_rank, base_level="TODO" - ) - print(factor) - print(spec) - res = super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) - print(res) - return res - - return CustomPandasMaterializer + factor_storage: dict[str, FactorMetadata] = {} + + class CustomPandasMaterializer(PandasMaterializer): + """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" + + REGISTER_NAME = "custom_pandas" + REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) + REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") + + factor_metadata_storage = factor_storage + + @override + def _encode_evaled_factor( + self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False + ) -> dict[str, Any]: + assert factor.expr not in self.factor_metadata_storage, "Factor already present in metadata storage" + self.factor_metadata_storage[factor.expr] = FactorMetadata( + name=factor.expr, reduced_rank=reduced_rank, base_level="TODO" + ) + print(factor) + print(spec) + res = super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) + print(res) + return res + + return factor_storage, CustomPandasMaterializer diff --git a/src/multi_condition_comparisions/tl/de.py b/src/multi_condition_comparisions/tl/de.py index 8076d1c..297f81d 100644 --- a/src/multi_condition_comparisions/tl/de.py +++ b/src/multi_condition_comparisions/tl/de.py @@ -8,7 +8,7 @@ import statsmodels import statsmodels.api as sm from anndata import AnnData -from formulaic import model_matrix +from formulaic import Formula from formulaic.model_matrix import ModelMatrix from pydeseq2.dds import DeseqDataSet from pydeseq2.default_inference import DefaultInference @@ -17,10 +17,14 @@ from scipy.sparse import issparse, spmatrix from tqdm.auto import tqdm +from multi_condition_comparisions._util.formulaic import get_factor_storage_and_materializer + class BaseMethod(ABC): """Base method for DE testing that is implemented per DE test.""" + factor_metadata_storage = None + def __init__( self, adata: AnnData, @@ -62,7 +66,8 @@ def __init__( self.layer = layer if isinstance(design, str): - self.design = model_matrix(design, adata.obs, materializer="custom_pandas") + self.factor_metadata_storage, materializer_class = get_factor_storage_and_materializer() + self.design = materializer_class(adata.obs).get_model_matrix(Formula(design)) else: self.design = design diff --git a/tests/test_de.py b/tests/test_de.py index 7614d85..7931589 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -12,7 +12,8 @@ def test_package_has_version(): def test_foo(test_adata_minimal): - StatsmodelsDE(test_adata_minimal, "~ 0 + C(condition, contr.treatment(base='B')) + donor") + model = StatsmodelsDE(test_adata_minimal, "~ 0 + C(condition, contr.treatment(base='B')) + donor") + print(model) @pytest.mark.parametrize( From 36f07dbee05eaf141df7598a3cb3a92c47743ba1 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Feb 2024 11:57:26 +0100 Subject: [PATCH 04/36] Record all required attributes --- .../_util/formulaic.py | 41 ++++++++++++++----- tests/test_de.py | 2 +- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 66e68b8..3f1e6f8 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -4,9 +4,10 @@ from dataclasses import dataclass from typing import Any -from formulaic import ModelSpec +from formulaic import FactorValues, ModelSpec from formulaic.materializers import PandasMaterializer from formulaic.materializers.types import EvaluatedFactor +from formulaic.parser.types import Factor from interface_meta import override @@ -15,11 +16,25 @@ class FactorMetadata: """Store (relevant) metadata for a factor of a formula.""" name: str + """The unambiguous factor name as specified in the formula. E.g. `donor`, or `C(donor, contr.treatment(base="A"))`""" + reduced_rank: bool - base_level: str + """Whether a column will be dropped because it is redundant""" + + drop_field: str = None + """The category that is dropped. Note that this may also be populated if `reduced_rank = False`""" + + kind: Factor.Kind = None + """Type of the factor""" + + categories: Sequence[str] = None + """The unique categories in this factor""" + colname_format: str = None + """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`""" -def get_factor_storage_and_materializer(): + +def get_factor_storage_and_materializer() -> tuple[dict[str, FactorMetadata], type]: """ Keep track of categorical factors used in a model spec. @@ -41,13 +56,17 @@ def _encode_evaled_factor( self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False ) -> dict[str, Any]: assert factor.expr not in self.factor_metadata_storage, "Factor already present in metadata storage" - self.factor_metadata_storage[factor.expr] = FactorMetadata( - name=factor.expr, reduced_rank=reduced_rank, base_level="TODO" - ) - print(factor) - print(spec) - res = super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) - print(res) - return res + self.factor_metadata_storage[factor.expr] = FactorMetadata(name=factor.expr, reduced_rank=reduced_rank) + return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) + + @override + def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]: + """Function is called at the end, here se still have access to the raw factor values.""" + factor_metadata = self.factor_metadata_storage[name] + factor_metadata.drop_field = values.__formulaic_metadata__.drop_field + factor_metadata.categories = values.__formulaic_metadata__.column_names + factor_metadata.colname_format = values.__formulaic_metadata__.format + + return super()._flatten_encoded_evaled_factor(name, values) return factor_storage, CustomPandasMaterializer diff --git a/tests/test_de.py b/tests/test_de.py index 7931589..eacd286 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -12,7 +12,7 @@ def test_package_has_version(): def test_foo(test_adata_minimal): - model = StatsmodelsDE(test_adata_minimal, "~ 0 + C(condition, contr.treatment(base='B')) + donor") + model = StatsmodelsDE(test_adata_minimal, "~ 0 + C(condition, contr.treatment) + donor") print(model) From 56e0b05a35b487200e8520ab7560268e2d59b0b6 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Feb 2024 12:51:58 +0100 Subject: [PATCH 05/36] Implement variable2term --- src/multi_condition_comparisions/tl/de.py | 53 +++++++++++++++-------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/src/multi_condition_comparisions/tl/de.py b/src/multi_condition_comparisions/tl/de.py index 297f81d..bf4dfeb 100644 --- a/src/multi_condition_comparisions/tl/de.py +++ b/src/multi_condition_comparisions/tl/de.py @@ -1,4 +1,3 @@ -import re import warnings from abc import ABC, abstractmethod @@ -9,7 +8,6 @@ import statsmodels.api as sm from anndata import AnnData from formulaic import Formula -from formulaic.model_matrix import ModelMatrix from pydeseq2.dds import DeseqDataSet from pydeseq2.default_inference import DefaultInference from pydeseq2.ds import DeseqStats @@ -91,6 +89,33 @@ def variables(self): """Get the names of the variables used in the model definition""" return self.design.model_spec.variables_by_source["data"] + def _get_factor_from_variable(self, variable: str): + """ + Get the corresponding factor from a variable specified in a formula. + + TODO unittests/examples + >>> _get_factor_from_variable("donor") + C(donor, contr.treatment) + """ + try: + variable_terms = self.design.model_spec.variable_terms + terms = self.design.model_spec.terms + except AttributeError: + raise ValueError( + "Getting the factor from a variable only works when the design was specified as a formula." + ) from None + # Either, the variable is a term + if variable in terms: + return variable + # Or, there's an unambiguous mapping from variable to term: + elif len(variable_terms) == 1: + return next(iter(variable_terms)) + # Otherwise, we can't get the term and the user requires to specify the term directly + else: + raise ValueError( + "No unambigous mapping from variable to term -- Please specify the term instead of its variable" + ) + @abstractmethod def _check_counts(self) -> bool: """ @@ -170,26 +195,23 @@ def cond(self, **kwargs) -> np.ndarray: """ The intention is to make contrasts using this function as in glmGamPoi - >>> res < -test_de( - ... fit, contrast=cond(cell="B cells", condition="stim") - cond(cell="B cells", condition="ctrl") - ... ) + ```r + res <- test_de( + fit, contrast=cond(cell="B cells", condition="stim") - cond(cell="B cells", condition="ctrl") + ) + ``` Parameters ---------- **kwargs """ - - # TODO this is hacky - reach out to formulaic authors how to do this properly - def _get_var_from_colname(colname): - regex = re.compile(r"^.+\[T\.(.+)\]$") - return regex.search(colname).groups()[0] - - if not isinstance(self.design, ModelMatrix): + if self.factor_metadata_storage is None: raise RuntimeError( "Building contrasts with `cond` only works if you specified the model using a " "formulaic formula. Please manually provide a contrast vector." ) + cond_dict = kwargs for var in self.variables: var_type = self.design.model_spec.encoder_state[var][0].value @@ -205,12 +227,7 @@ def _get_var_from_colname(colname): if var_type != "categorical": cond_dict[var] = 0 else: - var_cols = self.design.columns[self.design.columns.str.startswith(f"{var}[")] - - present_categories = {_get_var_from_colname(x) for x in var_cols} - dropped_category = all_categories - present_categories - assert len(dropped_category) == 1 - cond_dict[var] = next(iter(dropped_category)) + pass df = pd.DataFrame([kwargs]) From fdcded04fc5f7380ad45dde7b6a1eb274e9ccbff Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 27 Feb 2024 09:43:42 +0100 Subject: [PATCH 06/36] Reimplement model.cond using factor_metadata_storage --- .../_util/formulaic.py | 9 +++- src/multi_condition_comparisions/tl/de.py | 47 +++++++++---------- tests/test_de.py | 2 +- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 3f1e6f8..abdbe0d 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -55,7 +55,14 @@ class CustomPandasMaterializer(PandasMaterializer): def _encode_evaled_factor( self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False ) -> dict[str, Any]: - assert factor.expr not in self.factor_metadata_storage, "Factor already present in metadata storage" + if factor.expr in self.factor_metadata_storage and not ( + factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache + ): + # the same factor might be referred to multiple times in the same formula -- for instance, when using + # an interaction term such as group*condition. In that case formulaic is reuding a cached encoding. + # However, if it's not just reusing an existing encoding, something unexpected is happening that + # we haven't accounted for yet. + raise AssertionError("Factor already present in metadata storage and not reusing cached encoding") self.factor_metadata_storage[factor.expr] = FactorMetadata(name=factor.expr, reduced_rank=reduced_rank) return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) diff --git a/src/multi_condition_comparisions/tl/de.py b/src/multi_condition_comparisions/tl/de.py index bf4dfeb..6c1ab06 100644 --- a/src/multi_condition_comparisions/tl/de.py +++ b/src/multi_condition_comparisions/tl/de.py @@ -8,6 +8,7 @@ import statsmodels.api as sm from anndata import AnnData from formulaic import Formula +from formulaic.parser.types import Factor from pydeseq2.dds import DeseqDataSet from pydeseq2.default_inference import DefaultInference from pydeseq2.ds import DeseqStats @@ -53,14 +54,14 @@ def __init__( # Do some sanity checks on the input. Do them after the mask is applied. - # Check that counts have no NaN or Inf values. - if np.any(np.logical_or(adata.X < 0, np.isnan(self.adata.X))) or np.any(np.isinf(self.adata.X)): - raise ValueError("Counts cannot contain negative, NaN or Inf values.") - # Check that counts have numeric values. - if not np.issubdtype(self.adata.X.dtype, np.number): - raise ValueError("Counts must be numeric.") + # # Check that counts have no NaN or Inf values. + # if np.any(np.logical_or(adata.X < 0, np.isnan(self.adata.X))) or np.any(np.isinf(self.adata.X)): + # raise ValueError("Counts cannot contain negative, NaN or Inf values.") + # # Check that counts have numeric values. + # if not np.issubdtype(self.adata.X.dtype, np.number): + # raise ValueError("Counts must be numeric.") - self._check_counts() + # self._check_counts() self.layer = layer if isinstance(design, str): @@ -89,12 +90,12 @@ def variables(self): """Get the names of the variables used in the model definition""" return self.design.model_spec.variables_by_source["data"] - def _get_factor_from_variable(self, variable: str): + def _get_term_from_variable(self, variable: str): """ Get the corresponding factor from a variable specified in a formula. TODO unittests/examples - >>> _get_factor_from_variable("donor") + >>> _get_terms_from_variable("donor") C(donor, contr.treatment) """ try: @@ -213,25 +214,19 @@ def cond(self, **kwargs) -> np.ndarray: ) cond_dict = kwargs - for var in self.variables: - var_type = self.design.model_spec.encoder_state[var][0].value - if var_type == "categorical": - all_categories = set(self.design.model_spec.encoder_state[var][1]["categories"]) - if var in kwargs: - if var_type == "categorical" and kwargs[var] not in all_categories: - raise ValueError( - f"You specified a non-existant category for {var}. Possible categories: {', '.join(all_categories)}" - ) - else: - # fill with default values - if var_type != "categorical": - cond_dict[var] = 0 + # convert variables to terms in the cond dict (if possible) + cond_dict = {self._get_term_from_variable(term): value for term, value in cond_dict.items()} + + # fill cond dict with default values + for term in self.factor_metadata_storage: + if term not in cond_dict: + metadata = self.factor_metadata_storage[term] + if metadata.kind == Factor.Kind.CATEGORICAL: + cond_dict[term] = metadata.drop_field else: - pass - - df = pd.DataFrame([kwargs]) + cond_dict[term] = 0 - return self.design.model_spec.get_model_matrix(df) + return self.design.model_spec.get_model_matrix(pd.DataFrame([cond_dict])) def contrast(self, column: str, baseline: str, group_to_compare: str) -> object: """Build a simple contrast for pairwise comparisons. In the future all methods should be able to accept the output of :method:`StatsmodelsDE.contrast` but alas a big TODO.""" diff --git a/tests/test_de.py b/tests/test_de.py index eacd286..4ad7bbe 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -12,7 +12,7 @@ def test_package_has_version(): def test_foo(test_adata_minimal): - model = StatsmodelsDE(test_adata_minimal, "~ 0 + C(condition, contr.treatment) + donor") + model = StatsmodelsDE(test_adata_minimal, "~ condition * donor") print(model) From 8ddd5bb385f5b270aeeaf70312dd6b810ad012c5 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 14:59:39 +0200 Subject: [PATCH 07/36] Cleanup after merge --- .../_util/__init__.py | 3 ++ .../{_util.py => _util/checks.py} | 0 .../methods/_statsmodels.py | 2 +- tests/conftest.py | 30 ------------------- tests/test_formulaic.py | 8 +++++ tests/test_statsmodels.py | 5 ---- 6 files changed, 12 insertions(+), 36 deletions(-) rename src/multi_condition_comparisions/{_util.py => _util/checks.py} (100%) create mode 100644 tests/test_formulaic.py diff --git a/src/multi_condition_comparisions/_util/__init__.py b/src/multi_condition_comparisions/_util/__init__.py index e69de29..a98b31a 100644 --- a/src/multi_condition_comparisions/_util/__init__.py +++ b/src/multi_condition_comparisions/_util/__init__.py @@ -0,0 +1,3 @@ +from .checks import check_is_integer_matrix, check_is_numeric_matrix + +__all__ = ["check_is_integer_matrix", "check_is_numeric_matrix"] diff --git a/src/multi_condition_comparisions/_util.py b/src/multi_condition_comparisions/_util/checks.py similarity index 100% rename from src/multi_condition_comparisions/_util.py rename to src/multi_condition_comparisions/_util/checks.py diff --git a/src/multi_condition_comparisions/methods/_statsmodels.py b/src/multi_condition_comparisions/methods/_statsmodels.py index 1bc4ef5..1fda634 100644 --- a/src/multi_condition_comparisions/methods/_statsmodels.py +++ b/src/multi_condition_comparisions/methods/_statsmodels.py @@ -18,7 +18,7 @@ def _check_counts(self): def fit( self, - regression_model: sm.OLS | sm.GLM = sm.OLS, + regression_model: type[sm.OLS] | type[sm.GLM] = sm.OLS, **kwargs, ) -> None: """ diff --git a/tests/conftest.py b/tests/conftest.py index 904e618..936ccf3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,36 +24,6 @@ def test_metadata(): ) -# @pytest.fixture -# def test_adata_minimal(): -# obs = pd.DataFrame( -# [ -# ["A", "D1", "X", 0.2], -# ["B", "D1", "X", 0.7], -# ["A", "D2", "Y", 1.6], -# ["B", "D2", "Y", 42], -# ["A", "D3", "Y", 212], -# ["B", "D3", "Y", 6023], -# ], -# columns=["condition", "donor", "other", "cont"], -# ) -# var = pd.DataFrame(index=["gene1", "gene2"]) -# X = np.array( -# [ -# # gene1 differs between condition A/B by approx. FC=2 -# # gene2 differs between donor D1/D2 by approx FC=2 -# # [gene1, gene2] -# [10, 200], -# [20, 220], -# [12, 400], -# [25, 420], -# [13, 600], -# [24, 620], -# ] -# ) -# return ad.AnnData(X=X, obs=obs, var=var) - - @pytest.fixture def test_adata(test_counts, test_metadata): return ad.AnnData(X=test_counts, obs=test_metadata) diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py new file mode 100644 index 0000000..cc62244 --- /dev/null +++ b/tests/test_formulaic.py @@ -0,0 +1,8 @@ +from multi_condition_comparisions._util.formulaic import get_factor_storage_and_materializer + + +def test_custom_materializer(test_adata_minimal, formula): + """Test that the custom materializer correctly stores the baseline category.""" + factor_storage, materializer = get_factor_storage_and_materializer() + materializer(test_adata_minimal.obs).get_model_matrix(formula) + raise AssertionError() diff --git a/tests/test_statsmodels.py b/tests/test_statsmodels.py index ffa195a..962e3a0 100644 --- a/tests/test_statsmodels.py +++ b/tests/test_statsmodels.py @@ -30,8 +30,3 @@ def test_statsmodels(test_adata, method_class, kwargs): # TODO: there should be a test checking if, for a concrete example, the output p-values and effect sizes are what # we expect (-> frozen snapshot, that way we also get a heads-up if something changes upstream) - - -def test_foo(test_adata_minimal): - model = Statsmodels(test_adata_minimal, "~ condition * donor") - print(model) From ee79d70ebd19ddd1fd1035f1b05a41486f4639b1 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 15:33:29 +0200 Subject: [PATCH 08/36] stub test cases --- .../_util/formulaic.py | 2 +- tests/test_formulaic.py | 23 ++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index abdbe0d..f1eb4e3 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -68,7 +68,7 @@ def _encode_evaled_factor( @override def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]: - """Function is called at the end, here se still have access to the raw factor values.""" + """Function is called at the end, here we still have access to the raw factor values.""" factor_metadata = self.factor_metadata_storage[name] factor_metadata.drop_field = values.__formulaic_metadata__.drop_field factor_metadata.categories = values.__formulaic_metadata__.column_names diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py index cc62244..446a950 100644 --- a/tests/test_formulaic.py +++ b/tests/test_formulaic.py @@ -1,8 +1,29 @@ +import pytest + from multi_condition_comparisions._util.formulaic import get_factor_storage_and_materializer +@pytest.mark.parametrize( + "formula", + [ + "~ donor", + "~ C(donor)", + "~ C(donor, contr.treatment(base='D2'))", + "~ C(donor, contr.sum)", + "~ condition", + "~ C(condition)", + "~ C(condition, contr.treatment(base='B'))", + "~ C(condition, contr.sum)", + "~ 0 + condition", + "~ condition + donor", + "~ condition * donor", + "~ condition + C(condition) + C(condition, contr.treatment(base='B'))", + "~ condition + continuous + np.log(continuous)", + "~ condition * donor + continuous", + ], +) def test_custom_materializer(test_adata_minimal, formula): """Test that the custom materializer correctly stores the baseline category.""" factor_storage, materializer = get_factor_storage_and_materializer() materializer(test_adata_minimal.obs).get_model_matrix(formula) - raise AssertionError() + raise AssertionError(factor_storage) From 402cff4428629db4b28e23ae5ecb7ab162758f05 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 16:51:55 +0200 Subject: [PATCH 09/36] WIP: deal with custom encoder classes --- .../_util/formulaic.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index f1eb4e3..d2d818a 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -21,8 +21,16 @@ class FactorMetadata: reduced_rank: bool """Whether a column will be dropped because it is redundant""" + custom_encoder: str = None + drop_field: str = None - """The category that is dropped. Note that this may also be populated if `reduced_rank = False`""" + """ + The category that is dropped. + + Note that + * this may also be populated if `reduced_rank = False` + * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`. + """ kind: Factor.Kind = None """Type of the factor""" @@ -30,9 +38,26 @@ class FactorMetadata: categories: Sequence[str] = None """The unique categories in this factor""" + column_names: Sequence[str] = None + """The column names for this factor included in the design matrix""" + colname_format: str = None """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`""" + @property + def base(self) -> str | None: + """The base category for this categorical factor""" + if not self.reduced_rank: + return None + else: + if self.custom_encoder: + tmp_base = set(self.categories) - set(self.column_names) + assert len(tmp_base) == 1 + return tmp_base.pop() + else: + assert self.drop_field is not None + return self.drop_field + def get_factor_storage_and_materializer() -> tuple[dict[str, FactorMetadata], type]: """ @@ -64,6 +89,7 @@ def _encode_evaled_factor( # we haven't accounted for yet. raise AssertionError("Factor already present in metadata storage and not reusing cached encoding") self.factor_metadata_storage[factor.expr] = FactorMetadata(name=factor.expr, reduced_rank=reduced_rank) + tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())) return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) @override From 438a46919caa8e5a271795bb5ee34f3513fe6f5c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 17:08:09 +0200 Subject: [PATCH 10/36] WIP stub testcase --- .../_util/formulaic.py | 37 +++++--- tests/test_formulaic.py | 88 +++++++++++++++---- 2 files changed, 97 insertions(+), 28 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index d2d818a..cb9b5d2 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -21,7 +21,14 @@ class FactorMetadata: reduced_rank: bool """Whether a column will be dropped because it is redundant""" - custom_encoder: str = None + custom_encoder: bool + """Whether or not a custom encoder (e.g. `C(...)`) was used.""" + + categories: Sequence[str] + """The unique categories in this factor (after applying `drop_rows`)""" + + kind: Factor.Kind = None + """Type of the factor""" drop_field: str = None """ @@ -32,21 +39,25 @@ class FactorMetadata: * this is only populated when no encoder was used (e.g. `~ donor` but NOT `~ C(donor)`. """ - kind: Factor.Kind = None - """Type of the factor""" - - categories: Sequence[str] = None - """The unique categories in this factor""" - column_names: Sequence[str] = None - """The column names for this factor included in the design matrix""" + """ + The column names for this factor included in the design matrix. + + This may be the same as `categories` if the default encoder is used, or + categories without the base level if a custom encoder (e.g. `C(...)`) is used. + """ colname_format: str = None """A formattable string that can be used to generate the column name in the design matrix, e.g. `{name}[T.{field}]`""" @property def base(self) -> str | None: - """The base category for this categorical factor""" + """ + The base category for this categorical factor. + + This is derived from `drop_field` (for default encoding) or by comparing the column names in + the design matrix with all categories (for custom encoding, e.g. `C(...)`). + """ if not self.reduced_rank: return None else: @@ -88,8 +99,12 @@ def _encode_evaled_factor( # However, if it's not just reusing an existing encoding, something unexpected is happening that # we haven't accounted for yet. raise AssertionError("Factor already present in metadata storage and not reusing cached encoding") - self.factor_metadata_storage[factor.expr] = FactorMetadata(name=factor.expr, reduced_rank=reduced_rank) - tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())) + self.factor_metadata_storage[factor.expr] = FactorMetadata( + name=factor.expr, + reduced_rank=reduced_rank, + categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())), + custom_encoder=factor.metadata.encoder is not None, + ) return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) @override diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py index 446a950..b6916ee 100644 --- a/tests/test_formulaic.py +++ b/tests/test_formulaic.py @@ -4,26 +4,80 @@ @pytest.mark.parametrize( - "formula", + "formula,expected_factor_metadata", [ - "~ donor", - "~ C(donor)", - "~ C(donor, contr.treatment(base='D2'))", - "~ C(donor, contr.sum)", - "~ condition", - "~ C(condition)", - "~ C(condition, contr.treatment(base='B'))", - "~ C(condition, contr.sum)", - "~ 0 + condition", - "~ condition + donor", - "~ condition * donor", - "~ condition + C(condition) + C(condition, contr.treatment(base='B'))", - "~ condition + continuous + np.log(continuous)", - "~ condition * donor + continuous", + [ + "~ donor", + {"donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + ], + [ + "~ C(donor)", + {"donor": {"reduced_rank": True, "custom_encoder": True, "base": ""}}, + ], + [ + "~ C(donor, contr.treatment(base='D2'))", + {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D2"}}, + ], + [ + "~ C(donor, contr.sum)", + {"donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + ], + [ + "~ condition", + {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + ], + [ + "~ C(condition)", + {"condition": {"reduced_rank": True, "custom_encoder": True, "base": ""}}, + ], + [ + "~ C(condition, contr.treatment(base='B'))", + {"condition": {"reduced_rank": True, "custom_encoder": False, "base": "B"}}, + ], + [ + "~ C(condition, contr.sum)", + {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + ], + [ + "~ 0 + condition", + {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + ], + [ + "~ condition + donor", + { + "condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + }, + ], + [ + "~ condition * donor", + { + "condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + }, + ], + [ + "~ condition + C(condition) + C(condition, contr.treatment(base='B'))", + {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + ], + [ + "~ condition + continuous + np.log(continuous)", + {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + ], + [ + "~ condition * donor + continuous", + { + "condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + }, + ], ], ) -def test_custom_materializer(test_adata_minimal, formula): +def test_custom_materializer(test_adata_minimal, formula, expected_factor_metadata): """Test that the custom materializer correctly stores the baseline category.""" factor_storage, materializer = get_factor_storage_and_materializer() materializer(test_adata_minimal.obs).get_model_matrix(formula) - raise AssertionError(factor_storage) + for factor, expected_metadata in expected_factor_metadata.items(): + actual_metadata = factor_storage[factor] + for k in expected_metadata: + assert getattr(actual_metadata, k) == expected_metadata[k] From f6be15db33ac5f1e3d81d8c7993516577d56aeb9 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 20:29:09 +0200 Subject: [PATCH 11/36] Add testcases for custom materializer --- .../_util/formulaic.py | 23 +++- tests/test_formulaic.py | 126 +++++++++++++++--- 2 files changed, 126 insertions(+), 23 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index cb9b5d2..585cfaa 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -27,7 +27,7 @@ class FactorMetadata: categories: Sequence[str] """The unique categories in this factor (after applying `drop_rows`)""" - kind: Factor.Kind = None + kind: Factor.Kind """Type of the factor""" drop_field: str = None @@ -75,6 +75,13 @@ def get_factor_storage_and_materializer() -> tuple[dict[str, FactorMetadata], ty Keep track of categorical factors used in a model spec. Generates a custom materializers that reports back certain metadata upon materialization of the model matrix. + + Returns + ------- + factor_storage + A dictionary pointing to Metadata for each factor processed by the custom materializer + CustomPandasMaterializer + A materializer class that is tied to the particular instance of `factor_storage`. """ factor_storage: dict[str, FactorMetadata] = {} @@ -91,6 +98,11 @@ class CustomPandasMaterializer(PandasMaterializer): def _encode_evaled_factor( self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False ) -> dict[str, Any]: + """ + Function is called just before the factor is evaluated. + + We can record some metadata, before we call the original function. + """ if factor.expr in self.factor_metadata_storage and not ( factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache ): @@ -104,15 +116,20 @@ def _encode_evaled_factor( reduced_rank=reduced_rank, categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())), custom_encoder=factor.metadata.encoder is not None, + kind=factor.metadata.kind, ) return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) @override def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) -> dict[str, Any]: - """Function is called at the end, here we still have access to the raw factor values.""" + """ + Function is called at the end, before the design matrix gets materialized. + + Here we have access to additional metadata, such as `drop_field`. + """ factor_metadata = self.factor_metadata_storage[name] factor_metadata.drop_field = values.__formulaic_metadata__.drop_field - factor_metadata.categories = values.__formulaic_metadata__.column_names + factor_metadata.column_names = values.__formulaic_metadata__.column_names factor_metadata.colname_format = values.__formulaic_metadata__.format return super()._flatten_encoded_evaled_factor(name, values) diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py index b6916ee..e099199 100644 --- a/tests/test_formulaic.py +++ b/tests/test_formulaic.py @@ -1,80 +1,166 @@ +import pandas as pd import pytest +from formulaic.parser.types import Factor from multi_condition_comparisions._util.formulaic import get_factor_storage_and_materializer @pytest.mark.parametrize( - "formula,expected_factor_metadata", + "formula,reorder_categorical,expected_factor_metadata", [ [ "~ donor", - {"donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + None, + {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}}, + ], + [ + "~ donor", + {"donor": ["D2", "D1", "D0", "D3"]}, + {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D2"}}, ], [ "~ C(donor)", - {"donor": {"reduced_rank": True, "custom_encoder": True, "base": ""}}, + None, + {"C(donor)": {"reduced_rank": True, "custom_encoder": True, "base": "D0"}}, ], [ "~ C(donor, contr.treatment(base='D2'))", - {"donor": {"reduced_rank": True, "custom_encoder": False, "base": "D2"}}, + None, + {"C(donor, contr.treatment(base='D2'))": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}}, ], [ "~ C(donor, contr.sum)", - {"donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + None, + {"C(donor, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "D3"}}, + ], + [ + "~ C(donor, contr.sum)", + {"donor": ["D1", "D0", "D3", "D2"]}, + {"C(donor, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}}, ], [ "~ condition", - {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + None, + {"condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}}, ], [ "~ C(condition)", - {"condition": {"reduced_rank": True, "custom_encoder": True, "base": ""}}, + None, + {"C(condition)": {"reduced_rank": True, "custom_encoder": True, "base": "A"}}, ], [ "~ C(condition, contr.treatment(base='B'))", - {"condition": {"reduced_rank": True, "custom_encoder": False, "base": "B"}}, + None, + {"C(condition, contr.treatment(base='B'))": {"reduced_rank": True, "custom_encoder": True, "base": "B"}}, ], [ "~ C(condition, contr.sum)", - {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + None, + {"C(condition, contr.sum)": {"reduced_rank": True, "custom_encoder": True, "base": "B"}}, ], [ "~ 0 + condition", - {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + None, + {"condition": {"reduced_rank": False, "custom_encoder": False, "base": None}}, ], [ "~ condition + donor", + None, { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, + }, + ], + [ + "~ 0 + condition + donor", + None, + { + "condition": {"reduced_rank": False, "custom_encoder": False, "base": None}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, }, ], [ "~ condition * donor", + None, + { + "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, + }, + ], + [ + "~ condition * C(donor, contr.treatment(base='D2'))", + None, { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, + "C(donor, contr.treatment(base='D2'))": {"reduced_rank": True, "custom_encoder": True, "base": "D2"}, }, ], [ "~ condition + C(condition) + C(condition, contr.treatment(base='B'))", - {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + None, + { + "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, + "C(condition)": {"reduced_rank": True, "custom_encoder": True, "base": "A"}, + "C(condition, contr.treatment(base='B'))": {"reduced_rank": True, "custom_encoder": True, "base": "B"}, + }, ], [ "~ condition + continuous + np.log(continuous)", - {"condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}}, + None, + { + "condition": { + "reduced_rank": True, + "custom_encoder": False, + "base": "A", + "kind": Factor.Kind.CATEGORICAL, + }, + "continuous": { + "reduced_rank": False, + "custom_encoder": False, + "base": None, + "kind": Factor.Kind.NUMERICAL, + }, + "np.log(continuous)": { + "reduced_rank": False, + "custom_encoder": False, + "base": None, + "kind": Factor.Kind.NUMERICAL, + }, + }, ], [ "~ condition * donor + continuous", + None, { - "condition": {"reduced_rank": True, "custom_encoder": False, "base": ""}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": ""}, + "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, + "continuous": { + "reduced_rank": False, + "custom_encoder": False, + "base": None, + "kind": Factor.Kind.NUMERICAL, + }, }, ], ], ) -def test_custom_materializer(test_adata_minimal, formula, expected_factor_metadata): - """Test that the custom materializer correctly stores the baseline category.""" +def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, expected_factor_metadata): + """Test that the custom materializer correctly stores the baseline category. + + Parameters + ---------- + test_adata_minimal + adata fixture + formula + Formula to test + reorder_categorical + Create a pandas categorical for a given column with a certain order of categories + expected_factor_metadata + dict with expected values for each factor + """ + if reorder_categorical is not None: + for col, order in reorder_categorical.items(): + test_adata_minimal.obs[col] = pd.Categorical(test_adata_minimal.obs[col], categories=order) factor_storage, materializer = get_factor_storage_and_materializer() materializer(test_adata_minimal.obs).get_model_matrix(formula) for factor, expected_metadata in expected_factor_metadata.items(): From 784266b77022c90402ea4633c2b54476f942ef19 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 21:07:25 +0200 Subject: [PATCH 12/36] Update docstring for linear model base --- .../methods/_base.py | 47 ++++++++++++++----- tests/test_base.py | 0 2 files changed, 34 insertions(+), 13 deletions(-) create mode 100644 tests/test_base.py diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index b834207..a78f6ca 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -252,22 +252,25 @@ def fit(self, **kwargs) -> None: ... @abstractmethod - def _test_single_contrast(self, contrast, **kwargs) -> pd.DataFrame: ... + def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> pd.DataFrame: ... - def test_contrasts( - self, contrasts: list[str] | dict[str, np.ndarray] | dict[str, list] | np.ndarray, **kwargs - ) -> pd.DataFrame: + def test_contrasts(self, contrasts: Sequence[float] | dict[str, Sequence[float]], **kwargs) -> pd.DataFrame: """ - Conduct a specific test. + Perform a comparison as specified in a contrast vector. + + The contrast vector is a numeric vector with one element for each column in the design matrix. + We recommend building the contrast vector using {func}`~LinearModelBase.contrast` or + {func}`~LinearModelBase.cond`. - Please use :method:`contrast` to build the contrasts instead of building it on your own. + Multiple comparisons can be specified as a dictionary of contrast vectors. Parameters ---------- contrasts: - either a single contrast, or a dictionary of contrasts where the key is the name for that particular contrast. - Each contrast can be either a vector of coefficients (the most general case), a string, a DSL (work in progress) - or a tuple with three elements contrasts = ("condition", "control", "treatment") + Either a numeric contrast vector, or a dictionary of numeric contrast vectors. The dictionary keys + are added in a column named `contrast` in the result dataframe. + kwargs + are passed to the respective implementation """ if not isinstance(contrasts, dict): contrasts = {None: contrasts} @@ -348,9 +351,27 @@ def _get_var_from_colname(colname): return self.design.model_spec.get_model_matrix(df) - def contrast(self, column: str, baseline: str, group_to_compare: str) -> list: - """Build a simple contrast for pairwise comparisons. + def contrast(self, column: str, baseline: str, group_to_compare: str) -> pd.Series: + """ + Build a simple contrast for pairwise comparisons. + + This is an alias for + + ``` + model.cond(column=group_to_compare) - model.cond(column=baseline) + ``` + + Parameters + ---------- + column + column in adata.obs to test on + baseline + baseline category (denominator) + group_to_compare + category to compare against baseline (nominator) - In the future all methods should be able to accept the output of :method:`StatsmodelsDE.contrast` but alas a big TODO. + Returns + ------- + Numeric contrast vector """ - return [column, baseline, group_to_compare] + return self.cond(**{column: group_to_compare}) - self.cond(**{column: baseline}) diff --git a/tests/test_base.py b/tests/test_base.py new file mode 100644 index 0000000..e69de29 From 3a1ef9ae4656c79a05adfa6b4b3bb87d9136b015 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 21:46:55 +0200 Subject: [PATCH 13/36] WIP reimplement model.cond --- docs/notebooks/example.ipynb | 132 ++++++++++++++++-- .../_util/formulaic.py | 1 - .../methods/_base.py | 95 ++++++++----- tests/test_base.py | 19 +++ 4 files changed, 201 insertions(+), 46 deletions(-) diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index ce7dbad..d80220a 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -9,32 +9,146 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import anndata as ad\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pytest\n", + "from pydeseq2.utils import load_example_data\n", + "from formulaic import model_matrix\n", + "\n", + "\n", + "@pytest.fixture\n", + "def test_counts():\n", + " return load_example_data(\n", + " modality=\"raw_counts\",\n", + " dataset=\"synthetic\",\n", + " debug=False,\n", + " )\n", + "\n", + "\n", + "@pytest.fixture\n", + "def test_metadata():\n", + " return load_example_data(\n", + " modality=\"metadata\",\n", + " dataset=\"synthetic\",\n", + " debug=False,\n", + " )\n", + "\n", + "\n", + "@pytest.fixture\n", + "def test_adata(test_counts, test_metadata):\n", + " return ad.AnnData(X=test_counts, obs=test_metadata)\n", + "\n", + "\n", + "def test_adata_minimal():\n", + " matrix_format = np.array\n", + " n_obs = 80\n", + " n_donors = n_obs // 4\n", + " rng = np.random.default_rng(9) # make tests deterministic\n", + " obs = pd.DataFrame(\n", + " {\n", + " \"condition\": [\"A\", \"B\"] * (n_obs // 2),\n", + " \"donor\": sum(([f\"D{i}\"] * n_donors for i in range(n_obs // n_donors)), []),\n", + " \"other\": ([\"X\"] * (n_obs // 4)) + ([\"Y\"] * ((3 * n_obs) // 4)),\n", + " \"pairing\": sum(([str(i), str(i)] for i in range(n_obs // 2)), []),\n", + " \"continuous\": [rng.uniform(0, 1) * 4000 for _ in range(n_obs)],\n", + " },\n", + " )\n", + " var = pd.DataFrame(index=[\"gene1\", \"gene2\"])\n", + " group1 = rng.negative_binomial(20, 0.1, n_obs // 2) # large mean\n", + " group2 = rng.negative_binomial(5, 0.5, n_obs // 2) # small mean\n", + "\n", + " condition_data = np.empty((n_obs,), dtype=group1.dtype)\n", + " condition_data[0::2] = group1\n", + " condition_data[1::2] = group2\n", + "\n", + " donor_data = np.empty((n_obs,), dtype=group1.dtype)\n", + " donor_data[0:n_donors] = group2[:n_donors]\n", + " donor_data[n_donors : (2 * n_donors)] = group1[n_donors:]\n", + "\n", + " donor_data[(2 * n_donors) : (3 * n_donors)] = group2[:n_donors]\n", + " donor_data[(3 * n_donors) :] = group1[n_donors:]\n", + "\n", + " X = matrix_format(np.vstack([condition_data, donor_data]).T)\n", + " return ad.AnnData(X=X, obs=obs, var=var)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/sturm/apps/micromamba/envs/multi-condition-comparisons/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "/home/sturm/apps/micromamba/envs/multi-condition-comparisons/lib/python3.10/site-packages/anndata/_core/anndata.py:183: ImplicitModificationWarning: Transforming to str index.\n", + " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" ] } ], "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "from multi_condition_comparisions.tl.de import StatsmodelsDE\n", - "import scanpy as sc" + "adata = test_adata_minimal()" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "adata = sc.read_h5ad(\"./airway.h5ad\")" + "design = model_matrix(\"~ condition + C(condition) + donor + continuous\", adata.obs)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{1: set(),\n", + " condition: {'condition'},\n", + " C(condition): {'C', 'condition'},\n", + " donor: {'donor'},\n", + " continuous: {'continuous'}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "design.model_spec.term_variables" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'condition': {C(condition), condition},\n", + " 'C': {C(condition)},\n", + " 'donor': {donor},\n", + " 'continuous': {continuous}}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "design.model_spec.variable_terms" ] }, { diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 585cfaa..25e7b20 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -7,7 +7,6 @@ from formulaic import FactorValues, ModelSpec from formulaic.materializers import PandasMaterializer from formulaic.materializers.types import EvaluatedFactor -from formulaic.parser.types import Factor from interface_meta import override diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index a78f6ca..a85b462 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -1,4 +1,3 @@ -import re from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence from dataclasses import dataclass @@ -7,10 +6,9 @@ import numpy as np import pandas as pd from anndata import AnnData -from formulaic import model_matrix -from formulaic.model_matrix import ModelMatrix from multi_condition_comparisions._util import check_is_numeric_matrix +from multi_condition_comparisions._util.formulaic import Factor, get_factor_storage_and_materializer @dataclass @@ -119,6 +117,11 @@ def compare_groups( class LinearModelBase(MethodBase): """Base class for DE methods that have a linear model interface (i.e. support design matrices and contrast testing)""" + materializer = None + factor_storage = None + """Object to store metadata of formulaic factors which is used for building contrasts later. If a design matrix + is passed directly, this remains None.""" + def __init__( self, adata: AnnData, @@ -148,7 +151,8 @@ def __init__( self._check_counts() if isinstance(design, str): - self.design = model_matrix(design, adata.obs) + self.factor_storage, self.materializer = get_factor_storage_and_materializer() + self.design = self.materializer(adata.obs).get_model_matrix(design) else: self.design = design @@ -216,7 +220,7 @@ def compare_groups( return de_res @property - def variables(self): + def variables(self) -> set: """Get the names of the variables used in the model definition""" try: return self.design.model_spec.variables_by_source["data"] @@ -303,53 +307,74 @@ def test_reduced(self, modelB: "MethodBase") -> pd.DataFrame: def cond(self, **kwargs) -> np.ndarray: """ - The intention is to make contrasts using this function as in glmGamPoi + Get a contrast vector representing a specific condition. + + The resulting contrast vector can be combined using arithmetic operations to generate a contrast + vector for a specific comparison, e.g. for a simple comparison treated vs. control + + >>> contrast = model.cond(condition="treated") - model.cond(condition="ctrl") + + When using an interaction term `cell * condition`, a comparison within a specific cell-type can be + easily built as follows: + + >>> contrast = model.cond(cell="B cells", condition="ctrl") - cond(cell="B cells", condition="treated") - >>> res < -test_de( - ... fit, contrast=cond(cell="B cells", condition="stim") - cond(cell="B cells", condition="ctrl") - ... ) + This way of building contrast vectors is inspired by [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html). Parameters ---------- **kwargs + column/value pairs + Returns + ------- + A contrast vector that aligns to the columns of the design matrix. """ - - # TODO this is hacky - reach out to formulaic authors how to do this properly - def _get_var_from_colname(colname): - regex = re.compile(r"^.+\[T\.(.+)\]$") - return regex.search(colname).groups()[0] - - if not isinstance(self.design, ModelMatrix): + if self.factor_storage is None: raise RuntimeError( "Building contrasts with `cond` only works if you specified the model using a " "formulaic formula. Please manually provide a contrast vector." ) + cond_dict = kwargs - for var in self.variables: - var_type = self.design.model_spec.encoder_state[var][0].value - if var_type == "categorical": - all_categories = set(self.design.model_spec.encoder_state[var][1]["categories"]) - if var in kwargs: - if var_type == "categorical" and kwargs[var] not in all_categories: + + if not set(cond_dict.keys()).issubset(self.variables): + raise ValueError( + "You specified a variable that is not part of the model. Available variables: " + + ",".join(self.variables) + ) + + # We now fill `cond_dict` such that it is equivalent to a data row from `adata.obs`. + # This data can then be passed to the `get_model_matrix` of formulaic to retreive a correpsonding + # contrast vector. + # To do so, we keep the values that were already specified, and fill all other values with the default category + # (the one that is usually dropped from the model for being redundant). + # + # `model_spec.variable_terms` is a mapping from variable to a set of terms. Unless a variable is used twice in the + # same formula (which for don't support for now), it contains exactly one element. + for var, term in self.design.model_spec.variable_terms.items(): + if len(term) != 1: + raise RuntimeError( + "Ambiguous variable! Building contrasts with model.cond only works " + "when each variable occurs only once per formula" + ) + term = term.pop() + term_metadata = self.factor_storage[term] + if var in cond_dict: + # In this case we keep the specified value in the dict, but we verify that it's a valid category + if term_metadata.kind == Factor.Kind.CATEGORICAL and cond_dict[var] not in term_metadata.categories: raise ValueError( - f"You specified a non-existant category for {var}. Possible categories: {', '.join(all_categories)}" + f"You specified a non-existant category for {var}. Possible categories: {', '.join(term_metadata.categories)}" ) else: # fill with default values - if var_type != "categorical": - cond_dict[var] = 0 + if term_metadata.kind == Factor.Kind.CATEGORICAL: + cond_dict[var] = term_metadata.base else: - var_cols = self.design.columns[self.design.columns.str.startswith(f"{var}[")] - - present_categories = {_get_var_from_colname(x) for x in var_cols} - dropped_category = all_categories - present_categories - assert len(dropped_category) == 1 - cond_dict[var] = next(iter(dropped_category)) + cond_dict[var] = 0 df = pd.DataFrame([kwargs]) - - return self.design.model_spec.get_model_matrix(df) + return self.design.model_spec.get_model_matrix(df).iloc[0] def contrast(self, column: str, baseline: str, group_to_compare: str) -> pd.Series: """ @@ -357,9 +382,7 @@ def contrast(self, column: str, baseline: str, group_to_compare: str) -> pd.Seri This is an alias for - ``` - model.cond(column=group_to_compare) - model.cond(column=baseline) - ``` + >>> model.cond(column=group_to_compare) - model.cond(column=baseline) Parameters ---------- diff --git a/tests/test_base.py b/tests/test_base.py index e69de29..a7cbbb6 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -0,0 +1,19 @@ +from collections.abc import Sequence + +import pytest +from pandas.core.api import DataFrame as DataFrame + +from multi_condition_comparisions.methods import LinearModelBase + + +@pytest.fixture +def MockLinearModel(): + class _MockLinearModel(LinearModelBase): + def _check_counts(self) -> None: + pass + + def fit(self, **kwargs) -> None: + pass + + def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFrame: + pass From 90e82474420d2e2937d97ad78382e49ea445f280 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 2 Apr 2024 21:58:19 +0200 Subject: [PATCH 14/36] Stub testcase for model.cond --- .../_util/formulaic.py | 1 + tests/test_base.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 25e7b20..f8b1811 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -8,6 +8,7 @@ from formulaic.materializers import PandasMaterializer from formulaic.materializers.types import EvaluatedFactor from interface_meta import override +from formulaic.parser.types import Factor @dataclass diff --git a/tests/test_base.py b/tests/test_base.py index a7cbbb6..3deed9e 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -17,3 +17,18 @@ def fit(self, **kwargs) -> None: def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFrame: pass + + return _MockLinearModel + + +@pytest.mark.parametrize( + "formula,cond_kwargs,expected_contrast", + [ + ["~ condition", {"condition": "A"}, [0, 0]], + ], +) +def test_model_cond(test_adata_minimal, MockLinearModel, formula, cond_kwargs, expected_contrast): + mod = MockLinearModel(test_adata_minimal, formula) + actual_contrast = mod.cond(**cond_kwargs) + assert actual_contrast.tolist() == expected_contrast + assert actual_contrast.index.tolist() == mod.design.columns.tolist() From 068c7cc949b06e20a2d3ef58d4169140b62cee83 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Apr 2024 19:58:30 +0000 Subject: [PATCH 15/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/multi_condition_comparisions/_util/formulaic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index f8b1811..585cfaa 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -7,8 +7,8 @@ from formulaic import FactorValues, ModelSpec from formulaic.materializers import PandasMaterializer from formulaic.materializers.types import EvaluatedFactor -from interface_meta import override from formulaic.parser.types import Factor +from interface_meta import override @dataclass From 46f142028f8c4c88d3a0ad613c5872a93785a16c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 3 Apr 2024 11:45:08 +0200 Subject: [PATCH 16/36] Fix that contrasts couldn't be build from model spec --- .../_util/formulaic.py | 50 ++++++++++++------- .../methods/_base.py | 16 +++--- tests/test_base.py | 2 +- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 585cfaa..c10f56a 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -93,6 +93,16 @@ class CustomPandasMaterializer(PandasMaterializer): REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") factor_metadata_storage = factor_storage + recording_active = True + + def stop_recording(self): + """ + Call this function after the model matrix has been completely initalized. + + Then the materializer can be reused for creating contrast vectors without modifying the stored values. + """ + # Use class variable here + CustomPandasMaterializer.recording_active = False @override def _encode_evaled_factor( @@ -103,21 +113,22 @@ def _encode_evaled_factor( We can record some metadata, before we call the original function. """ - if factor.expr in self.factor_metadata_storage and not ( - factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache - ): - # the same factor might be referred to multiple times in the same formula -- for instance, when using - # an interaction term such as group*condition. In that case formulaic is reuding a cached encoding. - # However, if it's not just reusing an existing encoding, something unexpected is happening that - # we haven't accounted for yet. - raise AssertionError("Factor already present in metadata storage and not reusing cached encoding") - self.factor_metadata_storage[factor.expr] = FactorMetadata( - name=factor.expr, - reduced_rank=reduced_rank, - categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())), - custom_encoder=factor.metadata.encoder is not None, - kind=factor.metadata.kind, - ) + if self.recording_active: + if factor.expr in self.factor_metadata_storage and not ( + factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache + ): + # the same factor might be referred to multiple times in the same formula -- for instance, when using + # an interaction term such as group*condition. In that case formulaic is reuding a cached encoding. + # However, if it's not just reusing an existing encoding, something unexpected is happening that + # we haven't accounted for yet. + raise AssertionError("Factor already present in metadata storage and not reusing cached encoding") + self.factor_metadata_storage[factor.expr] = FactorMetadata( + name=factor.expr, + reduced_rank=reduced_rank, + categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())), + custom_encoder=factor.metadata.encoder is not None, + kind=factor.metadata.kind, + ) return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) @override @@ -127,10 +138,11 @@ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) Here we have access to additional metadata, such as `drop_field`. """ - factor_metadata = self.factor_metadata_storage[name] - factor_metadata.drop_field = values.__formulaic_metadata__.drop_field - factor_metadata.column_names = values.__formulaic_metadata__.column_names - factor_metadata.colname_format = values.__formulaic_metadata__.format + if self.recording_active: + factor_metadata = self.factor_metadata_storage[name] + factor_metadata.drop_field = values.__formulaic_metadata__.drop_field + factor_metadata.column_names = values.__formulaic_metadata__.column_names + factor_metadata.colname_format = values.__formulaic_metadata__.format return super()._flatten_encoded_evaled_factor(name, values) diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index a85b462..afc46b5 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -117,11 +117,6 @@ def compare_groups( class LinearModelBase(MethodBase): """Base class for DE methods that have a linear model interface (i.e. support design matrices and contrast testing)""" - materializer = None - factor_storage = None - """Object to store metadata of formulaic factors which is used for building contrasts later. If a design matrix - is passed directly, this remains None.""" - def __init__( self, adata: AnnData, @@ -150,9 +145,16 @@ def __init__( super().__init__(adata, mask=mask, layer=layer) self._check_counts() + self.materializer = None + self.factor_storage = None + """Object to store metadata of formulaic factors which is used for building contrasts later. If a design matrix + is passed directly, this remains None.""" + if isinstance(design, str): - self.factor_storage, self.materializer = get_factor_storage_and_materializer() - self.design = self.materializer(adata.obs).get_model_matrix(design) + self.factor_storage, materializer_class = get_factor_storage_and_materializer() + self.materializer = materializer_class(adata.obs) + self.design = self.materializer.get_model_matrix(design) + self.materializer.stop_recording() else: self.design = design diff --git a/tests/test_base.py b/tests/test_base.py index 3deed9e..09fe93d 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -24,7 +24,7 @@ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFram @pytest.mark.parametrize( "formula,cond_kwargs,expected_contrast", [ - ["~ condition", {"condition": "A"}, [0, 0]], + ["~ condition", {"condition": "A"}, [1, 0]], ], ) def test_model_cond(test_adata_minimal, MockLinearModel, formula, cond_kwargs, expected_contrast): From c7d32906a71560c2a8cf822482ee68c76e5a01de Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 3 Apr 2024 11:54:27 +0200 Subject: [PATCH 17/36] Fix edgeR type hints --- src/multi_condition_comparisions/methods/_edger.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/multi_condition_comparisions/methods/_edger.py b/src/multi_condition_comparisions/methods/_edger.py index dd328bb..897102c 100644 --- a/src/multi_condition_comparisions/methods/_edger.py +++ b/src/multi_condition_comparisions/methods/_edger.py @@ -1,3 +1,5 @@ +from collections.abc import Sequence + import numpy as np import pandas as pd from scanpy import logging @@ -81,7 +83,7 @@ def fit(self, **kwargs): # adata, design, mask, layer ro.globalenv["fit"] = fit self.fit = fit - def _test_single_contrast(self, contrast: list[str], **kwargs) -> pd.DataFrame: + def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> pd.DataFrame: """ Conduct test for each contrast and return a data frame @@ -119,11 +121,7 @@ def _test_single_contrast(self, contrast: list[str], **kwargs) -> pd.DataFrame: ) from None # Convert vector to R, which drops a category like `self.design_matrix` to use the intercept for the left out. - contrast_vec = [0] * len(self.design.columns) - make_contrast_column_key = lambda ind: f"{contrast[0]}[T.{contrast[ind]}]" - for index in [1, 2]: - if make_contrast_column_key(index) in self.design.columns: - contrast_vec[self.design.columns.to_list().index(f"{contrast[0]}[T.{contrast[index]}]")] = 1 + contrast_vec = contrast contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast_vec)) ro.globalenv["contrast_vec"] = contrast_vec_r From 36ed50919d08a3b9b468238355a4383d9ca51d89 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 3 Apr 2024 12:48:54 +0200 Subject: [PATCH 18/36] Fix pydeseq2 function signatures --- .../methods/_pydeseq2.py | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/multi_condition_comparisions/methods/_pydeseq2.py b/src/multi_condition_comparisions/methods/_pydeseq2.py index f46d775..5cdc666 100644 --- a/src/multi_condition_comparisions/methods/_pydeseq2.py +++ b/src/multi_condition_comparisions/methods/_pydeseq2.py @@ -63,18 +63,27 @@ def fit(self, **kwargs) -> pd.DataFrame: dds.deseq2() self.dds = dds - def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame: + # TODO: PyDeseq2 doesn't support arbitrary designs and contrasts yet + # see https://github.com/owkin/PyDESeq2/issues/213 + # + # Therefore these functions are overridden in a way to make it work with PyDESeq2, + # ingoring the inconsistency of function signatures. Once arbitrary design + # matrices and contrasts are supported by PyDEseq2, we can fully support the + # Linear model interface. + def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd.DataFrame: # type: ignore """ Conduct a specific test and returns a Pandas DataFrame. + Note that PyDESeq2 doesn't support arbitrary contrasts yet. + Parameters ---------- - contrasts: - list of three strings of the form - ["variable", "tested level", "reference level"] - alpha: p value threshold used for controlling fdr with - independent hypothesis weighting - kwargs: extra arguments to pass to DeseqStats() + contrasts + list of three strings of the form `["variable", "tested level", "reference level"]`. + alpha + p value threshold used for controlling fdr with independent hypothesis weighting + kwargs + extra arguments to pass to DeseqStats() """ stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs) # Calling `.summary()` is required to fill the `results_df` data frame @@ -87,3 +96,11 @@ def _test_single_contrast(self, contrast: list[str], alpha=0.05, **kwargs) -> pd res_df.index.name = "variable" res_df = res_df.reset_index() return res_df + + def cond(self, **kwargs) -> ndarray: + raise NotImplementedError( + "PyDESeq2 currently doesn't support arbitratry contrasts, see https://github.com/owkin/PyDESeq2/issues/213" + ) + + def contrast(self, column: str, baseline: str, group_to_compare: str) -> tuple[str, str, str]: # type: ignore + return (column, group_to_compare, baseline) From 9d8add0da700ad800bac573a64a9c1648b11238a Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 3 Apr 2024 13:14:09 +0200 Subject: [PATCH 19/36] Fix edgeR tests --- src/multi_condition_comparisions/methods/_base.py | 2 +- src/multi_condition_comparisions/methods/_edger.py | 7 +++---- tests/test_edger.py | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index afc46b5..7413a99 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -360,7 +360,7 @@ def cond(self, **kwargs) -> np.ndarray: "Ambiguous variable! Building contrasts with model.cond only works " "when each variable occurs only once per formula" ) - term = term.pop() + term = next(iter(term)) term_metadata = self.factor_storage[term] if var in cond_dict: # In this case we keep the specified value in the dict, but we verify that it's a valid category diff --git a/src/multi_condition_comparisions/methods/_edger.py b/src/multi_condition_comparisions/methods/_edger.py index 897102c..a7691c5 100644 --- a/src/multi_condition_comparisions/methods/_edger.py +++ b/src/multi_condition_comparisions/methods/_edger.py @@ -50,11 +50,11 @@ def fit(self, **kwargs): # adata, design, mask, layer try: edger = importr("edgeR") - except ImportError: + except ImportError as e: raise ImportError( "edgeR requires a valid R installation with the following packages:\n" "edgeR, BiocParallel, RhpcBLASctl" - ) from None + ) from e # Convert dataframe with localconverter(ro.default_converter + numpy2ri.converter): @@ -121,8 +121,7 @@ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> pd.DataF ) from None # Convert vector to R, which drops a category like `self.design_matrix` to use the intercept for the left out. - contrast_vec = contrast - contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast_vec)) + contrast_vec_r = ro.conversion.py2rpy(np.asarray(contrast)) ro.globalenv["contrast_vec"] = contrast_vec_r # Test contrast with R diff --git a/tests/test_edger.py b/tests/test_edger.py index 521403b..b55fb36 100644 --- a/tests/test_edger.py +++ b/tests/test_edger.py @@ -12,7 +12,7 @@ def test_edger_simple(test_adata): """ method = EdgeR(adata=test_adata, design="~condition") method.fit() - res_df = method.test_contrasts(["condition", "A", "B"]) + res_df = method.test_contrasts(method.contrast("condition", "A", "B")) assert len(res_df) == test_adata.n_vars @@ -24,7 +24,7 @@ def test_edger_complex(test_adata): test_adata.obs["condition1"] = test_adata.obs["condition"].copy() method = EdgeR(adata=test_adata, design="~condition1+group") method.fit() - res_df = method.test_contrasts(["condition1", "A", "B"]) + res_df = method.test_contrasts(method.contrast("condition1", "A", "B")) assert len(res_df) == test_adata.n_vars # Check that the index of the result matches the var_names of the AnnData object From 28faf64547e4c9566380d7d0912fb8bd10c2aec6 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 3 Apr 2024 14:15:38 +0200 Subject: [PATCH 20/36] Fix model.cond term iteration --- src/multi_condition_comparisions/methods/_base.py | 6 ++++-- tests/test_base.py | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index 7413a99..7ef0f8b 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -353,8 +353,10 @@ def cond(self, **kwargs) -> np.ndarray: # (the one that is usually dropped from the model for being redundant). # # `model_spec.variable_terms` is a mapping from variable to a set of terms. Unless a variable is used twice in the - # same formula (which for don't support for now), it contains exactly one element. - for var, term in self.design.model_spec.variable_terms.items(): + # same formula (which for don't support for now), it contains exactly one element. It also contains + # "non-data" variables such as `C` - therefore we use `self.variables` to loop through. + for var in self.variables: + term = self.design.model_spec.variable_terms[var] if len(term) != 1: raise RuntimeError( "Ambiguous variable! Building contrasts with model.cond only works " diff --git a/tests/test_base.py b/tests/test_base.py index 09fe93d..05a2438 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -25,6 +25,14 @@ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFram "formula,cond_kwargs,expected_contrast", [ ["~ condition", {"condition": "A"}, [1, 0]], + ["~ condition", {"condition": "B"}, [1, 1]], + ["~ 0 + condition", {"condition": "A"}, [1, 0]], + ["~ 0 + condition", {"condition": "B"}, [0, 1]], + ["~ donor", {"donor": "D0"}, [1, 0, 0, 0]], + ["~ C(donor)", {"donor": "D0"}, [1, 0, 0, 0]], + ["~ C(donor, contr.treatment(base='D2'))", {"donor": "D2"}, [1, 0, 0, 0]], + ["~ C(donor, contr.treatment(base='D2'))", {"donor": "D0"}, [1, 1, 0, 0]], + # TODO: also include tests for errors that should be caught by .cond ], ) def test_model_cond(test_adata_minimal, MockLinearModel, formula, cond_kwargs, expected_contrast): From 822c6232510c1eec204ed0461f453bf834e39d7a Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 3 Apr 2024 14:24:59 +0200 Subject: [PATCH 21/36] Get rid of class variable stuff --- .../_util/formulaic.py | 36 +++++++++++++++---- .../methods/_base.py | 5 +-- tests/test_formulaic.py | 2 +- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index c10f56a..c1f7281 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -1,6 +1,6 @@ """Helpers to interact with Formulaic Formulas""" -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from dataclasses import dataclass from typing import Any @@ -92,8 +92,32 @@ class CustomPandasMaterializer(PandasMaterializer): REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") - factor_metadata_storage = factor_storage - recording_active = True + def __init__( + self, + data: Any, + context: Mapping[str, Any] | None = None, + record_factor_metadata: bool = False, + **params: Any, + ): + """ + Initialize the Materializer + + Parameters + ---------- + data + passed to PandasMaterializer + context + passed to PandasMaterializer + record_factor_metadata + Flag that tells whether this particular instance of the custom materializer class + is supposed to record factor metadata. Only the instance that is used for building the design + matrix should record the metadata. All other instances (e.g. used to generate contrast vectors) + should not record metadata to not overwrite the specifications from the design matrix. + **params + passed to PandasMaterializer + """ + self.factor_metadata_storage = factor_storage if record_factor_metadata else None + super().__init__(data, context, **params) def stop_recording(self): """ @@ -102,7 +126,7 @@ def stop_recording(self): Then the materializer can be reused for creating contrast vectors without modifying the stored values. """ # Use class variable here - CustomPandasMaterializer.recording_active = False + CustomPandasMaterializer.record_factor_metadata = False @override def _encode_evaled_factor( @@ -113,7 +137,7 @@ def _encode_evaled_factor( We can record some metadata, before we call the original function. """ - if self.recording_active: + if self.factor_metadata_storage is not None: if factor.expr in self.factor_metadata_storage and not ( factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache ): @@ -138,7 +162,7 @@ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) Here we have access to additional metadata, such as `drop_field`. """ - if self.recording_active: + if self.factor_metadata_storage is not None: factor_metadata = self.factor_metadata_storage[name] factor_metadata.drop_field = values.__formulaic_metadata__.drop_field factor_metadata.column_names = values.__formulaic_metadata__.column_names diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index 7ef0f8b..b55a7e7 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -145,16 +145,13 @@ def __init__( super().__init__(adata, mask=mask, layer=layer) self._check_counts() - self.materializer = None self.factor_storage = None """Object to store metadata of formulaic factors which is used for building contrasts later. If a design matrix is passed directly, this remains None.""" if isinstance(design, str): self.factor_storage, materializer_class = get_factor_storage_and_materializer() - self.materializer = materializer_class(adata.obs) - self.design = self.materializer.get_model_matrix(design) - self.materializer.stop_recording() + self.design = materializer_class(adata.obs, record_factor_metadata=True).get_model_matrix(design) else: self.design = design diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py index e099199..92b261b 100644 --- a/tests/test_formulaic.py +++ b/tests/test_formulaic.py @@ -162,7 +162,7 @@ def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, e for col, order in reorder_categorical.items(): test_adata_minimal.obs[col] = pd.Categorical(test_adata_minimal.obs[col], categories=order) factor_storage, materializer = get_factor_storage_and_materializer() - materializer(test_adata_minimal.obs).get_model_matrix(formula) + materializer(test_adata_minimal.obs, record_factor_metadata=True).get_model_matrix(formula) for factor, expected_metadata in expected_factor_metadata.items(): actual_metadata = factor_storage[factor] for k in expected_metadata: From ea027a2f0d28f42f2a292d97e4fa887ebd49d331 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 9 Apr 2024 16:32:41 +0200 Subject: [PATCH 22/36] Account for multiple factors being generated --- .../_util/formulaic.py | 59 +++++++++---------- .../methods/_base.py | 18 ++++-- tests/conftest.py | 5 ++ tests/test_base.py | 32 +++++++++- tests/test_formulaic.py | 10 ++++ 5 files changed, 84 insertions(+), 40 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index c1f7281..b9f4a8a 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -1,5 +1,6 @@ """Helpers to interact with Formulaic Formulas""" +from collections import defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass from typing import Any @@ -70,7 +71,7 @@ def base(self) -> str | None: return self.drop_field -def get_factor_storage_and_materializer() -> tuple[dict[str, FactorMetadata], type]: +def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], type]: """ Keep track of categorical factors used in a model spec. @@ -83,7 +84,9 @@ def get_factor_storage_and_materializer() -> tuple[dict[str, FactorMetadata], ty CustomPandasMaterializer A materializer class that is tied to the particular instance of `factor_storage`. """ - factor_storage: dict[str, FactorMetadata] = {} + # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction + # term, it generates the factor with both full rank and reduced rank. + factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list) class CustomPandasMaterializer(PandasMaterializer): """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" @@ -117,17 +120,10 @@ def __init__( passed to PandasMaterializer """ self.factor_metadata_storage = factor_storage if record_factor_metadata else None + # temporary pointer to metadata of factor that is currently evaluated + self._current_factor: FactorMetadata = None super().__init__(data, context, **params) - def stop_recording(self): - """ - Call this function after the model matrix has been completely initalized. - - Then the materializer can be reused for creating contrast vectors without modifying the stored values. - """ - # Use class variable here - CustomPandasMaterializer.record_factor_metadata = False - @override def _encode_evaled_factor( self, factor: EvaluatedFactor, spec: ModelSpec, drop_rows: Sequence[int], reduced_rank: bool = False @@ -137,22 +133,21 @@ def _encode_evaled_factor( We can record some metadata, before we call the original function. """ + assert ( + self._current_factor is None + ), "_current_factor should always be None when we start recording metadata" if self.factor_metadata_storage is not None: - if factor.expr in self.factor_metadata_storage and not ( - factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache - ): - # the same factor might be referred to multiple times in the same formula -- for instance, when using - # an interaction term such as group*condition. In that case formulaic is reuding a cached encoding. - # However, if it's not just reusing an existing encoding, something unexpected is happening that - # we haven't accounted for yet. - raise AssertionError("Factor already present in metadata storage and not reusing cached encoding") - self.factor_metadata_storage[factor.expr] = FactorMetadata( - name=factor.expr, - reduced_rank=reduced_rank, - categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())), - custom_encoder=factor.metadata.encoder is not None, - kind=factor.metadata.kind, - ) + # Don't store if the factor is cached (then we should already have recorded it) + if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache: + assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached" + else: + self._current_factor = FactorMetadata( + name=factor.expr, + reduced_rank=reduced_rank, + categories=tuple(sorted(factor.values.drop(index=factor.values.index[drop_rows]).unique())), + custom_encoder=factor.metadata.encoder is not None, + kind=factor.metadata.kind, + ) return super()._encode_evaled_factor(factor, spec, drop_rows, reduced_rank) @override @@ -162,11 +157,13 @@ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) Here we have access to additional metadata, such as `drop_field`. """ - if self.factor_metadata_storage is not None: - factor_metadata = self.factor_metadata_storage[name] - factor_metadata.drop_field = values.__formulaic_metadata__.drop_field - factor_metadata.column_names = values.__formulaic_metadata__.column_names - factor_metadata.colname_format = values.__formulaic_metadata__.format + if self._current_factor is not None: + assert self._current_factor.name == name + self._current_factor.drop_field = values.__formulaic_metadata__.drop_field + self._current_factor.column_names = values.__formulaic_metadata__.column_names + self._current_factor.colname_format = values.__formulaic_metadata__.format + self.factor_metadata_storage[name].append(self._current_factor) + self._current_factor = None return super()._flatten_encoded_evaled_factor(name, values) diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index b55a7e7..f640cc1 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -353,14 +353,18 @@ def cond(self, **kwargs) -> np.ndarray: # same formula (which for don't support for now), it contains exactly one element. It also contains # "non-data" variables such as `C` - therefore we use `self.variables` to loop through. for var in self.variables: - term = self.design.model_spec.variable_terms[var] - if len(term) != 1: - raise RuntimeError( + # A variable can refer to one or multiple terms. Either if a variable is specified + # multiple times in the model (e.g. ~ var + C(var); ~ continuous + np.log(continuous)) + # or when there's an interaction term (e.g. ~ A * B ==> terms `A`, `B`, `A:B`) + terms = self.design.model_spec.variable_terms[var] + # Only terms tied to a single variable are in the factor storage. Interaction terms (A:B) are not. + terms_metadata = [self.factor_storage[term] for term in terms if term in self.factor_storage] + if len(terms_metadata) != 1: + raise ValueError( "Ambiguous variable! Building contrasts with model.cond only works " "when each variable occurs only once per formula" ) - term = next(iter(term)) - term_metadata = self.factor_storage[term] + term_metadata = terms_metadata[0] if var in cond_dict: # In this case we keep the specified value in the dict, but we verify that it's a valid category if term_metadata.kind == Factor.Kind.CATEGORICAL and cond_dict[var] not in term_metadata.categories: @@ -370,7 +374,9 @@ def cond(self, **kwargs) -> np.ndarray: else: # fill with default values if term_metadata.kind == Factor.Kind.CATEGORICAL: - cond_dict[var] = term_metadata.base + cond_dict[var] = ( + term_metadata.base if term_metadata.base is not None else "\0" + ) # can be any other string that is not a category, but not None else: cond_dict[var] = 0 diff --git a/tests/conftest.py b/tests/conftest.py index 936ccf3..5eb3cd0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,6 @@ +# TODO: temp workaround for Gregor +import os + import anndata as ad import numpy as np import pandas as pd @@ -5,6 +8,8 @@ import scipy.sparse as sp from pydeseq2.utils import load_example_data +os.environ["R_HOME"] = "/home/sturm/apps/micromamba/envs/multi-condition-comparisons/lib/R" + @pytest.fixture def test_counts(): diff --git a/tests/test_base.py b/tests/test_base.py index 05a2438..010585f 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -26,17 +26,43 @@ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFram [ ["~ condition", {"condition": "A"}, [1, 0]], ["~ condition", {"condition": "B"}, [1, 1]], + ["~ condition", {"condition": "42"}, ValueError], # non-existant category ["~ 0 + condition", {"condition": "A"}, [1, 0]], ["~ 0 + condition", {"condition": "B"}, [0, 1]], ["~ donor", {"donor": "D0"}, [1, 0, 0, 0]], ["~ C(donor)", {"donor": "D0"}, [1, 0, 0, 0]], ["~ C(donor, contr.treatment(base='D2'))", {"donor": "D2"}, [1, 0, 0, 0]], ["~ C(donor, contr.treatment(base='D2'))", {"donor": "D0"}, [1, 1, 0, 0]], + ["~ donor + continuous", {"donor": "D1"}, [1, 1, 0, 0, 0]], + ["~ donor + np.log1p(continuous)", {"donor": "D1"}, [1, 1, 0, 0, 0]], + [ + "~ donor + continuous + np.log(continuous)", + {"donor": "D0"}, + ValueError, + ], # current limitation: variables may only be used once per formula + [ + "~ donor + C(donor)", + {"donor": "D0"}, + ValueError, + ], # current limitation: variables may only be used once per formula + ["~ C(donor, contr.sum)", {"donor": "D0"}, [1, 1, 0, 0]], + ["~ C(donor, contr.sum)", {"donor": "D3"}, [1, -1, -1, -1]], + ["~ condition + donor", {"condition": "A"}, [1, 0, 0, 0, 0]], + ["~ 0 + condition + donor", {"donor": "D1"}, [0, 0, 1, 0, 0]], + ["~ condition + donor", {"donor": "D2"}, [1, 0, 0, 1, 0]], + ["~ condition + donor", {"condition": "B", "donor": "D2"}, [1, 1, 0, 1, 0]], + ["~ condition * donor", {"condition": "A"}, [1, 0, 0, 0, 0]], + ["~ condition + donor + condition:donor", {"condition": "A"}, [1, 0, 0, 0, 0]], + ["~ condition:donor", {"condition": "A"}, [1, 0, 0, 0, 0]], # TODO: also include tests for errors that should be caught by .cond ], ) def test_model_cond(test_adata_minimal, MockLinearModel, formula, cond_kwargs, expected_contrast): mod = MockLinearModel(test_adata_minimal, formula) - actual_contrast = mod.cond(**cond_kwargs) - assert actual_contrast.tolist() == expected_contrast - assert actual_contrast.index.tolist() == mod.design.columns.tolist() + if isinstance(expected_contrast, type): + with pytest.raises(expected_contrast): + mod.cond(**cond_kwargs) + else: + actual_contrast = mod.cond(**cond_kwargs) + assert actual_contrast.tolist() == expected_contrast + assert actual_contrast.index.tolist() == mod.design.columns.tolist() diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py index 92b261b..33a3f19 100644 --- a/tests/test_formulaic.py +++ b/tests/test_formulaic.py @@ -142,6 +142,14 @@ }, }, ], + [ + "~ condition:donor", + None, + { + "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, + "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, + }, + ], ], ) def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, expected_factor_metadata): @@ -165,5 +173,7 @@ def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, e materializer(test_adata_minimal.obs, record_factor_metadata=True).get_model_matrix(formula) for factor, expected_metadata in expected_factor_metadata.items(): actual_metadata = factor_storage[factor] + assert len(actual_metadata) == 1 + actual_metadata = actual_metadata[0] for k in expected_metadata: assert getattr(actual_metadata, k) == expected_metadata[k] From cab97501eb4137ba2da40467f64998659af4afef Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 9 Apr 2024 17:03:13 +0200 Subject: [PATCH 23/36] only fail cond if variable is ambiguous --- .../_util/formulaic.py | 16 ++++++++++ .../methods/_base.py | 32 +++++++++++-------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index b9f4a8a..54e8ab3 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -168,3 +168,19 @@ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) return super()._flatten_encoded_evaled_factor(name, values) return factor_storage, CustomPandasMaterializer + + +def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any: + """Given a list of objects, return an attribute if it is the same between all object. Otherwise raise an error.""" + if not objs: + raise ValueError("Collection is empty") + + first_obj_attr = getattr(objs[0], attr) + + # Check if the attribute is the same for all objects + for obj in objs[1:]: + if getattr(obj, attr) != first_obj_attr: + raise ValueError(f"Ambiguous attribute '{attr}': values differ between objects") + + # If attribute is the same for all objects, return it + return first_obj_attr diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index f640cc1..a00c2e9 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence from dataclasses import dataclass +from itertools import chain from types import MappingProxyType import numpy as np @@ -8,7 +9,7 @@ from anndata import AnnData from multi_condition_comparisions._util import check_is_numeric_matrix -from multi_condition_comparisions._util.formulaic import Factor, get_factor_storage_and_materializer +from multi_condition_comparisions._util.formulaic import Factor, get_factor_storage_and_materializer, resolve_ambiguous @dataclass @@ -350,32 +351,35 @@ def cond(self, **kwargs) -> np.ndarray: # (the one that is usually dropped from the model for being redundant). # # `model_spec.variable_terms` is a mapping from variable to a set of terms. Unless a variable is used twice in the - # same formula (which for don't support for now), it contains exactly one element. It also contains + # same formula, it contains exactly one element. It also contains # "non-data" variables such as `C` - therefore we use `self.variables` to loop through. for var in self.variables: # A variable can refer to one or multiple terms. Either if a variable is specified # multiple times in the model (e.g. ~ var + C(var); ~ continuous + np.log(continuous)) # or when there's an interaction term (e.g. ~ A * B ==> terms `A`, `B`, `A:B`) terms = self.design.model_spec.variable_terms[var] - # Only terms tied to a single variable are in the factor storage. Interaction terms (A:B) are not. - terms_metadata = [self.factor_storage[term] for term in terms if term in self.factor_storage] - if len(terms_metadata) != 1: - raise ValueError( - "Ambiguous variable! Building contrasts with model.cond only works " - "when each variable occurs only once per formula" - ) - term_metadata = terms_metadata[0] + + # Get list of all Metadata objects for the associated terms. + # Some terms don't have metadata (e.g. instead of an interaction term `A:B`, metadata exists only for the + # individual variablesl `A` and `B`). Some terms have multiple metadata objects, because they are specified + # multiple times in the formula, or forumlaic resolves them multiple times internally. + terms_metadata = list(chain.from_iterable(self.factor_storage[term] for term in terms)) if var in cond_dict: # In this case we keep the specified value in the dict, but we verify that it's a valid category - if term_metadata.kind == Factor.Kind.CATEGORICAL and cond_dict[var] not in term_metadata.categories: + tmp_categories = resolve_ambiguous(terms_metadata, "categories") + if ( + resolve_ambiguous(terms_metadata, "kind") == Factor.Kind.CATEGORICAL + and cond_dict[var] not in tmp_categories + ): raise ValueError( - f"You specified a non-existant category for {var}. Possible categories: {', '.join(term_metadata.categories)}" + f"You specified a non-existant category for {var}. Possible categories: {', '.join(tmp_categories)}" ) else: # fill with default values - if term_metadata.kind == Factor.Kind.CATEGORICAL: + if resolve_ambiguous(terms_metadata, "kind") == Factor.Kind.CATEGORICAL: + tmp_base = resolve_ambiguous(terms_metadata, "base") cond_dict[var] = ( - term_metadata.base if term_metadata.base is not None else "\0" + tmp_base if tmp_base is not None else "\0" ) # can be any other string that is not a category, but not None else: cond_dict[var] = 0 From 4039e2387b522be85d390814f056b21e391d70b8 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 9 Apr 2024 17:17:36 +0200 Subject: [PATCH 24/36] Update comments --- .../_util/formulaic.py | 6 +++- .../methods/_base.py | 33 +++++++++++++++---- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 54e8ab3..96eca7e 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -170,6 +170,10 @@ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) return factor_storage, CustomPandasMaterializer +class AmbiguousAttributeError(ValueError): + pass + + def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any: """Given a list of objects, return an attribute if it is the same between all object. Otherwise raise an error.""" if not objs: @@ -180,7 +184,7 @@ def resolve_ambiguous(objs: Sequence[Any], attr: str) -> Any: # Check if the attribute is the same for all objects for obj in objs[1:]: if getattr(obj, attr) != first_obj_attr: - raise ValueError(f"Ambiguous attribute '{attr}': values differ between objects") + raise AmbiguousAttributeError(f"Ambiguous attribute '{attr}': values differ between objects") # If attribute is the same for all objects, return it return first_obj_attr diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index a00c2e9..1379d44 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -9,7 +9,12 @@ from anndata import AnnData from multi_condition_comparisions._util import check_is_numeric_matrix -from multi_condition_comparisions._util.formulaic import Factor, get_factor_storage_and_materializer, resolve_ambiguous +from multi_condition_comparisions._util.formulaic import ( + AmbiguousAttributeError, + Factor, + get_factor_storage_and_materializer, + resolve_ambiguous, +) @dataclass @@ -364,8 +369,12 @@ def cond(self, **kwargs) -> np.ndarray: # individual variablesl `A` and `B`). Some terms have multiple metadata objects, because they are specified # multiple times in the formula, or forumlaic resolves them multiple times internally. terms_metadata = list(chain.from_iterable(self.factor_storage[term] for term in terms)) + + # If the variable is specified in the cond_dict explicitly, we just keep it as is. + # We still verify that it's a valid category, otherwise simple typos are not caught and lead to + # zeros in the design matrix. if var in cond_dict: - # In this case we keep the specified value in the dict, but we verify that it's a valid category + # it should never be non-ambiguous. If it happens, it's an edge case we don't know about (-> let it fail) tmp_categories = resolve_ambiguous(terms_metadata, "categories") if ( resolve_ambiguous(terms_metadata, "kind") == Factor.Kind.CATEGORICAL @@ -374,14 +383,24 @@ def cond(self, **kwargs) -> np.ndarray: raise ValueError( f"You specified a non-existant category for {var}. Possible categories: {', '.join(tmp_categories)}" ) + + # If the variable is not specified, we want to fill it with its default value (i.e. the base category) else: - # fill with default values if resolve_ambiguous(terms_metadata, "kind") == Factor.Kind.CATEGORICAL: - tmp_base = resolve_ambiguous(terms_metadata, "base") - cond_dict[var] = ( - tmp_base if tmp_base is not None else "\0" - ) # can be any other string that is not a category, but not None + # In this case it can be ambiguous for some formulas -> Tell the user to specify the variable explicitly + try: + tmp_base = resolve_ambiguous(terms_metadata, "base") + except AmbiguousAttributeError as e: + raise ValueError( + f"Could not automatically resolve base category for variable {var}. Please specify it explicity in `model.cond`." + ) from e + + # if tmp_base is None (no-intercept model), set it to the NUL string \0. + # In principle, it could be any string that is not a valid category, but it cannot be None + # because this leads to an error in `get_model_matrix` + cond_dict[var] = tmp_base if tmp_base is not None else "\0" else: + # Set to zero for continuous variables cond_dict[var] = 0 df = pd.DataFrame([kwargs]) From ef99eb4cc3437380b0f1cf060d345885ef19eaef Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 9 Apr 2024 17:30:51 +0200 Subject: [PATCH 25/36] Add test for resolve ambiguous --- tests/test_formulaic.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py index 33a3f19..a6d8ae2 100644 --- a/tests/test_formulaic.py +++ b/tests/test_formulaic.py @@ -2,7 +2,12 @@ import pytest from formulaic.parser.types import Factor -from multi_condition_comparisions._util.formulaic import get_factor_storage_and_materializer +from multi_condition_comparisions._util.formulaic import ( + AmbiguousAttributeError, + FactorMetadata, + get_factor_storage_and_materializer, + resolve_ambiguous, +) @pytest.mark.parametrize( @@ -147,7 +152,10 @@ None, { "condition": {"reduced_rank": True, "custom_encoder": False, "base": "A"}, - "donor": {"reduced_rank": True, "custom_encoder": False, "base": "D0"}, + "donor": { + "custom_encoder": False, + "drop_field": "D0", + }, # `reduced_rank` and `base` will be ambigous here because Formulaic generates both version of the factor internally }, ], ], @@ -173,7 +181,29 @@ def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, e materializer(test_adata_minimal.obs, record_factor_metadata=True).get_model_matrix(formula) for factor, expected_metadata in expected_factor_metadata.items(): actual_metadata = factor_storage[factor] - assert len(actual_metadata) == 1 - actual_metadata = actual_metadata[0] for k in expected_metadata: - assert getattr(actual_metadata, k) == expected_metadata[k] + assert resolve_ambiguous(actual_metadata, k) == expected_metadata[k] + + +def test_resolve_ambiguous(): + obj1 = FactorMetadata("F1", True, True, ["A", "B"], Factor.Kind.CATEGORICAL) + obj2 = FactorMetadata("F2", True, False, ["A", "B"], Factor.Kind.CATEGORICAL) + obj3 = FactorMetadata("F3", True, False, None, Factor.Kind.NUMERICAL) + + with pytest.raises(ValueError): + resolve_ambiguous([], "foo") + + with pytest.raises(AttributeError): + resolve_ambiguous([obj1, obj2], "doesntexist") + + with pytest.raises(AmbiguousAttributeError): + assert resolve_ambiguous([obj1, obj2], "name") + + assert resolve_ambiguous([obj1, obj2, obj3], "reduced_rank") is True + assert resolve_ambiguous([obj1, obj2], "categories") == ["A", "B"] + + with pytest.raises(AmbiguousAttributeError): + assert resolve_ambiguous([obj1, obj2, obj3], "categories") + + with pytest.raises(AmbiguousAttributeError): + assert resolve_ambiguous([obj1, obj3], "kind") From e59e760a2e20b80bcc73f56894e3c02d5b045e15 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 11 Apr 2024 12:19:37 +0200 Subject: [PATCH 26/36] Add more test cases and fix others --- .../methods/_base.py | 2 +- tests/test_base.py | 37 +++++++++++++------ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index 1379d44..2cd81db 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -374,7 +374,7 @@ def cond(self, **kwargs) -> np.ndarray: # We still verify that it's a valid category, otherwise simple typos are not caught and lead to # zeros in the design matrix. if var in cond_dict: - # it should never be non-ambiguous. If it happens, it's an edge case we don't know about (-> let it fail) + # Getting the categories should never be non-ambiguous. If it happens, it's an edge case we don't know about (-> let it fail) tmp_categories = resolve_ambiguous(terms_metadata, "categories") if ( resolve_ambiguous(terms_metadata, "kind") == Factor.Kind.CATEGORICAL diff --git a/tests/test_base.py b/tests/test_base.py index 010585f..7a06ad1 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -24,37 +24,50 @@ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFram @pytest.mark.parametrize( "formula,cond_kwargs,expected_contrast", [ + # single variable + ["~ condition", {}, [1, 0]], ["~ condition", {"condition": "A"}, [1, 0]], ["~ condition", {"condition": "B"}, [1, 1]], ["~ condition", {"condition": "42"}, ValueError], # non-existant category + # no-intercept models ["~ 0 + condition", {"condition": "A"}, [1, 0]], ["~ 0 + condition", {"condition": "B"}, [0, 1]], + # Different way of specifying dummy coding ["~ donor", {"donor": "D0"}, [1, 0, 0, 0]], ["~ C(donor)", {"donor": "D0"}, [1, 0, 0, 0]], ["~ C(donor, contr.treatment(base='D2'))", {"donor": "D2"}, [1, 0, 0, 0]], ["~ C(donor, contr.treatment(base='D2'))", {"donor": "D0"}, [1, 1, 0, 0]], + # Handle continuous covariates ["~ donor + continuous", {"donor": "D1"}, [1, 1, 0, 0, 0]], ["~ donor + np.log1p(continuous)", {"donor": "D1"}, [1, 1, 0, 0, 0]], + ["~ donor + continuous + np.log1p(continuous)", {"donor": "D0"}, [1, 0, 0, 0, 0, 0]], + # Nonsense models repeating the same variable, which are nonetheless allowed by formulaic + ["~ donor + C(donor)", {"donor": "D1"}, [1, 1, 0, 0, 1, 0, 0]], + ["~ donor + C(donor, contr.treatment(base='D2'))", {"donor": "D0"}, [1, 0, 0, 0, 1, 0, 0]], [ - "~ donor + continuous + np.log(continuous)", - {"donor": "D0"}, + "~ condition + donor + C(donor, contr.treatment(base='D2'))", + {"condition": "A"}, ValueError, - ], # current limitation: variables may only be used once per formula - [ - "~ donor + C(donor)", - {"donor": "D0"}, - ValueError, - ], # current limitation: variables may only be used once per formula + ], # donor base category can't be resolved because it's ambiguous -> ValueError + # Sum2zero coding ["~ C(donor, contr.sum)", {"donor": "D0"}, [1, 1, 0, 0]], ["~ C(donor, contr.sum)", {"donor": "D3"}, [1, -1, -1, -1]], + # Multiple categorical variables ["~ condition + donor", {"condition": "A"}, [1, 0, 0, 0, 0]], - ["~ 0 + condition + donor", {"donor": "D1"}, [0, 0, 1, 0, 0]], ["~ condition + donor", {"donor": "D2"}, [1, 0, 0, 1, 0]], ["~ condition + donor", {"condition": "B", "donor": "D2"}, [1, 1, 0, 1, 0]], - ["~ condition * donor", {"condition": "A"}, [1, 0, 0, 0, 0]], - ["~ condition + donor + condition:donor", {"condition": "A"}, [1, 0, 0, 0, 0]], + ["~ 0 + condition + donor", {"donor": "D1"}, [0, 0, 1, 0, 0]], + # Interaction terms + ["~ condition * donor", {"condition": "A"}, [1, 0, 0, 0, 0, 0, 0, 0]], + ["~ condition + donor + condition:donor", {"condition": "A"}, [1, 0, 0, 0, 0, 0, 0, 0]], + ["~ condition * donor", {"condition": "B", "donor": "D2"}, [1, 1, 0, 1, 0, 0, 1, 0]], + ["~ condition * C(donor, contr.treatment(base='D2'))", {"condition": "A"}, [1, 0, 0, 0, 0, 0, 0, 0]], + [ + "~ condition * C(donor, contr.treatment(base='D2'))", + {"condition": "B", "donor": "D0"}, + [1, 1, 1, 0, 0, 1, 0, 0], + ], ["~ condition:donor", {"condition": "A"}, [1, 0, 0, 0, 0]], - # TODO: also include tests for errors that should be caught by .cond ], ) def test_model_cond(test_adata_minimal, MockLinearModel, formula, cond_kwargs, expected_contrast): From 43f77e18649e04b4459a444db7720fe9a1ddbe24 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 11 Apr 2024 12:51:00 +0200 Subject: [PATCH 27/36] Use mapping variable -> factor instead of variable -> term --- .../_util/formulaic.py | 18 ++++++++++++++++-- .../methods/_base.py | 12 +++++++----- tests/test_base.py | 4 +++- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 96eca7e..3125350 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -1,4 +1,11 @@ -"""Helpers to interact with Formulaic Formulas""" +"""Helpers to interact with Formulaic Formulas + +TODO Glossary: + * factor + * term + * variable + +""" from collections import defaultdict from collections.abc import Mapping, Sequence @@ -81,12 +88,16 @@ def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata ------- factor_storage A dictionary pointing to Metadata for each factor processed by the custom materializer + variable_to_factors + A dictionary mapping variables to factor names (similar to model_spec.variable_terms), except that it maps + to *factors* rather than *terms* CustomPandasMaterializer A materializer class that is tied to the particular instance of `factor_storage`. """ # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction # term, it generates the factor with both full rank and reduced rank. factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list) + variable_to_factors: dict[str, list[set]] = defaultdict(set) class CustomPandasMaterializer(PandasMaterializer): """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" @@ -120,6 +131,7 @@ def __init__( passed to PandasMaterializer """ self.factor_metadata_storage = factor_storage if record_factor_metadata else None + self.variable_to_factors = variable_to_factors if record_factor_metadata else None # temporary pointer to metadata of factor that is currently evaluated self._current_factor: FactorMetadata = None super().__init__(data, context, **params) @@ -141,6 +153,8 @@ def _encode_evaled_factor( if factor.expr in self.encoded_cache or (factor.expr, reduced_rank) in self.encoded_cache: assert factor.expr in self.factor_metadata_storage, "Factor should be there since it's cached" else: + for var in factor.variables: + self.variable_to_factors[var].add(factor.expr) self._current_factor = FactorMetadata( name=factor.expr, reduced_rank=reduced_rank, @@ -167,7 +181,7 @@ def _flatten_encoded_evaled_factor(self, name: str, values: FactorValues[dict]) return super()._flatten_encoded_evaled_factor(name, values) - return factor_storage, CustomPandasMaterializer + return factor_storage, variable_to_factors, CustomPandasMaterializer class AmbiguousAttributeError(ValueError): diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index 2cd81db..666286f 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -152,11 +152,12 @@ def __init__( self._check_counts() self.factor_storage = None + self.variable_to_factors = None """Object to store metadata of formulaic factors which is used for building contrasts later. If a design matrix is passed directly, this remains None.""" if isinstance(design, str): - self.factor_storage, materializer_class = get_factor_storage_and_materializer() + self.factor_storage, self.variable_to_factors, materializer_class = get_factor_storage_and_materializer() self.design = materializer_class(adata.obs, record_factor_metadata=True).get_model_matrix(design) else: self.design = design @@ -362,13 +363,14 @@ def cond(self, **kwargs) -> np.ndarray: # A variable can refer to one or multiple terms. Either if a variable is specified # multiple times in the model (e.g. ~ var + C(var); ~ continuous + np.log(continuous)) # or when there's an interaction term (e.g. ~ A * B ==> terms `A`, `B`, `A:B`) - terms = self.design.model_spec.variable_terms[var] + factors = self.variable_to_factors[var] - # Get list of all Metadata objects for the associated terms. + # Get list of all Metadata objects for the associated Factors. # Some terms don't have metadata (e.g. instead of an interaction term `A:B`, metadata exists only for the - # individual variablesl `A` and `B`). Some terms have multiple metadata objects, because they are specified + # individual variables `A` and `B`). Some terms have multiple metadata objects, because they are specified # multiple times in the formula, or forumlaic resolves them multiple times internally. - terms_metadata = list(chain.from_iterable(self.factor_storage[term] for term in terms)) + # terms_metadata = list(chain.from_iterable(self.factor_storage[term] for term in terms)) + terms_metadata = list(chain.from_iterable(self.factor_storage[f] for f in factors)) # If the variable is specified in the cond_dict explicitly, we just keep it as is. # We still verify that it's a valid category, otherwise simple typos are not caught and lead to diff --git a/tests/test_base.py b/tests/test_base.py index 7a06ad1..ec8caf0 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -67,7 +67,9 @@ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFram {"condition": "B", "donor": "D0"}, [1, 1, 1, 0, 0, 1, 0, 0], ], - ["~ condition:donor", {"condition": "A"}, [1, 0, 0, 0, 0]], + ["~ condition:donor", {"condition": "A"}, [1, 0, 0, 0]], + ["~ condition:C(donor)", {"condition": "A", "donor": "D1"}, [1, 0, 0, 0]], + ["~ condition:donor", {"condition": "A", "donor": "D1"}, [1, 0, 0, 0]], ], ) def test_model_cond(test_adata_minimal, MockLinearModel, formula, cond_kwargs, expected_contrast): From 54a0d0b73365899c34699346b477830c23c1bcb6 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 11 Apr 2024 16:09:42 +0200 Subject: [PATCH 28/36] Fix remaining model.cond testcases --- tests/test_base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_base.py b/tests/test_base.py index ec8caf0..cf825d5 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -67,9 +67,13 @@ def _test_single_contrast(self, contrast: Sequence[float], **kwargs) -> DataFram {"condition": "B", "donor": "D0"}, [1, 1, 1, 0, 0, 1, 0, 0], ], - ["~ condition:donor", {"condition": "A"}, [1, 0, 0, 0]], - ["~ condition:C(donor)", {"condition": "A", "donor": "D1"}, [1, 0, 0, 0]], - ["~ condition:donor", {"condition": "A", "donor": "D1"}, [1, 0, 0, 0]], + [ + "~ condition:donor", + {"condition": "A"}, + ValueError, + ], # Can't automatically resolve base category, because Formulaic builds a reduced-rank and full-rank factor internally + ["~ condition:donor", {"condition": "A", "donor": "D1"}, [1, 1, 0, 0, 0, 0, 0, 0]], + ["~ condition:C(donor)", {"condition": "A", "donor": "D1"}, [1, 1, 0, 0, 0, 0, 0, 0]], ], ) def test_model_cond(test_adata_minimal, MockLinearModel, formula, cond_kwargs, expected_contrast): From 86f114a2f025a4360bc4048df4c690b3f1bc4217 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 11 Apr 2024 16:12:09 +0200 Subject: [PATCH 29/36] Fix formulaic testcase --- tests/test_formulaic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_formulaic.py b/tests/test_formulaic.py index a6d8ae2..da64be5 100644 --- a/tests/test_formulaic.py +++ b/tests/test_formulaic.py @@ -177,7 +177,7 @@ def test_custom_materializer(test_adata_minimal, formula, reorder_categorical, e if reorder_categorical is not None: for col, order in reorder_categorical.items(): test_adata_minimal.obs[col] = pd.Categorical(test_adata_minimal.obs[col], categories=order) - factor_storage, materializer = get_factor_storage_and_materializer() + factor_storage, _, materializer = get_factor_storage_and_materializer() materializer(test_adata_minimal.obs, record_factor_metadata=True).get_model_matrix(formula) for factor, expected_metadata in expected_factor_metadata.items(): actual_metadata = factor_storage[factor] From 2c8df267615b4815c523626ebc23477ae68d0f75 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 11 Apr 2024 16:12:59 +0200 Subject: [PATCH 30/36] Reset example notebook --- docs/notebooks/example.ipynb | 132 +++-------------------------------- 1 file changed, 9 insertions(+), 123 deletions(-) diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb index d80220a..ce7dbad 100644 --- a/docs/notebooks/example.ipynb +++ b/docs/notebooks/example.ipynb @@ -9,146 +9,32 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import anndata as ad\n", - "import numpy as np\n", - "import pandas as pd\n", - "import pytest\n", - "from pydeseq2.utils import load_example_data\n", - "from formulaic import model_matrix\n", - "\n", - "\n", - "@pytest.fixture\n", - "def test_counts():\n", - " return load_example_data(\n", - " modality=\"raw_counts\",\n", - " dataset=\"synthetic\",\n", - " debug=False,\n", - " )\n", - "\n", - "\n", - "@pytest.fixture\n", - "def test_metadata():\n", - " return load_example_data(\n", - " modality=\"metadata\",\n", - " dataset=\"synthetic\",\n", - " debug=False,\n", - " )\n", - "\n", - "\n", - "@pytest.fixture\n", - "def test_adata(test_counts, test_metadata):\n", - " return ad.AnnData(X=test_counts, obs=test_metadata)\n", - "\n", - "\n", - "def test_adata_minimal():\n", - " matrix_format = np.array\n", - " n_obs = 80\n", - " n_donors = n_obs // 4\n", - " rng = np.random.default_rng(9) # make tests deterministic\n", - " obs = pd.DataFrame(\n", - " {\n", - " \"condition\": [\"A\", \"B\"] * (n_obs // 2),\n", - " \"donor\": sum(([f\"D{i}\"] * n_donors for i in range(n_obs // n_donors)), []),\n", - " \"other\": ([\"X\"] * (n_obs // 4)) + ([\"Y\"] * ((3 * n_obs) // 4)),\n", - " \"pairing\": sum(([str(i), str(i)] for i in range(n_obs // 2)), []),\n", - " \"continuous\": [rng.uniform(0, 1) * 4000 for _ in range(n_obs)],\n", - " },\n", - " )\n", - " var = pd.DataFrame(index=[\"gene1\", \"gene2\"])\n", - " group1 = rng.negative_binomial(20, 0.1, n_obs // 2) # large mean\n", - " group2 = rng.negative_binomial(5, 0.5, n_obs // 2) # small mean\n", - "\n", - " condition_data = np.empty((n_obs,), dtype=group1.dtype)\n", - " condition_data[0::2] = group1\n", - " condition_data[1::2] = group2\n", - "\n", - " donor_data = np.empty((n_obs,), dtype=group1.dtype)\n", - " donor_data[0:n_donors] = group2[:n_donors]\n", - " donor_data[n_donors : (2 * n_donors)] = group1[n_donors:]\n", - "\n", - " donor_data[(2 * n_donors) : (3 * n_donors)] = group2[:n_donors]\n", - " donor_data[(3 * n_donors) :] = group1[n_donors:]\n", - "\n", - " X = matrix_format(np.vstack([condition_data, donor_data]).T)\n", - " return ad.AnnData(X=X, obs=obs, var=var)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/home/sturm/apps/micromamba/envs/multi-condition-comparisons/lib/python3.10/site-packages/anndata/_core/anndata.py:183: ImplicitModificationWarning: Transforming to str index.\n", - " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" + "/home/sturm/apps/micromamba/envs/multi-condition-comparisons/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ - "adata = test_adata_minimal()" + "%load_ext autoreload\n", + "%autoreload 2\n", + "from multi_condition_comparisions.tl.de import StatsmodelsDE\n", + "import scanpy as sc" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "design = model_matrix(\"~ condition + C(condition) + donor + continuous\", adata.obs)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{1: set(),\n", - " condition: {'condition'},\n", - " C(condition): {'C', 'condition'},\n", - " donor: {'donor'},\n", - " continuous: {'continuous'}}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.model_spec.term_variables" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'condition': {C(condition), condition},\n", - " 'C': {C(condition)},\n", - " 'donor': {donor},\n", - " 'continuous': {continuous}}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "design.model_spec.variable_terms" + "adata = sc.read_h5ad(\"./airway.h5ad\")" ] }, { From 99992c1565d6ab8d84ecbf6586247dbc27f327b5 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 11 Apr 2024 16:13:37 +0200 Subject: [PATCH 31/36] Restet conftest --- tests/conftest.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5eb3cd0..936ccf3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,3 @@ -# TODO: temp workaround for Gregor -import os - import anndata as ad import numpy as np import pandas as pd @@ -8,8 +5,6 @@ import scipy.sparse as sp from pydeseq2.utils import load_example_data -os.environ["R_HOME"] = "/home/sturm/apps/micromamba/envs/multi-condition-comparisons/lib/R" - @pytest.fixture def test_counts(): From 6104af082e68177656e64d704351d042d9439a78 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 11 Apr 2024 16:52:31 +0200 Subject: [PATCH 32/36] Refactor --- .../_util/formulaic.py | 4 +- .../methods/_base.py | 134 ++++++++++++------ 2 files changed, 94 insertions(+), 44 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 3125350..4a61a44 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -78,7 +78,7 @@ def base(self) -> str | None: return self.drop_field -def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], type]: +def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata]], dict[str, set[str]], type]: """ Keep track of categorical factors used in a model spec. @@ -97,7 +97,7 @@ def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata # There can be multiple FactorMetadata entries per sample, for instance when formulaic generates an interaction # term, it generates the factor with both full rank and reduced rank. factor_storage: dict[str, list[FactorMetadata]] = defaultdict(list) - variable_to_factors: dict[str, list[set]] = defaultdict(set) + variable_to_factors: dict[str, set[str]] = defaultdict(set) class CustomPandasMaterializer(PandasMaterializer): """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index 666286f..1e27d1a 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from itertools import chain from types import MappingProxyType +from typing import Any import numpy as np import pandas as pd @@ -12,6 +13,7 @@ from multi_condition_comparisions._util.formulaic import ( AmbiguousAttributeError, Factor, + FactorMetadata, get_factor_storage_and_materializer, resolve_ambiguous, ) @@ -151,12 +153,15 @@ def __init__( super().__init__(adata, mask=mask, layer=layer) self._check_counts() - self.factor_storage = None - self.variable_to_factors = None + self.factor_storage: dict[str, list[FactorMetadata]] = None """Object to store metadata of formulaic factors which is used for building contrasts later. If a design matrix is passed directly, this remains None.""" + self.variable_to_factors: dict[str, set[str]] = None + """Stores mapping from variables to formulaic factors""" + if isinstance(design, str): + # Use custom formulaic materializer that will record factor metadata while creating the model matrix self.factor_storage, self.variable_to_factors, materializer_class = get_factor_storage_and_materializer() self.design = materializer_class(adata.obs, record_factor_metadata=True).get_model_matrix(design) else: @@ -356,58 +361,103 @@ def cond(self, **kwargs) -> np.ndarray: # To do so, we keep the values that were already specified, and fill all other values with the default category # (the one that is usually dropped from the model for being redundant). # - # `model_spec.variable_terms` is a mapping from variable to a set of terms. Unless a variable is used twice in the - # same formula, it contains exactly one element. It also contains - # "non-data" variables such as `C` - therefore we use `self.variables` to loop through. + # self.variables only contains "data-variables", but no "factor variables", such as `C` for var in self.variables: - # A variable can refer to one or multiple terms. Either if a variable is specified - # multiple times in the model (e.g. ~ var + C(var); ~ continuous + np.log(continuous)) - # or when there's an interaction term (e.g. ~ A * B ==> terms `A`, `B`, `A:B`) - factors = self.variable_to_factors[var] - - # Get list of all Metadata objects for the associated Factors. - # Some terms don't have metadata (e.g. instead of an interaction term `A:B`, metadata exists only for the - # individual variables `A` and `B`). Some terms have multiple metadata objects, because they are specified - # multiple times in the formula, or forumlaic resolves them multiple times internally. - # terms_metadata = list(chain.from_iterable(self.factor_storage[term] for term in terms)) - terms_metadata = list(chain.from_iterable(self.factor_storage[f] for f in factors)) - # If the variable is specified in the cond_dict explicitly, we just keep it as is. # We still verify that it's a valid category, otherwise simple typos are not caught and lead to # zeros in the design matrix. if var in cond_dict: - # Getting the categories should never be non-ambiguous. If it happens, it's an edge case we don't know about (-> let it fail) - tmp_categories = resolve_ambiguous(terms_metadata, "categories") - if ( - resolve_ambiguous(terms_metadata, "kind") == Factor.Kind.CATEGORICAL - and cond_dict[var] not in tmp_categories - ): - raise ValueError( - f"You specified a non-existant category for {var}. Possible categories: {', '.join(tmp_categories)}" - ) + self._check_category(var, cond_dict[var]) # If the variable is not specified, we want to fill it with its default value (i.e. the base category) else: - if resolve_ambiguous(terms_metadata, "kind") == Factor.Kind.CATEGORICAL: - # In this case it can be ambiguous for some formulas -> Tell the user to specify the variable explicitly - try: - tmp_base = resolve_ambiguous(terms_metadata, "base") - except AmbiguousAttributeError as e: - raise ValueError( - f"Could not automatically resolve base category for variable {var}. Please specify it explicity in `model.cond`." - ) from e - - # if tmp_base is None (no-intercept model), set it to the NUL string \0. - # In principle, it could be any string that is not a valid category, but it cannot be None - # because this leads to an error in `get_model_matrix` - cond_dict[var] = tmp_base if tmp_base is not None else "\0" - else: - # Set to zero for continuous variables - cond_dict[var] = 0 + cond_dict[var] = self._get_default_value(var) df = pd.DataFrame([kwargs]) return self.design.model_spec.get_model_matrix(df).iloc[0] + def _get_factor_metadata_for_variable(self, var) -> list[FactorMetadata]: + """ + Get the Metadata objects of all factors defined by a given variable. + + A variable can refer to one or multiple factors. Either if a variable is specified + multiple times in the model (e.g. ~ var + C(var); ~ continuous + np.log(continuous)) + or when there's an interaction term (e.g. ~ A * B ==> terms `A`, `B`, `A:B`). + + Some Factors can have multiple metadata objects, because they are specified + multiple times in the formula, or forumlaic resolves them multiple times internally. + + Even if there are multiple factors per variable, they could still contain the same metadata and we + can unambiguously retreive metadata for a given variable, e.g. using the `_util.formulaic.resolve_ambiguous` + function. + + Returns + ------- + a list of FactorMetadata objects + """ + factors = self.variable_to_factors[var] + + return list(chain.from_iterable(self.factor_storage[f] for f in factors)) + + def _get_default_value(self, var) -> Any: + r""" + Get the default value (base category) for variable `var`. + + Special cases: + * If the variable is continuous, return 0. + * For a variable without reduced rank (as it happens for the first variable in a no-intercept model `~0 + xxx`) + the function returns the null string "\0". This is to ensure that when passed to `formulaic.get_model_matrix`, + a row is returned that sets all columns of that variable to 0. It could be any string that is not + a valid category. It cannot be `None`, because this results in failure of `get_model_matrix`, therefore the + null string was chosen because of clear semantics and it being surely not a valid category. + + Raises + ------ + ValueError + If there is no way to unambiguously infer the base category from the formulaic formula. + + Returns + ------- + The default value of the given variable. + """ + factor_metadata = self._get_factor_metadata_for_variable(var) + if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL: + # In this case it can be ambiguous for some formulas -> Tell the user to specify the variable explicitly + try: + tmp_base = resolve_ambiguous(factor_metadata, "base") + except AmbiguousAttributeError as e: + raise ValueError( + f"Could not automatically resolve base category for variable {var}. Please specify it explicity in `model.cond`." + ) from e + + # if tmp_base is None (no-intercept model), set it to the NUL string \0. + # In principle, it could be any string that is not a valid category, but it cannot be None + # because this leads to an error in `get_model_matrix` + return tmp_base if tmp_base is not None else "\0" + else: + # Set to zero for continuous variables + return 0 + + def _check_category(self, var, value) -> None: + """ + Check if `value` is a valid category of the factor defined by `var`. + + If `var` is a continuous variable this passes silently. + + Raises + ------ + ValueError + If `value` is not a valid category. + """ + factor_metadata = self._get_factor_metadata_for_variable(var) + + # Getting the categories should never be non-ambiguous. If it happens, it's an edge case we don't know about (-> let it fail) + tmp_categories = resolve_ambiguous(factor_metadata, "categories") + if resolve_ambiguous(factor_metadata, "kind") == Factor.Kind.CATEGORICAL and value not in tmp_categories: + raise ValueError( + f"You specified a non-existant category for {var}. Possible categories: {', '.join(tmp_categories)}" + ) + def contrast(self, column: str, baseline: str, group_to_compare: str) -> pd.Series: """ Build a simple contrast for pairwise comparisons. From e180e7ae6e686361ae457a8219d2bd54e91306d4 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 12 Apr 2024 12:26:59 +0200 Subject: [PATCH 33/36] Add formulaic glossary --- src/multi_condition_comparisions/_util/formulaic.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 4a61a44..86e3b3d 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -1,10 +1,9 @@ """Helpers to interact with Formulaic Formulas -TODO Glossary: - * factor - * term - * variable - +Some helpful definitions for working with formulaic formulas (e.g. `~ 0 + C(donor):treatment + np.log1p(continuous)`): + * A *term* refers to an expression in the formula, separated by `+`, e.g. `C(donor):treatment`, or `np.log1p(continuous)`. + * A *variable* refers to a column of the data frame passed to formulaic, e.g. `donor`. + * A *factor* is the specification of how a certain variable is represented in the design matrix, e.g. treatment coding with base level "A" and reduced rank. """ from collections import defaultdict From c2d76877d9065686de6714f2ae3141d383eee7c8 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Fri, 12 Apr 2024 16:06:15 +0200 Subject: [PATCH 34/36] Fix typo --- src/multi_condition_comparisions/methods/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index 1e27d1a..28b46bf 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -356,7 +356,7 @@ def cond(self, **kwargs) -> np.ndarray: ) # We now fill `cond_dict` such that it is equivalent to a data row from `adata.obs`. - # This data can then be passed to the `get_model_matrix` of formulaic to retreive a correpsonding + # This data can then be passed to the `get_model_matrix` of formulaic to retrieve a corresponding # contrast vector. # To do so, we keep the values that were already specified, and fill all other values with the default category # (the one that is usually dropped from the model for being redundant). From 66f9919c5aa73e28ebcf097bcb3ee7d35beda567 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Fri, 12 Apr 2024 16:06:24 +0200 Subject: [PATCH 35/36] Fix typo --- src/multi_condition_comparisions/_util/formulaic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/multi_condition_comparisions/_util/formulaic.py b/src/multi_condition_comparisions/_util/formulaic.py index 86e3b3d..11a9d99 100644 --- a/src/multi_condition_comparisions/_util/formulaic.py +++ b/src/multi_condition_comparisions/_util/formulaic.py @@ -99,7 +99,7 @@ def get_factor_storage_and_materializer() -> tuple[dict[str, list[FactorMetadata variable_to_factors: dict[str, set[str]] = defaultdict(set) class CustomPandasMaterializer(PandasMaterializer): - """An extension of the PandasMaterializer that records all cateogrical variables and their (base) categories.""" + """An extension of the PandasMaterializer that records all categorical variables and their (base) categories.""" REGISTER_NAME = "custom_pandas" REGISTER_INPUTS = ("pandas.core.frame.DataFrame",) From 4cc220dc39d170e96a3bcd8fa2b86a1187aafe4b Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 18 Apr 2024 20:51:53 +0200 Subject: [PATCH 36/36] Update src/multi_condition_comparisions/methods/_base.py Co-authored-by: Ilan Gold --- src/multi_condition_comparisions/methods/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/multi_condition_comparisions/methods/_base.py b/src/multi_condition_comparisions/methods/_base.py index 28b46bf..a8b05ab 100644 --- a/src/multi_condition_comparisions/methods/_base.py +++ b/src/multi_condition_comparisions/methods/_base.py @@ -153,11 +153,11 @@ def __init__( super().__init__(adata, mask=mask, layer=layer) self._check_counts() - self.factor_storage: dict[str, list[FactorMetadata]] = None + self.factor_storage: dict[str, list[FactorMetadata]] | None = None """Object to store metadata of formulaic factors which is used for building contrasts later. If a design matrix is passed directly, this remains None.""" - self.variable_to_factors: dict[str, set[str]] = None + self.variable_to_factors: dict[str, set[str]] | None = None """Stores mapping from variables to formulaic factors""" if isinstance(design, str):