Merge pull request #155 from matthewwardrop/fix_poly

Improve null handling in `poly` and when generating sparse matrices
matthewwardrop · Sep 25, 2023 · 77cbba6 · 77cbba6
2 parents 0f4ea0b + db58b80
commit 77cbba6
Show file tree

Hide file tree

Showing 6 changed files with 331 additions and 32 deletions.
diff --git a/formulaic/materializers/pandas.py b/formulaic/materializers/pandas.py
@@ -9,6 +9,7 @@
 import scipy.sparse as spsparse
 from interface_meta import override
 from formulaic.utils.cast import as_columns
+from formulaic.utils.null_handling import find_nulls, drop_rows as drop_nulls
 
 from .base import FormulaMaterializer
 from .types import NAAction
@@ -39,24 +40,24 @@ def _check_for_nulls(
         if na_action is NAAction.IGNORE:
             return
 
-        if isinstance(
-            values, dict
-        ):  # pragma: no cover; no formulaic transforms return dictionaries any more
-            for key, vs in values.items():
-                self._check_for_nulls(f"{name}[{key}]", vs, na_action, drop_rows)
+        try:
+            null_indices = find_nulls(values)
 
-        elif na_action is NAAction.RAISE:
-            if isinstance(values, pandas.Series) and values.isnull().values.any():
-                raise ValueError(f"`{name}` contains null values after evaluation.")
+            if na_action is NAAction.RAISE:
+                if null_indices:
+                    raise ValueError(f"`{name}` contains null values after evaluation.")
 
-        elif na_action is NAAction.DROP:
-            if isinstance(values, pandas.Series):
-                drop_rows.update(numpy.flatnonzero(values.isnull().values))
+            elif na_action is NAAction.DROP:
+                drop_rows.update(null_indices)
 
-        else:
+            else:
+                raise ValueError(
+                    f"Do not know how to interpret `na_action` = {repr(na_action)}."
+                )  # pragma: no cover; this is currently impossible to reach
+        except ValueError as e:
             raise ValueError(
-                f"Do not know how to interpret `na_action` = {repr(na_action)}."
-            )  # pragma: no cover; this is currently impossible to reach
+                f"Error encountered while checking for nulls in `{name}`: {e}"
+            ) from e
 
     @override
     def _encode_constant(
@@ -67,13 +68,10 @@ def _encode_constant(
         spec: ModelSpec,
         drop_rows: Sequence[int],
     ) -> Any:
+        nrows = self.nrows - len(drop_rows)
         if spec.output == "sparse":
-            return spsparse.csc_matrix(
-                numpy.array([value] * self.nrows).reshape(
-                    (self.nrows - len(drop_rows), 1)
-                )
-            )
-        series = value * numpy.ones(self.nrows - len(drop_rows))
+            return spsparse.csc_matrix(numpy.array([value] * nrows).reshape((nrows, 1)))
+        series = value * numpy.ones(nrows)
         return series
 
     @override
@@ -86,9 +84,11 @@ def _encode_numerical(
         drop_rows: Sequence[int],
     ) -> Any:
         if drop_rows:
-            values = values.drop(index=values.index[drop_rows])
+            values = drop_nulls(values, indices=drop_rows)
         if spec.output == "sparse":
-            return spsparse.csc_matrix(numpy.array(values).reshape((self.nrows, 1)))
+            return spsparse.csc_matrix(
+                numpy.array(values).reshape((values.shape[0], 1))
+            )
         return values
 
     @override
@@ -107,7 +107,7 @@ def _encode_categorical(
         from formulaic.transforms import encode_contrasts
 
         if drop_rows:
-            values = values.drop(index=values.index[drop_rows])
+            values = drop_nulls(values, indices=drop_rows)
         return as_columns(
             encode_contrasts(
                 values,

diff --git a/formulaic/transforms/poly.py b/formulaic/transforms/poly.py
@@ -62,7 +62,7 @@ def poly(  # pylint: disable=dangerous-default-value  # always replaced by state
         regression.
 
         `nan` values in `x` will be ignored and progagated through to generated
-        polynomials.
+        polynomials in the corresponding row indices.
 
         The signature of this transform is intentionally chosen to be compatible
         with R.
@@ -71,7 +71,15 @@ def poly(  # pylint: disable=dangerous-default-value  # always replaced by state
     if raw:
         return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1)
 
-    x = numpy.array(x)
+    x = numpy.array(x, dtype=numpy.float64)
+
+    # Prepare output matrix (needed because we resize x below if there are nulls)
+    out = numpy.empty((x.shape[0], degree))
+    out.fill(numpy.nan)
+
+    # Filter to non-null indices (we will invert this later)
+    nonnull_indices = numpy.flatnonzero(~numpy.isnan(x))
+    x = x[nonnull_indices]
 
     # Check if we already have generated the alpha and beta coefficients.
     # If not, we enter "training" mode.
@@ -114,7 +122,8 @@ def get_beta(k: int) -> float:
         _state["alpha"] = alpha
         _state["norms2"] = norms2
 
+    # Populate output matrix with evaluated polynomials
+    out[nonnull_indices, :] = P[:, 1:]
+
     # Return basis dropping the first (constant) column
-    return FactorValues(
-        P[:, 1:], column_names=tuple(str(i) for i in range(1, degree + 1))
-    )
+    return FactorValues(out, column_names=tuple(str(i) for i in range(1, degree + 1)))
diff --git a/formulaic/utils/null_handling.py b/formulaic/utils/null_handling.py
@@ -0,0 +1,145 @@
+from functools import singledispatch
+from typing import Any, Sequence, Set, Union
+
+import numpy
+import pandas
+import scipy.sparse as spsparse
+
+from formulaic.materializers.types import FactorValues
+
+
+@singledispatch
+def find_nulls(values: Any) -> Set[int]:
+    """
+    Find the indices of rows in `values` that have null/nan values.
+
+    Args:
+        values: The values in which to find nulls.
+    """
+    raise ValueError(
+        f"No implementation of `find_nulls()` for type `{repr(type(values))}`."
+    )
+
+
+@find_nulls.register
+def _(values: None) -> Set[int]:
+    # Literal `None` values have special meaning and are checked elsewhere.
+    return set()
+
+
+@find_nulls.register
+def _(values: str) -> Set[int]:
+    return set()
+
+
+@find_nulls.register
+def _(values: int) -> Set[int]:
+    return _drop_nulls_scalar(values)
+
+
+@find_nulls.register
+def _(values: float) -> Set[int]:
+    return _drop_nulls_scalar(values)
+
+
+def _drop_nulls_scalar(values: Union[int, float]) -> Set[int]:
+    if isinstance(values, FactorValues):
+        values = values.__wrapped__
+    if numpy.isnan(values):
+        raise ValueError("Constant value is null, invalidating all rows.")
+    return set()
+
+
+@find_nulls.register
+def _(values: list) -> Set[int]:
+    if isinstance(values, FactorValues):
+        # Older versions of pandas (<1.2) cannot unpack this automatically.
+        values = values.__wrapped__
+    return find_nulls(pandas.Series(values))
+
+
+@find_nulls.register
+def _(values: dict) -> Set[int]:
+    indices = set()
+    for vs in values.values():
+        indices.update(find_nulls(vs))
+    return indices
+
+
+@find_nulls.register
+def _(values: pandas.Series) -> Set[int]:
+    return set(numpy.flatnonzero(values.isnull().values))
+
+
+@find_nulls.register
+def _(values: numpy.ndarray) -> Set[int]:
+    if len(values.shape) == 0:
+        if numpy.isnan(values):
+            raise ValueError("Constant value is null, invalidating all rows.")
+        return set()
+
+    if len(values.shape) == 1:
+        return set(numpy.flatnonzero(numpy.isnan(values)))
+
+    if len(values.shape) == 2:
+        return set(numpy.flatnonzero(numpy.any(numpy.isnan(values), axis=1)))
+
+    raise ValueError(
+        "Cannot check for null indices for arrays of more than 2 dimensions."
+    )
+
+
+@find_nulls.register
+def _(values: spsparse.spmatrix) -> Set[int]:
+    rows, _, data = spsparse.find(values)
+    null_data_indices = numpy.flatnonzero(numpy.isnan(data))
+    return set(rows[null_data_indices])
+
+
+@singledispatch
+def drop_rows(values: Any, indices: Sequence[int]) -> Any:
+    """
+    Drop rows corresponding to the given indices in `values`.
+
+    Args:
+        values: The vector from which to drop rows with the given `indices`.
+        indices: The indices of the rows to be dropped.
+    """
+    raise ValueError(
+        f"No implementation of `drop_rows()` for values of type `{repr(type(values))}`."
+    )
+
+
+@drop_rows.register
+def _(values: list, indices: Sequence[int]) -> list:
+    return [value for i, value in enumerate(values) if i not in indices]
+
+
+@drop_rows.register
+def _(values: pandas.Series, indices: Sequence[int]) -> pandas.Series:
+    return values.drop(index=values.index[indices])
+
+
+@drop_rows.register
+def _(values: numpy.ndarray, indices: Sequence[int]) -> numpy.ndarray:
+    return numpy.delete(values, indices, axis=0)
+
+
+@drop_rows.register
+def _(values: spsparse.spmatrix, indices: Sequence[int]) -> numpy.ndarray:
+    """
+    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
+    """
+    was_csc = False
+    if isinstance(values, spsparse.csc_matrix):
+        was_csc = True
+        values = values.tocsr()
+    indices = list(indices)
+    mask = numpy.ones(values.shape[0], dtype=bool)
+    mask[indices] = False
+    masked = values[mask]
+
+    if was_csc:
+        masked = masked.tocsc()
+
+    return masked
diff --git a/tests/materializers/test_pandas.py b/tests/materializers/test_pandas.py
@@ -160,11 +160,21 @@ def test_get_model_matrix_invalid_output(self, materializer):
             )
 
     @pytest.mark.parametrize("formula,tests", PANDAS_TESTS.items())
-    def test_na_handling(self, data_with_nulls, formula, tests):
-        mm = PandasMaterializer(data_with_nulls).get_model_matrix(formula)
-        assert isinstance(mm, pandas.DataFrame)
+    @pytest.mark.parametrize("output", ["pandas", "numpy", "sparse"])
+    def test_na_handling(self, data_with_nulls, formula, tests, output):
+        mm = PandasMaterializer(data_with_nulls).get_model_matrix(
+            formula, output=output
+        )
+        if output == "pandas":
+            assert isinstance(mm, pandas.DataFrame)
+            assert list(mm.columns) == tests[2]
+        elif output == "numpy":
+            assert isinstance(mm, numpy.ndarray)
+
+        else:
+            assert isinstance(mm, spsparse.csc_matrix)
         assert mm.shape == (tests[3], len(tests[2]))
-        assert list(mm.columns) == tests[2]
+        assert list(mm.model_spec.column_names) == tests[2]
 
         if formula == "A:B":
             return

diff --git a/tests/transforms/test_poly.py b/tests/transforms/test_poly.py
@@ -153,3 +153,60 @@ def test_raw(self, data):
         assert numpy.allclose(
             poly(data, 3, raw=True), numpy.array([data, data**2, data**3]).T
         )
+
+    def test_nulls(self, data):
+        state = {}
+        data = numpy.concatenate([[numpy.nan, numpy.nan, numpy.nan], data])
+        data[:3] = numpy.nan
+        V = poly(data, degree=3, _state=state)
+
+        assert V.shape[1] == 3
+
+        # Comparison data copied from R output of:
+        # > data = seq(from=0, to=1, by=0.05)
+        # > poly(data, 3)
+        r_reference = numpy.array(
+            [
+                [numpy.nan, numpy.nan, numpy.nan],
+                [numpy.nan, numpy.nan, numpy.nan],
+                [numpy.nan, numpy.nan, numpy.nan],
+                [-3.603750e-01, 0.42285541, -4.332979e-01],
+                [-3.243375e-01, 0.29599879, -1.733191e-01],
+                [-2.883000e-01, 0.18249549, 1.824412e-02],
+                [-2.522625e-01, 0.08234553, 1.489937e-01],
+                [-2.162250e-01, -0.00445111, 2.265312e-01],
+                [-1.801875e-01, -0.07789442, 2.584584e-01],
+                [-1.441500e-01, -0.13798440, 2.523770e-01],
+                [-1.081125e-01, -0.18472105, 2.158888e-01],
+                [-7.207500e-02, -0.21810437, 1.565954e-01],
+                [-3.603750e-02, -0.23813436, 8.209854e-02],
+                [-3.000725e-17, -0.24481103, -4.395626e-17],
+                [3.603750e-02, -0.23813436, -8.209854e-02],
+                [7.207500e-02, -0.21810437, -1.565954e-01],
+                [1.081125e-01, -0.18472105, -2.158888e-01],
+                [1.441500e-01, -0.13798440, -2.523770e-01],
+                [1.801875e-01, -0.07789442, -2.584584e-01],
+                [2.162250e-01, -0.00445111, -2.265312e-01],
+                [2.522625e-01, 0.08234553, -1.489937e-01],
+                [2.883000e-01, 0.18249549, -1.824412e-02],
+                [3.243375e-01, 0.29599879, 1.733191e-01],
+                [3.603750e-01, 0.42285541, 4.332979e-01],
+            ]
+        )
+
+        assert numpy.allclose(
+            V,
+            r_reference,
+            equal_nan=True,
+        )
+
+        assert state["alpha"] == pytest.approx({0: 0.5, 1: 0.5, 2: 0.5})
+        assert state["norms2"] == pytest.approx(
+            {0: 21.0, 1: 1.925, 2: 0.14020416666666669, 3: 0.009734175}
+        )
+
+        assert numpy.allclose(
+            poly(data, degree=3, _state=state),
+            r_reference,
+            equal_nan=True,
+        )