Skip to content

Commit

Permalink
Merge pull request #155 from matthewwardrop/fix_poly
Browse files Browse the repository at this point in the history
Improve null handling in `poly` and when generating sparse matrices
  • Loading branch information
matthewwardrop authored Sep 25, 2023
2 parents 0f4ea0b + db58b80 commit 77cbba6
Show file tree
Hide file tree
Showing 6 changed files with 331 additions and 32 deletions.
46 changes: 23 additions & 23 deletions formulaic/materializers/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import scipy.sparse as spsparse
from interface_meta import override
from formulaic.utils.cast import as_columns
from formulaic.utils.null_handling import find_nulls, drop_rows as drop_nulls

from .base import FormulaMaterializer
from .types import NAAction
Expand Down Expand Up @@ -39,24 +40,24 @@ def _check_for_nulls(
if na_action is NAAction.IGNORE:
return

if isinstance(
values, dict
): # pragma: no cover; no formulaic transforms return dictionaries any more
for key, vs in values.items():
self._check_for_nulls(f"{name}[{key}]", vs, na_action, drop_rows)
try:
null_indices = find_nulls(values)

elif na_action is NAAction.RAISE:
if isinstance(values, pandas.Series) and values.isnull().values.any():
raise ValueError(f"`{name}` contains null values after evaluation.")
if na_action is NAAction.RAISE:
if null_indices:
raise ValueError(f"`{name}` contains null values after evaluation.")

elif na_action is NAAction.DROP:
if isinstance(values, pandas.Series):
drop_rows.update(numpy.flatnonzero(values.isnull().values))
elif na_action is NAAction.DROP:
drop_rows.update(null_indices)

else:
else:
raise ValueError(
f"Do not know how to interpret `na_action` = {repr(na_action)}."
) # pragma: no cover; this is currently impossible to reach
except ValueError as e:
raise ValueError(
f"Do not know how to interpret `na_action` = {repr(na_action)}."
) # pragma: no cover; this is currently impossible to reach
f"Error encountered while checking for nulls in `{name}`: {e}"
) from e

@override
def _encode_constant(
Expand All @@ -67,13 +68,10 @@ def _encode_constant(
spec: ModelSpec,
drop_rows: Sequence[int],
) -> Any:
nrows = self.nrows - len(drop_rows)
if spec.output == "sparse":
return spsparse.csc_matrix(
numpy.array([value] * self.nrows).reshape(
(self.nrows - len(drop_rows), 1)
)
)
series = value * numpy.ones(self.nrows - len(drop_rows))
return spsparse.csc_matrix(numpy.array([value] * nrows).reshape((nrows, 1)))
series = value * numpy.ones(nrows)
return series

@override
Expand All @@ -86,9 +84,11 @@ def _encode_numerical(
drop_rows: Sequence[int],
) -> Any:
if drop_rows:
values = values.drop(index=values.index[drop_rows])
values = drop_nulls(values, indices=drop_rows)
if spec.output == "sparse":
return spsparse.csc_matrix(numpy.array(values).reshape((self.nrows, 1)))
return spsparse.csc_matrix(
numpy.array(values).reshape((values.shape[0], 1))
)
return values

@override
Expand All @@ -107,7 +107,7 @@ def _encode_categorical(
from formulaic.transforms import encode_contrasts

if drop_rows:
values = values.drop(index=values.index[drop_rows])
values = drop_nulls(values, indices=drop_rows)
return as_columns(
encode_contrasts(
values,
Expand Down
19 changes: 14 additions & 5 deletions formulaic/transforms/poly.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def poly( # pylint: disable=dangerous-default-value # always replaced by state
regression.
`nan` values in `x` will be ignored and progagated through to generated
polynomials.
polynomials in the corresponding row indices.
The signature of this transform is intentionally chosen to be compatible
with R.
Expand All @@ -71,7 +71,15 @@ def poly( # pylint: disable=dangerous-default-value # always replaced by state
if raw:
return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1)

x = numpy.array(x)
x = numpy.array(x, dtype=numpy.float64)

# Prepare output matrix (needed because we resize x below if there are nulls)
out = numpy.empty((x.shape[0], degree))
out.fill(numpy.nan)

# Filter to non-null indices (we will invert this later)
nonnull_indices = numpy.flatnonzero(~numpy.isnan(x))
x = x[nonnull_indices]

# Check if we already have generated the alpha and beta coefficients.
# If not, we enter "training" mode.
Expand Down Expand Up @@ -114,7 +122,8 @@ def get_beta(k: int) -> float:
_state["alpha"] = alpha
_state["norms2"] = norms2

# Populate output matrix with evaluated polynomials
out[nonnull_indices, :] = P[:, 1:]

# Return basis dropping the first (constant) column
return FactorValues(
P[:, 1:], column_names=tuple(str(i) for i in range(1, degree + 1))
)
return FactorValues(out, column_names=tuple(str(i) for i in range(1, degree + 1)))
145 changes: 145 additions & 0 deletions formulaic/utils/null_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from functools import singledispatch
from typing import Any, Sequence, Set, Union

import numpy
import pandas
import scipy.sparse as spsparse

from formulaic.materializers.types import FactorValues


@singledispatch
def find_nulls(values: Any) -> Set[int]:
"""
Find the indices of rows in `values` that have null/nan values.
Args:
values: The values in which to find nulls.
"""
raise ValueError(
f"No implementation of `find_nulls()` for type `{repr(type(values))}`."
)


@find_nulls.register
def _(values: None) -> Set[int]:
# Literal `None` values have special meaning and are checked elsewhere.
return set()


@find_nulls.register
def _(values: str) -> Set[int]:
return set()


@find_nulls.register
def _(values: int) -> Set[int]:
return _drop_nulls_scalar(values)


@find_nulls.register
def _(values: float) -> Set[int]:
return _drop_nulls_scalar(values)


def _drop_nulls_scalar(values: Union[int, float]) -> Set[int]:
if isinstance(values, FactorValues):
values = values.__wrapped__
if numpy.isnan(values):
raise ValueError("Constant value is null, invalidating all rows.")
return set()


@find_nulls.register
def _(values: list) -> Set[int]:
if isinstance(values, FactorValues):
# Older versions of pandas (<1.2) cannot unpack this automatically.
values = values.__wrapped__
return find_nulls(pandas.Series(values))


@find_nulls.register
def _(values: dict) -> Set[int]:
indices = set()
for vs in values.values():
indices.update(find_nulls(vs))
return indices


@find_nulls.register
def _(values: pandas.Series) -> Set[int]:
return set(numpy.flatnonzero(values.isnull().values))


@find_nulls.register
def _(values: numpy.ndarray) -> Set[int]:
if len(values.shape) == 0:
if numpy.isnan(values):
raise ValueError("Constant value is null, invalidating all rows.")
return set()

if len(values.shape) == 1:
return set(numpy.flatnonzero(numpy.isnan(values)))

if len(values.shape) == 2:
return set(numpy.flatnonzero(numpy.any(numpy.isnan(values), axis=1)))

raise ValueError(
"Cannot check for null indices for arrays of more than 2 dimensions."
)


@find_nulls.register
def _(values: spsparse.spmatrix) -> Set[int]:
rows, _, data = spsparse.find(values)
null_data_indices = numpy.flatnonzero(numpy.isnan(data))
return set(rows[null_data_indices])


@singledispatch
def drop_rows(values: Any, indices: Sequence[int]) -> Any:
"""
Drop rows corresponding to the given indices in `values`.
Args:
values: The vector from which to drop rows with the given `indices`.
indices: The indices of the rows to be dropped.
"""
raise ValueError(
f"No implementation of `drop_rows()` for values of type `{repr(type(values))}`."
)


@drop_rows.register
def _(values: list, indices: Sequence[int]) -> list:
return [value for i, value in enumerate(values) if i not in indices]


@drop_rows.register
def _(values: pandas.Series, indices: Sequence[int]) -> pandas.Series:
return values.drop(index=values.index[indices])


@drop_rows.register
def _(values: numpy.ndarray, indices: Sequence[int]) -> numpy.ndarray:
return numpy.delete(values, indices, axis=0)


@drop_rows.register
def _(values: spsparse.spmatrix, indices: Sequence[int]) -> numpy.ndarray:
"""
Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
"""
was_csc = False
if isinstance(values, spsparse.csc_matrix):
was_csc = True
values = values.tocsr()
indices = list(indices)
mask = numpy.ones(values.shape[0], dtype=bool)
mask[indices] = False
masked = values[mask]

if was_csc:
masked = masked.tocsc()

return masked
18 changes: 14 additions & 4 deletions tests/materializers/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,21 @@ def test_get_model_matrix_invalid_output(self, materializer):
)

@pytest.mark.parametrize("formula,tests", PANDAS_TESTS.items())
def test_na_handling(self, data_with_nulls, formula, tests):
mm = PandasMaterializer(data_with_nulls).get_model_matrix(formula)
assert isinstance(mm, pandas.DataFrame)
@pytest.mark.parametrize("output", ["pandas", "numpy", "sparse"])
def test_na_handling(self, data_with_nulls, formula, tests, output):
mm = PandasMaterializer(data_with_nulls).get_model_matrix(
formula, output=output
)
if output == "pandas":
assert isinstance(mm, pandas.DataFrame)
assert list(mm.columns) == tests[2]
elif output == "numpy":
assert isinstance(mm, numpy.ndarray)

else:
assert isinstance(mm, spsparse.csc_matrix)
assert mm.shape == (tests[3], len(tests[2]))
assert list(mm.columns) == tests[2]
assert list(mm.model_spec.column_names) == tests[2]

if formula == "A:B":
return
Expand Down
57 changes: 57 additions & 0 deletions tests/transforms/test_poly.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,60 @@ def test_raw(self, data):
assert numpy.allclose(
poly(data, 3, raw=True), numpy.array([data, data**2, data**3]).T
)

def test_nulls(self, data):
state = {}
data = numpy.concatenate([[numpy.nan, numpy.nan, numpy.nan], data])
data[:3] = numpy.nan
V = poly(data, degree=3, _state=state)

assert V.shape[1] == 3

# Comparison data copied from R output of:
# > data = seq(from=0, to=1, by=0.05)
# > poly(data, 3)
r_reference = numpy.array(
[
[numpy.nan, numpy.nan, numpy.nan],
[numpy.nan, numpy.nan, numpy.nan],
[numpy.nan, numpy.nan, numpy.nan],
[-3.603750e-01, 0.42285541, -4.332979e-01],
[-3.243375e-01, 0.29599879, -1.733191e-01],
[-2.883000e-01, 0.18249549, 1.824412e-02],
[-2.522625e-01, 0.08234553, 1.489937e-01],
[-2.162250e-01, -0.00445111, 2.265312e-01],
[-1.801875e-01, -0.07789442, 2.584584e-01],
[-1.441500e-01, -0.13798440, 2.523770e-01],
[-1.081125e-01, -0.18472105, 2.158888e-01],
[-7.207500e-02, -0.21810437, 1.565954e-01],
[-3.603750e-02, -0.23813436, 8.209854e-02],
[-3.000725e-17, -0.24481103, -4.395626e-17],
[3.603750e-02, -0.23813436, -8.209854e-02],
[7.207500e-02, -0.21810437, -1.565954e-01],
[1.081125e-01, -0.18472105, -2.158888e-01],
[1.441500e-01, -0.13798440, -2.523770e-01],
[1.801875e-01, -0.07789442, -2.584584e-01],
[2.162250e-01, -0.00445111, -2.265312e-01],
[2.522625e-01, 0.08234553, -1.489937e-01],
[2.883000e-01, 0.18249549, -1.824412e-02],
[3.243375e-01, 0.29599879, 1.733191e-01],
[3.603750e-01, 0.42285541, 4.332979e-01],
]
)

assert numpy.allclose(
V,
r_reference,
equal_nan=True,
)

assert state["alpha"] == pytest.approx({0: 0.5, 1: 0.5, 2: 0.5})
assert state["norms2"] == pytest.approx(
{0: 21.0, 1: 1.925, 2: 0.14020416666666669, 3: 0.009734175}
)

assert numpy.allclose(
poly(data, degree=3, _state=state),
r_reference,
equal_nan=True,
)
Loading

0 comments on commit 77cbba6

Please sign in to comment.