Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve null handling in poly and when generating sparse matrices #155

Merged
merged 2 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions formulaic/materializers/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import scipy.sparse as spsparse
from interface_meta import override
from formulaic.utils.cast import as_columns
from formulaic.utils.null_handling import find_nulls, drop_rows as drop_nulls

from .base import FormulaMaterializer
from .types import NAAction
Expand Down Expand Up @@ -39,24 +40,24 @@ def _check_for_nulls(
if na_action is NAAction.IGNORE:
return

if isinstance(
values, dict
): # pragma: no cover; no formulaic transforms return dictionaries any more
for key, vs in values.items():
self._check_for_nulls(f"{name}[{key}]", vs, na_action, drop_rows)
try:
null_indices = find_nulls(values)

elif na_action is NAAction.RAISE:
if isinstance(values, pandas.Series) and values.isnull().values.any():
raise ValueError(f"`{name}` contains null values after evaluation.")
if na_action is NAAction.RAISE:
if null_indices:
raise ValueError(f"`{name}` contains null values after evaluation.")

elif na_action is NAAction.DROP:
if isinstance(values, pandas.Series):
drop_rows.update(numpy.flatnonzero(values.isnull().values))
elif na_action is NAAction.DROP:
drop_rows.update(null_indices)

else:
else:
raise ValueError(
f"Do not know how to interpret `na_action` = {repr(na_action)}."
) # pragma: no cover; this is currently impossible to reach
except ValueError as e:
raise ValueError(
f"Do not know how to interpret `na_action` = {repr(na_action)}."
) # pragma: no cover; this is currently impossible to reach
f"Error encountered while checking for nulls in `{name}`: {e}"
) from e

@override
def _encode_constant(
Expand All @@ -67,13 +68,10 @@ def _encode_constant(
spec: ModelSpec,
drop_rows: Sequence[int],
) -> Any:
nrows = self.nrows - len(drop_rows)
if spec.output == "sparse":
return spsparse.csc_matrix(
numpy.array([value] * self.nrows).reshape(
(self.nrows - len(drop_rows), 1)
)
)
series = value * numpy.ones(self.nrows - len(drop_rows))
return spsparse.csc_matrix(numpy.array([value] * nrows).reshape((nrows, 1)))
series = value * numpy.ones(nrows)
return series

@override
Expand All @@ -86,9 +84,11 @@ def _encode_numerical(
drop_rows: Sequence[int],
) -> Any:
if drop_rows:
values = values.drop(index=values.index[drop_rows])
values = drop_nulls(values, indices=drop_rows)
if spec.output == "sparse":
return spsparse.csc_matrix(numpy.array(values).reshape((self.nrows, 1)))
return spsparse.csc_matrix(
numpy.array(values).reshape((values.shape[0], 1))
)
return values

@override
Expand All @@ -107,7 +107,7 @@ def _encode_categorical(
from formulaic.transforms import encode_contrasts

if drop_rows:
values = values.drop(index=values.index[drop_rows])
values = drop_nulls(values, indices=drop_rows)
return as_columns(
encode_contrasts(
values,
Expand Down
19 changes: 14 additions & 5 deletions formulaic/transforms/poly.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def poly( # pylint: disable=dangerous-default-value # always replaced by state
regression.

`nan` values in `x` will be ignored and progagated through to generated
polynomials.
polynomials in the corresponding row indices.

The signature of this transform is intentionally chosen to be compatible
with R.
Expand All @@ -71,7 +71,15 @@ def poly( # pylint: disable=dangerous-default-value # always replaced by state
if raw:
return numpy.stack([numpy.power(x, k) for k in range(1, degree + 1)], axis=1)

x = numpy.array(x)
x = numpy.array(x, dtype=numpy.float64)

# Prepare output matrix (needed because we resize x below if there are nulls)
out = numpy.empty((x.shape[0], degree))
out.fill(numpy.nan)

# Filter to non-null indices (we will invert this later)
nonnull_indices = numpy.flatnonzero(~numpy.isnan(x))
x = x[nonnull_indices]

# Check if we already have generated the alpha and beta coefficients.
# If not, we enter "training" mode.
Expand Down Expand Up @@ -114,7 +122,8 @@ def get_beta(k: int) -> float:
_state["alpha"] = alpha
_state["norms2"] = norms2

# Populate output matrix with evaluated polynomials
out[nonnull_indices, :] = P[:, 1:]

# Return basis dropping the first (constant) column
return FactorValues(
P[:, 1:], column_names=tuple(str(i) for i in range(1, degree + 1))
)
return FactorValues(out, column_names=tuple(str(i) for i in range(1, degree + 1)))
145 changes: 145 additions & 0 deletions formulaic/utils/null_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from functools import singledispatch
from typing import Any, Sequence, Set, Union

import numpy
import pandas
import scipy.sparse as spsparse

from formulaic.materializers.types import FactorValues


@singledispatch
def find_nulls(values: Any) -> Set[int]:
"""
Find the indices of rows in `values` that have null/nan values.

Args:
values: The values in which to find nulls.
"""
raise ValueError(
f"No implementation of `find_nulls()` for type `{repr(type(values))}`."
)


@find_nulls.register
def _(values: None) -> Set[int]:
# Literal `None` values have special meaning and are checked elsewhere.
return set()


@find_nulls.register
def _(values: str) -> Set[int]:
return set()


@find_nulls.register
def _(values: int) -> Set[int]:
return _drop_nulls_scalar(values)


@find_nulls.register
def _(values: float) -> Set[int]:
return _drop_nulls_scalar(values)


def _drop_nulls_scalar(values: Union[int, float]) -> Set[int]:
if isinstance(values, FactorValues):
values = values.__wrapped__
if numpy.isnan(values):
raise ValueError("Constant value is null, invalidating all rows.")
return set()


@find_nulls.register
def _(values: list) -> Set[int]:
if isinstance(values, FactorValues):
# Older versions of pandas (<1.2) cannot unpack this automatically.
values = values.__wrapped__
return find_nulls(pandas.Series(values))


@find_nulls.register
def _(values: dict) -> Set[int]:
indices = set()
for vs in values.values():
indices.update(find_nulls(vs))
return indices


@find_nulls.register
def _(values: pandas.Series) -> Set[int]:
return set(numpy.flatnonzero(values.isnull().values))


@find_nulls.register
def _(values: numpy.ndarray) -> Set[int]:
if len(values.shape) == 0:
if numpy.isnan(values):
raise ValueError("Constant value is null, invalidating all rows.")
return set()

if len(values.shape) == 1:
return set(numpy.flatnonzero(numpy.isnan(values)))

if len(values.shape) == 2:
return set(numpy.flatnonzero(numpy.any(numpy.isnan(values), axis=1)))

raise ValueError(
"Cannot check for null indices for arrays of more than 2 dimensions."
)


@find_nulls.register
def _(values: spsparse.spmatrix) -> Set[int]:
rows, _, data = spsparse.find(values)
null_data_indices = numpy.flatnonzero(numpy.isnan(data))
return set(rows[null_data_indices])


@singledispatch
def drop_rows(values: Any, indices: Sequence[int]) -> Any:
"""
Drop rows corresponding to the given indices in `values`.

Args:
values: The vector from which to drop rows with the given `indices`.
indices: The indices of the rows to be dropped.
"""
raise ValueError(
f"No implementation of `drop_rows()` for values of type `{repr(type(values))}`."
)


@drop_rows.register
def _(values: list, indices: Sequence[int]) -> list:
return [value for i, value in enumerate(values) if i not in indices]


@drop_rows.register
def _(values: pandas.Series, indices: Sequence[int]) -> pandas.Series:
return values.drop(index=values.index[indices])


@drop_rows.register
def _(values: numpy.ndarray, indices: Sequence[int]) -> numpy.ndarray:
return numpy.delete(values, indices, axis=0)


@drop_rows.register
def _(values: spsparse.spmatrix, indices: Sequence[int]) -> numpy.ndarray:
"""
Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
"""
was_csc = False
if isinstance(values, spsparse.csc_matrix):
was_csc = True
values = values.tocsr()
indices = list(indices)
mask = numpy.ones(values.shape[0], dtype=bool)
mask[indices] = False
masked = values[mask]

if was_csc:
masked = masked.tocsc()

return masked
18 changes: 14 additions & 4 deletions tests/materializers/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,11 +160,21 @@ def test_get_model_matrix_invalid_output(self, materializer):
)

@pytest.mark.parametrize("formula,tests", PANDAS_TESTS.items())
def test_na_handling(self, data_with_nulls, formula, tests):
mm = PandasMaterializer(data_with_nulls).get_model_matrix(formula)
assert isinstance(mm, pandas.DataFrame)
@pytest.mark.parametrize("output", ["pandas", "numpy", "sparse"])
def test_na_handling(self, data_with_nulls, formula, tests, output):
mm = PandasMaterializer(data_with_nulls).get_model_matrix(
formula, output=output
)
if output == "pandas":
assert isinstance(mm, pandas.DataFrame)
assert list(mm.columns) == tests[2]
elif output == "numpy":
assert isinstance(mm, numpy.ndarray)

else:
assert isinstance(mm, spsparse.csc_matrix)
assert mm.shape == (tests[3], len(tests[2]))
assert list(mm.columns) == tests[2]
assert list(mm.model_spec.column_names) == tests[2]

if formula == "A:B":
return
Expand Down
57 changes: 57 additions & 0 deletions tests/transforms/test_poly.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,60 @@ def test_raw(self, data):
assert numpy.allclose(
poly(data, 3, raw=True), numpy.array([data, data**2, data**3]).T
)

def test_nulls(self, data):
state = {}
data = numpy.concatenate([[numpy.nan, numpy.nan, numpy.nan], data])
data[:3] = numpy.nan
V = poly(data, degree=3, _state=state)

assert V.shape[1] == 3

# Comparison data copied from R output of:
# > data = seq(from=0, to=1, by=0.05)
# > poly(data, 3)
r_reference = numpy.array(
[
[numpy.nan, numpy.nan, numpy.nan],
[numpy.nan, numpy.nan, numpy.nan],
[numpy.nan, numpy.nan, numpy.nan],
[-3.603750e-01, 0.42285541, -4.332979e-01],
[-3.243375e-01, 0.29599879, -1.733191e-01],
[-2.883000e-01, 0.18249549, 1.824412e-02],
[-2.522625e-01, 0.08234553, 1.489937e-01],
[-2.162250e-01, -0.00445111, 2.265312e-01],
[-1.801875e-01, -0.07789442, 2.584584e-01],
[-1.441500e-01, -0.13798440, 2.523770e-01],
[-1.081125e-01, -0.18472105, 2.158888e-01],
[-7.207500e-02, -0.21810437, 1.565954e-01],
[-3.603750e-02, -0.23813436, 8.209854e-02],
[-3.000725e-17, -0.24481103, -4.395626e-17],
[3.603750e-02, -0.23813436, -8.209854e-02],
[7.207500e-02, -0.21810437, -1.565954e-01],
[1.081125e-01, -0.18472105, -2.158888e-01],
[1.441500e-01, -0.13798440, -2.523770e-01],
[1.801875e-01, -0.07789442, -2.584584e-01],
[2.162250e-01, -0.00445111, -2.265312e-01],
[2.522625e-01, 0.08234553, -1.489937e-01],
[2.883000e-01, 0.18249549, -1.824412e-02],
[3.243375e-01, 0.29599879, 1.733191e-01],
[3.603750e-01, 0.42285541, 4.332979e-01],
]
)

assert numpy.allclose(
V,
r_reference,
equal_nan=True,
)

assert state["alpha"] == pytest.approx({0: 0.5, 1: 0.5, 2: 0.5})
assert state["norms2"] == pytest.approx(
{0: 21.0, 1: 1.925, 2: 0.14020416666666669, 3: 0.009734175}
)

assert numpy.allclose(
poly(data, degree=3, _state=state),
r_reference,
equal_nan=True,
)
Loading
Loading