Skip to content

Commit

Permalink
Merge pull request #188 from matthewwardrop/update_tooling
Browse files Browse the repository at this point in the history
Migrate to `ruff`, update `mypy`, and include pre-commit configuration.
  • Loading branch information
matthewwardrop authored Aug 6, 2024
2 parents 2aab6f9 + 1a87262 commit edd315b
Show file tree
Hide file tree
Showing 59 changed files with 248 additions and 190 deletions.
16 changes: 16 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
hooks:
- id: check-yaml
- id: check-toml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.6
hooks:
- id: ruff # Run the linter.
types_or: [ python, pyi, jupyter ]
args: [ --fix ]
- id: ruff-format # Run the formatter.
types_or: [ python, pyi, jupyter ]
5 changes: 3 additions & 2 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import time
from collections import namedtuple

import formulaic
import numpy
import pandas
import patsy
from formulaic import Formula
from uncertainties import ufloat

import formulaic
from formulaic import Formula

ALL_TOOLINGS = ["patsy", "formulaic", "formulaic_sparse", "R", "R_sparse"]

formulas = {
Expand Down
36 changes: 27 additions & 9 deletions benchmarks/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,48 @@
import numpy as np
import pandas as pd


data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'benchmarks.csv')).sort_values('mean')
data = pd.read_csv(
os.path.join(os.path.dirname(__file__), "benchmarks.csv")
).sort_values("mean")


def grouped_barplot(df, cat, subcat, val, err, subcats=None, **kwargs):
# based on https://stackoverflow.com/a/42033734
categories = df[cat].unique()
x = np.arange(len(categories))
subcats = subcats or df[subcat].unique()
offsets = (np.arange(len(subcats)) - np.arange(len(subcats)).mean()) / (len(subcats) + 1.)
offsets = (np.arange(len(subcats)) - np.arange(len(subcats)).mean()) / (
len(subcats) + 1.0
)
width = np.diff(offsets).mean()
for i, gr in enumerate(subcats):
dfg = df[df[subcat] == gr]
plt.bar(x + offsets[i], dfg[val].values, width=width,
label="{}".format(gr), yerr=dfg[err].values, capsize=6, **kwargs)
plt.bar(
x + offsets[i],
dfg[val].values,
width=width,
label="{}".format(gr),
yerr=dfg[err].values,
capsize=6,
**kwargs,
)
plt.xlabel(cat)
plt.ylabel(val)
plt.xticks(x, categories)
plt.legend(title=subcat, loc='center left', bbox_to_anchor=(1, 0.5))
plt.legend(title=subcat, loc="center left", bbox_to_anchor=(1, 0.5))


def plot_benchmarks(toolings=None):
plt.figure(dpi=120, figsize=(10, 5))
grouped_barplot(data, cat='formula', subcat='tooling', val='mean', err='stderr', subcats=toolings, log=True)
grouped_barplot(
data,
cat="formula",
subcat="tooling",
val="mean",
err="stderr",
subcats=toolings,
log=True,
)
plt.ylim(1e-2, None)
plt.grid()
plt.gca().set_axisbelow(True)
Expand All @@ -36,5 +54,5 @@ def plot_benchmarks(toolings=None):
plt.tight_layout()


plot_benchmarks(toolings=['formulaic', 'R', 'patsy', 'formulaic_sparse', 'R_sparse'])
plt.savefig(os.path.join(os.path.dirname(__file__), 'benchmarks.png'))
plot_benchmarks(toolings=["formulaic", "R", "patsy", "formulaic_sparse", "R_sparse"])
plt.savefig(os.path.join(os.path.dirname(__file__), "benchmarks.png"))
2 changes: 1 addition & 1 deletion formulaic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .formula import Formula, FormulaSpec
from .materializers import FactorValues
from .model_matrix import ModelMatrix, ModelMatrices
from .model_matrix import ModelMatrices, ModelMatrix
from .model_spec import ModelSpec, ModelSpecs
from .sugar import model_matrix

Expand Down
1 change: 0 additions & 1 deletion formulaic/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from .parser.types import FormulaParser, OrderedSet, Structured, Term
from .utils.calculus import differentiate_term


FormulaSpec: TypeAlias = Union[
str,
List[Union[str, Term]],
Expand Down
15 changes: 10 additions & 5 deletions formulaic/materializers/arrow.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Dict, Sequence
from collections.abc import Mapping
from typing import TYPE_CHECKING, Any, Dict, Iterator, Sequence

import pandas
from interface_meta import override


from .pandas import PandasMaterializer

if TYPE_CHECKING: # pragma: no cover
import pyarrow


class ArrowMaterializer(PandasMaterializer):

REGISTER_NAME: str = "arrow"
REGISTER_NAME = "arrow"
REGISTER_INPUTS: Sequence[str] = ("pyarrow.lib.Table",)

@override
Expand All @@ -27,7 +26,7 @@ def data_context(self):
return self.__data_context


class LazyArrowTableProxy:
class LazyArrowTableProxy(Mapping):
def __init__(self, table: pyarrow.Table):
self.table = table
self.column_names = set(self.table.column_names)
Expand All @@ -43,3 +42,9 @@ def __getitem__(self, key: str) -> Any:
if key not in self._cache:
self._cache[key] = self.table.column(key).to_pandas()
return self._cache[key]

def __iter__(self) -> Iterator[str]:
return iter(self.column_names)

def __len__(self) -> int:
return len(self.column_names)
40 changes: 23 additions & 17 deletions formulaic/materializers/base.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
from __future__ import annotations
import ast

import ast
import functools
import inspect
import itertools
import operator
from abc import abstractmethod
from collections import defaultdict, namedtuple
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
Hashable,
List,
Iterable,
List,
Mapping,
Optional,
Sequence,
Set,
Tuple,
Type,
Union,
TYPE_CHECKING,
cast,
)

Expand Down Expand Up @@ -56,7 +56,6 @@


class FormulaMaterializerMeta(InterfaceMeta):

INTERFACE_RAISE_ON_VIOLATION = True

REGISTERED_NAMES: Dict[str, Type[FormulaMaterializer]] = {}
Expand Down Expand Up @@ -119,7 +118,6 @@ def for_data(cls, data: Any, output: Hashable = None) -> Type[FormulaMaterialize


class FormulaMaterializer(metaclass=FormulaMaterializerMeta):

REGISTER_NAME: Optional[str] = None
REGISTER_INPUTS: Sequence[str] = ()
REGISTER_OUTPUTS: Sequence[Hashable] = ()
Expand Down Expand Up @@ -209,7 +207,6 @@ def get_model_matrix(
def _build_model_matrix(
self, spec: ModelSpec, drop_rows: Sequence[int]
) -> ModelMatrix:

# Step 0: Apply any requested column/term clustering
# This must happen before Step 1 otherwise the greedy rank reduction
# below would result in a different outcome than if the columns had
Expand All @@ -228,10 +225,9 @@ def _build_model_matrix(
scoped_cols = {}
for scoped_term in scoped_terms:
if not scoped_term.factors:
scoped_cols[
"Intercept"
] = scoped_term.scale * self._encode_constant(
1, None, {}, spec, drop_rows
scoped_cols["Intercept"] = (
scoped_term.scale
* self._encode_constant(1, None, {}, spec, drop_rows)
)
else:
scoped_cols.update(
Expand Down Expand Up @@ -455,7 +451,9 @@ def _get_scoped_terms_spanned_by_evaled_factors(
else:
factors.append((ScopedFactor(factor),))
return OrderedSet(
ScopedTerm(factors=(p for p in prod if p != 1), scale=scale)
ScopedTerm(
factors=(cast(ScopedFactor, p) for p in prod if p != 1), scale=scale
)
for prod in itertools.product(*factors)
)

Expand Down Expand Up @@ -534,7 +532,9 @@ def _evaluate_factor(
raise FactorEvaluationError(
f"The evaluation method `{factor.eval_method.value}` for factor `{factor}` is not understood."
)
except FactorEvaluationError: # pragma: no cover; future proofing against new eval methods
except (
FactorEvaluationError
): # pragma: no cover; future proofing against new eval methods
raise
except Exception as e:
raise FactorEvaluationError(
Expand Down Expand Up @@ -746,7 +746,8 @@ def wrapped(
and reduced_rank
):
encoded = FactorValues(
encoded.copy(), metadata=encoded.__formulaic_metadata__ # type: ignore
encoded.copy(),
metadata=encoded.__formulaic_metadata__, # type: ignore
)
del encoded[encoded.__formulaic_metadata__.drop_field]

Expand Down Expand Up @@ -831,7 +832,10 @@ def _enforce_structure(
) -> Generator[Tuple[Term, List[ScopedTerm], Dict[str, Any]], None, None]:
# TODO: Verify that imputation strategies are intuitive and make sense.
structure = cast(List[EncodedTermStructure], spec.structure)
assert len(cols) == len(structure)
if not len(cols) == len(structure): # pragma: no cover
raise RuntimeError(
"Specification structure and columns are mismatched. Please report this error with examples!"
)
for i, col_spec in enumerate(cols):
scoped_cols = col_spec[2]
target_cols = structure[i][2]
Expand All @@ -854,9 +858,11 @@ def _enforce_structure(
f"Term `{col_spec[0]}` has generated columns that are inconsistent with specification: generated {list(scoped_cols)}, expecting {target_cols}."
)

yield col_spec[0], col_spec[1], {
col: scoped_cols[col] for col in target_cols
}
yield (
col_spec[0],
col_spec[1],
{col: scoped_cols[col] for col in target_cols},
)

def _get_columns_for_term(
self, factors: List[Dict[str, Any]], spec: ModelSpec, scale: float = 1
Expand Down
10 changes: 5 additions & 5 deletions formulaic/materializers/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@

import functools
import itertools
from typing import Any, Dict, List, Sequence, Set, Tuple, cast, TYPE_CHECKING
from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Set, Tuple, cast

import numpy
import pandas
import scipy.sparse as spsparse
from interface_meta import override

from formulaic.utils.cast import as_columns
from formulaic.utils.null_handling import find_nulls, drop_rows as drop_nulls
from formulaic.utils.null_handling import drop_rows as drop_nulls
from formulaic.utils.null_handling import find_nulls

from .base import FormulaMaterializer
from .types import NAAction
Expand All @@ -19,8 +21,7 @@


class PandasMaterializer(FormulaMaterializer):

REGISTER_NAME: str = "pandas"
REGISTER_NAME = "pandas"
REGISTER_INPUTS: Sequence[str] = ("pandas.core.frame.DataFrame", "pandas.DataFrame")
REGISTER_OUTPUTS: Sequence[str] = ("pandas", "numpy", "sparse")

Expand All @@ -36,7 +37,6 @@ def _is_categorical(self, values: Any) -> bool:
def _check_for_nulls(
self, name: str, values: Any, na_action: NAAction, drop_rows: Set[int]
) -> None:

if na_action is NAAction.IGNORE:
return

Expand Down
4 changes: 2 additions & 2 deletions formulaic/materializers/types/factor_values.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from __future__ import annotations
import copy

import copy
from dataclasses import dataclass, replace
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Expand All @@ -13,7 +14,6 @@
Tuple,
TypeVar,
Union,
TYPE_CHECKING,
)

import wrapt
Expand Down
2 changes: 1 addition & 1 deletion formulaic/model_matrix.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import copy
from typing import Any, Generic, Optional, TypeVar, TYPE_CHECKING, cast
from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar, cast

import wrapt

Expand Down
6 changes: 3 additions & 3 deletions formulaic/model_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
from dataclasses import dataclass, field, replace
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
Expand All @@ -11,17 +12,16 @@
Sequence,
Set,
Union,
TYPE_CHECKING,
cast,
)

from formulaic.materializers.base import EncodedTermStructure
from formulaic.parser.types import Structured, Term
from formulaic.utils.constraints import LinearConstraintSpec, LinearConstraints
from formulaic.utils.constraints import LinearConstraints, LinearConstraintSpec
from formulaic.utils.variables import Variable

from .formula import Formula, FormulaSpec
from .materializers import FormulaMaterializer, NAAction, ClusterBy
from .materializers import ClusterBy, FormulaMaterializer, NAAction

if TYPE_CHECKING: # pragma: no cover
from .model_matrix import ModelMatrices, ModelMatrix
Expand Down
8 changes: 6 additions & 2 deletions formulaic/parser/algos/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,16 @@ def tokenize(
continue # pragma: no cover; workaround bug in coverage

if word_chars.match(char):
assert token.kind in (
if token.kind not in (
None,
Token.Kind.OPERATOR,
Token.Kind.VALUE,
Token.Kind.NAME,
), f"Unexpected token kind {token.kind}."
):
raise exc_for_token( # pragma: no cover
Token(source=formula, source_start=i, source_end=i),
f"Unexpected token kind {token.kind} for character '{char}'.",
)
if token and token.kind is Token.Kind.OPERATOR:
yield token
token = Token(source=formula)
Expand Down
Loading

0 comments on commit edd315b

Please sign in to comment.