Merge pull request #188 from matthewwardrop/update_tooling

Migrate to `ruff`, update `mypy`, and include pre-commit configuration.
matthewwardrop · Aug 6, 2024 · edd315b · edd315b
2 parents 2aab6f9 + 1a87262
commit edd315b
Show file tree

Hide file tree

Showing 59 changed files with 248 additions and 190 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,16 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-toml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.5.6
+    hooks:
+      - id: ruff  # Run the linter.
+        types_or: [ python, pyi, jupyter ]
+        args: [ --fix ]
+      - id: ruff-format  # Run the formatter.
+        types_or: [ python, pyi, jupyter ]
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -4,13 +4,14 @@
 import time
 from collections import namedtuple
 
-import formulaic
 import numpy
 import pandas
 import patsy
-from formulaic import Formula
 from uncertainties import ufloat
 
+import formulaic
+from formulaic import Formula
+
 ALL_TOOLINGS = ["patsy", "formulaic", "formulaic_sparse", "R", "R_sparse"]
 
 formulas = {

diff --git a/benchmarks/plot.py b/benchmarks/plot.py
@@ -4,30 +4,48 @@
 import numpy as np
 import pandas as pd
 
-
-data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'benchmarks.csv')).sort_values('mean')
+data = pd.read_csv(
+    os.path.join(os.path.dirname(__file__), "benchmarks.csv")
+).sort_values("mean")
 
 
 def grouped_barplot(df, cat, subcat, val, err, subcats=None, **kwargs):
     # based on https://stackoverflow.com/a/42033734
     categories = df[cat].unique()
     x = np.arange(len(categories))
     subcats = subcats or df[subcat].unique()
-    offsets = (np.arange(len(subcats)) - np.arange(len(subcats)).mean()) / (len(subcats) + 1.)
+    offsets = (np.arange(len(subcats)) - np.arange(len(subcats)).mean()) / (
+        len(subcats) + 1.0
+    )
     width = np.diff(offsets).mean()
     for i, gr in enumerate(subcats):
         dfg = df[df[subcat] == gr]
-        plt.bar(x + offsets[i], dfg[val].values, width=width,
-                label="{}".format(gr), yerr=dfg[err].values, capsize=6, **kwargs)
+        plt.bar(
+            x + offsets[i],
+            dfg[val].values,
+            width=width,
+            label="{}".format(gr),
+            yerr=dfg[err].values,
+            capsize=6,
+            **kwargs,
+        )
     plt.xlabel(cat)
     plt.ylabel(val)
     plt.xticks(x, categories)
-    plt.legend(title=subcat, loc='center left', bbox_to_anchor=(1, 0.5))
+    plt.legend(title=subcat, loc="center left", bbox_to_anchor=(1, 0.5))
 
 
 def plot_benchmarks(toolings=None):
     plt.figure(dpi=120, figsize=(10, 5))
-    grouped_barplot(data, cat='formula', subcat='tooling', val='mean', err='stderr', subcats=toolings, log=True)
+    grouped_barplot(
+        data,
+        cat="formula",
+        subcat="tooling",
+        val="mean",
+        err="stderr",
+        subcats=toolings,
+        log=True,
+    )
     plt.ylim(1e-2, None)
     plt.grid()
     plt.gca().set_axisbelow(True)
@@ -36,5 +54,5 @@ def plot_benchmarks(toolings=None):
     plt.tight_layout()
 
 
-plot_benchmarks(toolings=['formulaic', 'R', 'patsy', 'formulaic_sparse', 'R_sparse'])
-plt.savefig(os.path.join(os.path.dirname(__file__), 'benchmarks.png'))
+plot_benchmarks(toolings=["formulaic", "R", "patsy", "formulaic_sparse", "R_sparse"])
+plt.savefig(os.path.join(os.path.dirname(__file__), "benchmarks.png"))
diff --git a/formulaic/__init__.py b/formulaic/__init__.py
@@ -1,6 +1,6 @@
 from .formula import Formula, FormulaSpec
 from .materializers import FactorValues
-from .model_matrix import ModelMatrix, ModelMatrices
+from .model_matrix import ModelMatrices, ModelMatrix
 from .model_spec import ModelSpec, ModelSpecs
 from .sugar import model_matrix
 

diff --git a/formulaic/formula.py b/formulaic/formula.py
@@ -11,7 +11,6 @@
 from .parser.types import FormulaParser, OrderedSet, Structured, Term
 from .utils.calculus import differentiate_term
 
-
 FormulaSpec: TypeAlias = Union[
     str,
     List[Union[str, Term]],

diff --git a/formulaic/materializers/arrow.py b/formulaic/materializers/arrow.py
@@ -1,20 +1,19 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Dict, Sequence
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Dict, Iterator, Sequence
 
 import pandas
 from interface_meta import override
 
-
 from .pandas import PandasMaterializer
 
 if TYPE_CHECKING:  # pragma: no cover
     import pyarrow
 
 
 class ArrowMaterializer(PandasMaterializer):
-
-    REGISTER_NAME: str = "arrow"
+    REGISTER_NAME = "arrow"
     REGISTER_INPUTS: Sequence[str] = ("pyarrow.lib.Table",)
 
     @override
@@ -27,7 +26,7 @@ def data_context(self):
         return self.__data_context
 
 
-class LazyArrowTableProxy:
+class LazyArrowTableProxy(Mapping):
     def __init__(self, table: pyarrow.Table):
         self.table = table
         self.column_names = set(self.table.column_names)
@@ -43,3 +42,9 @@ def __getitem__(self, key: str) -> Any:
         if key not in self._cache:
             self._cache[key] = self.table.column(key).to_pandas()
         return self._cache[key]
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.column_names)
+
+    def __len__(self) -> int:
+        return len(self.column_names)
diff --git a/formulaic/materializers/base.py b/formulaic/materializers/base.py
@@ -1,27 +1,27 @@
 from __future__ import annotations
-import ast
 
+import ast
 import functools
 import inspect
 import itertools
 import operator
 from abc import abstractmethod
 from collections import defaultdict, namedtuple
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     Generator,
     Hashable,
-    List,
     Iterable,
+    List,
     Mapping,
     Optional,
     Sequence,
     Set,
     Tuple,
     Type,
     Union,
-    TYPE_CHECKING,
     cast,
 )
 
@@ -56,7 +56,6 @@
 
 
 class FormulaMaterializerMeta(InterfaceMeta):
-
     INTERFACE_RAISE_ON_VIOLATION = True
 
     REGISTERED_NAMES: Dict[str, Type[FormulaMaterializer]] = {}
@@ -119,7 +118,6 @@ def for_data(cls, data: Any, output: Hashable = None) -> Type[FormulaMaterialize
 
 
 class FormulaMaterializer(metaclass=FormulaMaterializerMeta):
-
     REGISTER_NAME: Optional[str] = None
     REGISTER_INPUTS: Sequence[str] = ()
     REGISTER_OUTPUTS: Sequence[Hashable] = ()
@@ -209,7 +207,6 @@ def get_model_matrix(
     def _build_model_matrix(
         self, spec: ModelSpec, drop_rows: Sequence[int]
     ) -> ModelMatrix:
-
         # Step 0: Apply any requested column/term clustering
         # This must happen before Step 1 otherwise the greedy rank reduction
         # below would result in a different outcome than if the columns had
@@ -228,10 +225,9 @@ def _build_model_matrix(
             scoped_cols = {}
             for scoped_term in scoped_terms:
                 if not scoped_term.factors:
-                    scoped_cols[
-                        "Intercept"
-                    ] = scoped_term.scale * self._encode_constant(
-                        1, None, {}, spec, drop_rows
+                    scoped_cols["Intercept"] = (
+                        scoped_term.scale
+                        * self._encode_constant(1, None, {}, spec, drop_rows)
                     )
                 else:
                     scoped_cols.update(
@@ -455,7 +451,9 @@ def _get_scoped_terms_spanned_by_evaled_factors(
             else:
                 factors.append((ScopedFactor(factor),))
         return OrderedSet(
-            ScopedTerm(factors=(p for p in prod if p != 1), scale=scale)
+            ScopedTerm(
+                factors=(cast(ScopedFactor, p) for p in prod if p != 1), scale=scale
+            )
             for prod in itertools.product(*factors)
         )
 
@@ -534,7 +532,9 @@ def _evaluate_factor(
                     raise FactorEvaluationError(
                         f"The evaluation method `{factor.eval_method.value}` for factor `{factor}` is not understood."
                     )
-            except FactorEvaluationError:  # pragma: no cover; future proofing against new eval methods
+            except (
+                FactorEvaluationError
+            ):  # pragma: no cover; future proofing against new eval methods
                 raise
             except Exception as e:
                 raise FactorEvaluationError(
@@ -746,7 +746,8 @@ def wrapped(
             and reduced_rank
         ):
             encoded = FactorValues(
-                encoded.copy(), metadata=encoded.__formulaic_metadata__  # type: ignore
+                encoded.copy(),
+                metadata=encoded.__formulaic_metadata__,  # type: ignore
             )
             del encoded[encoded.__formulaic_metadata__.drop_field]
 
@@ -831,7 +832,10 @@ def _enforce_structure(
     ) -> Generator[Tuple[Term, List[ScopedTerm], Dict[str, Any]], None, None]:
         # TODO: Verify that imputation strategies are intuitive and make sense.
         structure = cast(List[EncodedTermStructure], spec.structure)
-        assert len(cols) == len(structure)
+        if not len(cols) == len(structure):  # pragma: no cover
+            raise RuntimeError(
+                "Specification structure and columns are mismatched. Please report this error with examples!"
+            )
         for i, col_spec in enumerate(cols):
             scoped_cols = col_spec[2]
             target_cols = structure[i][2]
@@ -854,9 +858,11 @@ def _enforce_structure(
                     f"Term `{col_spec[0]}` has generated columns that are inconsistent with specification: generated {list(scoped_cols)}, expecting {target_cols}."
                 )
 
-            yield col_spec[0], col_spec[1], {
-                col: scoped_cols[col] for col in target_cols
-            }
+            yield (
+                col_spec[0],
+                col_spec[1],
+                {col: scoped_cols[col] for col in target_cols},
+            )
 
     def _get_columns_for_term(
         self, factors: List[Dict[str, Any]], spec: ModelSpec, scale: float = 1

diff --git a/formulaic/materializers/pandas.py b/formulaic/materializers/pandas.py
@@ -2,14 +2,16 @@
 
 import functools
 import itertools
-from typing import Any, Dict, List, Sequence, Set, Tuple, cast, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Set, Tuple, cast
 
 import numpy
 import pandas
 import scipy.sparse as spsparse
 from interface_meta import override
+
 from formulaic.utils.cast import as_columns
-from formulaic.utils.null_handling import find_nulls, drop_rows as drop_nulls
+from formulaic.utils.null_handling import drop_rows as drop_nulls
+from formulaic.utils.null_handling import find_nulls
 
 from .base import FormulaMaterializer
 from .types import NAAction
@@ -19,8 +21,7 @@
 
 
 class PandasMaterializer(FormulaMaterializer):
-
-    REGISTER_NAME: str = "pandas"
+    REGISTER_NAME = "pandas"
     REGISTER_INPUTS: Sequence[str] = ("pandas.core.frame.DataFrame", "pandas.DataFrame")
     REGISTER_OUTPUTS: Sequence[str] = ("pandas", "numpy", "sparse")
 
@@ -36,7 +37,6 @@ def _is_categorical(self, values: Any) -> bool:
     def _check_for_nulls(
         self, name: str, values: Any, na_action: NAAction, drop_rows: Set[int]
     ) -> None:
-
         if na_action is NAAction.IGNORE:
             return
 

diff --git a/formulaic/materializers/types/factor_values.py b/formulaic/materializers/types/factor_values.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
-import copy
 
+import copy
 from dataclasses import dataclass, replace
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -13,7 +14,6 @@
     Tuple,
     TypeVar,
     Union,
-    TYPE_CHECKING,
 )
 
 import wrapt

diff --git a/formulaic/model_matrix.py b/formulaic/model_matrix.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import copy
-from typing import Any, Generic, Optional, TypeVar, TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar, cast
 
 import wrapt
 

diff --git a/formulaic/model_spec.py b/formulaic/model_spec.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, field, replace
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -11,17 +12,16 @@
     Sequence,
     Set,
     Union,
-    TYPE_CHECKING,
     cast,
 )
 
 from formulaic.materializers.base import EncodedTermStructure
 from formulaic.parser.types import Structured, Term
-from formulaic.utils.constraints import LinearConstraintSpec, LinearConstraints
+from formulaic.utils.constraints import LinearConstraints, LinearConstraintSpec
 from formulaic.utils.variables import Variable
 
 from .formula import Formula, FormulaSpec
-from .materializers import FormulaMaterializer, NAAction, ClusterBy
+from .materializers import ClusterBy, FormulaMaterializer, NAAction
 
 if TYPE_CHECKING:  # pragma: no cover
     from .model_matrix import ModelMatrices, ModelMatrix

diff --git a/formulaic/parser/algos/tokenize.py b/formulaic/parser/algos/tokenize.py
@@ -150,12 +150,16 @@ def tokenize(
             continue  # pragma: no cover; workaround bug in coverage
 
         if word_chars.match(char):
-            assert token.kind in (
+            if token.kind not in (
                 None,
                 Token.Kind.OPERATOR,
                 Token.Kind.VALUE,
                 Token.Kind.NAME,
-            ), f"Unexpected token kind {token.kind}."
+            ):
+                raise exc_for_token(  # pragma: no cover
+                    Token(source=formula, source_start=i, source_end=i),
+                    f"Unexpected token kind {token.kind} for character '{char}'.",
+                )
             if token and token.kind is Token.Kind.OPERATOR:
                 yield token
                 token = Token(source=formula)