From 3d24d0e378faac4359be4f2ae4f862a5820842da Mon Sep 17 00:00:00 2001 From: Matthew Wardrop Date: Tue, 6 Aug 2024 09:30:42 -0700 Subject: [PATCH] Migrate to `ruff`, update `mypy`, and include pre-commit configuration. --- .pre-commit-config.yaml | 16 ++++ benchmarks/benchmark.py | 5 +- benchmarks/plot.py | 36 ++++++--- formulaic/__init__.py | 2 +- formulaic/formula.py | 1 - formulaic/materializers/arrow.py | 8 +- formulaic/materializers/base.py | 36 +++++---- formulaic/materializers/pandas.py | 14 ++-- .../materializers/types/factor_values.py | 4 +- formulaic/model_matrix.py | 2 +- formulaic/model_spec.py | 6 +- formulaic/parser/algos/tokenize.py | 8 +- formulaic/parser/algos/tokens_to_ast.py | 10 ++- formulaic/parser/parser.py | 4 +- formulaic/parser/types/__init__.py | 1 - formulaic/parser/types/ast_node.py | 1 - formulaic/parser/types/factor.py | 2 +- formulaic/parser/types/formula_parser.py | 2 +- formulaic/parser/types/ordered_set.py | 1 - formulaic/parser/types/structured.py | 5 +- formulaic/parser/types/term.py | 2 +- formulaic/parser/types/token.py | 2 +- formulaic/parser/utils.py | 4 +- formulaic/transforms/__init__.py | 4 +- formulaic/transforms/contrasts.py | 12 +-- formulaic/transforms/hashed.py | 4 +- formulaic/transforms/patsy_compat.py | 7 +- formulaic/transforms/scale.py | 4 +- formulaic/utils/calculus.py | 5 +- formulaic/utils/constraints.py | 11 +-- formulaic/utils/context.py | 4 +- formulaic/utils/sentinels.py | 1 + formulaic/utils/sparse.py | 2 +- formulaic/utils/stateful_transforms.py | 22 ++--- pyproject.toml | 81 +++++++++---------- tests/materializers/test_base.py | 6 +- tests/materializers/test_pandas.py | 1 - .../types/test_evaluated_factor.py | 2 +- tests/materializers/types/test_scoped_term.py | 2 +- tests/parser/algos/test_tokenize.py | 3 +- tests/parser/algos/test_tokens_to_ast.py | 6 +- tests/parser/test_parser.py | 4 +- tests/parser/test_utils.py | 3 +- tests/parser/types/test_formula_parser.py | 1 - tests/parser/types/test_ordered_set.py | 1 - tests/parser/types/test_structured.py | 4 +- tests/test_model_matrix.py | 8 +- tests/test_model_spec.py | 8 +- tests/test_sugar.py | 3 +- tests/transforms/test_basis_spline.py | 2 +- tests/transforms/test_contrasts.py | 4 +- tests/transforms/test_patsy_compat.py | 3 +- tests/utils/test_constraints.py | 4 +- tests/utils/test_iterators.py | 1 - tests/utils/test_layered_mapping.py | 2 - tests/utils/test_null_handling.py | 4 +- tests/utils/test_sentinels.py | 2 +- tests/utils/test_stateful_transforms.py | 4 +- 58 files changed, 215 insertions(+), 192 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..39e7e3c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,16 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: check-toml + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.6 + hooks: + - id: ruff # Run the linter. + types_or: [ python, pyi, jupyter ] + args: [ --fix ] + - id: ruff-format # Run the formatter. + types_or: [ python, pyi, jupyter ] diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index dbaffd9..0d7ba3e 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -4,13 +4,14 @@ import time from collections import namedtuple -import formulaic import numpy import pandas import patsy -from formulaic import Formula from uncertainties import ufloat +import formulaic +from formulaic import Formula + ALL_TOOLINGS = ["patsy", "formulaic", "formulaic_sparse", "R", "R_sparse"] formulas = { diff --git a/benchmarks/plot.py b/benchmarks/plot.py index c85a324..12a6a11 100644 --- a/benchmarks/plot.py +++ b/benchmarks/plot.py @@ -4,8 +4,9 @@ import numpy as np import pandas as pd - -data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'benchmarks.csv')).sort_values('mean') +data = pd.read_csv( + os.path.join(os.path.dirname(__file__), "benchmarks.csv") +).sort_values("mean") def grouped_barplot(df, cat, subcat, val, err, subcats=None, **kwargs): @@ -13,21 +14,38 @@ def grouped_barplot(df, cat, subcat, val, err, subcats=None, **kwargs): categories = df[cat].unique() x = np.arange(len(categories)) subcats = subcats or df[subcat].unique() - offsets = (np.arange(len(subcats)) - np.arange(len(subcats)).mean()) / (len(subcats) + 1.) + offsets = (np.arange(len(subcats)) - np.arange(len(subcats)).mean()) / ( + len(subcats) + 1.0 + ) width = np.diff(offsets).mean() for i, gr in enumerate(subcats): dfg = df[df[subcat] == gr] - plt.bar(x + offsets[i], dfg[val].values, width=width, - label="{}".format(gr), yerr=dfg[err].values, capsize=6, **kwargs) + plt.bar( + x + offsets[i], + dfg[val].values, + width=width, + label="{}".format(gr), + yerr=dfg[err].values, + capsize=6, + **kwargs, + ) plt.xlabel(cat) plt.ylabel(val) plt.xticks(x, categories) - plt.legend(title=subcat, loc='center left', bbox_to_anchor=(1, 0.5)) + plt.legend(title=subcat, loc="center left", bbox_to_anchor=(1, 0.5)) def plot_benchmarks(toolings=None): plt.figure(dpi=120, figsize=(10, 5)) - grouped_barplot(data, cat='formula', subcat='tooling', val='mean', err='stderr', subcats=toolings, log=True) + grouped_barplot( + data, + cat="formula", + subcat="tooling", + val="mean", + err="stderr", + subcats=toolings, + log=True, + ) plt.ylim(1e-2, None) plt.grid() plt.gca().set_axisbelow(True) @@ -36,5 +54,5 @@ def plot_benchmarks(toolings=None): plt.tight_layout() -plot_benchmarks(toolings=['formulaic', 'R', 'patsy', 'formulaic_sparse', 'R_sparse']) -plt.savefig(os.path.join(os.path.dirname(__file__), 'benchmarks.png')) +plot_benchmarks(toolings=["formulaic", "R", "patsy", "formulaic_sparse", "R_sparse"]) +plt.savefig(os.path.join(os.path.dirname(__file__), "benchmarks.png")) diff --git a/formulaic/__init__.py b/formulaic/__init__.py index 5de2752..0cdaaf6 100644 --- a/formulaic/__init__.py +++ b/formulaic/__init__.py @@ -1,6 +1,6 @@ from .formula import Formula, FormulaSpec from .materializers import FactorValues -from .model_matrix import ModelMatrix, ModelMatrices +from .model_matrix import ModelMatrices, ModelMatrix from .model_spec import ModelSpec, ModelSpecs from .sugar import model_matrix diff --git a/formulaic/formula.py b/formulaic/formula.py index 321f957..7557c39 100644 --- a/formulaic/formula.py +++ b/formulaic/formula.py @@ -11,7 +11,6 @@ from .parser.types import FormulaParser, OrderedSet, Structured, Term from .utils.calculus import differentiate_term - FormulaSpec: TypeAlias = Union[ str, List[Union[str, Term]], diff --git a/formulaic/materializers/arrow.py b/formulaic/materializers/arrow.py index 501e82b..894b1e7 100644 --- a/formulaic/materializers/arrow.py +++ b/formulaic/materializers/arrow.py @@ -1,11 +1,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, Sequence +from typing import TYPE_CHECKING, Any, Dict import pandas from interface_meta import override - from .pandas import PandasMaterializer if TYPE_CHECKING: # pragma: no cover @@ -13,9 +12,8 @@ class ArrowMaterializer(PandasMaterializer): - - REGISTER_NAME: str = "arrow" - REGISTER_INPUTS: Sequence[str] = ("pyarrow.lib.Table",) + REGISTER_NAME = "arrow" + REGISTER_INPUTS = ("pyarrow.lib.Table",) @override def _init(self) -> None: diff --git a/formulaic/materializers/base.py b/formulaic/materializers/base.py index 6393769..b07c627 100644 --- a/formulaic/materializers/base.py +++ b/formulaic/materializers/base.py @@ -1,6 +1,6 @@ from __future__ import annotations -import ast +import ast import functools import inspect import itertools @@ -8,12 +8,13 @@ from abc import abstractmethod from collections import defaultdict, namedtuple from typing import ( + TYPE_CHECKING, Any, Dict, Generator, Hashable, - List, Iterable, + List, Mapping, Optional, Sequence, @@ -21,7 +22,6 @@ Tuple, Type, Union, - TYPE_CHECKING, cast, ) @@ -56,7 +56,6 @@ class FormulaMaterializerMeta(InterfaceMeta): - INTERFACE_RAISE_ON_VIOLATION = True REGISTERED_NAMES: Dict[str, Type[FormulaMaterializer]] = {} @@ -119,7 +118,6 @@ def for_data(cls, data: Any, output: Hashable = None) -> Type[FormulaMaterialize class FormulaMaterializer(metaclass=FormulaMaterializerMeta): - REGISTER_NAME: Optional[str] = None REGISTER_INPUTS: Sequence[str] = () REGISTER_OUTPUTS: Sequence[Hashable] = () @@ -209,7 +207,6 @@ def get_model_matrix( def _build_model_matrix( self, spec: ModelSpec, drop_rows: Sequence[int] ) -> ModelMatrix: - # Step 0: Apply any requested column/term clustering # This must happen before Step 1 otherwise the greedy rank reduction # below would result in a different outcome than if the columns had @@ -228,10 +225,9 @@ def _build_model_matrix( scoped_cols = {} for scoped_term in scoped_terms: if not scoped_term.factors: - scoped_cols[ - "Intercept" - ] = scoped_term.scale * self._encode_constant( - 1, None, {}, spec, drop_rows + scoped_cols["Intercept"] = ( + scoped_term.scale + * self._encode_constant(1, None, {}, spec, drop_rows) ) else: scoped_cols.update( @@ -534,7 +530,9 @@ def _evaluate_factor( raise FactorEvaluationError( f"The evaluation method `{factor.eval_method.value}` for factor `{factor}` is not understood." ) - except FactorEvaluationError: # pragma: no cover; future proofing against new eval methods + except ( + FactorEvaluationError + ): # pragma: no cover; future proofing against new eval methods raise except Exception as e: raise FactorEvaluationError( @@ -746,7 +744,8 @@ def wrapped( and reduced_rank ): encoded = FactorValues( - encoded.copy(), metadata=encoded.__formulaic_metadata__ # type: ignore + encoded.copy(), + metadata=encoded.__formulaic_metadata__, # type: ignore ) del encoded[encoded.__formulaic_metadata__.drop_field] @@ -831,7 +830,10 @@ def _enforce_structure( ) -> Generator[Tuple[Term, List[ScopedTerm], Dict[str, Any]], None, None]: # TODO: Verify that imputation strategies are intuitive and make sense. structure = cast(List[EncodedTermStructure], spec.structure) - assert len(cols) == len(structure) + if not len(cols) == len(structure): + raise RuntimeError( + "Specification structure and columns are mismatched. Please report this error with examples!" + ) for i, col_spec in enumerate(cols): scoped_cols = col_spec[2] target_cols = structure[i][2] @@ -854,9 +856,11 @@ def _enforce_structure( f"Term `{col_spec[0]}` has generated columns that are inconsistent with specification: generated {list(scoped_cols)}, expecting {target_cols}." ) - yield col_spec[0], col_spec[1], { - col: scoped_cols[col] for col in target_cols - } + yield ( + col_spec[0], + col_spec[1], + {col: scoped_cols[col] for col in target_cols}, + ) def _get_columns_for_term( self, factors: List[Dict[str, Any]], spec: ModelSpec, scale: float = 1 diff --git a/formulaic/materializers/pandas.py b/formulaic/materializers/pandas.py index 525a222..2be07ea 100644 --- a/formulaic/materializers/pandas.py +++ b/formulaic/materializers/pandas.py @@ -2,14 +2,16 @@ import functools import itertools -from typing import Any, Dict, List, Sequence, Set, Tuple, cast, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Set, Tuple, cast import numpy import pandas import scipy.sparse as spsparse from interface_meta import override + from formulaic.utils.cast import as_columns -from formulaic.utils.null_handling import find_nulls, drop_rows as drop_nulls +from formulaic.utils.null_handling import drop_rows as drop_nulls +from formulaic.utils.null_handling import find_nulls from .base import FormulaMaterializer from .types import NAAction @@ -19,10 +21,9 @@ class PandasMaterializer(FormulaMaterializer): - - REGISTER_NAME: str = "pandas" - REGISTER_INPUTS: Sequence[str] = ("pandas.core.frame.DataFrame", "pandas.DataFrame") - REGISTER_OUTPUTS: Sequence[str] = ("pandas", "numpy", "sparse") + REGISTER_NAME = "pandas" + REGISTER_INPUTS = ("pandas.core.frame.DataFrame", "pandas.DataFrame") + REGISTER_OUTPUTS = ("pandas", "numpy", "sparse") @override def _is_categorical(self, values: Any) -> bool: @@ -36,7 +37,6 @@ def _is_categorical(self, values: Any) -> bool: def _check_for_nulls( self, name: str, values: Any, na_action: NAAction, drop_rows: Set[int] ) -> None: - if na_action is NAAction.IGNORE: return diff --git a/formulaic/materializers/types/factor_values.py b/formulaic/materializers/types/factor_values.py index 4f92973..09392a6 100644 --- a/formulaic/materializers/types/factor_values.py +++ b/formulaic/materializers/types/factor_values.py @@ -1,8 +1,9 @@ from __future__ import annotations -import copy +import copy from dataclasses import dataclass, replace from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -13,7 +14,6 @@ Tuple, TypeVar, Union, - TYPE_CHECKING, ) import wrapt diff --git a/formulaic/model_matrix.py b/formulaic/model_matrix.py index 147d8ad..2f3458d 100644 --- a/formulaic/model_matrix.py +++ b/formulaic/model_matrix.py @@ -1,7 +1,7 @@ from __future__ import annotations import copy -from typing import Any, Generic, Optional, TypeVar, TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar, cast import wrapt diff --git a/formulaic/model_spec.py b/formulaic/model_spec.py index 649c724..f8c8ce0 100644 --- a/formulaic/model_spec.py +++ b/formulaic/model_spec.py @@ -3,6 +3,7 @@ from collections import defaultdict from dataclasses import dataclass, field, replace from typing import ( + TYPE_CHECKING, Any, Dict, List, @@ -11,17 +12,16 @@ Sequence, Set, Union, - TYPE_CHECKING, cast, ) from formulaic.materializers.base import EncodedTermStructure from formulaic.parser.types import Structured, Term -from formulaic.utils.constraints import LinearConstraintSpec, LinearConstraints +from formulaic.utils.constraints import LinearConstraints, LinearConstraintSpec from formulaic.utils.variables import Variable from .formula import Formula, FormulaSpec -from .materializers import FormulaMaterializer, NAAction, ClusterBy +from .materializers import ClusterBy, FormulaMaterializer, NAAction if TYPE_CHECKING: # pragma: no cover from .model_matrix import ModelMatrices, ModelMatrix diff --git a/formulaic/parser/algos/tokenize.py b/formulaic/parser/algos/tokenize.py index 7c5e512..255ab54 100644 --- a/formulaic/parser/algos/tokenize.py +++ b/formulaic/parser/algos/tokenize.py @@ -150,12 +150,16 @@ def tokenize( continue # pragma: no cover; workaround bug in coverage if word_chars.match(char): - assert token.kind in ( + if token.kind not in ( None, Token.Kind.OPERATOR, Token.Kind.VALUE, Token.Kind.NAME, - ), f"Unexpected token kind {token.kind}." + ): + raise exc_for_token( + Token(source=formula, source_start=i, source_end=i), + f"Unexpected token kind {token.kind} for character '{char}'.", + ) if token and token.kind is Token.Kind.OPERATOR: yield token token = Token(source=formula) diff --git a/formulaic/parser/algos/tokens_to_ast.py b/formulaic/parser/algos/tokens_to_ast.py index 0bb3c07..1fe7dda 100644 --- a/formulaic/parser/algos/tokens_to_ast.py +++ b/formulaic/parser/algos/tokens_to_ast.py @@ -2,8 +2,7 @@ from typing import Iterable, List, Union from ..types import ASTNode, Operator, OperatorResolver, Token -from ..utils import exc_for_token, exc_for_missing_operator - +from ..utils import exc_for_missing_operator, exc_for_token OrderedOperator = namedtuple("OrderedOperator", ("operator", "token", "index")) CONTEXT_OPENERS = {"(", "["} @@ -53,7 +52,11 @@ def operate( operator, token, index = ordered_operator if operator.fixity is Operator.Fixity.INFIX: - assert operator.arity == 2 + if operator.arity != 2: + raise exc_for_token( + token, + f"Infix operator `{token.token}` must have an arity of 2 (got: {operator.arity}).", + ) min_index = index - 1 max_index = index + 1 elif operator.fixity is Operator.Fixity.PREFIX: @@ -107,7 +110,6 @@ def operate( ) for operator in operators: - while ( operator_stack and operator_stack[-1].token.kind is not Token.Kind.CONTEXT diff --git a/formulaic/parser/parser.py b/formulaic/parser/parser.py index 3100c21..7232e9f 100644 --- a/formulaic/parser/parser.py +++ b/formulaic/parser/parser.py @@ -1,9 +1,9 @@ import ast -import itertools import functools +import itertools import re from dataclasses import dataclass, field -from typing import List, Iterable, Sequence, Tuple, Union, cast +from typing import Iterable, List, Sequence, Tuple, Union, cast from .algos.sanitize_tokens import sanitize_tokens from .algos.tokenize import tokenize diff --git a/formulaic/parser/types/__init__.py b/formulaic/parser/types/__init__.py index 01ded9b..c7662ac 100644 --- a/formulaic/parser/types/__init__.py +++ b/formulaic/parser/types/__init__.py @@ -8,7 +8,6 @@ from .term import Term from .token import Token - __all__ = [ "ASTNode", "Factor", diff --git a/formulaic/parser/types/ast_node.py b/formulaic/parser/types/ast_node.py index 3b901e8..cf93f22 100644 --- a/formulaic/parser/types/ast_node.py +++ b/formulaic/parser/types/ast_node.py @@ -7,7 +7,6 @@ from .structured import Structured from .term import Term - ItemType = TypeVar("ItemType") diff --git a/formulaic/parser/types/factor.py b/formulaic/parser/types/factor.py index 1075588..d8af694 100644 --- a/formulaic/parser/types/factor.py +++ b/formulaic/parser/types/factor.py @@ -1,7 +1,7 @@ from __future__ import annotations from enum import Enum -from typing import Any, Dict, Optional, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, Optional, Union from .ordered_set import OrderedSet from .term import Term diff --git a/formulaic/parser/types/formula_parser.py b/formulaic/parser/types/formula_parser.py index 444d977..7cedc88 100644 --- a/formulaic/parser/types/formula_parser.py +++ b/formulaic/parser/types/formula_parser.py @@ -41,8 +41,8 @@ def get_tokens(self, formula: str) -> Iterable[Token]: Args: formula: The formula string to be tokenized. """ - from ..algos.tokenize import tokenize from ..algos.sanitize_tokens import sanitize_tokens + from ..algos.tokenize import tokenize return sanitize_tokens(tokenize(formula)) diff --git a/formulaic/parser/types/ordered_set.py b/formulaic/parser/types/ordered_set.py index dc3d7c3..f961e80 100644 --- a/formulaic/parser/types/ordered_set.py +++ b/formulaic/parser/types/ordered_set.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections.abc import Set - from typing import Any, Generic, Iterable, Iterator, TypeVar ItemType = TypeVar("ItemType") diff --git a/formulaic/parser/types/structured.py b/formulaic/parser/types/structured.py index fd3bd1b..da290e3 100644 --- a/formulaic/parser/types/structured.py +++ b/formulaic/parser/types/structured.py @@ -17,7 +17,6 @@ Union, ) - ItemType = TypeVar("ItemType") _MISSING = object() @@ -437,9 +436,7 @@ def __setitem__(self, key: Any, value: Any) -> Any: self._structure[key] = self.__prepare_item(key, value) def __iter__(self) -> Generator[Any, None, None]: - if ( - self._has_root and not self._has_keys and isinstance(self.root, Iterable) - ): # pylint: disable=isinstance-second-argument-not-valid-type + if self._has_root and not self._has_keys and isinstance(self.root, Iterable): # pylint: disable=isinstance-second-argument-not-valid-type yield from self.root else: if self._has_root: # Always yield root first. diff --git a/formulaic/parser/types/term.py b/formulaic/parser/types/term.py index 1cb575d..158bb23 100644 --- a/formulaic/parser/types/term.py +++ b/formulaic/parser/types/term.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Iterable, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Iterable if TYPE_CHECKING: from .factor import Factor # pragma: no cover diff --git a/formulaic/parser/types/token.py b/formulaic/parser/types/token.py index cf6bd95..3679396 100644 --- a/formulaic/parser/types/token.py +++ b/formulaic/parser/types/token.py @@ -5,8 +5,8 @@ from enum import Enum from typing import Any, Iterable, Optional, Tuple, Union -from .ordered_set import OrderedSet from .factor import Factor +from .ordered_set import OrderedSet from .term import Term diff --git a/formulaic/parser/utils.py b/formulaic/parser/utils.py index ae9ca2e..b26dec3 100644 --- a/formulaic/parser/utils.py +++ b/formulaic/parser/utils.py @@ -2,10 +2,10 @@ from typing import Iterable, Optional, Sequence, Set, Tuple, Type, Union from formulaic.errors import FormulaSyntaxError + from .types.ast_node import ASTNode from .types.token import Token - # Exception handling @@ -241,7 +241,7 @@ def merge_operator_tokens( # `token` is an operator that can be collapsed on the left if pooled_token: pooled_token = token.copy_with_attrs(token=pooled_token.token + token.token) - if symbols and not pooled_token.token[-1] in symbols: + if symbols and pooled_token.token[-1] not in symbols: yield pooled_token pooled_token = None continue diff --git a/formulaic/transforms/__init__.py b/formulaic/transforms/__init__.py index e94b40c..85502be 100644 --- a/formulaic/transforms/__init__.py +++ b/formulaic/transforms/__init__.py @@ -3,9 +3,9 @@ from formulaic.utils.stateful_transforms import stateful_transform from .basis_spline import basis_spline -from .identity import identity -from .contrasts import C, encode_contrasts, ContrastsRegistry +from .contrasts import C, ContrastsRegistry, encode_contrasts from .hashed import hashed +from .identity import identity from .patsy_compat import PATSY_COMPAT_TRANSFORMS from .poly import poly from .scale import center, scale diff --git a/formulaic/transforms/contrasts.py b/formulaic/transforms/contrasts.py index 8b5e573..54e5215 100644 --- a/formulaic/transforms/contrasts.py +++ b/formulaic/transforms/contrasts.py @@ -1,20 +1,20 @@ from __future__ import annotations -from abc import abstractmethod import inspect import warnings +from abc import abstractmethod from numbers import Number from typing import ( + TYPE_CHECKING, Any, - Hashable, - Sequence, - Tuple, - Union, Dict, + Hashable, Iterable, List, Optional, - TYPE_CHECKING, + Sequence, + Tuple, + Union, cast, ) diff --git a/formulaic/transforms/hashed.py b/formulaic/transforms/hashed.py index 5418c49..c1aec07 100644 --- a/formulaic/transforms/hashed.py +++ b/formulaic/transforms/hashed.py @@ -3,7 +3,7 @@ import sys from hashlib import md5 from numbers import Number -from typing import Any, Callable, Dict, Iterable, List, Optional, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Union import numpy as np @@ -19,7 +19,7 @@ def md5_to_int(s: str) -> int: # pragma: no cover; branched code if sys.version_info >= (3, 9): hashed = md5(s.encode(), usedforsecurity=False) else: - hashed = md5(s.encode()) + hashed = md5(s.encode()) # noqa: S324 ; use of insecure hash function return int(hashed.hexdigest(), 16) diff --git a/formulaic/transforms/patsy_compat.py b/formulaic/transforms/patsy_compat.py index c66ed55..a58e593 100644 --- a/formulaic/transforms/patsy_compat.py +++ b/formulaic/transforms/patsy_compat.py @@ -1,12 +1,13 @@ from typing import Any, Dict, Mapping, Optional + from formulaic.utils.stateful_transforms import stateful_transform from .contrasts import ( - TreatmentContrasts, + DiffContrasts, + HelmertContrasts, PolyContrasts, SumContrasts, - HelmertContrasts, - DiffContrasts, + TreatmentContrasts, ) from .scale import scale diff --git a/formulaic/transforms/scale.py b/formulaic/transforms/scale.py index 8384714..596de48 100644 --- a/formulaic/transforms/scale.py +++ b/formulaic/transforms/scale.py @@ -1,4 +1,5 @@ from typing import Any + import numpy import scipy.sparse as spsparse @@ -60,7 +61,8 @@ def scale( # pylint: disable=dangerous-default-value # always replaced by stat @scale.register # type: ignore[attr-defined] def _(data: spsparse.spmatrix, *args: Any, **kwargs: Any) -> numpy.ndarray: - assert data.shape[1] == 1 + if data.shape[1] != 1: + raise ValueError("Cannot scale a sparse matrix with more than one column.") return scale(data.toarray()[:, 0], *args, **kwargs) diff --git a/formulaic/utils/calculus.py b/formulaic/utils/calculus.py index 43e2f5a..8f7e1a5 100644 --- a/formulaic/utils/calculus.py +++ b/formulaic/utils/calculus.py @@ -102,7 +102,10 @@ def _differentiate_factors( "`sympy` is not available. Install it using `pip install formulaic[calculus]` or `pip install sympy`." ) from e else: - assert len(factors) == 1 + if len(factors) != 1: + raise RuntimeError( + "Cannot differentiate non-trivial factors without `sympy`." + ) expr = 1 eval_method = next(iter(factors)).eval_method diff --git a/formulaic/utils/constraints.py b/formulaic/utils/constraints.py index eca87e4..f4d5fe0 100644 --- a/formulaic/utils/constraints.py +++ b/formulaic/utils/constraints.py @@ -16,22 +16,21 @@ Union, cast, ) -from typing_extensions import Literal import numpy +from typing_extensions import Literal from formulaic.parser.algos.tokenize import tokenize from formulaic.parser.algos.tokens_to_ast import tokens_to_ast from formulaic.parser.types import ( ASTNode, Factor, - OperatorResolver, Operator, + OperatorResolver, Token, ) from formulaic.parser.utils import exc_for_token - LinearConstraintSpec = Union[ str, Dict[str, Number], @@ -364,9 +363,7 @@ def __repr__(self) -> str: return f"{self.scale}*{self.factor}" # pragma: no cover -class ConstraintOperatorResolver( - OperatorResolver -): # pylint: disable=unnecessary-lambda +class ConstraintOperatorResolver(OperatorResolver): # pylint: disable=unnecessary-lambda """ The default constraint `OperatorResolver` implementation. @@ -386,7 +383,6 @@ def join_tuples(lhs: Any, rhs: Any) -> Tuple: def add_terms( terms_left: Set[ScaledFactor], terms_right: Set[ScaledFactor] ) -> Set[ScaledFactor]: - terms_left = {term: term for term in terms_left} terms_right = {term: term for term in terms_right} @@ -403,7 +399,6 @@ def add_terms( def sub_terms( terms_left: Set[ScaledFactor], terms_right: Set[ScaledFactor] ) -> Set[ScaledFactor]: - terms_left = {term: term for term in terms_left} terms_right = {term: term for term in terms_right} diff --git a/formulaic/utils/context.py b/formulaic/utils/context.py index a1e4cb4..2784665 100644 --- a/formulaic/utils/context.py +++ b/formulaic/utils/context.py @@ -1,11 +1,11 @@ import sys -from typing import Any, Optional, Mapping, Union +from typing import Any, Mapping, Optional, Union from .layered_mapping import LayeredMapping def capture_context( - context: Optional[Union[int, Mapping[str, Any]]] = 0 + context: Optional[Union[int, Mapping[str, Any]]] = 0, ) -> Optional[Mapping[str, Any]]: """ Explicitly capture the context to be used by subsequent formula diff --git a/formulaic/utils/sentinels.py b/formulaic/utils/sentinels.py index b4f438d..7d25413 100644 --- a/formulaic/utils/sentinels.py +++ b/formulaic/utils/sentinels.py @@ -1,6 +1,7 @@ from __future__ import annotations from typing import Dict + from typing_extensions import Self diff --git a/formulaic/utils/sparse.py b/formulaic/utils/sparse.py index 65ded25..bc5a52c 100644 --- a/formulaic/utils/sparse.py +++ b/formulaic/utils/sparse.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Tuple, List +from typing import Iterable, List, Optional, Tuple import numpy import pandas diff --git a/formulaic/utils/stateful_transforms.py b/formulaic/utils/stateful_transforms.py index 74b9c70..fd7e4a8 100644 --- a/formulaic/utils/stateful_transforms.py +++ b/formulaic/utils/stateful_transforms.py @@ -2,6 +2,7 @@ import functools import inspect from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -9,13 +10,12 @@ MutableMapping, Optional, Set, - TYPE_CHECKING, cast, ) from .code import format_expr, sanitize_variable_names from .layered_mapping import LayeredMapping -from .variables import get_expression_variables, Variable +from .variables import Variable, get_expression_variables if TYPE_CHECKING: from formulaic.model_spec import ModelSpec # pragma: no cover @@ -177,10 +177,16 @@ def stateful_eval( # Compile mutated AST compiled = compile(ast.fix_missing_locations(code), "", "eval") - assert "__FORMULAIC_CONTEXT__" not in env - assert "__FORMULAIC_METADATA__" not in env - assert "__FORMULAIC_STATE__" not in env - assert "__FORMULAIC_SPEC__" not in env + if used_reserved := { + "__FORMULAIC_CONTEXT__", + "__FORMULAIC_METADATA__", + "__FORMULAIC_STATE__", + "__FORMULAIC_SPEC__", + }.intersection(env): + raise ValueError( + f"Reserved names {repr(used_reserved)} are already in use in the " + "evaluation environment." + ) # Evaluate and return return eval( @@ -216,9 +222,7 @@ def _is_stateful_transform(node: ast.AST, env: Mapping) -> bool: return False try: - func = eval( - compile(format_expr(node.func), "", "eval"), {}, env - ) # nosec; Get function handle (assuming it exists in env) + func = eval(compile(format_expr(node.func), "", "eval"), {}, env) # nosec; Get function handle (assuming it exists in env) return getattr(func, "__is_stateful_transform__", False) except NameError: return False diff --git a/pyproject.toml b/pyproject.toml index 8b1273a..43b0b27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,61 +105,58 @@ dependencies = [ [tool.hatch.envs.lint] dependencies = [ - "black==22.6", - "flake8==5.0.4", - "flake8-pyproject", - "mypy==1.4.1", + "mypy==1.11.1", "mypy-extensions==1.0.0", - "pylint==2.17.4", - "pytest-cov==3.0.0", - "pytest==6.2.5", + "ruff==0.5.6", ] [tool.hatch.envs.lint.scripts] check = [ - "black --check formulaic tests", - "flake8 formulaic", - "pylint formulaic", + "ruff format --check", "mypy formulaic", ] -format = "black formulaic tests" +format = "ruff format" # Linting configuration -[tool.flake8] -ignore = [ - "C901","E203","E501","E712","E722","E731","W503","W504","W601" +[tool.ruff] +target-version = "py310" + +exclude = [ + "*.egg-info", + "*.pyc", + ".cache", + ".coverage.*", + ".gradle", + ".tox", + "build", + "dist", + "htmlcov.*", +] + +[tool.ruff.lint] +select = [ + "F", # flake8 rules + "E", # pycodestyle + "W", # pycodestyle + "S", # bandit + "I001", # import sorting + # "D", # documentation ] -max-complexity = 25 -import-order-style = "edited" -application-import-names = "formulaic" - -[tool.pylint."MESSAGES CONTROL"] -disable = [ - "cyclic-import", - "duplicate-code", - "eval-used", - "fixme", - "import-outside-toplevel", - "invalid-name", - "line-too-long", - "missing-class-docstring", - "missing-function-docstring", - "missing-module-docstring", - "no-member", - "protected-access", - "redefined-outer-name", - "too-few-public-methods", - "too-many-arguments", - "too-many-branches", - "too-many-instance-attributes", - "too-many-statements", - "ungrouped-imports", - "unnecessary-lambda-assignment", - "unused-argument", - "use-dict-literal", + +ignore = [ + "C901", + "E203", + "E501", + "E712", + "E722", + "E731", + "S307", # Use of ast.literal_eval ] +[tool.ruff.lint.per-file-ignores] +"**/tests/*" = ["F", "E", "W", "S"] + [tool.mypy] allow_redefinition = true disallow_untyped_defs = true diff --git a/tests/materializers/test_base.py b/tests/materializers/test_base.py index a5ef656..b465f95 100644 --- a/tests/materializers/test_base.py +++ b/tests/materializers/test_base.py @@ -2,14 +2,14 @@ import pytest from formulaic.errors import FactorEncodingError, FormulaMaterializerNotFoundError +from formulaic.materializers.base import FormulaMaterializer +from formulaic.materializers.pandas import PandasMaterializer from formulaic.materializers.types import ( EvaluatedFactor, FactorValues, ScopedFactor, ScopedTerm, ) -from formulaic.materializers.base import FormulaMaterializer -from formulaic.materializers.pandas import PandasMaterializer from formulaic.model_spec import ModelSpec from formulaic.parser.types import Factor @@ -105,7 +105,6 @@ def test__simplify_scoped_terms(self, evaled_factors): ) == [ScopedTerm((A, B, C_))] def test__flatten_encoded_evaled_factor(self): - flattened = PandasMaterializer(data=None)._flatten_encoded_evaled_factor( "name", FactorValues( @@ -120,7 +119,6 @@ def test__flatten_encoded_evaled_factor(self): assert list(flattened.values()) == [1, 2, 3, 4] def test__enforce_structure(self): - # TODO: Make sure that imputations are intuitive df = pandas.DataFrame({"a": [1]}) diff --git a/tests/materializers/test_pandas.py b/tests/materializers/test_pandas.py index b85cda4..dc3da5d 100644 --- a/tests/materializers/test_pandas.py +++ b/tests/materializers/test_pandas.py @@ -19,7 +19,6 @@ from formulaic.model_spec import ModelSpec from formulaic.parser.types import Factor, Structured - PANDAS_TESTS = { # '': (, , , ) "a": (["Intercept", "a"], ["Intercept", "a"], ["Intercept", "a"], 2), diff --git a/tests/materializers/types/test_evaluated_factor.py b/tests/materializers/types/test_evaluated_factor.py index 2460962..249b0e8 100644 --- a/tests/materializers/types/test_evaluated_factor.py +++ b/tests/materializers/types/test_evaluated_factor.py @@ -1,7 +1,7 @@ import pytest -from formulaic.parser.types import Factor from formulaic.materializers.types import EvaluatedFactor, FactorValues +from formulaic.parser.types import Factor class TestEvaluatedFactor: diff --git a/tests/materializers/types/test_scoped_term.py b/tests/materializers/types/test_scoped_term.py index 7b6a957..5696609 100644 --- a/tests/materializers/types/test_scoped_term.py +++ b/tests/materializers/types/test_scoped_term.py @@ -1,6 +1,6 @@ import pytest -from formulaic.materializers.types import ScopedFactor, ScopedTerm, EvaluatedFactor +from formulaic.materializers.types import EvaluatedFactor, ScopedFactor, ScopedTerm from formulaic.parser.types import Factor from formulaic.utils.variables import Variable diff --git a/tests/parser/algos/test_tokenize.py b/tests/parser/algos/test_tokenize.py index e148c8b..2c29770 100644 --- a/tests/parser/algos/test_tokenize.py +++ b/tests/parser/algos/test_tokenize.py @@ -1,8 +1,7 @@ import pytest -from formulaic.parser.algos.tokenize import tokenize from formulaic.errors import FormulaSyntaxError - +from formulaic.parser.algos.tokenize import tokenize TOKEN_TESTS = { "": [], diff --git a/tests/parser/algos/test_tokens_to_ast.py b/tests/parser/algos/test_tokens_to_ast.py index d91365d..9e3a10f 100644 --- a/tests/parser/algos/test_tokens_to_ast.py +++ b/tests/parser/algos/test_tokens_to_ast.py @@ -1,12 +1,12 @@ -import pytest - import functools import itertools +import pytest + from formulaic.errors import FormulaSyntaxError from formulaic.parser import DefaultOperatorResolver -from formulaic.parser.algos.tokens_to_ast import tokens_to_ast from formulaic.parser.algos.tokenize import tokenize +from formulaic.parser.algos.tokens_to_ast import tokens_to_ast from formulaic.parser.types import Operator diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py index d548eac..081f61c 100644 --- a/tests/parser/test_parser.py +++ b/tests/parser/test_parser.py @@ -1,6 +1,6 @@ -from io import BytesIO import pickle import re +from io import BytesIO from typing import List from xml.etree.ElementInclude import include @@ -11,7 +11,6 @@ from formulaic.parser.types import Structured, Token from formulaic.parser.types.term import Term - FORMULA_TO_TOKENS = { "": ["1"], " ": ["1"], @@ -206,7 +205,6 @@ def resolver(self): return DefaultOperatorResolver() def test_resolve(self, resolver): - assert len(resolver.resolve(Token("+++++"), 1, [])) == 1 assert resolver.resolve(Token("+++++"), 1, [])[0].symbol == "+" assert resolver.resolve(Token("+++++"), 1, [])[0].arity == 2 diff --git a/tests/parser/test_utils.py b/tests/parser/test_utils.py index a2bbbd9..62c28d5 100644 --- a/tests/parser/test_utils.py +++ b/tests/parser/test_utils.py @@ -1,11 +1,12 @@ from ntpath import join + import pytest from formulaic.parser.types import Token from formulaic.parser.utils import ( - replace_tokens, insert_tokens_after, merge_operator_tokens, + replace_tokens, ) diff --git a/tests/parser/types/test_formula_parser.py b/tests/parser/types/test_formula_parser.py index 2f8dc4b..19bf6e9 100644 --- a/tests/parser/types/test_formula_parser.py +++ b/tests/parser/types/test_formula_parser.py @@ -2,7 +2,6 @@ from formulaic.parser.types import FormulaParser - FORMULA_TO_TOKENS = { "": [], " ": [], diff --git a/tests/parser/types/test_ordered_set.py b/tests/parser/types/test_ordered_set.py index 29d1db5..48adebb 100644 --- a/tests/parser/types/test_ordered_set.py +++ b/tests/parser/types/test_ordered_set.py @@ -2,7 +2,6 @@ def test_ordered_set(): - assert OrderedSet() == OrderedSet() assert len(OrderedSet()) == 0 diff --git a/tests/parser/types/test_structured.py b/tests/parser/types/test_structured.py index 75cf925..30d54c3 100644 --- a/tests/parser/types/test_structured.py +++ b/tests/parser/types/test_structured.py @@ -1,6 +1,6 @@ -from ast import Str import pickle import re +from ast import Str from io import BytesIO import pytest @@ -27,7 +27,6 @@ def test_constructor(self): Structured(_invalid=True) def test_access_structure(self): - s = Structured("Hello", key="asd") assert s.root == "Hello" assert s[None] == "Hello" @@ -120,7 +119,6 @@ def test__update(self): assert Structured(_metadata={"a": 1})._update()._metadata == {"a": 1} def test__merge(self): - _m = Structured._merge assert _m() == Structured() diff --git a/tests/test_model_matrix.py b/tests/test_model_matrix.py index ebd1c7d..1ac912a 100644 --- a/tests/test_model_matrix.py +++ b/tests/test_model_matrix.py @@ -5,12 +5,12 @@ import pytest from formulaic import ( - model_matrix, - ModelSpec, - ModelMatrix, + FactorValues, ModelMatrices, + ModelMatrix, + ModelSpec, ModelSpecs, - FactorValues, + model_matrix, ) diff --git a/tests/test_model_spec.py b/tests/test_model_spec.py index b21fafb..abdc935 100644 --- a/tests/test_model_spec.py +++ b/tests/test_model_spec.py @@ -1,12 +1,12 @@ -from pyexpat import model import re - -import pytest +from pyexpat import model import numpy import pandas +import pytest import scipy.sparse -from formulaic import Formula, ModelSpec, ModelSpecs, ModelMatrix, ModelMatrices + +from formulaic import Formula, ModelMatrices, ModelMatrix, ModelSpec, ModelSpecs from formulaic.materializers.base import FormulaMaterializerMeta from formulaic.materializers.pandas import PandasMaterializer from formulaic.parser.types import Factor, Term diff --git a/tests/test_sugar.py b/tests/test_sugar.py index e6b2a02..cb1b155 100644 --- a/tests/test_sugar.py +++ b/tests/test_sugar.py @@ -1,6 +1,5 @@ -import pytest - import pandas +import pytest from formulaic import model_matrix from formulaic.errors import FactorEvaluationError diff --git a/tests/transforms/test_basis_spline.py b/tests/transforms/test_basis_spline.py index a416a66..6dae069 100644 --- a/tests/transforms/test_basis_spline.py +++ b/tests/transforms/test_basis_spline.py @@ -3,9 +3,9 @@ import numpy import pytest -from formulaic.transforms.basis_spline import basis_spline from formulaic import model_matrix from formulaic.errors import FactorEvaluationError +from formulaic.transforms.basis_spline import basis_spline class TestBasisSpline: diff --git a/tests/transforms/test_contrasts.py b/tests/transforms/test_contrasts.py index 9993c1f..d123ca8 100644 --- a/tests/transforms/test_contrasts.py +++ b/tests/transforms/test_contrasts.py @@ -9,10 +9,12 @@ from formulaic.errors import DataMismatchWarning from formulaic.materializers import FactorValues from formulaic.model_spec import ModelSpec +from formulaic.transforms.contrasts import ( + ContrastsRegistry as contr, +) from formulaic.transforms.contrasts import ( SumContrasts, encode_contrasts, - ContrastsRegistry as contr, ) from formulaic.utils.sparse import categorical_encode_series_to_sparse_csc_matrix diff --git a/tests/transforms/test_patsy_compat.py b/tests/transforms/test_patsy_compat.py index ce6a2d4..19ed693 100644 --- a/tests/transforms/test_patsy_compat.py +++ b/tests/transforms/test_patsy_compat.py @@ -1,8 +1,9 @@ import numpy import pandas + from formulaic import model_matrix from formulaic.transforms.contrasts import TreatmentContrasts -from formulaic.transforms.patsy_compat import standardize, Treatment +from formulaic.transforms.patsy_compat import Treatment, standardize from formulaic.transforms.scale import scale diff --git a/tests/utils/test_constraints.py b/tests/utils/test_constraints.py index 84b7f3b..a043cd5 100644 --- a/tests/utils/test_constraints.py +++ b/tests/utils/test_constraints.py @@ -4,11 +4,10 @@ import pytest from formulaic.errors import FormulaSyntaxError -from formulaic.utils.constraints import LinearConstraints, LinearConstraintParser +from formulaic.utils.constraints import LinearConstraintParser, LinearConstraints class TestLinearConstraints: - REF_MATRICES = { 1: [[1, 1, 1]], 2: [[1, 1, 1], [1, 0, -1]], @@ -188,7 +187,6 @@ def test_repr(self): class TestLinearConstraintParser: - COLUMNS = list("abcd") TEST_CASES = { diff --git a/tests/utils/test_iterators.py b/tests/utils/test_iterators.py index 297be4b..f8abf8c 100644 --- a/tests/utils/test_iterators.py +++ b/tests/utils/test_iterators.py @@ -4,7 +4,6 @@ def test_peekable_iter(): - it = peekable_iter([1, 2, 3, 4, 5, 6]) assert it.peek() == 1 diff --git a/tests/utils/test_layered_mapping.py b/tests/utils/test_layered_mapping.py index 28a70b2..8f47b5e 100644 --- a/tests/utils/test_layered_mapping.py +++ b/tests/utils/test_layered_mapping.py @@ -6,7 +6,6 @@ def test_layered_context(): - layer1 = {"a": 1, "b": 2, "c": 3} layer2 = {"a": 2, "d": 4} @@ -43,7 +42,6 @@ def test_layered_context(): def test_named_layered_mappings(): - data_layer = LayeredMapping({"data": 1}, name="data") context_layer = LayeredMapping({"context": "context"}, name="context") layers = LayeredMapping({"data": None, "context": None}, data_layer, context_layer) diff --git a/tests/utils/test_null_handling.py b/tests/utils/test_null_handling.py index 20a3d01..80a0ed4 100644 --- a/tests/utils/test_null_handling.py +++ b/tests/utils/test_null_handling.py @@ -6,11 +6,10 @@ import scipy.sparse from formulaic.materializers.types import FactorValues -from formulaic.utils.null_handling import find_nulls, drop_rows +from formulaic.utils.null_handling import drop_rows, find_nulls def test_find_nulls(): - assert find_nulls(None) == set() assert find_nulls(FactorValues(None)) == set() assert find_nulls(1) == set() @@ -58,7 +57,6 @@ def test_find_nulls(): def test_drop_rows(): - assert drop_rows([1, 2, 3], [1]) == [1, 3] assert numpy.all( drop_rows(pandas.Series([1, 2, 3]), [0]) == pandas.Series([2, 3], index=[1, 2]) diff --git a/tests/utils/test_sentinels.py b/tests/utils/test_sentinels.py index 45ed18b..b21479d 100644 --- a/tests/utils/test_sentinels.py +++ b/tests/utils/test_sentinels.py @@ -1,6 +1,6 @@ import copy -from formulaic.utils.sentinels import _MissingType, MISSING +from formulaic.utils.sentinels import MISSING, _MissingType def test_missing(): diff --git a/tests/utils/test_stateful_transforms.py b/tests/utils/test_stateful_transforms.py index d0dbb53..cc4b93e 100644 --- a/tests/utils/test_stateful_transforms.py +++ b/tests/utils/test_stateful_transforms.py @@ -1,6 +1,5 @@ -import pytest - import numpy +import pytest from formulaic.utils.stateful_transforms import stateful_eval, stateful_transform @@ -15,7 +14,6 @@ def dummy_transform(data, _state=None, _spec=None, _metadata=None): def test_stateful_transform(): - state = {} metadata = {} assert dummy_transform(1, _state=state, _metadata=metadata) == 1