From 2b125d881f0f019d8b8cce78a9ae7202bee59e56 Mon Sep 17 00:00:00 2001 From: Fede Date: Mon, 29 Apr 2024 18:08:45 +0300 Subject: [PATCH 1/2] Fix typing and warnings as much as we can --- julearn/api.py | 25 ++- julearn/base/estimators.py | 34 +++- julearn/base/tests/test_base_estimators.py | 2 +- julearn/inspect/_cv.py | 26 +-- julearn/inspect/_preprocess.py | 2 +- julearn/inspect/tests/test_cv.py | 6 +- julearn/inspect/tests/test_inspector.py | 21 +-- julearn/inspect/tests/test_pipeline.py | 41 +++-- julearn/inspect/tests/test_preprocess.py | 9 +- .../model_selection/available_searchers.py | 6 +- .../model_selection/stratified_bootstrap.py | 12 +- julearn/models/dynamic.py | 2 +- julearn/models/tests/test_models.py | 12 +- julearn/pipeline/pipeline_creator.py | 43 +++-- julearn/pipeline/target_pipeline.py | 2 +- julearn/pipeline/tests/test_merger.py | 29 ++-- .../pipeline/tests/test_pipeline_creator.py | 2 +- julearn/prepare.py | 6 +- julearn/scoring/available_scorers.py | 18 ++- julearn/stats/corrected_ttest.py | 12 +- julearn/stats/tests/test_corrected_ttest.py | 27 +++- julearn/tests/test_api.py | 148 ++++++++++++------ julearn/transformers/confound_remover.py | 4 +- .../dataframe/tests/test_drop_columns.py | 2 +- .../dataframe/tests/test_filter_columns.py | 2 +- .../dataframe/tests/test_set_column_types.py | 16 +- julearn/transformers/ju_column_transformer.py | 2 +- .../target/ju_transformed_target_model.py | 2 +- .../target/target_confound_remover.py | 2 +- .../tests/test_ju_transformed_target_model.py | 4 +- julearn/transformers/tests/test_cbpm.py | 6 +- julearn/transformers/tests/test_confounds.py | 23 ++- .../tests/test_jucolumntransformers.py | 14 +- julearn/utils/checks.py | 4 +- julearn/utils/logging.py | 2 +- julearn/utils/testing.py | 32 ++-- julearn/utils/typing.py | 58 ++++--- julearn/viz/_scores.py | 9 +- pyproject.toml | 16 +- 39 files changed, 454 insertions(+), 229 deletions(-) diff --git a/julearn/api.py b/julearn/api.py index b89b42ba7..039254dfc 100644 --- a/julearn/api.py +++ b/julearn/api.py @@ -4,13 +4,13 @@ # Sami Hamdan # License: AGPL -from typing import Dict, Iterable, List, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd +import sklearn from sklearn.base import BaseEstimator from sklearn.model_selection import ( - BaseCrossValidator, check_cv, cross_validate, ) @@ -23,6 +23,7 @@ from .prepare import check_consistency, prepare_input_data from .scoring import check_scoring from .utils import _compute_cvmdsum, logger, raise_error +from .utils.typing import CVLike def run_cross_validation( # noqa: C901 @@ -36,7 +37,7 @@ def run_cross_validation( # noqa: C901 return_estimator: Optional[str] = None, return_inspector: bool = False, return_train_score: bool = False, - cv: Optional[Union[int, BaseCrossValidator, Iterable]] = None, + cv: Optional[CVLike] = None, groups: Optional[str] = None, scoring: Union[str, List[str], None] = None, pos_labels: Union[str, List[str], None] = None, @@ -357,20 +358,32 @@ def run_cross_validation( # noqa: C901 # Prepare cross validation cv_outer = check_cv( - cv, classifier=problem_type == "classification" # type: ignore + cv, # type: ignore + classifier=problem_type == "classification", ) logger.info(f"Using outer CV scheme {cv_outer}") check_consistency(df_y, cv, groups, problem_type) # type: ignore cv_return_estimator = return_estimator in ["cv", "all"] - scoring = check_scoring(pipeline, scoring, wrap_score=wrap_score) + scoring = check_scoring( + pipeline, # type: ignore + scoring, + wrap_score=wrap_score, + ) cv_mdsum = _compute_cvmdsum(cv_outer) fit_params = {} if df_groups is not None: if isinstance(pipeline, BaseSearchCV): fit_params["groups"] = df_groups.values + + _sklearn_deprec_fit_params = {} + if sklearn.__version__ >= "1.4.0": + _sklearn_deprec_fit_params["params"] = fit_params + else: + _sklearn_deprec_fit_params["fit_params"] = fit_params + scores = cross_validate( pipeline, df_X, @@ -382,7 +395,7 @@ def run_cross_validation( # noqa: C901 n_jobs=n_jobs, return_train_score=return_train_score, verbose=verbose, # type: ignore - fit_params=fit_params, + **_sklearn_deprec_fit_params, ) n_repeats = getattr(cv_outer, "n_repeats", 1) diff --git a/julearn/base/estimators.py b/julearn/base/estimators.py index 7296e5927..e198c4874 100644 --- a/julearn/base/estimators.py +++ b/julearn/base/estimators.py @@ -13,11 +13,11 @@ try: # sklearn < 1.4.0 - from sklearn.utils.validation import _check_fit_params + from sklearn.utils.validation import _check_fit_params # type: ignore fit_params_checker = _check_fit_params except ImportError: # sklearn >= 1.4.0 - from sklearn.utils.validation import _check_method_params + from sklearn.utils.validation import _check_method_params # type: ignore fit_params_checker = _check_method_params @@ -180,7 +180,12 @@ def __init__( self.row_select_col_type = row_select_col_type self.row_select_vals = row_select_vals - def fit(self, X, y=None, **fit_params): # noqa: N803 + def fit( + self, + X: pd.DataFrame, # noqa: N803 + y: Optional[pd.Series] = None, + **fit_params, + ): """Fit the model. This method will fit the model using only the columns selected by @@ -217,8 +222,21 @@ def fit(self, X, y=None, **fit_params): # noqa: N803 self.row_select_vals = [self.row_select_vals] return self._fit(**self._select_rows(X, y, **fit_params)) + def _fit( + self, + X: pd.DataFrame, # noqa: N803, + y: Optional[pd.Series], + **kwargs, + ) -> None: + raise_error( + "This method should be implemented in the concrete class", + klass=NotImplementedError, + ) + def _add_backed_filtered( - self, X: pd.DataFrame, X_trans: pd.DataFrame # noqa: N803 + self, + X: pd.DataFrame, # noqa: N803 + X_trans: pd.DataFrame, # noqa: N803 ) -> pd.DataFrame: """Add the left-out columns back to the transformed data. @@ -301,7 +319,7 @@ def __init__( def fit( self, - X: pd.DataFrame, # noqa: N803 + X: DataLike, # noqa: N803 y: Optional[DataLike] = None, **fit_params: Any, ) -> "WrapModel": @@ -312,7 +330,7 @@ def fit( Parameters ---------- - X : pd.DataFrame + X : DataLike The data to fit the model on. y : DataLike, optional The target data (default is None). @@ -329,9 +347,9 @@ def fit( if self.needed_types is not None: self.needed_types = ensure_column_types(self.needed_types) - Xt = self.filter_columns(X) + Xt = self.filter_columns(X) # type: ignore self.model_ = self.model - self.model_.fit(Xt, y, **fit_params) + self.model_.fit(Xt, y, **fit_params) # type: ignore return self def predict(self, X: pd.DataFrame) -> DataLike: # noqa: N803 diff --git a/julearn/base/tests/test_base_estimators.py b/julearn/base/tests/test_base_estimators.py index 1f85b2223..67cd546ca 100644 --- a/julearn/base/tests/test_base_estimators.py +++ b/julearn/base/tests/test_base_estimators.py @@ -110,7 +110,7 @@ def test_WrapModel( np.random.seed(42) lr = model() - lr.fit(X_iris_selected, y_iris) + lr.fit(X_iris_selected, y_iris) # type: ignore pred_sk = lr.predict(X_iris_selected) np.random.seed(42) diff --git a/julearn/inspect/_cv.py b/julearn/inspect/_cv.py index c8ecded72..a22869493 100644 --- a/julearn/inspect/_cv.py +++ b/julearn/inspect/_cv.py @@ -4,13 +4,14 @@ # Sami Hamdan # License: AGPL -from typing import List, Optional, Union +from typing import Optional, Union import pandas as pd from sklearn.model_selection import BaseCrossValidator, check_cv from sklearn.utils.metaestimators import available_if from ..utils import _compute_cvmdsum, is_nonoverlapping_cv, raise_error +from ..utils.typing import DataLike from ._pipeline import PipelineInspector @@ -60,14 +61,13 @@ class FoldsInspector: def __init__( self, scores: pd.DataFrame, - cv: BaseCrossValidator, - X: Union[str, List[str]], # noqa: N803 - y: str, + cv: Union[BaseCrossValidator, int], + X: DataLike, # noqa: N803 + y: pd.Series, func: str = "predict", - groups: Optional[str] = None, + groups: Optional[pd.Series] = None, ): self._scores = scores - self._cv = cv self._X = X self._y = y self._func = func @@ -92,7 +92,7 @@ def __init__( ) cv = check_cv(cv) - + self._cv = cv t_cv_mdsum = _compute_cvmdsum(cv) if t_cv_mdsum != cv_mdsums[0]: raise_error( @@ -120,10 +120,16 @@ def _get_predictions(self, func): predictions = [] for i_fold, (_, test) in enumerate( - self._cv.split(self._X, self._y, groups=self._groups) + self._cv.split( + self._X, # type: ignore + self._y, + groups=self._groups, + ) ): t_model = self._scores["estimator"][i_fold] - t_values = getattr(t_model, func)(self._X.iloc[test]) + t_values = getattr(t_model, func)( + self._X.iloc[test] # type: ignore + ) if t_values.ndim == 1: t_values = t_values[:, None] column_names = [f"p{i}" for i in range(t_values.shape[1])] @@ -152,7 +158,7 @@ def _get_predictions(self, func): t_df.columns = [f"fold{i_fold}_{x}" for x in t_df.columns] predictions = pd.concat(predictions, axis=1) predictions = predictions.sort_index() - predictions["target"] = self._y.values + predictions["target"] = self._y.values # type: ignore return predictions def __getitem__(self, key): diff --git a/julearn/inspect/_preprocess.py b/julearn/inspect/_preprocess.py index 38bd2f4bb..27e2a76ef 100644 --- a/julearn/inspect/_preprocess.py +++ b/julearn/inspect/_preprocess.py @@ -53,7 +53,7 @@ def preprocess( else: raise_error(f"No step named {until} found.") df_out = pipeline[:i].transform(_X) - + df_out = df_out.copy() if not isinstance(df_out, pd.DataFrame) and with_column_types is False: raise_error( "The output of the pipeline is not a DataFrame. Cannot remove " diff --git a/julearn/inspect/tests/test_cv.py b/julearn/inspect/tests/test_cv.py index cced789ea..012d59f25 100644 --- a/julearn/inspect/tests/test_cv.py +++ b/julearn/inspect/tests/test_cv.py @@ -3,7 +3,6 @@ # Authors: Federico Raimondo # Sami Hamdan # License: AGPL - import numpy as np import pandas as pd import pytest @@ -70,7 +69,10 @@ def scores(df_typed_iris, n_iters=5, mock_model=None): if mock_model is None: mock_model = MockModelReturnsIndex - estimators = [WrapModel(mock_model()).fit(X, y) for _ in range(n_iters)] + estimators = [ + WrapModel(mock_model()).fit(X, y) # type: ignore + for _ in range(n_iters) + ] return pd.DataFrame( { diff --git a/julearn/inspect/tests/test_inspector.py b/julearn/inspect/tests/test_inspector.py index 6bd5d1487..8643cee1d 100644 --- a/julearn/inspect/tests/test_inspector.py +++ b/julearn/inspect/tests/test_inspector.py @@ -18,28 +18,28 @@ def test_no_cv() -> None: """Test inspector with no cross-validation.""" - inspector = Inspector({}) + inspector = Inspector({}) # type: ignore with pytest.raises(ValueError, match="No cv"): _ = inspector.folds def test_no_X() -> None: """Test inspector with no features.""" - inspector = Inspector({}, cv=5) + inspector = Inspector({}, cv=5) # type: ignore with pytest.raises(ValueError, match="No X"): _ = inspector.folds def test_no_y() -> None: """Test inspector with no targets.""" - inspector = Inspector({}, cv=5, X=[1, 2, 3]) + inspector = Inspector({}, cv=5, X=[1, 2, 3]) # type: ignore with pytest.raises(ValueError, match="No y"): _ = inspector.folds def test_no_model() -> None: """Test inspector with no model.""" - inspector = Inspector({}) + inspector = Inspector({}) # type: ignore with pytest.raises(ValueError, match="No model"): _ = inspector.model @@ -63,8 +63,11 @@ def test_normal_usage(df_iris: "pd.DataFrame") -> None: return_inspector=True, problem_type="classification", ) - assert pipe == inspect.model._model - for (_, score), inspect_fold in zip(scores.iterrows(), inspect.folds): + assert pipe == inspect.model._model # type: ignore + for (_, score), inspect_fold in zip( + scores.iterrows(), # type: ignore + inspect.folds, # type: ignore + ): assert score["estimator"] == inspect_fold.model._model @@ -88,6 +91,6 @@ def test_normal_usage_with_search(df_iris: "pd.DataFrame") -> None: return_estimator="all", return_inspector=True, ) - assert pipe == inspect.model._model - inspect.model.get_fitted_params() - inspect.model.get_params() + assert pipe == inspect.model._model # type: ignore + inspect.model.get_fitted_params() # type: ignore + inspect.model.get_params() # type: ignore diff --git a/julearn/inspect/tests/test_pipeline.py b/julearn/inspect/tests/test_pipeline.py index afbe631c4..73b1d9690 100644 --- a/julearn/inspect/tests/test_pipeline.py +++ b/julearn/inspect/tests/test_pipeline.py @@ -4,8 +4,9 @@ # Sami Hamdan # License: AGPL -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Optional, Type +import pandas as pd import pytest from sklearn.base import BaseEstimator from sklearn.decomposition import PCA @@ -17,10 +18,6 @@ from julearn.transformers import JuColumnTransformer -if TYPE_CHECKING: - import pandas as pd - - class MockTestEst(BaseEstimator): """Class for estimator tests. @@ -39,8 +36,8 @@ def __init__(self, hype_0: int = 0, hype_1: int = 1) -> None: def fit( self, - X: List[str], # noqa: N803 - y: Optional[str] = None, + X: pd.DataFrame, # noqa: N803 + y: Optional[pd.Series] = None, **fit_params: Any, ) -> "MockTestEst": """Fit the estimator. @@ -64,7 +61,7 @@ def fit( self.param_1_ = 1 return self - def transform(self, X: List[str]) -> List[str]: # noqa: N803 + def transform(self, X: pd.DataFrame) -> pd.DataFrame: # noqa: N803 """Transform the estimator. Parameters @@ -90,7 +87,7 @@ def transform(self, X: List[str]) -> List[str]: # noqa: N803 ["zscore", "pca", "svm"], ], ) -def test_get_stepnames(steps: List[str], df_iris: "pd.DataFrame") -> None: +def test_get_stepnames(steps: List[str], df_iris: pd.DataFrame) -> None: """Test step names fetch. Parameters @@ -157,7 +154,11 @@ def test_steps( [ [MockTestEst(), {"param_0_": 0, "param_1_": 1}], [ - JuColumnTransformer("test", MockTestEst(), "continuous"), + JuColumnTransformer( + "test", + MockTestEst(), # type: ignore + "continuous", + ), {"param_0_": 0, "param_1_": 1}, ], ], @@ -201,8 +202,14 @@ def test_inspect_pipeline(df_iris: "pd.DataFrame") -> None: pipe = ( PipelineCreator(problem_type="classification") - .add(JuColumnTransformer("test", MockTestEst(), "continuous")) - .add(SVC()) + .add( + JuColumnTransformer( + "test", + MockTestEst(), # type: ignore + "continuous", + ) + ) + .add(SVC()) # type: ignore TODO: fix typing hints .to_pipeline() ) pipe.fit(df_iris.iloc[:, :-1], df_iris.species) @@ -230,8 +237,14 @@ def test_get_estimator(df_iris: "pd.DataFrame") -> None: """ pipe = ( PipelineCreator(problem_type="classification") - .add(JuColumnTransformer("test", MockTestEst(), "continuous")) - .add(SVC()) + .add( + JuColumnTransformer( + "test", + MockTestEst(), # type: ignore + "continuous", + ) + ) + .add(SVC()) # type: ignore TODO: fix typing hints .to_pipeline() ) pipe.fit(df_iris.iloc[:, :-1], df_iris.species) diff --git a/julearn/inspect/tests/test_preprocess.py b/julearn/inspect/tests/test_preprocess.py index ec281c519..df38c4c14 100644 --- a/julearn/inspect/tests/test_preprocess.py +++ b/julearn/inspect/tests/test_preprocess.py @@ -157,13 +157,20 @@ def test_preprocess_with_column_types(df_iris: pd.DataFrame) -> None: X = list(df_iris.iloc[:, :-1].columns) y = "species" + X_types = {"continuous": X} _, model = run_cross_validation( X=X, y=y, data=df_iris, + X_types=X_types, problem_type="classification", model="rf", return_estimator="final", ) - X_t = preprocess(model, X=X, data=df_iris, with_column_types=False) + X_t = preprocess( + model, # type: ignore + X=X, + data=df_iris, + with_column_types=False, + ) assert list(X_t.columns) == X diff --git a/julearn/model_selection/available_searchers.py b/julearn/model_selection/available_searchers.py index 499ba2edc..9da12e601 100644 --- a/julearn/model_selection/available_searchers.py +++ b/julearn/model_selection/available_searchers.py @@ -46,7 +46,7 @@ def list_searchers() -> List[str]: return list(_available_searchers) -def get_searcher(name: str) -> object: +def get_searcher(name: str) -> Type: """Get a searcher by name. Parameters @@ -56,8 +56,8 @@ def get_searcher(name: str) -> object: Returns ------- - obj - scikit-learn compatible searcher. + out + scikit-learn compatible searcher class. Raises ------ diff --git a/julearn/model_selection/stratified_bootstrap.py b/julearn/model_selection/stratified_bootstrap.py index 5206b532a..29f186328 100644 --- a/julearn/model_selection/stratified_bootstrap.py +++ b/julearn/model_selection/stratified_bootstrap.py @@ -9,7 +9,9 @@ import numpy as np from numpy.random import RandomState from sklearn.model_selection import BaseShuffleSplit -from sklearn.model_selection._split import _validate_shuffle_split +from sklearn.model_selection._split import ( + _validate_shuffle_split, # type: ignore +) class StratifiedBootstrap(BaseShuffleSplit): @@ -87,13 +89,13 @@ def _iter_indices( n_samples = [ _validate_shuffle_split( len(t_inds), - self.test_size, - self.train_size, - default_test_size=self._default_test_size, + self.test_size, # type: ignore + self.train_size, # type: ignore + default_test_size=self._default_test_size, # type: ignore ) for t_inds in y_inds ] - for _ in range(self.n_splits): + for _ in range(self.n_splits): # type: ignore train = [] test = [] for t_inds, (n_train, _) in zip(y_inds, n_samples): diff --git a/julearn/models/dynamic.py b/julearn/models/dynamic.py index 38b9eff6e..a04629c47 100644 --- a/julearn/models/dynamic.py +++ b/julearn/models/dynamic.py @@ -132,7 +132,7 @@ def fit( y_train = y[train] y_dsel = y[test] - self.ensemble.fit(X_train, y_train) + self.ensemble.fit(X_train, y_train) # type: ignore self._dsmodel = self._get_algorithm() self._dsmodel.fit(X_dsel, y_dsel) diff --git a/julearn/models/tests/test_models.py b/julearn/models/tests/test_models.py index 870fca671..011986cc8 100644 --- a/julearn/models/tests/test_models.py +++ b/julearn/models/tests/test_models.py @@ -113,7 +113,7 @@ def test_naive_bayes_estimators( "preprocess": None, "problem_type": "classification", } - clf = make_pipeline(clone(t_model)) + clf = make_pipeline(clone(t_model)) # type: ignore do_scoring_test( X, y, @@ -129,7 +129,7 @@ def test_naive_bayes_estimators( "preprocess": None, "problem_type": "classification", } - clf = make_pipeline(clone(t_model)) + clf = make_pipeline(clone(t_model)) # type: ignore do_scoring_test( X, y, @@ -150,7 +150,7 @@ def test_naive_bayes_estimators( "preprocess": None, "problem_type": "classification", } - clf = make_pipeline(clone(t_model)) + clf = make_pipeline(clone(t_model)) # type: ignore do_scoring_test( X, y, @@ -236,7 +236,7 @@ def test_classificationestimators( "problem_type": "classification", "preprocess": "zscore", } - clf = make_pipeline(StandardScaler(), clone(t_model)) + clf = make_pipeline(StandardScaler(), clone(t_model)) # type: ignore do_scoring_test( X, y, @@ -258,7 +258,7 @@ def test_classificationestimators( "problem_type": "classification", "preprocess": "zscore", } - clf = make_pipeline(StandardScaler(), clone(t_model)) + clf = make_pipeline(StandardScaler(), clone(t_model)) # type: ignore do_scoring_test( X, y, @@ -333,7 +333,7 @@ def test_regression_estimators( "preprocess": "zscore", "problem_type": "regression", } - clf = make_pipeline(StandardScaler(), clone(t_model)) + clf = make_pipeline(StandardScaler(), clone(t_model)) # type: ignore do_scoring_test( X, y, diff --git a/julearn/pipeline/pipeline_creator.py b/julearn/pipeline/pipeline_creator.py index 77f54189e..652e060d2 100644 --- a/julearn/pipeline/pipeline_creator.py +++ b/julearn/pipeline/pipeline_creator.py @@ -4,6 +4,7 @@ # Sami Hamdan # License: AGPL +import typing from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple, Union @@ -216,7 +217,7 @@ def add( "TargetPipelineCreator can only be added to the target." ) step = step.to_pipeline() # type: ignore - + step = typing.cast(JuTargetPipeline, step) # Validate the step self._validate_step(step, apply_to) @@ -266,7 +267,7 @@ def add( step, self.problem_type, **params_to_set ) elif len(params_to_set) > 0: - step.set_params(**params_to_set) + step.set_params(**params_to_set) # type: ignore # JuEstimators accept the apply_to parameter and return needed types if isinstance(step, JuEstimatorLike): @@ -301,7 +302,7 @@ def add( self._steps.append( Step( name=name, - estimator=step, + estimator=step, # type: ignore apply_to=apply_to, needed_types=needed_types, params_to_tune=params_to_tune, @@ -480,12 +481,12 @@ def to_pipeline( X_types = self._check_X_types(X_types) model_step = self._steps[-1] - target_transformer_step = None + target_trans_step = None transformer_steps = [] for _step in self._steps[:-1]: if "target" in _step.apply_to: - target_transformer_step = _step + target_trans_step = _step else: transformer_steps.append(_step) @@ -543,12 +544,12 @@ def to_pipeline( target_model_step = self._wrap_target_model( model_name, model_estimator, # type: ignore - target_transformer_step, # type: ignore + target_trans_step, # type: ignore ) target_step_to_tune = { f"{model_name}_target_transform__transformer__{param}": val for param, val in ( - target_transformer_step.params_to_tune.items() + target_trans_step.params_to_tune.items() # type: ignore ) } step_params_to_tune = { @@ -563,7 +564,7 @@ def to_pipeline( params_to_tune.update(step_params_to_tune) pipeline_steps.append((model_name, model_estimator)) pipeline = Pipeline(pipeline_steps).set_output(transform="pandas") - + pipeline = typing.cast(Pipeline, pipeline) # damn typing.. # Deal with the Hyperparameter tuning out = _prepare_hyperparameter_tuning( params_to_tune, search_params, pipeline @@ -573,7 +574,7 @@ def to_pipeline( @staticmethod def _wrap_target_model( - model_name: str, model: ModelLike, target_transformer_step: Step + model_name: str, model: ModelLike, target_trans_step: Step ) -> Tuple[str, JuTransformedTargetModel]: """Wrap the model in a JuTransformedTargetModel. @@ -583,7 +584,7 @@ def _wrap_target_model( The name of the model model : ModelLike The model to wrap - target_transformer_step : Step + target_trans_step : Step The step with the target transformer. Returns @@ -599,7 +600,7 @@ def _wrap_target_model( If the target transformer is not a JuTargetPipeline. """ - transformer = target_transformer_step.estimator + transformer = target_trans_step.estimator if not isinstance(transformer, JuTargetPipeline): raise_error( "The target transformer should be a JuTargetPipeline. " @@ -639,7 +640,9 @@ def _validate_model_params( ) def _get_step_name( - self, name: Optional[str], step: Union[EstimatorLike, str] + self, + name: Optional[str], + step: Union[EstimatorLike, str, TargetPipelineCreator], ) -> str: """Get the name of a step, with a count if it is repeated. @@ -657,7 +660,7 @@ def _get_step_name( """ out = name - if name is None: + if out is None: name = ( step if isinstance(step, str) @@ -670,7 +673,9 @@ def _get_step_name( return out def _validate_step( - self, step: Union[EstimatorLike, str], apply_to: ColumnTypesLike + self, + step: Union[EstimatorLike, str, TargetPipelineCreator], + apply_to: ColumnTypesLike, ) -> None: """Validate a step. @@ -689,7 +694,7 @@ def _validate_step( transformer. """ - if self._is_transfromer_step(step): + if self._is_transformer_step(step): if self._added_model: raise_error("Cannot add a transformer after adding a model") if self._added_target_transformer and not self._is_model_step( @@ -778,7 +783,9 @@ def _check_X_types( return X_types @staticmethod - def _is_transfromer_step(step: Union[str, EstimatorLike]) -> bool: + def _is_transformer_step( + step: Union[str, EstimatorLike, TargetPipelineCreator] + ) -> bool: """Check if a step is a transformer.""" if step in list_transformers(): return True @@ -787,7 +794,9 @@ def _is_transfromer_step(step: Union[str, EstimatorLike]) -> bool: return False @staticmethod - def _is_model_step(step: Union[EstimatorLike, str]) -> bool: + def _is_model_step( + step: Union[EstimatorLike, str, TargetPipelineCreator] + ) -> bool: """Check if a step is a model.""" if step in list_models(): return True diff --git a/julearn/pipeline/target_pipeline.py b/julearn/pipeline/target_pipeline.py index 415f34c6d..ff4cf5e8e 100644 --- a/julearn/pipeline/target_pipeline.py +++ b/julearn/pipeline/target_pipeline.py @@ -169,6 +169,6 @@ def needed_types(self): needed_types = [] for _, t_step in self.steps: if getattr(t_step, "needed_types", None) is not None: - needed_types.extend(t_step.needed_types) + needed_types.extend(t_step.needed_types) # type: ignore needed_types = set(needed_types) return needed_types if len(needed_types) > 0 else None diff --git a/julearn/pipeline/tests/test_merger.py b/julearn/pipeline/tests/test_merger.py index 96caaca17..54468ebf0 100644 --- a/julearn/pipeline/tests/test_merger.py +++ b/julearn/pipeline/tests/test_merger.py @@ -26,15 +26,15 @@ def test_merger_pipelines() -> None: pipe1 = creator1.to_pipeline() pipe2 = creator2.to_pipeline() - merged = merge_pipelines(pipe1, pipe2, search_params=None) + merged = merge_pipelines(pipe1, pipe2, search_params=None) # type: ignore assert isinstance(merged, GridSearchCV) - assert isinstance(merged.estimator, Pipeline) - assert len(merged.estimator.named_steps) == 3 - named_steps = list(merged.estimator.named_steps.keys()) + assert isinstance(merged.estimator, Pipeline) # type: ignore + assert len(merged.estimator.named_steps) == 3 # type: ignore + named_steps = list(merged.estimator.named_steps.keys()) # type: ignore assert "scaler" == named_steps[1] assert "rf" == named_steps[2] - assert len(merged.param_grid) == 2 + assert len(merged.param_grid) == 2 # type: ignore search_params = {"kind": "random"} creator3 = PipelineCreator(problem_type="classification") @@ -45,13 +45,16 @@ def test_merger_pipelines() -> None: merged = merge_pipelines(pipe1, pipe2, pipe3, search_params=search_params) assert isinstance(merged, RandomizedSearchCV) - assert isinstance(merged.estimator, Pipeline) - assert len(merged.estimator.named_steps) == 3 - named_steps = list(merged.estimator.named_steps.keys()) + assert isinstance(merged.estimator, Pipeline) # type: ignore + assert len(merged.estimator.named_steps) == 3 # type: ignore + named_steps = list(merged.estimator.named_steps.keys()) # type: ignore assert "scaler" == named_steps[1] assert "rf" == named_steps[2] - assert len(merged.param_distributions) == 3 - assert merged.param_distributions[-1]["rf__max_features"] == [2, 3, 7, 42] + assert len(merged.param_distributions) == 3 # type: ignore + assert ( + merged.param_distributions[-1]["rf__max_features"] # type: ignore + == [2, 3, 7, 42] + ) def test_merger_errors() -> None: @@ -68,7 +71,7 @@ def test_merger_errors() -> None: pipe2 = creator2.to_pipeline(search_params={"kind": "grid"}) with pytest.raises(ValueError, match="Only pipelines and searchers"): - merge_pipelines(pipe1, SVC(), search_params=None) + merge_pipelines(pipe1, SVC(), search_params=None) # type: ignore search_params = {"kind": "random"} @@ -92,7 +95,7 @@ def test_merger_errors() -> None: ValueError, match="All searchers must use a pipeline.", ): - merge_pipelines(pipe1, pipe3, search_params=None) + merge_pipelines(pipe1, pipe3, search_params=None) # type: ignore creator4 = PipelineCreator(problem_type="classification") creator4.add("scaler_robust", name="scaler", apply_to="continuous") @@ -103,7 +106,7 @@ def test_merger_errors() -> None: ValueError, match="must have the same named steps.", ): - merge_pipelines(pipe1, pipe4, search_params=None) + merge_pipelines(pipe1, pipe4, search_params=None) # type: ignore search_params = {"kind": "grid"} pipe5 = creator2.to_pipeline(search_params={"kind": "bayes"}) diff --git a/julearn/pipeline/tests/test_pipeline_creator.py b/julearn/pipeline/tests/test_pipeline_creator.py index e431a6264..e299b1008 100644 --- a/julearn/pipeline/tests/test_pipeline_creator.py +++ b/julearn/pipeline/tests/test_pipeline_creator.py @@ -378,7 +378,7 @@ def test_hyperparameter_tuning_distributions_bayes( ) assert isinstance(pipeline, BayesSearchCV) - _compare_param_grids(pipeline.search_spaces, param_grid) + _compare_param_grids(pipeline.search_spaces, param_grid) # type: ignore @pytest.mark.parametrize( diff --git a/julearn/prepare.py b/julearn/prepare.py index 0f4359145..5001520f2 100644 --- a/julearn/prepare.py +++ b/julearn/prepare.py @@ -11,15 +11,12 @@ import numpy as np import pandas as pd from sklearn.model_selection import ( - BaseCrossValidator, - BaseShuffleSplit, GroupKFold, GroupShuffleSplit, LeaveOneGroupOut, LeavePGroupsOut, StratifiedGroupKFold, ) -from sklearn.model_selection._split import _RepeatedSplits from .config import get_config from .model_selection import ( @@ -27,6 +24,7 @@ RepeatedContinuousStratifiedGroupKFold, ) from .utils import logger, raise_error, warn_with_log +from .utils.typing import CVLike def _validate_input_data_df( @@ -335,7 +333,7 @@ def prepare_input_data( def check_consistency( y: pd.Series, - cv: Union[int, BaseCrossValidator, BaseShuffleSplit, _RepeatedSplits], + cv: CVLike, groups: Optional[pd.Series], problem_type: str, ) -> None: diff --git a/julearn/scoring/available_scorers.py b/julearn/scoring/available_scorers.py index 6254d5429..a1dca3c9f 100644 --- a/julearn/scoring/available_scorers.py +++ b/julearn/scoring/available_scorers.py @@ -10,7 +10,7 @@ from typing import Callable, Dict, List, Optional, Union from sklearn.metrics import _scorer, get_scorer_names, make_scorer -from sklearn.metrics._scorer import _check_multimetric_scoring +from sklearn.metrics._scorer import _check_multimetric_scoring # type: ignore from sklearn.metrics._scorer import check_scoring as sklearn_check_scoring from ..transformers.target.ju_transformed_target_model import ( @@ -29,7 +29,7 @@ _extra_available_scorers_reset = deepcopy(_extra_available_scorers) -def get_scorer(name: str) -> ScorerLike: +def get_scorer(name: str) -> ScorerLike: # type: ignore TODO: deprecate sklearn < 1.4.0 """Get available scorer by name. Parameters @@ -72,7 +72,9 @@ def list_scorers() -> List[str]: def register_scorer( - scorer_name: str, scorer: ScorerLike, overwrite: Optional[bool] = None + scorer_name: str, + scorer: ScorerLike, # type: ignore TODO: deprecate sklearn < 1.4.0 + overwrite: Optional[bool] = None, ) -> None: """Register a scorer, so that it can be accessed by name. @@ -130,9 +132,9 @@ def reset_scorer_register(): def check_scoring( estimator: EstimatorLike, - scoring: Union[ScorerLike, str, Callable, List[str], None], + scoring: Union[ScorerLike, str, Callable, List[str], None], # type: ignore wrap_score: bool, -) -> Union[None, ScorerLike, Callable, Dict[str, ScorerLike]]: +) -> Union[None, ScorerLike, Callable, Dict[str, ScorerLike]]: # type: ignore """Check the scoring. Parameters @@ -152,7 +154,11 @@ def check_scoring( scoring = _extend_scorer(get_scorer(scoring), wrap_score) if callable(scoring): return _extend_scorer( - sklearn_check_scoring(estimator, scoring=scoring), wrap_score + sklearn_check_scoring( + estimator, # type: ignore + scoring=scoring, + ), + wrap_score, ) if isinstance(scoring, list): scorer_names = typing.cast(List[str], scoring) diff --git a/julearn/stats/corrected_ttest.py b/julearn/stats/corrected_ttest.py index 4b7040967..d4d7fbf53 100644 --- a/julearn/stats/corrected_ttest.py +++ b/julearn/stats/corrected_ttest.py @@ -5,6 +5,7 @@ # Federico Raimondo # License: BSD 3 clause +import typing from itertools import combinations from typing import Optional, Tuple @@ -18,7 +19,7 @@ def _corrected_std( - differences: np.ndarray, n_train: int, n_test: int + differences: pd.DataFrame, n_train: int, n_test: int ) -> float: """Corrects standard deviation using Nadeau and Bengio's approach. @@ -48,12 +49,12 @@ def _corrected_std( def _compute_corrected_ttest( - differences: np.ndarray, + differences: pd.DataFrame, n_train: int, n_test: int, df: Optional[int] = None, alternative: str = "two-sided", -) -> Tuple[float, float]: +) -> Tuple[pd.Series, pd.Series]: """Compute paired t-test with corrected variance. Parameters @@ -167,12 +168,15 @@ def corrected_ttest( n_train = i_scores["n_train"].values n_test = i_scores["n_test"].values + n_train = typing.cast(np.ndarray, n_train) + n_test = typing.cast(np.ndarray, n_test) + if np.unique(n_train).size > 1: warn_with_log( "The training set sizes are not the same. Will use a rounded " "average." ) - n_train = int(np.mean(n_train).round()) + n_train = int(np.mean(n_train).round()) # type: ignore else: n_train = n_train[0] diff --git a/julearn/stats/tests/test_corrected_ttest.py b/julearn/stats/tests/test_corrected_ttest.py index 9aec2fd18..221c240a1 100644 --- a/julearn/stats/tests/test_corrected_ttest.py +++ b/julearn/stats/tests/test_corrected_ttest.py @@ -21,27 +21,44 @@ def test__compute_corrected_ttest_alternatives(): rvs1 = stats.norm.rvs(loc=0.5, scale=0.2, size=20, random_state=42) rvs2 = stats.norm.rvs(loc=0.51, scale=0.2, size=20, random_state=45) rvs3 = stats.norm.rvs(loc=0.9, scale=0.2, size=20, random_state=50) - _, p1 = _compute_corrected_ttest(rvs1 - rvs2, n_train=70, n_test=30) + _, p1 = _compute_corrected_ttest( + rvs1 - rvs2, # type: ignore + n_train=70, + n_test=30, + ) assert p1 > 0.7 - _, p2 = _compute_corrected_ttest(rvs1 - rvs3, n_train=70, n_test=30) + _, p2 = _compute_corrected_ttest( + rvs1 - rvs3, # type: ignore + n_train=70, + n_test=30, + ) assert p2 < 0.1 _, p3 = _compute_corrected_ttest( - rvs1 - rvs3, n_train=70, n_test=30, alternative="less" + rvs1 - rvs3, # type: ignore + n_train=70, + n_test=30, + alternative="less", ) assert p3 < 0.05 # rvs1 is less than rvs3 _, p4 = _compute_corrected_ttest( - rvs1 - rvs3, n_train=70, n_test=30, alternative="greater" + rvs1 - rvs3, # type: ignore + n_train=70, + n_test=30, + alternative="greater", ) assert p4 > 0.90 # rvs1 is less than rvs3, so this should be high with pytest.raises(ValueError, match="Invalid alternative"): _compute_corrected_ttest( - rvs1 - rvs3, n_train=70, n_test=30, alternative="not_valid" + rvs1 - rvs3, # type: ignore + n_train=70, + n_test=30, + alternative="not_valid", ) diff --git a/julearn/tests/test_api.py b/julearn/tests/test_api.py index c12300388..5f9def69b 100644 --- a/julearn/tests/test_api.py +++ b/julearn/tests/test_api.py @@ -79,7 +79,7 @@ def test_run_cv_simple_binary( # now let"s try target-dependent scores scorers = ["recall", "precision", "f1"] - sk_y = (df_iris[y].values == "virginica").astype(int) + sk_y = (df_iris[y].values == "virginica").astype(int) # type: ignore model = PipelineCreator(apply_to="features", problem_type="classification") model.add("svm") @@ -95,7 +95,7 @@ def test_run_cv_simple_binary( data=df_iris, api_params=api_params, X_types=X_types, - sklearn_model=sklearn_model, + sklearn_model=sklearn_model, # type: ignore scorers=scorers, sk_y=sk_y, ) @@ -103,7 +103,7 @@ def test_run_cv_simple_binary( # now let"s try proba-dependent scores X = ["sepal_length", "petal_length"] scorers = ["accuracy", "roc_auc"] - sk_y = (df_iris[y].values == "virginica").astype(int) + sk_y = (df_iris[y].values == "virginica").astype(int) # type: ignore with pytest.warns(RuntimeWarning, match="treated as continuous"): api_params = { "model": "svm", @@ -126,7 +126,7 @@ def test_run_cv_simple_binary( # e.g. svm with probability=False X = ["sepal_length", "petal_length"] scorers = ["accuracy", "roc_auc"] - sk_y = (df_iris[y].values == "virginica").astype(int) + sk_y = (df_iris[y].values == "virginica").astype(int) # type: ignore with pytest.warns(RuntimeWarning, match="treated as continuous"): api_params = { "model": "svm", @@ -284,7 +284,7 @@ def test_run_cv_errors(df_iris: pd.DataFrame) -> None: y=y, data=df_iris, X_types=X_types, - model=model, + model=model, # type: ignore ) model = "svm" @@ -305,7 +305,7 @@ def test_run_cv_errors(df_iris: pd.DataFrame) -> None: data=df_iris, X_types=X_types, model=model, - preprocess=2, + preprocess=2, # type: ignore problem_type="classification", ) @@ -440,19 +440,35 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None: cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) clf = make_pipeline(SVC()) - gs = GridSearchCV(clf, {"svc__C": [0.01, 0.001]}, cv=cv_inner) + gs = GridSearchCV( + clf, + {"svc__C": [0.01, 0.001]}, + cv=cv_inner, # type: ignore + ) - expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring]) + expected = cross_validate( + gs, + sk_X, + sk_y, # type: ignore + cv=cv_outer, # type: ignore + scoring=[scoring], + ) - assert len(actual.columns) == len(expected) + 5 - assert len(actual["test_accuracy"]) == len(expected["test_accuracy"]) + assert len(actual.columns) == len(expected) + 5 # type: ignore + assert ( + len(actual["test_accuracy"]) # type: ignore + == len(expected["test_accuracy"]) + ) assert all( a == b - for a, b in zip(actual["test_accuracy"], expected["test_accuracy"]) + for a, b in zip( + actual["test_accuracy"], # type: ignore + expected["test_accuracy"], + ) ) # Compare the models - clf1 = actual_estimator.best_estimator_.steps[-1][1] + clf1 = actual_estimator.best_estimator_.steps[-1][1] # type: ignore clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1] compare_models(clf1, clf2) @@ -516,22 +532,28 @@ def test_tune_hyperparam_gridsearch_groups(df_iris: pd.DataFrame) -> None: expected = cross_validate( gs, sk_X, - sk_y, + sk_y, # type: ignore cv=cv_outer, scoring=[scoring], - groups=sk_groups, + groups=sk_groups, # type: ignore fit_params={"groups": sk_groups}, ) - assert len(actual.columns) == len(expected) + 5 - assert len(actual["test_accuracy"]) == len(expected["test_accuracy"]) + assert len(actual.columns) == len(expected) + 5 # type: ignore + assert ( + len(actual["test_accuracy"]) # type: ignore + == len(expected["test_accuracy"]) + ) assert all( a == b - for a, b in zip(actual["test_accuracy"], expected["test_accuracy"]) + for a, b in zip( + actual["test_accuracy"], # type: ignore + expected["test_accuracy"], + ) ) # Compare the models - clf1 = actual_estimator.best_estimator_.steps[-1][1] + clf1 = actual_estimator.best_estimator_.steps[-1][1] # type: ignore clf2 = ( clone(gs) .fit(sk_X, sk_y, groups=sk_groups) @@ -593,20 +615,35 @@ def test_tune_hyperparam_randomsearch(df_iris: pd.DataFrame) -> None: clf = make_pipeline(SVC()) gs = RandomizedSearchCV( - clf, {"svc__C": [0.01, 0.001]}, cv=cv_inner, n_iter=2 + clf, + {"svc__C": [0.01, 0.001]}, + cv=cv_inner, # type: ignore + n_iter=2, ) - expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring]) + expected = cross_validate( + gs, + sk_X, + sk_y, # type: ignore + cv=cv_outer, # type: ignore + scoring=[scoring], + ) - assert len(actual.columns) == len(expected) + 5 - assert len(actual["test_accuracy"]) == len(expected["test_accuracy"]) + assert len(actual.columns) == len(expected) + 5 # type: ignore + assert ( + len(actual["test_accuracy"]) # type: ignore + == len(expected["test_accuracy"]) + ) assert all( a == b - for a, b in zip(actual["test_accuracy"], expected["test_accuracy"]) + for a, b in zip( + actual["test_accuracy"], # type: ignore + expected["test_accuracy"], + ) ) # Compare the models - clf1 = actual_estimator.best_estimator_.steps[-1][1] + clf1 = actual_estimator.best_estimator_.steps[-1][1] # type: ignore clf2 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1] compare_models(clf1, clf2) @@ -697,25 +734,43 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None: "svc__C": [0.01, 0.1], }, ] - gs = GridSearchCV(clf, grid, cv=cv_inner) + gs = GridSearchCV(clf, grid, cv=cv_inner) # type: ignore - expected = cross_validate(gs, sk_X, sk_y, cv=cv_outer, scoring=[scoring]) + expected = cross_validate( + gs, + sk_X, + sk_y, # type: ignore + cv=cv_outer, # type: ignore + scoring=[scoring], + ) - assert len(actual1.columns) == len(expected) + 5 - assert len(actual2.columns) == len(expected) + 5 - assert len(actual1["test_accuracy"]) == len(expected["test_accuracy"]) - assert len(actual2["test_accuracy"]) == len(expected["test_accuracy"]) + assert len(actual1.columns) == len(expected) + 5 # type: ignore + assert len(actual2.columns) == len(expected) + 5 # type: ignore + assert ( + len(actual1["test_accuracy"]) # type: ignore + == len(expected["test_accuracy"]) + ) + assert ( + len(actual2["test_accuracy"]) # type: ignore + == len(expected["test_accuracy"]) + ) assert all( a == b - for a, b in zip(actual1["test_accuracy"], expected["test_accuracy"]) + for a, b in zip( + actual1["test_accuracy"], # type: ignore + expected["test_accuracy"], + ) ) assert all( a == b - for a, b in zip(actual2["test_accuracy"], expected["test_accuracy"]) + for a, b in zip( + actual2["test_accuracy"], # type: ignore + expected["test_accuracy"], + ) ) # Compare the models - clf1 = actual_estimator1.best_estimator_.steps[-1][1] - clf2 = actual_estimator2.best_estimator_.steps[-1][1] + clf1 = actual_estimator1.best_estimator_.steps[-1][1] # type: ignore + clf2 = actual_estimator2.best_estimator_.steps[-1][1] # type: ignore clf3 = clone(gs).fit(sk_X, sk_y).best_estimator_.steps[-1][1] compare_models(clf1, clf2) compare_models(clf1, clf3) @@ -746,7 +801,7 @@ def test_return_estimators(df_iris: pd.DataFrame) -> None: model="svm", problem_type="classification", cv=cv, - return_estimator=True, + return_estimator=True, # type: ignore ) scores = run_cross_validation( @@ -776,7 +831,7 @@ def test_return_estimators(df_iris: pd.DataFrame) -> None: assert isinstance(scores, pd.DataFrame) assert "estimator" not in scores - assert isinstance(final["svm"], SVC) + assert isinstance(final["svm"], SVC) # type: ignore scores = run_cross_validation( X=X, @@ -805,7 +860,7 @@ def test_return_estimators(df_iris: pd.DataFrame) -> None: assert isinstance(scores, pd.DataFrame) assert "estimator" in scores - assert isinstance(final["svm"], SVC) + assert isinstance(final["svm"], SVC) # type: ignore def test_return_train_scores(df_iris: pd.DataFrame) -> None: @@ -838,8 +893,8 @@ def test_return_train_scores(df_iris: pd.DataFrame) -> None: train_scores = [f"train_{s}" for s in scoring] test_scores = [f"test_{s}" for s in scoring] - assert all(s not in scores.columns for s in train_scores) - assert all(s in scores.columns for s in test_scores) + assert all(s not in scores.columns for s in train_scores) # type: ignore + assert all(s in scores.columns for s in test_scores) # type: ignore with pytest.warns(RuntimeWarning, match="treated as continuous"): scores = run_cross_validation( @@ -856,8 +911,8 @@ def test_return_train_scores(df_iris: pd.DataFrame) -> None: train_scores = [f"train_{s}" for s in scoring] test_scores = [f"test_{s}" for s in scoring] - assert all(s in scores.columns for s in train_scores) - assert all(s in scores.columns for s in test_scores) + assert all(s in scores.columns for s in train_scores) # type: ignore + assert all(s in scores.columns for s in test_scores) # type: ignore @pytest.mark.parametrize( @@ -1174,7 +1229,10 @@ def test_api_stacking_models() -> None: # The final model should be a stacking model im which the first estimator # is a grid search - assert isinstance(final.steps[1][1].model.estimators[0][1], GridSearchCV) + assert isinstance( + final.steps[1][1].model.estimators[0][1], # type: ignore + GridSearchCV, + ) def test_inspection_error(df_iris: pd.DataFrame) -> None: @@ -1253,10 +1311,12 @@ def test_inspector_picklable(tmp_path: Path, df_iris: pd.DataFrame) -> None: X = ["sepal_length", "sepal_width", "petal_length"] y = "species" pickled_file = tmp_path / "inspector.joblib" + X_types = {"continuous": X} _, _, inspector = run_cross_validation( X=X, y=y, data=df_iris, + X_types=X_types, model="rf", problem_type="classification", return_estimator="all", @@ -1289,9 +1349,7 @@ def test_tune_hyperparam_target(df_iris: pd.DataFrame) -> None: } target_pipeline = TargetPipelineCreator() - model = PipelineCreator( - problem_type="regression", apply_to="continuous" - ) + model = PipelineCreator(problem_type="regression", apply_to="continuous") target_pipeline.add("confound_removal", confounds="confounds") model.add(target_pipeline, apply_to="target") model.add("svm", C=[1, 2]) diff --git a/julearn/transformers/confound_remover.py b/julearn/transformers/confound_remover.py index ce6c8d093..c7d522c83 100644 --- a/julearn/transformers/confound_remover.py +++ b/julearn/transformers/confound_remover.py @@ -107,7 +107,7 @@ def _fit( self.support_mask_ = self.support_mask_.values def fit_confound_models(X: Scalar) -> ModelLike: # noqa: N803 - _model = clone(self.model_confound) + _model = clone(self.model_confound) # type: ignore _model.fit(ser_confound.values, X) # type: ignore return _model # type: ignore @@ -256,7 +256,7 @@ def _apply_threshold(self, residuals: pd.DataFrame) -> pd.DataFrame: if self.threshold is not None: # Accounting for correlated rounding errors for very small # residuals - residuals = residuals.applymap( + residuals = residuals.map( # type: ignore lambda x: 0 if abs(x) <= self.threshold else x ) return residuals diff --git a/julearn/transformers/dataframe/tests/test_drop_columns.py b/julearn/transformers/dataframe/tests/test_drop_columns.py index 898d2059c..b8280bd40 100644 --- a/julearn/transformers/dataframe/tests/test_drop_columns.py +++ b/julearn/transformers/dataframe/tests/test_drop_columns.py @@ -45,4 +45,4 @@ def test_DropColumns() -> None: ), X_trans, ) - assert all(support == [1, 1, 0, 0, 1, 1]) + assert all(support == [1, 1, 0, 0, 1, 1]) # type: ignore diff --git a/julearn/transformers/dataframe/tests/test_filter_columns.py b/julearn/transformers/dataframe/tests/test_filter_columns.py index ff7e823ee..b0842c567 100644 --- a/julearn/transformers/dataframe/tests/test_filter_columns.py +++ b/julearn/transformers/dataframe/tests/test_filter_columns.py @@ -30,7 +30,7 @@ def test_FilterColumns() -> None: "a__:type:__continuous", "b__:type:__continuous", ] - filter.set_output(transform="pandas").fit(X_with_types) + filter.set_output(transform="pandas").fit(X_with_types) # type: ignore X_expected = X_with_types.copy()[kept_columns] X_trans = filter.transform(X_with_types) assert isinstance(X_expected, pd.DataFrame) diff --git a/julearn/transformers/dataframe/tests/test_set_column_types.py b/julearn/transformers/dataframe/tests/test_set_column_types.py index e2e2a6798..e2895690c 100644 --- a/julearn/transformers/dataframe/tests/test_set_column_types.py +++ b/julearn/transformers/dataframe/tests/test_set_column_types.py @@ -13,7 +13,8 @@ def test_SetColumnTypes( - X_iris: pd.DataFrame, X_types_iris: Optional[Dict] # noqa: N803 + X_iris: pd.DataFrame, # noqa: N803 + X_types_iris: Optional[Dict], # noqa: N803 ) -> None: """Test SetColumnTypes. @@ -40,8 +41,8 @@ def test_SetColumnTypes( ) ) st = SetColumnTypes(X_types_iris).set_output(transform="pandas") - Xt = st.fit_transform(X_iris) - Xt_iris_with_types = st.fit_transform(X_iris_with_types) + Xt = st.fit_transform(X_iris) # type: ignore + Xt_iris_with_types = st.fit_transform(X_iris_with_types) # type: ignore assert_frame_equal(Xt, X_iris_with_types) assert_frame_equal(Xt_iris_with_types, X_iris_with_types) @@ -64,7 +65,8 @@ def test_SetColumnTypes_input_validation( def test_SetColumnTypes_array( - X_iris: pd.DataFrame, X_types_iris: Optional[Dict] # noqa: N803 + X_iris: pd.DataFrame, # noqa: N803 + X_types_iris: Optional[Dict], # noqa: N803 ) -> None: """Test SetColumnTypes. @@ -92,6 +94,8 @@ def test_SetColumnTypes_array( } X_iris_with_types.rename(columns=to_rename) st = SetColumnTypes(X_types_iris).set_output(transform="pandas") - Xt = st.fit_transform(X_iris.values) - Xt_iris_with_types = st.fit_transform(X_iris_with_types.values) + Xt = st.fit_transform(X_iris.values) # type: ignore + Xt_iris_with_types = st.fit_transform( # type: ignore + X_iris_with_types.values + ) assert_frame_equal(Xt, Xt_iris_with_types) diff --git a/julearn/transformers/ju_column_transformer.py b/julearn/transformers/ju_column_transformer.py index 07034b916..d4c0537d3 100644 --- a/julearn/transformers/ju_column_transformer.py +++ b/julearn/transformers/ju_column_transformer.py @@ -154,7 +154,7 @@ def get_feature_names_out( klass=ValueError, exception=e, ) - if self.column_transformer_.verbose_feature_names_out: + if self.column_transformer_.verbose_feature_names_out: # type: ignore out = [ x.replace("remainder__", "") if "remainder__" in x else x for x in out diff --git a/julearn/transformers/target/ju_transformed_target_model.py b/julearn/transformers/target/ju_transformed_target_model.py index 41649038b..32c8aa79b 100644 --- a/julearn/transformers/target/ju_transformed_target_model.py +++ b/julearn/transformers/target/ju_transformed_target_model.py @@ -96,7 +96,7 @@ def fit( """ y = self.transformer.fit_transform(X, y) - self.model_ = clone(self.model) + self.model_ = clone(self.model) # type: ignore self.model_.fit(X, y, **fit_params) # type: ignore return self diff --git a/julearn/transformers/target/target_confound_remover.py b/julearn/transformers/target/target_confound_remover.py index 67a209444..8b102de60 100644 --- a/julearn/transformers/target/target_confound_remover.py +++ b/julearn/transformers/target/target_confound_remover.py @@ -71,7 +71,7 @@ def fit( The fitted target confound remover. """ - self.model_confounds_ = clone(self.model_confound) + self.model_confounds_ = clone(self.model_confound) # type: ignore self.detected_confounds_ = self.confounds.to_type_selector()(X) X_confounds = X.loc[:, self.detected_confounds_] self.model_confounds_.fit(X_confounds.values, y) # type: ignore diff --git a/julearn/transformers/target/tests/test_ju_transformed_target_model.py b/julearn/transformers/target/tests/test_ju_transformed_target_model.py index 727797479..15dc8a1d0 100644 --- a/julearn/transformers/target/tests/test_ju_transformed_target_model.py +++ b/julearn/transformers/target/tests/test_ju_transformed_target_model.py @@ -38,7 +38,9 @@ def test_JuTransformedTargetModel( y_scaled = scaler_sk.fit_transform(y_iris.values[:, None])[:, 0] model_sk.fit(X_iris, y_scaled) y_pred_sk = model_sk.predict(X_iris) - y_inverse_sk = scaler_sk.inverse_transform(y_pred_sk[:, None])[:, 0] + y_inverse_sk = scaler_sk.inverse_transform( + y_pred_sk[:, None] # type: ignore + )[:, 0] assert_array_equal(y_pred, y_inverse_sk) diff --git a/julearn/transformers/tests/test_cbpm.py b/julearn/transformers/tests/test_cbpm.py index ddd195af4..7126e51e2 100644 --- a/julearn/transformers/tests/test_cbpm.py +++ b/julearn/transformers/tests/test_cbpm.py @@ -278,7 +278,7 @@ def test_CBPM_set_output_posneg( trans_posneg = ( CBPM(corr_method=spearmanr, agg_method=np.mean, corr_sign="posneg") .set_output(transform="pandas") - .fit_transform(X_iris, y_iris) + .fit_transform(X_iris, y_iris) # type: ignore ) trans_man_pos = X_iris[X_pos].values.mean(axis=1) @@ -311,7 +311,7 @@ def test_CBPM_set_output_pos( trans_pos = ( CBPM(corr_method=spearmanr, agg_method=np.mean, corr_sign="pos") .set_output(transform="pandas") - .fit_transform(X_iris, y_iris) + .fit_transform(X_iris, y_iris) # type: ignore ) trans_man_pos = X_iris[X_pos].values.mean(axis=1) @@ -340,7 +340,7 @@ def test_CBPM_set_output_neg( trans_neg = ( CBPM(corr_method=spearmanr, agg_method=np.mean, corr_sign="neg") .set_output(transform="pandas") - .fit_transform(X_iris, y_iris) + .fit_transform(X_iris, y_iris) # type: ignore ) trans_man_neg = X_iris[X_neg].values.mean(axis=1) diff --git a/julearn/transformers/tests/test_confounds.py b/julearn/transformers/tests/test_confounds.py index 3de4793f2..b1bd00edb 100644 --- a/julearn/transformers/tests/test_confounds.py +++ b/julearn/transformers/tests/test_confounds.py @@ -167,10 +167,14 @@ def test_ConfoundRemover_confound_auto_find_conf( # After confound removal the confound should be removed assert ( - df_cofound_removed.columns == df_X.drop(columns=confounds).columns + df_cofound_removed.columns # type: ignore + == df_X.drop(columns=confounds).columns ).all() - assert_frame_equal(df_cofound_removed, df_confound_removed_manual) + assert_frame_equal( + df_cofound_removed, # type: ignore + df_confound_removed_manual, + ) @pytest.mark.parametrize( @@ -216,7 +220,8 @@ def test_confound_set_confounds( df_confounds = df_X_confounds.loc[:, conf_as_feat] # type: ignore confound_regressions = [ model_class().fit( - df_confounds, df_X_confounds.loc[:, feature] # type: ignore + df_confounds, + df_X_confounds.loc[:, feature], # type: ignore ) for feature in features ] @@ -244,11 +249,14 @@ def test_confound_set_confounds( ) # After confound removal the confound should be removed assert ( - df_cofound_removed.columns + df_cofound_removed.columns # type: ignore == df_X_confounds.drop(columns=confounds).columns ).all() - assert_frame_equal(df_cofound_removed, df_confound_removed_manual) + assert_frame_equal( + df_cofound_removed, # type: ignore + df_confound_removed_manual, + ) def test_return_confound(df_X_confounds: pd.DataFrame) -> None: # noqa: N803 @@ -264,7 +272,10 @@ def test_return_confound(df_X_confounds: pd.DataFrame) -> None: # noqa: N803 apply_to=["categorical", "continuous"], keep_confounds=True ) X_trans = remover.fit_transform(df_X_confounds) - assert_array_equal(X_trans.columns, df_X_confounds.columns) + assert_array_equal( + X_trans.columns, # type: ignore + df_X_confounds.columns, + ) def test_no_confound_found() -> None: diff --git a/julearn/transformers/tests/test_jucolumntransformers.py b/julearn/transformers/tests/test_jucolumntransformers.py index 4c4ca83b9..7bf8309fa 100644 --- a/julearn/transformers/tests/test_jucolumntransformers.py +++ b/julearn/transformers/tests/test_jucolumntransformers.py @@ -123,7 +123,7 @@ def test_JuColumnTransformer_row_select(): transformer_healthy = JuColumnTransformer( name="zscore", - transformer=StandardScaler(), + transformer=StandardScaler(), # type: ignore apply_to="continuous", row_select_col_type=["healthy"], row_select_vals=1, @@ -131,7 +131,7 @@ def test_JuColumnTransformer_row_select(): transformer_unhealthy = JuColumnTransformer( name="zscore", - transformer=StandardScaler(), + transformer=StandardScaler(), # type: ignore apply_to="continuous", row_select_col_type=["healthy"], row_select_vals=0, @@ -139,24 +139,26 @@ def test_JuColumnTransformer_row_select(): transformer_both = JuColumnTransformer( name="zscore", - transformer=StandardScaler(), + transformer=StandardScaler(), # type: ignore apply_to="continuous", row_select_col_type=["healthy"], row_select_vals=[0, 1], ) mean_healthy = ( transformer_healthy.fit(X) - .column_transformer_.transformers_[0][1] + .column_transformer_.transformers_[0][1] # type: ignore .mean_ ) mean_unhealthy = ( transformer_unhealthy.fit(X) - .column_transformer_.transformers_[0][1] + .column_transformer_.transformers_[0][1] # type: ignore .mean_ ) mean_both = ( - transformer_both.fit(X).column_transformer_.transformers_[0][1].mean_ + transformer_both.fit( + X + ).column_transformer_.transformers_[0][1].mean_ # type: ignore ) assert_almost_equal( diff --git a/julearn/utils/checks.py b/julearn/utils/checks.py index 14f7207ff..919e796dc 100644 --- a/julearn/utils/checks.py +++ b/julearn/utils/checks.py @@ -2,6 +2,8 @@ # Author: Federico Raimondo # License: BSD 3 clause +from typing import List + import numpy as np import pandas as pd @@ -10,7 +12,7 @@ def check_scores_df( *scores: pd.DataFrame, same_cv: bool = False -) -> pd.DataFrame: +) -> List[pd.DataFrame]: """Check the output of `run_cross_validation`. Parameters diff --git a/julearn/utils/logging.py b/julearn/utils/logging.py index 094d6c179..c7c7516e9 100644 --- a/julearn/utils/logging.py +++ b/julearn/utils/logging.py @@ -223,7 +223,7 @@ def raise_error( def warn_with_log( - msg: str, category: Optional[Type[Warning]] = RuntimeWarning + msg: str, category: Type[Warning] = RuntimeWarning ) -> None: """Warn, but first log it. diff --git a/julearn/utils/testing.py b/julearn/utils/testing.py index 5c6145098..7afd9d2ab 100644 --- a/julearn/utils/testing.py +++ b/julearn/utils/testing.py @@ -5,7 +5,7 @@ # License: AGPL import warnings -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import numpy as np import pandas as pd @@ -38,7 +38,7 @@ SGDClassifier, SGDRegressor, ) -from sklearn.model_selection import KFold, cross_validate +from sklearn.model_selection import BaseCrossValidator, KFold, cross_validate from sklearn.naive_bayes import ( BernoulliNB, CategoricalNB, @@ -51,7 +51,7 @@ from julearn import run_cross_validation from julearn.base import WrapModel -from julearn.utils.typing import DataLike, EstimatorLike +from julearn.utils.typing import DataLike, EstimatorLike, ModelLike def compare_models( # noqa: C901, pragma: no cover @@ -108,7 +108,8 @@ def compare_models( # noqa: C901, pragma: no cover assert clf1.strategy == clf2.strategy # type: ignore if hasattr(clf1, "class_prior_"): assert_array_equal( - clf1.class_prior_, clf2.class_prior_ # type: ignore + clf1.class_prior_, # type: ignore + clf2.class_prior_, # type: ignore ) if hasattr(clf1, "constant_"): assert clf1.constant_ == clf2.constant_ # type: ignore @@ -180,11 +181,11 @@ def do_scoring_test( y: str, data: pd.DataFrame, api_params: Dict[str, Any], - sklearn_model: EstimatorLike, + sklearn_model: Union[EstimatorLike, ModelLike, Any], # TODO: fix scorers: List[str], groups: Optional[str] = None, X_types: Optional[Dict[str, List[str]]] = None, # noqa: N803 - cv: int = 5, + cv: Union[int, BaseCrossValidator] = 5, sk_y: Optional[np.ndarray] = None, decimal: int = 5, ): @@ -245,7 +246,12 @@ def do_scoring_test( np.random.seed(42) expected = cross_validate( - sklearn_model, sk_X, sk_y, cv=sk_cv, scoring=scorers, groups=sk_groups + sklearn_model, # type: ignore + sk_X, + sk_y, + cv=sk_cv, + scoring=scorers, + groups=sk_groups, # type: ignore ) # Compare the models @@ -257,8 +263,8 @@ def do_scoring_test( if isinstance(sklearn_model, Pipeline): clf2 = clone(sklearn_model).fit(sk_X, sk_y).steps[-1][1] else: - clf2 = clone(sklearn_model).fit(sk_X, sk_y) - compare_models(clf1, clf2) + clf2 = clone(sklearn_model).fit(sk_X, sk_y) # type: ignore + compare_models(clf1, clf2) # type: ignore if decimal > 0: for scoring in scorers: @@ -266,7 +272,9 @@ def do_scoring_test( assert len(actual.columns) == len(expected) + 5 # type: ignore assert len(actual[s_key]) == len(expected[s_key]) # type: ignore assert_array_almost_equal( - actual[s_key], expected[s_key], decimal=decimal # type: ignore + actual[s_key], # type: ignore + expected[s_key], + decimal=decimal, # type: ignore ) @@ -277,7 +285,9 @@ def __init__(self): pass def fit( - self, X: DataLike, y: Optional[DataLike] = None # noqa: N803 + self, + X: DataLike, # noqa: N803 + y: Optional[DataLike] = None, ) -> "PassThroughTransformer": """Fit the transformer. diff --git a/julearn/utils/typing.py b/julearn/utils/typing.py index eb886b865..ed0f1a191 100644 --- a/julearn/utils/typing.py +++ b/julearn/utils/typing.py @@ -7,7 +7,7 @@ from typing import ( Any, Dict, - List, + Iterable, Optional, Protocol, Union, @@ -16,9 +16,16 @@ import numpy as np import pandas as pd +from numpy.typing import ArrayLike +from sklearn.model_selection import BaseCrossValidator, BaseShuffleSplit +from sklearn.model_selection._split import _RepeatedSplits -try: # sklearn < 1.4.0 +try: # sklearn >= 1.4.0 + from sklearn.metrics._scorer import _Scorer # type: ignore + + ScorerLike = Union[_Scorer, _Scorer] +except ImportError: # sklearn < 1.4.0 from sklearn.metrics._scorer import ( _PredictScorer, _ProbaScorer, @@ -26,10 +33,6 @@ ) ScorerLike = Union[_ProbaScorer, _ThresholdScorer, _PredictScorer] -except ImportError: # sklearn >= 1.4.0 - from sklearn.metrics._scorer import _Scorer - - ScorerLike = _Scorer from ..base import ColumnTypes @@ -43,15 +46,18 @@ class EstimatorLikeFit1(Protocol): """Class for estimator-like fit 1.""" def fit( - self, X: List[str], y: str, **kwargs: Any # noqa: N803 + self, + X: DataLike, # noqa: N803 + y: pd.Series, + **kwargs: Any, ) -> "EstimatorLikeFit1": """Fit estimator. Parameters ---------- - X : list of str + X : DataLike The features to use. - y : str + y : pd.Series The target to use. **kwargs : dict Extra keyword arguments. @@ -101,15 +107,22 @@ def set_params(self, **params: Any) -> "EstimatorLikeFit1": class EstimatorLikeFit2(Protocol): """Class for estimator-like fit 2.""" - def fit(self, X: List[str], y: str) -> "EstimatorLikeFit2": # noqa: N803 + def fit( + self, + X: DataLike, # noqa: N803 + y: ArrayLike, + **kwargs: Any, + ) -> "EstimatorLikeFit2": """Fit estimator. Parameters ---------- - X : list of str + X : DataLike The features to use. - y : str + y : DataLike The target to use. + **kwargs : dict + Extra keyword arguments. Returns ------- @@ -156,12 +169,12 @@ def set_params(self, **params: Any) -> "EstimatorLikeFit2": class EstimatorLikeFity(Protocol): """Class for estimator-like fit y.""" - def fit(self, y: str) -> "EstimatorLikeFity": + def fit(self, y: DataLike) -> "EstimatorLikeFity": """Fit estimator. Parameters ---------- - y : str + y : DataLike The target to use. Returns @@ -214,17 +227,17 @@ class TransformerLike(EstimatorLikeFit1, Protocol): def fit( self, - X: List[str], # noqa: N803 - y: Optional[str] = None, + X: DataLike, # noqa: N803 + y: DataLike, **fit_params: Any, ) -> None: """Fit transformer. Parameters ---------- - X : list of str + X : DataLike The features to use. - y : str, optional + y : DataLike The target to use (default None). **fit_params : dict Fit parameters. @@ -249,7 +262,9 @@ def transform(self, X: DataLike) -> DataLike: # noqa: N803 return X def fit_transform( - self, X: DataLike, y: Optional[DataLike] = None # noqa: N803 + self, + X: DataLike, # noqa: N803 + y: Optional[DataLike] = None, ) -> DataLike: """Fit and transform. @@ -369,3 +384,8 @@ def get_apply_to(self) -> ColumnTypes: """ return ColumnTypes("placeholder") + + +CVLike = Union[ + int, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit, Iterable +] diff --git a/julearn/viz/_scores.py b/julearn/viz/_scores.py index d155d6caa..a1a99020f 100644 --- a/julearn/viz/_scores.py +++ b/julearn/viz/_scores.py @@ -2,6 +2,7 @@ # Sami Hamdan # License: AGPL from pathlib import Path +from typing import Sequence import pandas as pd import panel as pn @@ -43,10 +44,10 @@ class _JulearnScoresViewer(param.Parameterized): """ - metric = param.Selector([], default=None) - models = param.ListSelector(default=None, objects=[]) - sets = param.ListSelector(default=None, objects=[]) - show_stats = param.Boolean(False) + metric = param.Selector([], default=None) # type: ignore + models: Sequence = param.ListSelector(default=None, objects=[]) + sets: Sequence = param.ListSelector(default=None, objects=[]) + show_stats: bool = param.Boolean(False) group_repeats = param.Selector( objects=["mean", "median", "no"], default="no" ) diff --git a/pyproject.toml b/pyproject.toml index b6215fac8..320d256d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -222,4 +222,18 @@ showcontent = true ## Configure pyright to ignore assigment types until scikit-learn stubs are updated [tool.pyright] -reportAssignmentType = "none" \ No newline at end of file +reportAssignmentType = "none" +exclude = [ + "docs/auto_examples/", + "*.html", + ".git/", + "*.pyc,", + "*/_build/*", + "*/api/generated/*.examples", + "build/", + "examples/XX_disabled/", + ".tox", + ".eggs", + "examples/", # Lots of problems due to bad stubs, avoid filling the example with # type:ignore + "scratch/", # place to prototype, not to be checked +] \ No newline at end of file From e27eb1fbdc0e71491e5223eb89716e92a9af0362 Mon Sep 17 00:00:00 2001 From: Fede Date: Mon, 29 Apr 2024 18:15:45 +0300 Subject: [PATCH 2/2] Rollback pandas map to applymap until we remove support for py3.8 --- julearn/transformers/confound_remover.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/julearn/transformers/confound_remover.py b/julearn/transformers/confound_remover.py index c7d522c83..4a9327ebe 100644 --- a/julearn/transformers/confound_remover.py +++ b/julearn/transformers/confound_remover.py @@ -256,7 +256,7 @@ def _apply_threshold(self, residuals: pd.DataFrame) -> pd.DataFrame: if self.threshold is not None: # Accounting for correlated rounding errors for very small # residuals - residuals = residuals.map( # type: ignore + residuals = residuals.applymap( # type: ignore lambda x: 0 if abs(x) <= self.threshold else x ) return residuals