juaml · fraimondo · Apr 29, 2024 · Apr 29, 2024 · Apr 29, 2024 · Apr 29, 2024
diff --git a/julearn/api.py b/julearn/api.py
@@ -4,13 +4,13 @@
 #          Sami Hamdan <[email protected]>
 # License: AGPL
 
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
+import sklearn
 from sklearn.base import BaseEstimator
 from sklearn.model_selection import (
-    BaseCrossValidator,
     check_cv,
     cross_validate,
 )
@@ -23,6 +23,7 @@
 from .prepare import check_consistency, prepare_input_data
 from .scoring import check_scoring
 from .utils import _compute_cvmdsum, logger, raise_error
+from .utils.typing import CVLike
 
 
 def run_cross_validation(  # noqa: C901
@@ -36,7 +37,7 @@ def run_cross_validation(  # noqa: C901
     return_estimator: Optional[str] = None,
     return_inspector: bool = False,
     return_train_score: bool = False,
-    cv: Optional[Union[int, BaseCrossValidator, Iterable]] = None,
+    cv: Optional[CVLike] = None,
     groups: Optional[str] = None,
     scoring: Union[str, List[str], None] = None,
     pos_labels: Union[str, List[str], None] = None,
@@ -357,20 +358,32 @@ def run_cross_validation(  # noqa: C901
 
     # Prepare cross validation
     cv_outer = check_cv(
-        cv, classifier=problem_type == "classification"  # type: ignore
+        cv,  # type: ignore
+        classifier=problem_type == "classification",
     )
     logger.info(f"Using outer CV scheme {cv_outer}")
 
     check_consistency(df_y, cv, groups, problem_type)  # type: ignore
 
     cv_return_estimator = return_estimator in ["cv", "all"]
-    scoring = check_scoring(pipeline, scoring, wrap_score=wrap_score)
+    scoring = check_scoring(
+        pipeline,  # type: ignore
+        scoring,
+        wrap_score=wrap_score,
+    )
 
     cv_mdsum = _compute_cvmdsum(cv_outer)
     fit_params = {}
     if df_groups is not None:
         if isinstance(pipeline, BaseSearchCV):
             fit_params["groups"] = df_groups.values
+
+    _sklearn_deprec_fit_params = {}
+    if sklearn.__version__ >= "1.4.0":
+        _sklearn_deprec_fit_params["params"] = fit_params
+    else:
+        _sklearn_deprec_fit_params["fit_params"] = fit_params
+
     scores = cross_validate(
         pipeline,
         df_X,
@@ -382,7 +395,7 @@ def run_cross_validation(  # noqa: C901
         n_jobs=n_jobs,
         return_train_score=return_train_score,
         verbose=verbose,  # type: ignore
-        fit_params=fit_params,
+        **_sklearn_deprec_fit_params,
     )
 
     n_repeats = getattr(cv_outer, "n_repeats", 1)

diff --git a/julearn/base/estimators.py b/julearn/base/estimators.py
@@ -13,11 +13,11 @@
 
 
 try:  # sklearn < 1.4.0
-    from sklearn.utils.validation import _check_fit_params
+    from sklearn.utils.validation import _check_fit_params  # type: ignore
 
     fit_params_checker = _check_fit_params
 except ImportError:  # sklearn >= 1.4.0
-    from sklearn.utils.validation import _check_method_params
+    from sklearn.utils.validation import _check_method_params  # type: ignore
 
     fit_params_checker = _check_method_params
 
@@ -180,7 +180,12 @@ def __init__(
         self.row_select_col_type = row_select_col_type
         self.row_select_vals = row_select_vals
 
-    def fit(self, X, y=None, **fit_params):  # noqa: N803
+    def fit(
+        self,
+        X: pd.DataFrame,  # noqa: N803
+        y: Optional[pd.Series] = None,
+        **fit_params,
+    ):
         """Fit the model.
 
         This method will fit the model using only the columns selected by
@@ -217,8 +222,21 @@ def fit(self, X, y=None, **fit_params):  # noqa: N803
             self.row_select_vals = [self.row_select_vals]
         return self._fit(**self._select_rows(X, y, **fit_params))
 
+    def _fit(
+        self,
+        X: pd.DataFrame,  # noqa: N803,
+        y: Optional[pd.Series],
+        **kwargs,
+    ) -> None:
+        raise_error(
+            "This method should be implemented in the concrete class",
+            klass=NotImplementedError,
+        )
+
     def _add_backed_filtered(
-        self, X: pd.DataFrame, X_trans: pd.DataFrame  # noqa: N803
+        self,
+        X: pd.DataFrame,  # noqa: N803
+        X_trans: pd.DataFrame,  # noqa: N803
     ) -> pd.DataFrame:
         """Add the left-out columns back to the transformed data.
 
@@ -301,7 +319,7 @@ def __init__(
 
     def fit(
         self,
-        X: pd.DataFrame,  # noqa: N803
+        X: DataLike,  # noqa: N803
         y: Optional[DataLike] = None,
         **fit_params: Any,
     ) -> "WrapModel":
@@ -312,7 +330,7 @@ def fit(
 
         Parameters
         ----------
-        X : pd.DataFrame
+        X : DataLike
             The data to fit the model on.
         y : DataLike, optional
             The target data (default is None).
@@ -329,9 +347,9 @@ def fit(
         if self.needed_types is not None:
             self.needed_types = ensure_column_types(self.needed_types)
 
-        Xt = self.filter_columns(X)
+        Xt = self.filter_columns(X)  # type: ignore
         self.model_ = self.model
-        self.model_.fit(Xt, y, **fit_params)
+        self.model_.fit(Xt, y, **fit_params)  # type: ignore
         return self
 
     def predict(self, X: pd.DataFrame) -> DataLike:  # noqa: N803

diff --git a/julearn/base/tests/test_base_estimators.py b/julearn/base/tests/test_base_estimators.py
@@ -110,7 +110,7 @@ def test_WrapModel(
 
     np.random.seed(42)
     lr = model()
-    lr.fit(X_iris_selected, y_iris)
+    lr.fit(X_iris_selected, y_iris)  # type: ignore
     pred_sk = lr.predict(X_iris_selected)
 
     np.random.seed(42)

diff --git a/julearn/inspect/_cv.py b/julearn/inspect/_cv.py
@@ -4,13 +4,14 @@
 #          Sami Hamdan <[email protected]>
 # License: AGPL
 
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import pandas as pd
 from sklearn.model_selection import BaseCrossValidator, check_cv
 from sklearn.utils.metaestimators import available_if
 
 from ..utils import _compute_cvmdsum, is_nonoverlapping_cv, raise_error
+from ..utils.typing import DataLike
 from ._pipeline import PipelineInspector
 
 
@@ -60,14 +61,13 @@ class FoldsInspector:
     def __init__(
         self,
         scores: pd.DataFrame,
-        cv: BaseCrossValidator,
-        X: Union[str, List[str]],  # noqa: N803
-        y: str,
+        cv: Union[BaseCrossValidator, int],
+        X: DataLike,  # noqa: N803
+        y: pd.Series,
         func: str = "predict",
-        groups: Optional[str] = None,
+        groups: Optional[pd.Series] = None,
     ):
         self._scores = scores
-        self._cv = cv
         self._X = X
         self._y = y
         self._func = func
@@ -92,7 +92,7 @@ def __init__(
             )
 
         cv = check_cv(cv)
-
+        self._cv = cv
         t_cv_mdsum = _compute_cvmdsum(cv)
         if t_cv_mdsum != cv_mdsums[0]:
             raise_error(
@@ -120,10 +120,16 @@ def _get_predictions(self, func):
 
         predictions = []
         for i_fold, (_, test) in enumerate(
-            self._cv.split(self._X, self._y, groups=self._groups)
+            self._cv.split(
+                self._X,  # type: ignore
+                self._y,
+                groups=self._groups,
+            )
         ):
             t_model = self._scores["estimator"][i_fold]
-            t_values = getattr(t_model, func)(self._X.iloc[test])
+            t_values = getattr(t_model, func)(
+                self._X.iloc[test]  # type: ignore
+            )
             if t_values.ndim == 1:
                 t_values = t_values[:, None]
             column_names = [f"p{i}" for i in range(t_values.shape[1])]
@@ -152,7 +158,7 @@ def _get_predictions(self, func):
                 t_df.columns = [f"fold{i_fold}_{x}" for x in t_df.columns]
         predictions = pd.concat(predictions, axis=1)
         predictions = predictions.sort_index()
-        predictions["target"] = self._y.values
+        predictions["target"] = self._y.values  # type: ignore
         return predictions
 
     def __getitem__(self, key):

diff --git a/julearn/inspect/_preprocess.py b/julearn/inspect/_preprocess.py
@@ -53,7 +53,7 @@ def preprocess(
         else:
             raise_error(f"No step named {until} found.")
     df_out = pipeline[:i].transform(_X)
-
+    df_out = df_out.copy()
     if not isinstance(df_out, pd.DataFrame) and with_column_types is False:
         raise_error(
             "The output of the pipeline is not a DataFrame. Cannot remove "

diff --git a/julearn/inspect/tests/test_cv.py b/julearn/inspect/tests/test_cv.py
@@ -3,7 +3,6 @@
 # Authors: Federico Raimondo <[email protected]>
 #          Sami Hamdan <[email protected]>
 # License: AGPL
-
 import numpy as np
 import pandas as pd
 import pytest
@@ -70,7 +69,10 @@ def scores(df_typed_iris, n_iters=5, mock_model=None):
     if mock_model is None:
         mock_model = MockModelReturnsIndex
 
-    estimators = [WrapModel(mock_model()).fit(X, y) for _ in range(n_iters)]
+    estimators = [
+        WrapModel(mock_model()).fit(X, y)  # type: ignore
+        for _ in range(n_iters)
+    ]
 
     return pd.DataFrame(
         {

diff --git a/julearn/inspect/tests/test_inspector.py b/julearn/inspect/tests/test_inspector.py
@@ -18,28 +18,28 @@
 
 def test_no_cv() -> None:
     """Test inspector with no cross-validation."""
-    inspector = Inspector({})
+    inspector = Inspector({})  # type: ignore
     with pytest.raises(ValueError, match="No cv"):
         _ = inspector.folds
 
 
 def test_no_X() -> None:
     """Test inspector with no features."""
-    inspector = Inspector({}, cv=5)
+    inspector = Inspector({}, cv=5)  # type: ignore
     with pytest.raises(ValueError, match="No X"):
         _ = inspector.folds
 
 
 def test_no_y() -> None:
     """Test inspector with no targets."""
-    inspector = Inspector({}, cv=5, X=[1, 2, 3])
+    inspector = Inspector({}, cv=5, X=[1, 2, 3])  # type: ignore
     with pytest.raises(ValueError, match="No y"):
         _ = inspector.folds
 
 
 def test_no_model() -> None:
     """Test inspector with no model."""
-    inspector = Inspector({})
+    inspector = Inspector({})  # type: ignore
     with pytest.raises(ValueError, match="No model"):
         _ = inspector.model
 
@@ -63,8 +63,11 @@ def test_normal_usage(df_iris: "pd.DataFrame") -> None:
         return_inspector=True,
         problem_type="classification",
     )
-    assert pipe == inspect.model._model
-    for (_, score), inspect_fold in zip(scores.iterrows(), inspect.folds):
+    assert pipe == inspect.model._model  # type: ignore
+    for (_, score), inspect_fold in zip(
+        scores.iterrows(),  # type: ignore
+        inspect.folds,  # type: ignore
+    ):
         assert score["estimator"] == inspect_fold.model._model
 
 
@@ -88,6 +91,6 @@ def test_normal_usage_with_search(df_iris: "pd.DataFrame") -> None:
         return_estimator="all",
         return_inspector=True,
     )
-    assert pipe == inspect.model._model
-    inspect.model.get_fitted_params()
-    inspect.model.get_params()
+    assert pipe == inspect.model._model  # type: ignore
+    inspect.model.get_fitted_params()  # type: ignore
+    inspect.model.get_params()  # type: ignore