Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix typing and warnings as much as we can #261

Merged
merged 3 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions julearn/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
# Sami Hamdan <[email protected]>
# License: AGPL

from typing import Dict, Iterable, List, Optional, Union
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
import sklearn
from sklearn.base import BaseEstimator
from sklearn.model_selection import (
BaseCrossValidator,
check_cv,
cross_validate,
)
Expand All @@ -23,6 +23,7 @@
from .prepare import check_consistency, prepare_input_data
from .scoring import check_scoring
from .utils import _compute_cvmdsum, logger, raise_error
from .utils.typing import CVLike


def run_cross_validation( # noqa: C901
Expand All @@ -36,7 +37,7 @@ def run_cross_validation( # noqa: C901
return_estimator: Optional[str] = None,
return_inspector: bool = False,
return_train_score: bool = False,
cv: Optional[Union[int, BaseCrossValidator, Iterable]] = None,
cv: Optional[CVLike] = None,
groups: Optional[str] = None,
scoring: Union[str, List[str], None] = None,
pos_labels: Union[str, List[str], None] = None,
Expand Down Expand Up @@ -357,20 +358,32 @@ def run_cross_validation( # noqa: C901

# Prepare cross validation
cv_outer = check_cv(
cv, classifier=problem_type == "classification" # type: ignore
cv, # type: ignore
classifier=problem_type == "classification",
)
logger.info(f"Using outer CV scheme {cv_outer}")

check_consistency(df_y, cv, groups, problem_type) # type: ignore

cv_return_estimator = return_estimator in ["cv", "all"]
scoring = check_scoring(pipeline, scoring, wrap_score=wrap_score)
scoring = check_scoring(
pipeline, # type: ignore
scoring,
wrap_score=wrap_score,
)

cv_mdsum = _compute_cvmdsum(cv_outer)
fit_params = {}
if df_groups is not None:
if isinstance(pipeline, BaseSearchCV):
fit_params["groups"] = df_groups.values

_sklearn_deprec_fit_params = {}
if sklearn.__version__ >= "1.4.0":
_sklearn_deprec_fit_params["params"] = fit_params
else:
_sklearn_deprec_fit_params["fit_params"] = fit_params

scores = cross_validate(
pipeline,
df_X,
Expand All @@ -382,7 +395,7 @@ def run_cross_validation( # noqa: C901
n_jobs=n_jobs,
return_train_score=return_train_score,
verbose=verbose, # type: ignore
fit_params=fit_params,
**_sklearn_deprec_fit_params,
)

n_repeats = getattr(cv_outer, "n_repeats", 1)
Expand Down
34 changes: 26 additions & 8 deletions julearn/base/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@


try: # sklearn < 1.4.0
from sklearn.utils.validation import _check_fit_params
from sklearn.utils.validation import _check_fit_params # type: ignore

fit_params_checker = _check_fit_params
except ImportError: # sklearn >= 1.4.0
from sklearn.utils.validation import _check_method_params
from sklearn.utils.validation import _check_method_params # type: ignore

fit_params_checker = _check_method_params

Expand Down Expand Up @@ -180,7 +180,12 @@ def __init__(
self.row_select_col_type = row_select_col_type
self.row_select_vals = row_select_vals

def fit(self, X, y=None, **fit_params): # noqa: N803
def fit(
self,
X: pd.DataFrame, # noqa: N803
y: Optional[pd.Series] = None,
**fit_params,
):
"""Fit the model.

This method will fit the model using only the columns selected by
Expand Down Expand Up @@ -217,8 +222,21 @@ def fit(self, X, y=None, **fit_params): # noqa: N803
self.row_select_vals = [self.row_select_vals]
return self._fit(**self._select_rows(X, y, **fit_params))

def _fit(
self,
X: pd.DataFrame, # noqa: N803,
y: Optional[pd.Series],
**kwargs,
) -> None:
raise_error(
"This method should be implemented in the concrete class",
klass=NotImplementedError,
)

def _add_backed_filtered(
self, X: pd.DataFrame, X_trans: pd.DataFrame # noqa: N803
self,
X: pd.DataFrame, # noqa: N803
X_trans: pd.DataFrame, # noqa: N803
) -> pd.DataFrame:
"""Add the left-out columns back to the transformed data.

Expand Down Expand Up @@ -301,7 +319,7 @@ def __init__(

def fit(
self,
X: pd.DataFrame, # noqa: N803
X: DataLike, # noqa: N803
y: Optional[DataLike] = None,
**fit_params: Any,
) -> "WrapModel":
Expand All @@ -312,7 +330,7 @@ def fit(

Parameters
----------
X : pd.DataFrame
X : DataLike
The data to fit the model on.
y : DataLike, optional
The target data (default is None).
Expand All @@ -329,9 +347,9 @@ def fit(
if self.needed_types is not None:
self.needed_types = ensure_column_types(self.needed_types)

Xt = self.filter_columns(X)
Xt = self.filter_columns(X) # type: ignore
self.model_ = self.model
self.model_.fit(Xt, y, **fit_params)
self.model_.fit(Xt, y, **fit_params) # type: ignore
return self

def predict(self, X: pd.DataFrame) -> DataLike: # noqa: N803
Expand Down
2 changes: 1 addition & 1 deletion julearn/base/tests/test_base_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_WrapModel(

np.random.seed(42)
lr = model()
lr.fit(X_iris_selected, y_iris)
lr.fit(X_iris_selected, y_iris) # type: ignore
pred_sk = lr.predict(X_iris_selected)

np.random.seed(42)
Expand Down
26 changes: 16 additions & 10 deletions julearn/inspect/_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
# Sami Hamdan <[email protected]>
# License: AGPL

from typing import List, Optional, Union
from typing import Optional, Union

import pandas as pd
from sklearn.model_selection import BaseCrossValidator, check_cv
from sklearn.utils.metaestimators import available_if

from ..utils import _compute_cvmdsum, is_nonoverlapping_cv, raise_error
from ..utils.typing import DataLike
from ._pipeline import PipelineInspector


Expand Down Expand Up @@ -60,14 +61,13 @@ class FoldsInspector:
def __init__(
self,
scores: pd.DataFrame,
cv: BaseCrossValidator,
X: Union[str, List[str]], # noqa: N803
y: str,
cv: Union[BaseCrossValidator, int],
X: DataLike, # noqa: N803
y: pd.Series,
func: str = "predict",
groups: Optional[str] = None,
groups: Optional[pd.Series] = None,
):
self._scores = scores
self._cv = cv
self._X = X
self._y = y
self._func = func
Expand All @@ -92,7 +92,7 @@ def __init__(
)

cv = check_cv(cv)

self._cv = cv
t_cv_mdsum = _compute_cvmdsum(cv)
if t_cv_mdsum != cv_mdsums[0]:
raise_error(
Expand Down Expand Up @@ -120,10 +120,16 @@ def _get_predictions(self, func):

predictions = []
for i_fold, (_, test) in enumerate(
self._cv.split(self._X, self._y, groups=self._groups)
self._cv.split(
self._X, # type: ignore
self._y,
groups=self._groups,
)
):
t_model = self._scores["estimator"][i_fold]
t_values = getattr(t_model, func)(self._X.iloc[test])
t_values = getattr(t_model, func)(
self._X.iloc[test] # type: ignore
)
if t_values.ndim == 1:
t_values = t_values[:, None]
column_names = [f"p{i}" for i in range(t_values.shape[1])]
Expand Down Expand Up @@ -152,7 +158,7 @@ def _get_predictions(self, func):
t_df.columns = [f"fold{i_fold}_{x}" for x in t_df.columns]
predictions = pd.concat(predictions, axis=1)
predictions = predictions.sort_index()
predictions["target"] = self._y.values
predictions["target"] = self._y.values # type: ignore
return predictions

def __getitem__(self, key):
Expand Down
2 changes: 1 addition & 1 deletion julearn/inspect/_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def preprocess(
else:
raise_error(f"No step named {until} found.")
df_out = pipeline[:i].transform(_X)

df_out = df_out.copy()
if not isinstance(df_out, pd.DataFrame) and with_column_types is False:
raise_error(
"The output of the pipeline is not a DataFrame. Cannot remove "
Expand Down
6 changes: 4 additions & 2 deletions julearn/inspect/tests/test_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Authors: Federico Raimondo <[email protected]>
# Sami Hamdan <[email protected]>
# License: AGPL

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -70,7 +69,10 @@ def scores(df_typed_iris, n_iters=5, mock_model=None):
if mock_model is None:
mock_model = MockModelReturnsIndex

estimators = [WrapModel(mock_model()).fit(X, y) for _ in range(n_iters)]
estimators = [
WrapModel(mock_model()).fit(X, y) # type: ignore
for _ in range(n_iters)
]

return pd.DataFrame(
{
Expand Down
21 changes: 12 additions & 9 deletions julearn/inspect/tests/test_inspector.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,28 @@

def test_no_cv() -> None:
"""Test inspector with no cross-validation."""
inspector = Inspector({})
inspector = Inspector({}) # type: ignore
with pytest.raises(ValueError, match="No cv"):
_ = inspector.folds


def test_no_X() -> None:
"""Test inspector with no features."""
inspector = Inspector({}, cv=5)
inspector = Inspector({}, cv=5) # type: ignore
with pytest.raises(ValueError, match="No X"):
_ = inspector.folds


def test_no_y() -> None:
"""Test inspector with no targets."""
inspector = Inspector({}, cv=5, X=[1, 2, 3])
inspector = Inspector({}, cv=5, X=[1, 2, 3]) # type: ignore
with pytest.raises(ValueError, match="No y"):
_ = inspector.folds


def test_no_model() -> None:
"""Test inspector with no model."""
inspector = Inspector({})
inspector = Inspector({}) # type: ignore
with pytest.raises(ValueError, match="No model"):
_ = inspector.model

Expand All @@ -63,8 +63,11 @@ def test_normal_usage(df_iris: "pd.DataFrame") -> None:
return_inspector=True,
problem_type="classification",
)
assert pipe == inspect.model._model
for (_, score), inspect_fold in zip(scores.iterrows(), inspect.folds):
assert pipe == inspect.model._model # type: ignore
for (_, score), inspect_fold in zip(
scores.iterrows(), # type: ignore
inspect.folds, # type: ignore
):
assert score["estimator"] == inspect_fold.model._model


Expand All @@ -88,6 +91,6 @@ def test_normal_usage_with_search(df_iris: "pd.DataFrame") -> None:
return_estimator="all",
return_inspector=True,
)
assert pipe == inspect.model._model
inspect.model.get_fitted_params()
inspect.model.get_params()
assert pipe == inspect.model._model # type: ignore
inspect.model.get_fitted_params() # type: ignore
inspect.model.get_params() # type: ignore
Loading
Loading