Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify logic for final model training #273

Merged
merged 5 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changes/newsfragments/293.enh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Change the internal logic of :func:`.run_cross_validation` to optimise joblib calls by `Fede Raimondo`_
1 change: 0 additions & 1 deletion examples/02_inspection/run_binary_inspect_folds.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
creator.add("zscore")
creator.add("svm")

cv = ShuffleSplit(n_splits=5, train_size=0.7, random_state=200)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=4, random_state=200)

scores, model, inspector = run_cross_validation(
Expand Down
38 changes: 25 additions & 13 deletions julearn/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
import sklearn
from sklearn.base import BaseEstimator
from sklearn.model_selection import (
check_cv,
cross_validate,
)
from sklearn.model_selection._search import BaseSearchCV
from sklearn.pipeline import Pipeline

from .inspect import Inspector
from .model_selection.utils import check_cv
from .pipeline import PipelineCreator
from .pipeline.merger import merge_pipelines
from .prepare import check_consistency, prepare_input_data
Expand Down Expand Up @@ -541,16 +541,19 @@ def run_cross_validation(
seed=seed,
)

include_final_model = return_estimator in ["final", "all"]
cv_return_estimator = return_estimator in ["cv", "all", "final"]

# Prepare cross validation
cv_outer = check_cv(
cv, # type: ignore
classifier=problem_type == "classification",
include_final_model=include_final_model,
)
logger.info(f"Using outer CV scheme {cv_outer}")

check_consistency(df_y, cv, groups, problem_type) # type: ignore

cv_return_estimator = return_estimator in ["cv", "all"]
scoring = check_scoring(
pipeline, # type: ignore
scoring,
Expand Down Expand Up @@ -583,30 +586,39 @@ def run_cross_validation(
**_sklearn_deprec_fit_params,
)

n_repeats = getattr(cv_outer, "n_repeats", 1)
n_folds = len(scores["fit_time"]) // n_repeats

repeats = np.repeat(np.arange(n_repeats), n_folds)
folds = np.tile(np.arange(n_folds), n_repeats)

fold_sizes = np.array(
[
list(map(len, x))
for x in cv_outer.split(df_X, df_y, groups=df_groups)
]
)

if include_final_model:
# If we include the final model, we need to remove the last item in
# the scores as this is the final model
pipeline = scores["estimator"][-1]
if return_estimator == "final":
scores.pop("estimator")
scores = {k: v[:-1] for k, v in scores.items()}
fold_sizes = fold_sizes[:-1]

n_repeats = getattr(cv_outer, "n_repeats", 1)
n_folds = len(scores["fit_time"]) // n_repeats

repeats = np.repeat(np.arange(n_repeats), n_folds)
folds = np.tile(np.arange(n_folds), n_repeats)

scores["n_train"] = fold_sizes[:, 0]
scores["n_test"] = fold_sizes[:, 1]
scores["repeat"] = repeats
scores["fold"] = folds
scores["cv_mdsum"] = cv_mdsum

scores_df = pd.DataFrame(scores)

out = scores_df
if return_estimator in ["final", "all"]:
logger.info("Fitting final model")
pipeline.fit(df_X, df_y, **fit_params)
out = scores_df, pipeline
if include_final_model:
out = out, pipeline

if return_inspector:
inspector = Inspector(
Expand All @@ -615,7 +627,7 @@ def run_cross_validation(
X=df_X,
y=df_y,
groups=df_groups,
cv=cv_outer,
cv=cv_outer.cv if include_final_model else cv_outer,
)
if isinstance(out, tuple):
out = (*out, inspector)
Expand Down
96 changes: 96 additions & 0 deletions julearn/model_selection/final_model_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""CV Wrapper that includes a fold with all the data."""

# Authors: Federico Raimondo <[email protected]>
# License: AGPL
fraimondo marked this conversation as resolved.
Show resolved Hide resolved

from typing import TYPE_CHECKING, Generator, Optional, Tuple

import numpy as np


if TYPE_CHECKING:
from sklearn.model_selection import BaseCrossValidator


class _JulearnFinalModelCV:
"""Final model cross-validation iterator.

Wraps any CV iterator to provide an extra iteration with the full dataset.

Parameters
----------
cv : BaseCrossValidator
The cross-validation iterator to wrap.

"""

def __init__(self, cv: "BaseCrossValidator") -> None:
self.cv = cv
if hasattr(cv, "n_repeats"):
self.n_repeats = cv.n_repeats

def split(
self,
X: np.ndarray, # noqa: N803
y: np.ndarray,
groups: Optional[np.ndarray] = None,
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
"""Generate indices to split data into training and test set.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
Note that providing ``y`` is sufficient to generate the splits and
hence ``np.zeros(n_samples)`` may be used as a placeholder for
``X`` instead of actual training data.

y : array-like of shape (n_samples,), default=None
The target variable for supervised learning problems.

groups : array-like of shape (n_samples,), default=None
Group labels for the samples used while splitting the dataset into
train/test set.

Yields
------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.

Notes
-----
This CV Splitter will generate an extra fold where the full dataset is
used for training and testing. This is useful to train the final model
on the full dataset at the same time as the cross-validation,
profitting for joblib calls.

"""
yield from self.cv.split(X, y, groups)
all_inds = np.arange(len(X))
# For the last fold, train on all samples and return only 2 for testing
yield all_inds, all_inds[:2]

def get_n_splits(self) -> int:
"""Get the number of splits.

Returns
-------
int
The number of splits.

"""
return self.cv.get_n_splits() + 1

def __repr__(self) -> str:
"""Return the representation of the object.

Returns
-------
str
The representation of the object.

"""
return f"{self.cv} (incl. final model)"
53 changes: 53 additions & 0 deletions julearn/model_selection/tests/test_final_model_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Provides tests for the final model CV."""

# Authors: Federico Raimondo <[email protected]>
# License: AGPL
fraimondo marked this conversation as resolved.
Show resolved Hide resolved

import numpy as np
from numpy.testing import assert_array_equal
from sklearn.model_selection import RepeatedStratifiedKFold

from julearn.model_selection.final_model_cv import _JulearnFinalModelCV
from julearn.utils import _compute_cvmdsum


def test_final_model_cv() -> None:
"""Test the final model CV."""
sklearn_cv = RepeatedStratifiedKFold(
n_repeats=2, n_splits=5, random_state=42
)

julearn_cv = _JulearnFinalModelCV(sklearn_cv)

assert julearn_cv.get_n_splits() == 11
assert julearn_cv.n_repeats == 2

n_features = 10
n_samples = 123
X = np.zeros((n_samples, n_features))
y = np.zeros(n_samples)

all_ju = list(julearn_cv.split(X, y))
all_sk = list(sklearn_cv.split(X, y))

assert len(all_ju) == len(all_sk) + 1
for i in range(10):
assert_array_equal(all_ju[i][0], all_sk[i][0])
assert_array_equal( all_ju[i][1], all_sk[i][1])

assert all_ju[-1][0].shape[0] == n_samples
assert all_ju[-1][1].shape[0] == 2
assert_array_equal(all_ju[-1][0], np.arange(n_samples))


def test_final_model_cv_mdsum() -> None:
"""Test the mdsum of the final model CV."""
sklearn_cv = RepeatedStratifiedKFold(
n_repeats=2, n_splits=5, random_state=42
)

julearn_cv = _JulearnFinalModelCV(sklearn_cv)

mdsum = _compute_cvmdsum(julearn_cv)
mdsum_sk = _compute_cvmdsum(sklearn_cv)
assert mdsum == mdsum_sk
55 changes: 55 additions & 0 deletions julearn/model_selection/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""Utility functions for model selection in julearn."""

# Authors: Federico Raimondo <[email protected]>
# License: AGPL

from typing import TYPE_CHECKING

from sklearn.model_selection import check_cv as sk_check_cv

from .final_model_cv import _JulearnFinalModelCV


if TYPE_CHECKING:
from ..utils.typing import CVLike


def check_cv(
cv: "CVLike", classifier: bool = False, include_final_model: bool = False
) -> "CVLike":
"""Check the CV instance and return the proper CV for julearn.

Parameters
----------
cv : int, str or cross-validation generator | None
Cross-validation splitting strategy to use for model evaluation.

Options are:

* None: defaults to 5-fold
* int: the number of folds in a `(Stratified)KFold`
* CV Splitter (see scikit-learn documentation on CV)
fraimondo marked this conversation as resolved.
Show resolved Hide resolved
* An iterable yielding (train, test) splits as arrays of indices.

classifier : bool, default=False
Whether the task is a classification task, in which case
stratified KFold will be used.

include_final_model : bool, default=False
Whether to include the final model in the cross-validation. If true,
one more fold will be added to the cross-validation, where the full
dataset is used for training and testing

Returns
-------
checked_cv : a cross-validator instance.
The return value is a cross-validator which generates the train/test
splits via the ``split`` method.

"""

cv = sk_check_cv(cv, classifier=classifier)
if include_final_model:
cv = _JulearnFinalModelCV(cv)

return cv
10 changes: 6 additions & 4 deletions julearn/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,8 +415,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:
scoring = "accuracy"

np.random.seed(42)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)

model_params = {"svm__C": [0.01, 0.001]}
search_params = {"cv": cv_inner}
Expand All @@ -434,10 +434,12 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:
problem_type="classification",
)

assert len(actual["repeat"].unique()) == 2

# Now do the same with scikit-learn
np.random.seed(42)
cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)

clf = make_pipeline(SVC())
gs = GridSearchCV(
Expand Down
7 changes: 7 additions & 0 deletions julearn/utils/_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ContinuousStratifiedGroupKFold,
RepeatedContinuousStratifiedGroupKFold,
)
from ..model_selection.final_model_cv import _JulearnFinalModelCV


def _recurse_to_list(a):
Expand All @@ -40,6 +41,9 @@ def _recurse_to_list(a):

def _compute_cvmdsum(cv):
"""Compute the sum of the CV generator."""
if isinstance(cv, _JulearnFinalModelCV):
return _compute_cvmdsum(cv.cv)

params = dict(vars(cv).items())
params["class"] = cv.__class__.__name__

Expand All @@ -59,6 +63,7 @@ def _compute_cvmdsum(cv):
params["test_fold"] = params["test_fold"].tolist()
params["unique_folds"] = params["unique_folds"].tolist()


if "cv" in params:
if inspect.isclass(params["cv"]):
params["cv"] = params["cv"].__class__.__name__
Expand All @@ -72,6 +77,8 @@ def _compute_cvmdsum(cv):


def is_nonoverlapping_cv(cv) -> bool:
if isinstance(cv, _JulearnFinalModelCV):
return is_nonoverlapping_cv(cv.cv)
_valid_instances = (
KFold,
GroupKFold,
Expand Down
9 changes: 8 additions & 1 deletion julearn/utils/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from sklearn.model_selection import BaseCrossValidator, BaseShuffleSplit
from sklearn.model_selection._split import _RepeatedSplits

from ..model_selection.final_model_cv import _JulearnFinalModelCV


try: # sklearn >= 1.4.0
from sklearn.metrics._scorer import _Scorer # type: ignore
Expand Down Expand Up @@ -387,5 +389,10 @@ def get_apply_to(self) -> ColumnTypes:


CVLike = Union[
int, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit, Iterable
int,
BaseCrossValidator,
_RepeatedSplits,
BaseShuffleSplit,
Iterable,
_JulearnFinalModelCV,
]
Loading