From 9f6bb2262e2ad3373ee7c73a5fe0c60a8903448c Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 25 Sep 2024 09:22:45 +0200 Subject: [PATCH 1/5] Add fit method to API --- julearn/__init__.py | 2 +- julearn/api.py | 470 +++++++++++++++++++++++++++++++------- julearn/prepare.py | 6 +- julearn/tests/test_api.py | 49 +++- 4 files changed, 447 insertions(+), 80 deletions(-) diff --git a/julearn/__init__.py b/julearn/__init__.py index 7ff461303..30478c205 100644 --- a/julearn/__init__.py +++ b/julearn/__init__.py @@ -14,5 +14,5 @@ from . import prepare from . import api from . import stats -from .api import run_cross_validation +from .api import run_cross_validation, run_fit from .pipeline import PipelineCreator, TargetPipelineCreator diff --git a/julearn/api.py b/julearn/api.py index c86086d36..aa5ec06bb 100644 --- a/julearn/api.py +++ b/julearn/api.py @@ -4,7 +4,7 @@ # Sami Hamdan # License: AGPL -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd @@ -26,7 +26,7 @@ from .utils.typing import CVLike -def run_cross_validation( # noqa: C901 +def _validata_api_params( X: List[str], # noqa: N803 y: str, model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]], @@ -36,18 +36,22 @@ def run_cross_validation( # noqa: C901 preprocess: Union[None, str, List[str]] = None, return_estimator: Optional[str] = None, return_inspector: bool = False, - return_train_score: bool = False, - cv: Optional[CVLike] = None, groups: Optional[str] = None, - scoring: Union[str, List[str], None] = None, pos_labels: Union[str, List[str], None] = None, model_params: Optional[Dict] = None, search_params: Optional[Dict] = None, seed: Optional[int] = None, - n_jobs: Optional[int] = None, - verbose: Optional[int] = 0, -): - """Run cross validation and score. +) -> Tuple[ + pd.DataFrame, + pd.Series, + Optional[pd.Series], + Union[Pipeline, BaseSearchCV], + Optional[str], + bool, + bool, + str, +]: + """Validate the parameters passed to the API functions. Parameters ---------- @@ -95,28 +99,9 @@ def run_cross_validation( # noqa: C901 return_inspector : bool Whether to return the inspector object (default is False) - - return_train_score : bool - Whether to return the training score with the test scores - (default is False). - cv : int, str or cross-validation generator | None - Cross-validation splitting strategy to use for model evaluation. - - Options are: - - * None: defaults to 5-fold - * int: the number of folds in a `(Stratified)KFold` - * CV Splitter (see scikit-learn documentation on CV) - * An iterable yielding (train, test) splits as arrays of indices. - groups : str | None The grouping labels in case a Group CV is used. See :ref:`data_usage` for details. - scoring : ScorerLike, optional - The scoring metric to use. - See https://scikit-learn.org/stable/modules/model_evaluation.html for - a comprehensive list of options. If None, use the model's default - scorer. pos_labels : str, int, float or list | None The labels to interpret as positive. If not None, every element from y will be converted to 1 if is equal or in pos_labels and to 0 if not. @@ -131,62 +116,29 @@ def run_cross_validation( # noqa: C901 is provided for at least one hyperparameter, a search will be performed. - search_params : dict | None - Additional parameters in case Hyperparameter Tuning is performed, with - the following keys: - - * 'kind': The kind of search algorithm to use, Valid options are: - - * ``"grid"`` : :class:`~sklearn.model_selection.GridSearchCV` - * ``"random"`` : - :class:`~sklearn.model_selection.RandomizedSearchCV` - * ``"bayes"`` : :class:`~skopt.BayesSearchCV` - * ``"optuna"`` : - :class:`~optuna_integration.OptunaSearchCV` - * user-registered searcher name : see - :func:`~julearn.model_selection.register_searcher` - * ``scikit-learn``-compatible searcher - - * 'cv': If a searcher is going to be used, the cross-validation - splitting strategy to use. Defaults to same CV as for the model - evaluation. - * 'scoring': If a searcher is going to be used, the scoring metric to - evaluate the performance. - - See :ref:`hp_tuning` for details. seed : int | None If not None, set the random seed before any operation. Useful for reproducibility. - n_jobs : int, optional - Number of jobs to run in parallel. Training the estimator and computing - the score are parallelized over the cross-validation splits. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. - ``-1`` means using all processors (default None). - verbose: int - Verbosity level of outer cross-validation. - Follows scikit-learn/joblib converntions. - 0 means no additional information is printed. - Larger number generally mean more information is printed. - Note: verbosity up to 50 will print into standard error, - while larger than 50 will print in standrad output. Returns ------- - scores : pd.DataFrame - The resulting scores (one column for each score specified). - Additionally, a 'fit_time' column will be added. - And, if ``return_estimator='all'`` or - ``return_estimator='cv'``, an 'estimator' columns with the - corresponding estimators fitted for each CV split. - final_estimator : object - The final estimator, fitted on all the data (only if - ``return_estimator='all'`` or ``return_estimator='final'``) - inspector : Inspector | None - The inspector object (only if ``return_inspector=True``) - + df_X : pd.DataFrame + The features DataFrame. + df_y : pd.Series + The target Series. + df_groups : pd.Series | None + The groups Series. + pipeline : Pipeline | BaseSearchCV + The pipeline to use. + return_estimator : str | None + The validated return_estimator parameter. + return_inspector : bool + The validated return_inspector parameter. + wrap_score : bool + Whether to wrap the score or not. + problem_type : str + The problem type. """ - - # Validate parameters if return_estimator not in [None, "final", "cv", "all"]: raise_error( f"return_estimator must be one of None, 'final', 'cv', 'all'. " @@ -365,6 +317,206 @@ def run_cross_validation( # noqa: C901 elif problem_type == "regression": logger.info(f"\tTarget type: {df_y.dtype}") + out = ( + df_X, + df_y, + df_groups, + pipeline, + return_estimator, + return_inspector, + wrap_score, + problem_type, + ) + return out + + +def run_cross_validation( + X: List[str], # noqa: N803 + y: str, + model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]], + data: pd.DataFrame, + X_types: Optional[Dict] = None, # noqa: N803 + problem_type: Optional[str] = None, + preprocess: Union[None, str, List[str]] = None, + return_estimator: Optional[str] = None, + return_inspector: bool = False, + return_train_score: bool = False, + cv: Optional[CVLike] = None, + groups: Optional[str] = None, + scoring: Union[str, List[str], None] = None, + pos_labels: Union[str, List[str], None] = None, + model_params: Optional[Dict] = None, + search_params: Optional[Dict] = None, + seed: Optional[int] = None, + n_jobs: Optional[int] = None, + verbose: Optional[int] = 0, +): + """Run cross validation and score. + + Parameters + ---------- + X : list of str + The features to use. + See :ref:`data_usage` for details. + y : str + The targets to predict. + See :ref:`data_usage` for details. + model : str or scikit-learn compatible model. + If string, it will use one of the available models. + data : pandas.DataFrame + DataFrame with the data. See :ref:`data_usage` for details. + X_types : dict[str, list of str] + A dictionary containing keys with column type as a str and the + columns of this column type as a list of str. + problem_type : str + The kind of problem to model. + + Options are: + + * "classification": Perform a classification + in which the target (y) has categorical classes (default). + The parameter pos_labels can be used to convert a target with + multiple_classes into binary. + * "regression". Perform a regression. The target (y) has to be + ordinal at least. + + preprocess : str, TransformerLike or list or PipelineCreator | None + Transformer to apply to the features. If string, use one of the + available transformers. If list, each element can be a string or + scikit-learn compatible transformer. If None (default), no + transformation is applied. + + See documentation for details. + + return_estimator : str | None + Return the fitted estimator(s). + Options are: + + * 'final': Return the estimator fitted on all the data. + * 'cv': Return the all the estimator from each CV split, fitted on the + training data. + * 'all': Return all the estimators (final and cv). + + return_inspector : bool + Whether to return the inspector object (default is False) + + return_train_score : bool + Whether to return the training score with the test scores + (default is False). + cv : int, str or cross-validation generator | None + Cross-validation splitting strategy to use for model evaluation. + + Options are: + + * None: defaults to 5-fold + * int: the number of folds in a `(Stratified)KFold` + * CV Splitter (see scikit-learn documentation on CV) + * An iterable yielding (train, test) splits as arrays of indices. + + groups : str | None + The grouping labels in case a Group CV is used. + See :ref:`data_usage` for details. + scoring : ScorerLike, optional + The scoring metric to use. + See https://scikit-learn.org/stable/modules/model_evaluation.html for + a comprehensive list of options. If None, use the model's default + scorer. + pos_labels : str, int, float or list | None + The labels to interpret as positive. If not None, every element from y + will be converted to 1 if is equal or in pos_labels and to 0 if not. + model_params : dict | None + If not None, this dictionary specifies the model parameters to use + + The dictionary can define the following keys: + + * 'STEP__PARAMETER': A value (or several) to be used as PARAMETER for + STEP in the pipeline. Example: 'svm__probability': True will set + the parameter 'probability' of the 'svm' model. If more than option + is provided for at least one hyperparameter, a search will be + performed. + + search_params : dict | None + Additional parameters in case Hyperparameter Tuning is performed, with + the following keys: + + * 'kind': The kind of search algorithm to use, Valid options are: + + * ``"grid"`` : :class:`~sklearn.model_selection.GridSearchCV` + * ``"random"`` : + :class:`~sklearn.model_selection.RandomizedSearchCV` + * ``"bayes"`` : :class:`~skopt.BayesSearchCV` + * ``"optuna"`` : + :class:`~optuna_integration.OptunaSearchCV` + * user-registered searcher name : see + :func:`~julearn.model_selection.register_searcher` + * ``scikit-learn``-compatible searcher + + * 'cv': If a searcher is going to be used, the cross-validation + splitting strategy to use. Defaults to same CV as for the model + evaluation. + * 'scoring': If a searcher is going to be used, the scoring metric to + evaluate the performance. + + See :ref:`hp_tuning` for details. + seed : int | None + If not None, set the random seed before any operation. Useful for + reproducibility. + n_jobs : int, optional + Number of jobs to run in parallel. Training the estimator and computing + the score are parallelized over the cross-validation splits. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors (default None). + verbose: int + Verbosity level of outer cross-validation. + Follows scikit-learn/joblib converntions. + 0 means no additional information is printed. + Larger number generally mean more information is printed. + Note: verbosity up to 50 will print into standard error, + while larger than 50 will print in standrad output. + + Returns + ------- + scores : pd.DataFrame + The resulting scores (one column for each score specified). + Additionally, a 'fit_time' column will be added. + And, if ``return_estimator='all'`` or + ``return_estimator='cv'``, an 'estimator' columns with the + corresponding estimators fitted for each CV split. + final_estimator : object + The final estimator, fitted on all the data (only if + ``return_estimator='all'`` or ``return_estimator='final'``) + inspector : Inspector | None + The inspector object (only if ``return_inspector=True``) + + """ + + # Validate parameters + ( + df_X, + df_y, + df_groups, + pipeline, + return_estimator, + return_inspector, + wrap_score, + problem_type, + ) = _validata_api_params( + X=X, + y=y, + model=model, + data=data, + X_types=X_types, + problem_type=problem_type, + preprocess=preprocess, + return_estimator=return_estimator, + return_inspector=return_inspector, + groups=groups, + pos_labels=pos_labels, + model_params=model_params, + search_params=search_params, + seed=seed, + ) + # Prepare cross validation cv_outer = check_cv( cv, # type: ignore @@ -447,3 +599,167 @@ def run_cross_validation( # noqa: C901 out = out, inspector return out + + +def run_fit( + X: List[str], # noqa: N803 + y: str, + model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]], + data: pd.DataFrame, + X_types: Optional[Dict] = None, # noqa: N803 + problem_type: Optional[str] = None, + preprocess: Union[None, str, List[str]] = None, + groups: Optional[str] = None, + pos_labels: Union[str, List[str], None] = None, + model_params: Optional[Dict] = None, + search_params: Optional[Dict] = None, + seed: Optional[int] = None, + verbose: Optional[int] = 0, +): + """Run cross validation and score. + + Parameters + ---------- + X : list of str + The features to use. + See :ref:`data_usage` for details. + y : str + The targets to predict. + See :ref:`data_usage` for details. + model : str or scikit-learn compatible model. + If string, it will use one of the available models. + data : pandas.DataFrame + DataFrame with the data. See :ref:`data_usage` for details. + X_types : dict[str, list of str] + A dictionary containing keys with column type as a str and the + columns of this column type as a list of str. + problem_type : str + The kind of problem to model. + + Options are: + + * "classification": Perform a classification + in which the target (y) has categorical classes (default). + The parameter pos_labels can be used to convert a target with + multiple_classes into binary. + * "regression". Perform a regression. The target (y) has to be + ordinal at least. + + preprocess : str, TransformerLike or list or PipelineCreator | None + Transformer to apply to the features. If string, use one of the + available transformers. If list, each element can be a string or + scikit-learn compatible transformer. If None (default), no + transformation is applied. + + See documentation for details. + + groups : str | None + The grouping labels in case a Group CV is used. + See :ref:`data_usage` for details. + pos_labels : str, int, float or list | None + The labels to interpret as positive. If not None, every element from y + will be converted to 1 if is equal or in pos_labels and to 0 if not. + model_params : dict | None + If not None, this dictionary specifies the model parameters to use + + The dictionary can define the following keys: + + * 'STEP__PARAMETER': A value (or several) to be used as PARAMETER for + STEP in the pipeline. Example: 'svm__probability': True will set + the parameter 'probability' of the 'svm' model. If more than option + is provided for at least one hyperparameter, a search will be + performed. + + search_params : dict | None + Additional parameters in case Hyperparameter Tuning is performed, with + the following keys: + + * 'kind': The kind of search algorithm to use, Valid options are: + + * ``"grid"`` : :class:`~sklearn.model_selection.GridSearchCV` + * ``"random"`` : + :class:`~sklearn.model_selection.RandomizedSearchCV` + * ``"bayes"`` : :class:`~skopt.BayesSearchCV` + * ``"optuna"`` : + :class:`~optuna_integration.OptunaSearchCV` + * user-registered searcher name : see + :func:`~julearn.model_selection.register_searcher` + * ``scikit-learn``-compatible searcher + + * 'cv': If a searcher is going to be used, the cross-validation + splitting strategy to use. Defaults to same CV as for the model + evaluation. + * 'scoring': If a searcher is going to be used, the scoring metric to + evaluate the performance. + + See :ref:`hp_tuning` for details. + + seed : int | None + If not None, set the random seed before any operation. Useful for + reproducibility. + verbose: int + Verbosity level of outer cross-validation. + Follows scikit-learn/joblib converntions. + 0 means no additional information is printed. + Larger number generally mean more information is printed. + Note: verbosity up to 50 will print into standard error, + while larger than 50 will print in standrad output. + + Returns + ------- + scores : pd.DataFrame + The resulting scores (one column for each score specified). + Additionally, a 'fit_time' column will be added. + And, if ``return_estimator='all'`` or + ``return_estimator='cv'``, an 'estimator' columns with the + corresponding estimators fitted for each CV split. + final_estimator : object + The final estimator, fitted on all the data (only if + ``return_estimator='all'`` or ``return_estimator='final'``) + inspector : Inspector | None + The inspector object (only if ``return_inspector=True``) + + """ + + # Validate parameters + ( + df_X, + df_y, + df_groups, + pipeline, + _, + _, + _, + problem_type, + ) = _validata_api_params( + X=X, + y=y, + model=model, + data=data, + X_types=X_types, + problem_type=problem_type, + preprocess=preprocess, + return_estimator=None, + return_inspector=False, + groups=groups, + pos_labels=pos_labels, + model_params=model_params, + search_params=search_params, + seed=seed, + ) + + fit_params = {} + if df_groups is not None: + if isinstance(pipeline, BaseSearchCV): + fit_params["groups"] = df_groups.values + + _sklearn_deprec_fit_params = {} + if sklearn.__version__ >= "1.4.0": + _sklearn_deprec_fit_params["params"] = fit_params + else: + _sklearn_deprec_fit_params["fit_params"] = fit_params + + logger.info("Fitting final model") + pipeline.fit(df_X, df_y, **fit_params) + + return pipeline diff --git a/julearn/prepare.py b/julearn/prepare.py index 5001520f2..181aafae5 100644 --- a/julearn/prepare.py +++ b/julearn/prepare.py @@ -386,7 +386,7 @@ def check_consistency( ) else: logger.info("Binary classification problem detected.") - else: + elif problem_type == "regression": # Regression is_numeric = np.issubdtype(y.values.dtype, np.number) # type: ignore if not is_numeric: @@ -401,6 +401,10 @@ def check_consistency( "A regression will be performed but only 2 " "distinct values are defined in y." ) + else: + raise_error( + "The problem type must be either 'classification' or 'regression'." + ) # Check groups and CV scheme if groups is not None: valid_instances = ( diff --git a/julearn/tests/test_api.py b/julearn/tests/test_api.py index 0d05f7dec..beba0994b 100644 --- a/julearn/tests/test_api.py +++ b/julearn/tests/test_api.py @@ -37,7 +37,7 @@ from sklearn.svm import SVC from julearn import run_cross_validation -from julearn.api import _compute_cvmdsum +from julearn.api import _compute_cvmdsum, run_fit from julearn.model_selection import ( ContinuousStratifiedGroupKFold, RepeatedContinuousStratifiedGroupKFold, @@ -1368,3 +1368,50 @@ def test_tune_hyperparam_target(df_iris: pd.DataFrame) -> None: return_inspector=True, ) # TODO: add assertions + + +def test_run_cv_fit(df_binary: pd.DataFrame) -> None: + """Test a simple binary classification problem. + + Parameters + ---------- + df_binary : pd.DataFrame + The iris dataset as a binary classification problem. + df_iris : pd.DataFrame + The iris dataset as a multiclass classification problem. + + """ + X = ["sepal_length", "sepal_width", "petal_length"] + y = "species" + X_types = {"features": X} + + scorers = ["accuracy", "balanced_accuracy"] + + creator = PipelineCreator( + apply_to="features", problem_type="classification" + ) + creator.add("zscore") + creator.add("svm") + + _, model = run_cross_validation( + X=X, + y=y, + data=df_binary, + X_types=X_types, + scoring=scorers, + model=creator, + return_estimator="final", + ) + + # now let"s do the same with the fit method + + model2 = run_fit( + X=X, + y=y, + data=df_binary, + X_types=X_types, + model=creator, + ) + + # compare the models + compare_models(model.steps[-1][1], model2.steps[-1][1]) From 36354ca650f1b2d2132325a0648b23b0e482c964 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 25 Sep 2024 09:25:23 +0200 Subject: [PATCH 2/5] Add doc changes --- docs/changes/newsfragments/271.enh | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/changes/newsfragments/271.enh diff --git a/docs/changes/newsfragments/271.enh b/docs/changes/newsfragments/271.enh new file mode 100644 index 000000000..00d7c240f --- /dev/null +++ b/docs/changes/newsfragments/271.enh @@ -0,0 +1 @@ +Add :func:`.run_fit` that implements a model fitting procedure with the same API as :func:`.run_cross_validation` by `Fede Raimondo`_. \ No newline at end of file From 0839181e7909e40ad83e26d29c67633d5f8130d1 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 25 Sep 2024 09:27:52 +0200 Subject: [PATCH 3/5] Fix linting --- julearn/api.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/julearn/api.py b/julearn/api.py index aa5ec06bb..3f00a82d5 100644 --- a/julearn/api.py +++ b/julearn/api.py @@ -26,7 +26,7 @@ from .utils.typing import CVLike -def _validata_api_params( +def _validata_api_params( # noqa: C901 X: List[str], # noqa: N803 y: str, model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]], @@ -116,6 +116,29 @@ def _validata_api_params( is provided for at least one hyperparameter, a search will be performed. + search_params : dict | None + Additional parameters in case Hyperparameter Tuning is performed, with + the following keys: + + * 'kind': The kind of search algorithm to use, Valid options are: + + * ``"grid"`` : :class:`~sklearn.model_selection.GridSearchCV` + * ``"random"`` : + :class:`~sklearn.model_selection.RandomizedSearchCV` + * ``"bayes"`` : :class:`~skopt.BayesSearchCV` + * ``"optuna"`` : + :class:`~optuna_integration.OptunaSearchCV` + * user-registered searcher name : see + :func:`~julearn.model_selection.register_searcher` + * ``scikit-learn``-compatible searcher + + * 'cv': If a searcher is going to be used, the cross-validation + splitting strategy to use. Defaults to same CV as for the model + evaluation. + * 'scoring': If a searcher is going to be used, the scoring metric to + evaluate the performance. + + See :ref:`hp_tuning` for details. seed : int | None If not None, set the random seed before any operation. Useful for reproducibility. @@ -138,6 +161,7 @@ def _validata_api_params( Whether to wrap the score or not. problem_type : str The problem type. + """ if return_estimator not in [None, "final", "cv", "all"]: raise_error( From 1d9f477668bceda944da63e5c7c7afe7b661078a Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 25 Sep 2024 09:29:31 +0200 Subject: [PATCH 4/5] Fix docstrings for run_fit --- julearn/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/julearn/api.py b/julearn/api.py index 3f00a82d5..1b0dde11d 100644 --- a/julearn/api.py +++ b/julearn/api.py @@ -640,7 +640,7 @@ def run_fit( seed: Optional[int] = None, verbose: Optional[int] = 0, ): - """Run cross validation and score. + """Fit the model on all the data. Parameters ---------- From 30e21acc5ffe4d1a511347269e2f7f7b60387126 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Wed, 25 Sep 2024 11:04:31 +0200 Subject: [PATCH 5/5] Update docs to include run_fit --- docs/api/main.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api/main.rst b/docs/api/main.rst index def63a04b..7b2a5a9a6 100644 --- a/docs/api/main.rst +++ b/docs/api/main.rst @@ -15,3 +15,4 @@ Functions :template: function.rst run_cross_validation + run_fit