diff --git a/openstef/pipeline/optimize_hyperparameters.py b/openstef/pipeline/optimize_hyperparameters.py index 6bce42bd6..90f5786b5 100644 --- a/openstef/pipeline/optimize_hyperparameters.py +++ b/openstef/pipeline/optimize_hyperparameters.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: MPL-2.0 import os -from typing import Any +from typing import Any, Union import optuna import pandas as pd @@ -22,7 +22,7 @@ from openstef.model.regressors.regressor import OpenstfRegressor from openstef.model.serializer import MLflowSerializer from openstef.pipeline.train_model import ( - DEFAULT_TRAIN_HORIZONS, + DEFAULT_TRAIN_HORIZONS_HOURS, train_model_pipeline_core, ) from openstef.validation import validation @@ -43,7 +43,6 @@ def optimize_hyperparameters_pipeline( input_data: pd.DataFrame, mlflow_tracking_uri: str, artifact_folder: str, - horizons: list[float] = DEFAULT_TRAIN_HORIZONS, n_trials: int = N_TRIALS, ) -> dict: """Optimize hyperparameters pipeline. @@ -65,6 +64,12 @@ def optimize_hyperparameters_pipeline( Optimized hyperparameters. """ + if pj.train_horizons_minutes is None: + horizons = DEFAULT_TRAIN_HORIZONS_HOURS + else: + horizons = [ + horizon_minutes / 60 for horizon_minutes in pj.train_horizons_minutes + ] ( best_model, model_specs, @@ -97,7 +102,7 @@ def optimize_hyperparameters_pipeline( def optimize_hyperparameters_pipeline_core( pj: PredictionJobDataClass, input_data: pd.DataFrame, - horizons: list[float] = DEFAULT_TRAIN_HORIZONS, + horizons: Union[list[float], str] = DEFAULT_TRAIN_HORIZONS_HOURS, n_trials: int = N_TRIALS, ) -> tuple[ OpenstfRegressor, ModelSpecificationDataClass, Report, dict, int, dict[str, Any] @@ -109,7 +114,7 @@ def optimize_hyperparameters_pipeline_core( Args: pj: Prediction job input_data: Raw training input data - horizons: horizons for feature engineering. + horizons: horizons for feature engineering in hours. n_trials: The number of trials. Defaults to N_TRIALS. Raises: @@ -124,9 +129,6 @@ def optimize_hyperparameters_pipeline_core( - Optimized hyperparameters. """ - if pj.train_horizons_minutes is not None: - horizons = pj.train_horizons_minutes - if input_data.empty: raise InputDataInsufficientError("Input dataframe is empty") elif "load" not in input_data.columns: diff --git a/openstef/pipeline/train_model.py b/openstef/pipeline/train_model.py index 676cc8325..db7d72608 100644 --- a/openstef/pipeline/train_model.py +++ b/openstef/pipeline/train_model.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: MPL-2.0 import logging import os -from typing import Optional, Union, Tuple +from typing import Optional, Union import pandas as pd import structlog @@ -25,7 +25,7 @@ from openstef.model_selection.model_selection import split_data_train_validation_test from openstef.validation import validation -DEFAULT_TRAIN_HORIZONS: list[float] = [0.25, 47.0] +DEFAULT_TRAIN_HORIZONS_HOURS: list[float] = [0.25, 47.0] MAXIMUM_MODEL_AGE: int = 7 DEFAULT_EARLY_STOPPING_ROUNDS: int = 10 @@ -80,12 +80,19 @@ def train_model_pipeline( # Train model with core pipeline try: + if pj.train_horizons_minutes is None: + horizons = DEFAULT_TRAIN_HORIZONS_HOURS + else: + horizons = [ + horizon_minutes / 60 for horizon_minutes in pj.train_horizons_minutes + ] + model, report, model_specs_updated, data_sets = train_model_pipeline_core( pj, model_specs, input_data, old_model, - horizons=pj.train_horizons_minutes, + horizons=horizons, ) except OldModelHigherScoreError as OMHSE: logger.error("Old model is better than new model", pid=pj["id"], exc_info=OMHSE) @@ -134,7 +141,7 @@ def train_model_pipeline_core( model_specs: ModelSpecificationDataClass, input_data: pd.DataFrame, old_model: OpenstfRegressor = None, - horizons: Union[list[float], str] = None, + horizons: Union[list[float], str] = DEFAULT_TRAIN_HORIZONS_HOURS, ) -> Union[ OpenstfRegressor, Report, @@ -151,7 +158,7 @@ def train_model_pipeline_core( model_specs: Dataclass containing model specifications input_data: Input data old_model: Old model to compare to. Defaults to None. - horizons: horizons to train on in hours. + horizons: Horizons to train on in hours, relevant for feature engineering. Raises: InputDataInsufficientError: when input data is insufficient. @@ -165,12 +172,6 @@ def train_model_pipeline_core( - Datasets (tuple[pd.DataFrmae, pd.DataFrame, pd.Dataframe): The train, validation and test sets """ - if horizons is None: - if pj.train_horizons_minutes is None: - horizons = DEFAULT_TRAIN_HORIZONS - else: - horizons = pj.train_horizons_minutes - logger = structlog.get_logger(__name__) # Call common pipeline