diff --git a/julearn/base/column_types.py b/julearn/base/column_types.py index f6f2a64f6..350a16699 100644 --- a/julearn/base/column_types.py +++ b/julearn/base/column_types.py @@ -240,6 +240,42 @@ def __eq__(self, other: Union["ColumnTypes", str]): other = other if isinstance(other, ColumnTypes) else ColumnTypes(other) return self._column_types == other._column_types + def __and__(self, other: "ColumnTypes"): + """Get the intersection of the column_types. + + Parameters + ---------- + other : ColumnTypes + The other column_types to get the intersection with. + + Returns + ------- + ColumnTypes + The intersection of the column_types. + + """ + return ColumnTypes(self._column_types & other._column_types) + + def __or__(self, other: "ColumnTypes"): + """Get the union of the column_types. + + Parameters + ---------- + other : ColumnTypes + The other column_types to get the union with. + + Returns + ------- + ColumnTypes + The union of the column_types. + + """ + return ColumnTypes(self._column_types | other._column_types) + + def __len__(self): + """Get the number of column_types.""" + return len(self._column_types) + def __iter__(self): """Iterate over the column_types.""" diff --git a/julearn/conftest.py b/julearn/conftest.py index b4b27e69c..de9c985a7 100644 --- a/julearn/conftest.py +++ b/julearn/conftest.py @@ -215,7 +215,9 @@ def model(request: FixtureRequest) -> str: return request.param -@fixture(params=["regression", "classification"], scope="function") +@fixture( + params=["regression", "classification", "transformer"], scope="function" +) def problem_type(request: FixtureRequest) -> str: """Return different problem types. diff --git a/julearn/pipeline/pipeline_creator.py b/julearn/pipeline/pipeline_creator.py index 8a8b4bf30..98f7c8150 100644 --- a/julearn/pipeline/pipeline_creator.py +++ b/julearn/pipeline/pipeline_creator.py @@ -228,6 +228,38 @@ def add( " and only to the PipelineCreator like this" " PipelineCreator(problem_type=problem_type)" ) + + if self._added_target_generator: + # If a target generator was added, we need to make sure that + # the apply_to parameter is set. + if apply_to is None: + raise_error( + "A target generator was added. To prevent training on " + "the features used to generate the target, you need to " + "explicitly set the apply_to parameter." + ) + else: + # and it should also be set to exclude what is used to + # generate the target. + apply_to = ColumnTypes(apply_to) + if len(apply_to & ColumnTypes("*")) > 0: + raise_error( + "A target generator was added. The apply_to parameter " + "of subsequent steps cannot include the wildcard type " + "'*'." + ) + else: + + target_gen_step = next(iter([ + x for x in self._steps if x.name == "generate_target" + ])) + if len(apply_to & target_gen_step.apply_to) > 0: + raise_error( + "A target generator was added. The apply_to " + "parameter of subsequent steps should exclude the " + "types used to generate the target." + ) + apply_to = self.apply_to if apply_to is None else apply_to apply_to = ColumnTypes(apply_to) @@ -240,7 +272,6 @@ def add( step = typing.cast(JuTargetPipeline, step) # The name "generate_target" is reserved for the target generator step - # TODO: Add CI TEST if name == "generate_target" and step != "generate_target": raise_error( "The name 'generate_target' is reserved for the target " @@ -303,13 +334,20 @@ def add( else: logger.debug(f"Special step is {step}") name = "generate_target" + if len(apply_to & ColumnTypes("*")) > 0: + raise_error( + "The 'generate_target' step cannot apply to all types." + ) if "transformer" not in params_to_set: - # TODO: CI TEST raise_error( "The 'generate_target' step should have a " "transformer parameter." ) step = params_to_set["transformer"] + if not isinstance(step, PipelineCreator): + raise_error( + "The transformer parameter in the generate_target " + "step should be a PipelineCreator.") elif len(params_to_set) > 0: step.set_params(**params_to_set) # type: ignore diff --git a/julearn/pipeline/tests/test_pipeline_creator.py b/julearn/pipeline/tests/test_pipeline_creator.py index b2b8e7253..5c88db7c5 100644 --- a/julearn/pipeline/tests/test_pipeline_creator.py +++ b/julearn/pipeline/tests/test_pipeline_creator.py @@ -9,8 +9,11 @@ import pandas as pd import pytest +from numpy.testing import assert_array_equal +from sklearn.decomposition import PCA from sklearn.dummy import DummyClassifier -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.preprocessing import RobustScaler, StandardScaler from sklearn.svm import SVC @@ -45,7 +48,8 @@ def test_construction_working( preprocess = preprocess if isinstance(preprocess, list) else [preprocess] for step in preprocess: creator.add(step, apply_to="categorical") - creator.add(model) + if problem_type in ["classification", "regression"]: + creator.add(model) X_types = {"categorical": ["A"]} pipeline = creator.to_pipeline(X_types=X_types) @@ -60,16 +64,19 @@ def test_construction_working( ) # check model step - model_name, model = pipeline.steps[-1] - assert isinstance(model, WrapModel) - assert isinstance( - model.model, - get_model( - model_name, - problem_type=problem_type, - ).__class__, - ) - assert len(preprocess) + 2 == len(pipeline.steps) + if problem_type in ["classification", "regression"]: + model_name, model = pipeline.steps[-1] + assert isinstance(model, WrapModel) + assert isinstance( + model.model, + get_model( + model_name, + problem_type=problem_type, + ).__class__, + ) + assert len(preprocess) + 2 == len(pipeline.steps) + else: + assert len(preprocess) + 1 == len(pipeline.steps) def test_fit_and_transform_no_error( @@ -98,10 +105,14 @@ def test_fit_and_transform_no_error( creator = PipelineCreator.from_list( preprocess, model_params={}, problem_type=problem_type ) - creator.add(model) + if problem_type in ["classification", "regression"]: + creator.add(model) pipeline = creator.to_pipeline({}) pipeline.fit(X_iris, y_iris) - pipeline[:-1].transform(X_iris) + if problem_type in ["classification", "regression"]: + pipeline[:-1].transform(X_iris) + else: + pipeline.transform(X_iris) def _hyperparam_tuning_base_test( @@ -138,6 +149,8 @@ def _hyperparam_tuning_base_test( convention. """ + if problem_type == "transformer": + pytest.skip("Transformers can't be tuned") if isinstance(preprocess, str): preprocess = [preprocess] @@ -197,7 +210,8 @@ def test_hyperparameter_tuning( """ - + if problem_type == "transformer": + pytest.skip("Transformers can't be tuned") pipeline, param_grid = _hyperparam_tuning_base_test( X_types_iris, model, @@ -245,6 +259,8 @@ def test_hyperparameter_tuning_bayes( The parameters for the search. """ + if problem_type == "transformer": + pytest.skip("Transformers can't be tuned") BayesSearchCV = pytest.importorskip("skopt.BayesSearchCV") pipeline, param_grid = _hyperparam_tuning_base_test( @@ -292,6 +308,8 @@ def test_hyperparameter_tuning_optuna( # OptunaSearchCV = optuna_integration.OptunaSearchCV from julearn.external.optuna_searchcv import OptunaSearchCV + if problem_type == "transformer": + pytest.skip("Transformers can't be tuned") pipeline, param_grid = _hyperparam_tuning_base_test( X_types_iris, model, @@ -363,6 +381,8 @@ def test_hyperparameter_tuning_distributions( The parameters for the search. """ + if problem_type == "transformer": + pytest.skip("Transformers can't be tuned") kind = "grid" if search_params is not None: kind = search_params.get("kind", "grid") @@ -411,6 +431,8 @@ def test_hyperparameter_tuning_distributions_bayes( The parameters for the search. """ + if problem_type == "transformer": + pytest.skip("Transformers can't be tuned") BayesSearchCV = pytest.importorskip("skopt.BayesSearchCV") pipeline, param_grid = _hyperparam_tuning_base_test( @@ -858,3 +880,88 @@ def test_PipelineCreator_set_hyperparameter() -> None: model3 = creator3.to_pipeline() assert model3.steps[-1][1].get_params()["strategy"] == "uniform" + + +def test_PipelineCreator_generated_target( + X_iris: pd.DataFrame, # noqa: N803 +) -> None: + """Test the pipeline creator with a generated target.""" + + # Create a transfomer that will apply to the petal features + tranformer_creator = PipelineCreator( + problem_type="transformer", apply_to="petal" + ) + tranformer_creator.add("pca", n_components=2, random_state=42) + tranformer_creator.add("pick_columns", keep="pca__pca0") + + # Create a model that uses the previous transformer to generate the target + creator = PipelineCreator(problem_type="regression", apply_to="*") + creator.add( + "generate_target", apply_to="petal", transformer=tranformer_creator + ) + creator.add("linreg", apply_to="sepal") # sepal only + + X_types = { + "sepal": ["sepal_length", "sepal_width"], + "petal": ["petal_length", "petal_width"], + } + fake_y = pd.Series([0] * len(X_iris)) + + model = creator.to_pipeline(X_types) + assert len(model.steps) == 2 + assert model.steps[0][0] == "set_column_types" + assert model.steps[1][0] == "linreg_target_generate" + + model.fit(X_iris.copy(), fake_y) + + # Get the in sample predictions + ju_pred = model.predict(X_iris.copy()) + + pca = PCA(n_components=2, random_state=42) + linreg = LinearRegression() + + X_iris_petal_vals = X_iris[["petal_length", "petal_width"]].values + X_iris_sepal_vals = X_iris[["sepal_length", "sepal_width"]].values + y_gen = pca.fit(X_iris_petal_vals).transform(X_iris_petal_vals)[:, 0] + + linreg.fit(X_iris_sepal_vals, y_gen) + sk_pred = linreg.predict(X_iris_sepal_vals) + + assert_array_equal(ju_pred, sk_pred) + + +def test_PipelineCreator_generated_target_errors() -> None: + """Test errors with the generated target.""" + tranformer_creator = PipelineCreator( + problem_type="transformer", apply_to="petal" + ) + tranformer_creator.add("pca") + + # Create a model that uses the previous transformer to generate the target + with pytest.raises(ValueError, match="reserved for the target"): + creator = PipelineCreator(problem_type="regression", apply_to="*") + creator.add("pca", name="generate_target") + with pytest.raises(ValueError, match="have a transformer parameter"): + creator = PipelineCreator(problem_type="regression", apply_to="*") + creator.add("generate_target", apply_to="petal") + with pytest.raises(ValueError, match="should be a PipelineCreator"): + creator = PipelineCreator(problem_type="regression", apply_to="*") + creator.add("generate_target", apply_to="petal", transformer="pca") + + with pytest.raises(ValueError, match="all types"): + creator = PipelineCreator(problem_type="regression", apply_to="*") + creator.add( + "generate_target", apply_to="*", transformer=tranformer_creator + ) + + creator = PipelineCreator(problem_type="regression", apply_to="*") + creator.add( + "generate_target", apply_to="petal", transformer=tranformer_creator + ) + with pytest.raises(ValueError, match="explicitly set the apply_to"): + creator.add("linreg") + with pytest.raises(ValueError, match="exclude the types"): + creator.add("linreg", apply_to=["sepal", "petal"]) + with pytest.raises(ValueError, match="wildcard"): + creator.add("linreg", apply_to="*") + creator.add("linreg", apply_to=["sepal"]) diff --git a/julearn/transformers/dataframe/pick_columns.py b/julearn/transformers/dataframe/pick_columns.py index 77b677d1f..76ac76a33 100644 --- a/julearn/transformers/dataframe/pick_columns.py +++ b/julearn/transformers/dataframe/pick_columns.py @@ -50,7 +50,9 @@ def __init__( ) def _fit( - self, X: pd.DataFrame, y: Optional[DataLike] = None # noqa: N803 + self, + X: pd.DataFrame, + y: Optional[DataLike] = None, # noqa: N803 ) -> "PickColumns": """Fit the transformer. @@ -82,9 +84,8 @@ def _fit( self.keep_columns_ = [] self.support_mask_ = self.support_mask_.values return self - return self - def transform(self, X: pd.DataFrame) -> pd.DataFrame: # noqa: N803 + def transform(self, X: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]: # noqa: N803 """Pick the columns. Parameters @@ -99,7 +100,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # noqa: N803 """ logger.debug(f"Picking columns: {self.keep_columns_}") - return X[self.keep_columns_] + if len(self.keep_columns_) == 1: + out = X[self.keep_columns_[0]] + else: + out = X[self.keep_columns_] + return out def get_support( self, indices: bool = False @@ -118,9 +123,7 @@ def get_support( """ if indices: - return np.arange(len(self.support_mask_))[ - self.support_mask_ - ] # type: ignore + return np.arange(len(self.support_mask_))[self.support_mask_] # type: ignore else: return self.support_mask_ # type: ignore diff --git a/julearn/transformers/target/ju_generated_target_model.py b/julearn/transformers/target/ju_generated_target_model.py index be4ff9aed..2a699d3f1 100644 --- a/julearn/transformers/target/ju_generated_target_model.py +++ b/julearn/transformers/target/ju_generated_target_model.py @@ -237,7 +237,13 @@ def generate_target( """ logger.debug("Generating target") gen_y = self.transformer.transform(X) # type: ignore - logger.debug(f"Target generated: {gen_y.columns}") # type: ignore + + # If it's a pandas dataframe convert to series + if gen_y.shape[1] == 1: + gen_y = gen_y.iloc[:, 0] + logger.debug(f"Target generated: {gen_y.name}") + else: + logger.debug(f"Target generated: {gen_y.columns}") return gen_y @property diff --git a/julearn/transformers/target/tests/test_ju_generated_target_model.py b/julearn/transformers/target/tests/test_ju_generated_target_model.py new file mode 100644 index 000000000..0b8c95e31 --- /dev/null +++ b/julearn/transformers/target/tests/test_ju_generated_target_model.py @@ -0,0 +1,39 @@ +"""Provides tests for the JuGeneratedTargetModel class.""" + +# Authors: Federico Raimondo +# License: AGPL + +import numpy as np +import pandas as pd +from numpy.testing import assert_array_equal +from sklearn.decomposition import PCA +from sklearn.svm import SVR + +from julearn.transformers.target.ju_generated_target_model import ( + JuGeneratedTargetModel, +) + + +def test_JuGeneratedTargetModel( + X_iris: pd.DataFrame, # noqa: N803 +) -> None: + """Test JuGeneratedTargetModel.""" + model = SVR() + transformer = PCA(n_components=1, random_state=42) + transformer.set_output(transform="pandas") + + ju_generated_target_model = JuGeneratedTargetModel( + model=model, # type: ignore + transformer=transformer, # type: ignore + ) + + fake_y = pd.Series(np.zeros(X_iris.shape[0])) + + ju_generated_target_model.fit(X_iris, y=fake_y) + y_pred = ju_generated_target_model.predict(X_iris) + + model_sk = SVR() + y_iris = transformer.fit(X_iris).transform(X_iris) + model_sk.fit(X_iris, y_iris) + y_pred_sk = model_sk.predict(X_iris) + assert_array_equal(y_pred, y_pred_sk)