juaml · fraimondo · Jul 29, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/examples/03_complex_models/run_generate_target.py b/examples/03_complex_models/run_generate_target.py
@@ -0,0 +1,77 @@
+"""
+Target Generation
+=================
+
+This example uses the ``iris`` dataset and tests a regression model in which
+the target variable is generated from some features within the cross-validation
+procedure. We will use the Iris dataset and generate a target variable using
+PCA on the petal features. Then, we will evaluate if a regression model can
+predict the generated target from the sepal features
+
+.. include:: ../../links.inc
+"""
+# Authors: Federico Raimondo <[email protected]>
+# License: AGPL
+
+from seaborn import load_dataset
+from julearn import run_cross_validation
+from julearn.pipeline import PipelineCreator
+from julearn.utils import configure_logging
+
+###############################################################################
+# Set the logging level to info to see extra information.
+configure_logging(level="DEBUG")
+
+###############################################################################
+df_iris = load_dataset("iris")
+
+
+###############################################################################
+# As features, we will use the sepal length, width and petal length.
+# We will try to predict the species.
+
+X = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
+y = "__generated__"  # to indicate to julearn that the target will be generated
+
+
+# Define our feature types
+X_types = {
+    "sepal": ["sepal_length", "sepal_width"],
+    "petal": ["petal_length", "petal_width"],
+}
+
+###############################################################################
+# We now use a Pipeline Creator to create the pipeline that will generate the
+# features. This special pipeline should be configured to be a "transformer"
+# and apply to the "petal" feature types.
+
+target_creator = PipelineCreator(problem_type="transformer", apply_to="petal")
+target_creator.add("pca", n_components=2)
+# Select only the first component
+target_creator.add("pick_columns", keep="pca__pca0")
+
+
+###############################################################################
+# We now create the pipeline that will be used to predict the target. This
+# pipeline will be a regression pipeline. The step previous to the model should
+# be the the `generate_target`, applying to the "petal" features and using the
+# target_creator pipeline as the transformer.
+creator = PipelineCreator(problem_type="regression")
+creator.add("zscore", apply_to="*")
+creator.add("generate_target", apply_to="petal", transformer=target_creator)
+creator.add("linreg", apply_to="sepal")
+
+###############################################################################
+# We finally evaluate the model within the cross validation.
+scores, model = run_cross_validation(
+    X=X,
+    y=y,
+    X_types=X_types,
+    data=df_iris,
+    model=creator,
+    return_estimator="final",
+    cv=2,
+)
+
+print(scores["test_score"])  # type: ignore
+
diff --git a/julearn/api.py b/julearn/api.py
@@ -233,6 +233,9 @@ def run_cross_validation(  # noqa: C901
 
     wrap_score = False
     if isinstance(model, (PipelineCreator, list)):
+        logger.debug(
+            "Generating pipeline from PipelineCreator or list of them"
+        )
         if preprocess is not None:
             raise_error(
                 "If model is a PipelineCreator (or list of), "
@@ -266,6 +269,7 @@ def run_cross_validation(  # noqa: C901
             expanded_models.extend(m.split())
 
         has_target_transformer = expanded_models[-1]._added_target_transformer
+        has_target_generator = expanded_models[-1]._added_target_generator
         all_pipelines = [
             model.to_pipeline(X_types=X_types, search_params=search_params)
             for model in expanded_models
@@ -279,12 +283,16 @@ def run_cross_validation(  # noqa: C901
             pipeline = all_pipelines[0]
 
         if has_target_transformer:
+            logger.debug("Pipeline has target transformer")
             if isinstance(pipeline, BaseSearchCV):
                 last_step = pipeline.estimator[-1]  # type: ignore
             else:
                 last_step = pipeline[-1]
             if not last_step.can_inverse_transform():
                 wrap_score = True
+        if has_target_generator:
+            logger.debug("Pipeline has target generator")
+            wrap_score = True
         problem_type = model[0].problem_type
 
     elif not isinstance(model, (str, BaseEstimator)):
@@ -343,12 +351,15 @@ def run_cross_validation(  # noqa: C901
                 "The following model_params are incorrect: " f"{unused_params}"
             )
         has_target_transformer = pipeline_creator._added_target_transformer
+        has_target_generator = pipeline_creator._added_target_generator
         pipeline = pipeline_creator.to_pipeline(
             X_types=X_types, search_params=search_params
         )
 
         if has_target_transformer and not pipeline[-1].can_inverse_transform():
             wrap_score = True
+        if has_target_generator:
+            wrap_score = True
 
     # Log some information
     logger.info("= Data Information =")

diff --git a/julearn/base/column_types.py b/julearn/base/column_types.py
@@ -4,7 +4,7 @@
 #          Sami Hamdan <[email protected]>
 # License: AGPL
 
-from typing import Callable, List, Set, Union
+from typing import Any, Callable, Dict, List, Set, Union
 
 from sklearn.compose import make_column_selector
 
@@ -240,6 +240,42 @@ def __eq__(self, other: Union["ColumnTypes", str]):
         other = other if isinstance(other, ColumnTypes) else ColumnTypes(other)
         return self._column_types == other._column_types
 
+    def __and__(self, other: "ColumnTypes"):
+        """Get the intersection of the column_types.
+
+        Parameters
+        ----------
+        other : ColumnTypes
+            The other column_types to get the intersection with.
+
+        Returns
+        -------
+        ColumnTypes
+            The intersection of the column_types.
+
+        """
+        return ColumnTypes(self._column_types & other._column_types)
+
+    def __or__(self, other: "ColumnTypes"):
+        """Get the union of the column_types.
+
+        Parameters
+        ----------
+        other : ColumnTypes
+            The other column_types to get the union with.
+
+        Returns
+        -------
+        ColumnTypes
+            The union of the column_types.
+
+        """
+        return ColumnTypes(self._column_types | other._column_types)
+
+    def __len__(self):
+        """Get the number of column_types."""
+        return len(self._column_types)
+
     def __iter__(self):
         """Iterate over the column_types."""
 
@@ -251,6 +287,22 @@ def __repr__(self):
             f"ColumnTypes<types={self._column_types}; pattern={self.pattern}>"
         )
 
+    def filter(self, X_types: Dict[str, Any]) -> Dict[str, Any]:  # noqa: N803
+        """Filter the X_types based on the column_types.
+
+        Parameters
+        ----------
+        X_types : dict
+            The types of the columns.
+
+        Returns
+        -------
+        dict:
+            The filtered X_types.
+
+        """
+        return {k: v for k, v in X_types.items() if k in self._column_types}
+
     def copy(self) -> "ColumnTypes":
         """Get a copy of the ColumnTypes.
 

diff --git a/julearn/conftest.py b/julearn/conftest.py
@@ -215,7 +215,9 @@ def model(request: FixtureRequest) -> str:
     return request.param
 
 
-@fixture(params=["regression", "classification"], scope="function")
+@fixture(
+    params=["regression", "classification", "transformer"], scope="function"
+)
 def problem_type(request: FixtureRequest) -> str:
     """Return different problem types.