Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Generate the target from features #269

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions examples/03_complex_models/run_generate_target.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Target Generation
=================

This example uses the ``iris`` dataset and tests a regression model in which
the target variable is generated from some features within the cross-validation
procedure. We will use the Iris dataset and generate a target variable using
PCA on the petal features. Then, we will evaluate if a regression model can
predict the generated target from the sepal features

.. include:: ../../links.inc
"""
# Authors: Federico Raimondo <[email protected]>
# License: AGPL

from seaborn import load_dataset
from julearn import run_cross_validation
from julearn.pipeline import PipelineCreator
from julearn.utils import configure_logging

###############################################################################
# Set the logging level to info to see extra information.
configure_logging(level="DEBUG")

###############################################################################
df_iris = load_dataset("iris")


###############################################################################
# As features, we will use the sepal length, width and petal length.
# We will try to predict the species.

X = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
y = "__generated__" # to indicate to julearn that the target will be generated


# Define our feature types
X_types = {
"sepal": ["sepal_length", "sepal_width"],
"petal": ["petal_length", "petal_width"],
}

###############################################################################
# We now use a Pipeline Creator to create the pipeline that will generate the
# features. This special pipeline should be configured to be a "transformer"
# and apply to the "petal" feature types.

target_creator = PipelineCreator(problem_type="transformer", apply_to="petal")
target_creator.add("pca", n_components=2)
# Select only the first component
target_creator.add("pick_columns", keep="pca__pca0")


###############################################################################
# We now create the pipeline that will be used to predict the target. This
# pipeline will be a regression pipeline. The step previous to the model should
# be the the `generate_target`, applying to the "petal" features and using the
# target_creator pipeline as the transformer.
creator = PipelineCreator(problem_type="regression")
creator.add("zscore", apply_to="*")
creator.add("generate_target", apply_to="petal", transformer=target_creator)
creator.add("linreg", apply_to="sepal")

###############################################################################
# We finally evaluate the model within the cross validation.
scores, model = run_cross_validation(
X=X,
y=y,
X_types=X_types,
data=df_iris,
model=creator,
return_estimator="final",
cv=2,
)

print(scores["test_score"]) # type: ignore

11 changes: 11 additions & 0 deletions julearn/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,9 @@ def run_cross_validation( # noqa: C901

wrap_score = False
if isinstance(model, (PipelineCreator, list)):
logger.debug(
"Generating pipeline from PipelineCreator or list of them"
)
if preprocess is not None:
raise_error(
"If model is a PipelineCreator (or list of), "
Expand Down Expand Up @@ -266,6 +269,7 @@ def run_cross_validation( # noqa: C901
expanded_models.extend(m.split())

has_target_transformer = expanded_models[-1]._added_target_transformer
has_target_generator = expanded_models[-1]._added_target_generator
all_pipelines = [
model.to_pipeline(X_types=X_types, search_params=search_params)
for model in expanded_models
Expand All @@ -279,12 +283,16 @@ def run_cross_validation( # noqa: C901
pipeline = all_pipelines[0]

if has_target_transformer:
logger.debug("Pipeline has target transformer")
if isinstance(pipeline, BaseSearchCV):
last_step = pipeline.estimator[-1] # type: ignore
else:
last_step = pipeline[-1]
if not last_step.can_inverse_transform():
wrap_score = True
if has_target_generator:
logger.debug("Pipeline has target generator")
wrap_score = True
problem_type = model[0].problem_type

elif not isinstance(model, (str, BaseEstimator)):
Expand Down Expand Up @@ -343,12 +351,15 @@ def run_cross_validation( # noqa: C901
"The following model_params are incorrect: " f"{unused_params}"
)
has_target_transformer = pipeline_creator._added_target_transformer
has_target_generator = pipeline_creator._added_target_generator
pipeline = pipeline_creator.to_pipeline(
X_types=X_types, search_params=search_params
)

if has_target_transformer and not pipeline[-1].can_inverse_transform():
wrap_score = True
if has_target_generator:
wrap_score = True

# Log some information
logger.info("= Data Information =")
Expand Down
54 changes: 53 additions & 1 deletion julearn/base/column_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Sami Hamdan <[email protected]>
# License: AGPL

from typing import Callable, List, Set, Union
from typing import Any, Callable, Dict, List, Set, Union

from sklearn.compose import make_column_selector

Expand Down Expand Up @@ -240,6 +240,42 @@ def __eq__(self, other: Union["ColumnTypes", str]):
other = other if isinstance(other, ColumnTypes) else ColumnTypes(other)
return self._column_types == other._column_types

def __and__(self, other: "ColumnTypes"):
"""Get the intersection of the column_types.

Parameters
----------
other : ColumnTypes
The other column_types to get the intersection with.

Returns
-------
ColumnTypes
The intersection of the column_types.

"""
return ColumnTypes(self._column_types & other._column_types)

def __or__(self, other: "ColumnTypes"):
"""Get the union of the column_types.

Parameters
----------
other : ColumnTypes
The other column_types to get the union with.

Returns
-------
ColumnTypes
The union of the column_types.

"""
return ColumnTypes(self._column_types | other._column_types)

def __len__(self):
"""Get the number of column_types."""
return len(self._column_types)

def __iter__(self):
"""Iterate over the column_types."""

Expand All @@ -251,6 +287,22 @@ def __repr__(self):
f"ColumnTypes<types={self._column_types}; pattern={self.pattern}>"
)

def filter(self, X_types: Dict[str, Any]) -> Dict[str, Any]: # noqa: N803
"""Filter the X_types based on the column_types.

Parameters
----------
X_types : dict
The types of the columns.

Returns
-------
dict:
The filtered X_types.

"""
return {k: v for k, v in X_types.items() if k in self._column_types}

def copy(self) -> "ColumnTypes":
"""Get a copy of the ColumnTypes.

Expand Down
4 changes: 3 additions & 1 deletion julearn/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,9 @@ def model(request: FixtureRequest) -> str:
return request.param


@fixture(params=["regression", "classification"], scope="function")
@fixture(
params=["regression", "classification", "transformer"], scope="function"
)
def problem_type(request: FixtureRequest) -> str:
"""Return different problem types.

Expand Down
Loading
Loading