Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve pipeline 'from_config' method #191

Merged
merged 7 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ repos:
- id: flake8
additional_dependencies: [ flake8-docstrings, "flake8-bugbear==22.8.23" ]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.11
hooks:
- id: ruff
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.11
hooks:
- id: ruff

- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.10.1
Expand Down
16 changes: 3 additions & 13 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,15 @@ Next, the Melusine team, or the community, will give you a feedback on whether y

## Fork to code in your personal Melusine repo

The first step is to get our MAIF repository on your personal GitHub repositories. To do so, use the "Fork" button.
The first step is to get our MAIF repository on your personal GitHub repositories. To do so, use the "Fork" button on Github landing page of melusine project.

<img src="https://raw.githubusercontent.com/MAIF/melusine/master/docs/assets/images/contributing/fork_melusine.PNG" alt="fork this repository" />

## Clone your forked repository

<img align="right" width="300" src="https://raw.githubusercontent.com/MAIF/melusine/master/docs/assets/images/contributing/clone_melusine.PNG" alt="clone your forked repository" />

Click on the "Code" button to copy the url of your repository, and next, you can paste this url to clone your forked repository.

```
git clone https://github.com/YOUR_GITHUB_PROFILE/melusine.git
git clone https://github.com/<YOUR_GITHUB_PROFILE>/melusine.git
```

## Make sure that your repository is up-to-date
Expand Down Expand Up @@ -137,9 +134,7 @@ Your branch is now available on your remote forked repository, with your changes

A pull request allows you to ask the Melusine team to review your changes, and merge your changes into the master branch of the official repository.

To create one, on the top of your forked repository, you will find a button "Compare & pull request"

<img src="https://raw.githubusercontent.com/MAIF/melusine/master/docs/assets/images/contributing/melusine-compare-pr.png" alt="pull request" />
To create one, on the top of your forked repository, you will find a button "Compare & pull request".

As you can see, you can select on the right side which branch of your forked repository you want to associate to the pull request.

Expand All @@ -150,12 +145,7 @@ On the left side, you will find the official Melusine repository. Due to increas
- Head repository: your-github-username/melusine
- Head branch: your-contribution-branch

<img src="https://raw.githubusercontent.com/MAIF/melusine/master/docs/assets/images/contributing/melusine-pr-branch.png" alt="clone your forked repository" />

Once you have selected the right branch, let's create the pull request with the green button "Create pull request".

<img src="https://raw.githubusercontent.com/MAIF/melusine/master/docs/assets/images/contributing/melusine-pr-description.png" alt="clone your forked repository" />

In the description, a template is initialized with all information you have to give about what you are doing on what your PR is doing.

Please follow this to write your PR content.
Expand Down
2 changes: 1 addition & 1 deletion melusine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

__all__ = ["config"]

VERSION = (3, 1, 0)
VERSION = (3, 1, 1)
__version__ = ".".join(map(str, VERSION))

# ------------------------------- #
Expand Down
53 changes: 39 additions & 14 deletions melusine/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import copy
import importlib
import warnings
from typing import Iterable, TypeVar

from sklearn.pipeline import Pipeline
Expand Down Expand Up @@ -35,7 +36,7 @@ class MelusinePipeline(Pipeline):
"""

OBJ_NAME: str = "name"
OBJ_KEY: str = "config_key"
OBJ_CONFIG_KEY: str = "config_key"
OBJ_PARAMS: str = "parameters"
STEPS_KEY: str = "steps"
OBJ_CLASS: str = "class_name"
Expand Down Expand Up @@ -194,10 +195,10 @@ def from_config(
# Get config dict
if config_key and not config_dict:
raw_config_dict = config[config_key]
config_dict = cls.parse_pipeline_config(raw_config_dict)
config_dict = cls.parse_pipeline_config(config_dict=raw_config_dict)

elif config_dict and not config_key:
config_dict = cls.parse_pipeline_config(config_dict)
config_dict = cls.parse_pipeline_config(config_dict=config_dict)
else:
raise ValueError("You should specify one and only one of 'config_key' and 'config_dict'")

Expand All @@ -218,10 +219,33 @@ def from_config(
# Step arguments
obj_params = obj_meta[cls.OBJ_PARAMS]

if issubclass(obj_class, IoMixin):
# Verbose option
suppress_warnings: Any = kwargs.pop("suppress_warnings", False)

try:
obj = obj_class.from_config(config_dict=obj_params)
else:
raise TypeError(f"Object {obj_class} does not inherit from the SaverMixin class") # pragma: no cover
if not issubclass(obj_class, IoMixin) and not suppress_warnings:
type_warn_msg: str = f"""
It seems you are not using a melusine object in your melusine pipeline, but object '{obj_class}' (class {type(obj_class)}) for step '{step_name}'.
The expected behavior is not guaranteed and can break in future version of melusine.

Recommended usage:
- Exclusive usage of melusine class Pipeline with melusine objects (MelusineTransformer, MelusineDetector...)

To suppress this warning, instanciate melusine Pipeline with suppress_warnings argument at True (MelusinePipeline.from_config(..., suppress_warnings=True)).

Visit Melusine Open-Source project: https://github.com/MAIF/melusine and the documentation for more information.
"""
warnings.warn(message=type_warn_msg, category=DeprecationWarning, stacklevel=2)
except AttributeError as err:
if not hasattr(obj_class, "from_config"):
raise AttributeError(
f"Object '{obj_class}' does not implement 'from_config' method, maybe consider to inherit it from the Melusine IoMixin class or use a MelusineTransformer class."
) from None
else:
raise AttributeError(f"Error in loading class: '{obj_class}'\\n{str(err)}").with_traceback(
err.__traceback__
) from err

# Add step to pipeline
steps.append((step_name, obj))
Expand Down Expand Up @@ -251,17 +275,17 @@ def validate_step_config(cls, step: dict[str, Any]) -> dict[str, Any]:
f"Pipeline step conf should have a {cls.OBJ_MODULE} key and a {cls.OBJ_CLASS} key."
)

if step.get(cls.OBJ_KEY):
if step.get(cls.OBJ_CONFIG_KEY):
return {
cls.OBJ_CLASS: step[cls.OBJ_CLASS],
cls.OBJ_MODULE: step[cls.OBJ_MODULE],
cls.OBJ_KEY: step[cls.OBJ_KEY],
cls.OBJ_CONFIG_KEY: step[cls.OBJ_CONFIG_KEY],
}

if not step.get(cls.OBJ_NAME) or not step.get(cls.OBJ_PARAMS):
raise PipelineConfigurationError(
f"Pipeline step conf should have a {cls.OBJ_NAME} key and a {cls.OBJ_KEY} key "
f"(unless a {cls.OBJ_KEY} is specified)."
f"Pipeline step conf should have a {cls.OBJ_NAME} key and a {cls.OBJ_CONFIG_KEY} key "
f"(unless a {cls.OBJ_CONFIG_KEY} is specified)."
)

if not isinstance(step[cls.OBJ_PARAMS], dict):
Expand Down Expand Up @@ -325,11 +349,12 @@ def parse_pipeline_config(cls, config_dict: dict[str, Any]) -> dict[str, Any]:
steps = []
for step in config_dict[cls.STEPS_KEY]:
# Step defined from the config
config_key = step.get(cls.OBJ_KEY)
config_key = step.get(cls.OBJ_CONFIG_KEY)
config_name = step.get(cls.OBJ_NAME)
if config_key:
# Use config key as step name
step[cls.OBJ_NAME] = config_key
_ = step.pop(cls.OBJ_KEY)
# Use config key as step name if no name is provided in config
step[cls.OBJ_NAME] = config_name or config_key
_ = step.pop(cls.OBJ_CONFIG_KEY)
# Update step parameters
step[cls.OBJ_PARAMS] = config[config_key]

Expand Down
126 changes: 121 additions & 5 deletions tests/pipeline/test_pipeline_basic.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
Example script to fit a minimal preprocessing pipeline
"""
from contextlib import nullcontext as does_not_raise
from unittest import mock

import pandas as pd
import pytest
Expand Down Expand Up @@ -34,7 +36,7 @@ def test_pipeline_basic(dataframe_basic):
@pytest.mark.usefixtures("use_test_config")
def test_pipeline_from_config(dataframe_basic):
"""
Train a pipeline using transformers defined in a pipeline config file.
Train a pipeline using scikit-learn transformers defined in a pipeline config file.
"""
# Input data
df = dataframe_basic.copy()
Expand Down Expand Up @@ -79,7 +81,7 @@ def test_pipeline_from_config(dataframe_basic):
@pytest.mark.usefixtures("use_test_config")
def test_pipeline_from_dict(dataframe_basic):
"""
Train a pipeline using transformers defined in a pipeline config file.
Train a pipeline using scikit-learn transformers defined in a pipeline config file.
"""
# Input data
df = dataframe_basic.copy()
Expand Down Expand Up @@ -125,7 +127,7 @@ def test_pipeline_from_dict(dataframe_basic):
@pytest.mark.usefixtures("use_test_config")
def test_missing_config_key():
"""
Train a pipeline using transformers defined in a pipeline config file.
Train a pipeline using scikit-learn transformers defined in a pipeline config file.
"""
# Set config keys
normalizer_name = "normalizer"
Expand Down Expand Up @@ -159,7 +161,7 @@ def test_missing_config_key():
@pytest.mark.usefixtures("use_test_config")
def test_invalid_config_key():
"""
Train a pipeline using transformers defined in a pipeline config file.
Train a pipeline using scikit-learn transformers defined in a pipeline config file.
"""
incorrect_config_key = "INCORRECT_CONFIG_KEY"

Expand Down Expand Up @@ -263,7 +265,7 @@ def test_invalid_config_key():
)
def test_pipeline_config_error(pipeline_conf):
"""
Train a pipeline using transformers defined in a pipeline config file.
Train a pipeline using scikit-learn transformers defined in a pipeline config file.
"""
# Create pipeline from a json config file (using config key "my_pipeline")
with pytest.raises(PipelineConfigurationError):
Expand Down Expand Up @@ -313,3 +315,117 @@ def test_missing_input_field(dataframe_basic):
# Fit the pipeline and transform the data
with pytest.raises(ValueError, match=rf"(?s){tokenizer_name}.*{missing_field_name}"):
pipe.transform(df)


@pytest.mark.usefixtures("use_test_config")
def test_pipeline_from_config_with_error():
"""
Instanciate a Melusine Pipeline not using scikit-learn transformers defined in a pipeline config file (raising errors).
"""
# Set config keys
normalizer_key = "test_normalizer"
pipeline_key = "test_pipeline"

# Pipeline configuration
conf_pipeline_basic = {
"steps": [
{
"class_name": "Normalizer",
"module": "melusine.processors",
"config_key": normalizer_key,
},
]
}

test_conf_dict = config.dict()
test_conf_dict[pipeline_key] = conf_pipeline_basic
config.reset(config_dict=test_conf_dict)

with mock.patch("melusine.pipeline.MelusinePipeline.import_class") as mock_import_class, pytest.raises(
AttributeError, match=r"Object '.+' does not implement 'from_config'.+to inherit it from the Melusine IoMixin.+"
):

class WrongNormalizerWithoutMethodFromConfig:
def __init__(self, form, input_columns, lowercase, output_columns) -> None:
"""Mock class without from_config method so we raise error."""
return None

mock_import_class.return_value = WrongNormalizerWithoutMethodFromConfig

# Create pipeline from a json config file
_ = MelusinePipeline.from_config(config_key=pipeline_key, verbose=True)

with mock.patch("melusine.pipeline.MelusinePipeline.import_class") as mock_import_class, pytest.raises(
AttributeError
) as e:

class WrongNormalizerWithMethodFromConfig:
def __init__(self) -> None:
return None

@classmethod
def from_config(cls, config_dict: dict):
"""Mock method from_config so we have a valid 'duck typed' class but with attribute error."""

return cls.class_constant_not_defined_raising_attribute_error

mock_import_class.return_value = WrongNormalizerWithMethodFromConfig

# Create pipeline from a json config file
_ = MelusinePipeline.from_config(config_key=pipeline_key, verbose=True)

assert "does not implement 'from_config'" not in str(e)


@pytest.mark.usefixtures("use_test_config")
def test_pipeline_from_config_with_warning(recwarn):
"""
Instanciate a Melusine Pipeline not using scikit-learn transformers defined in a pipeline config file (raising warnings).
"""
# Set config keys
normalizer_key = "test_normalizer"
pipeline_key = "test_pipeline"

# Pipeline configuration
conf_pipeline_basic = {
"steps": [
{
"class_name": "Normalizer",
"module": "melusine.processors",
"config_key": normalizer_key,
},
]
}

test_conf_dict = config.dict()
test_conf_dict[pipeline_key] = conf_pipeline_basic
config.reset(config_dict=test_conf_dict)

with mock.patch("melusine.pipeline.MelusinePipeline.import_class") as mock_import_class:

class WrongNormalizerWithMethodFromConfig:
def __init__(self) -> None:
return None

@classmethod
def from_config(cls, config_dict: dict):
"""Mock method from_config so we have a valid 'duck typed' class but with invalid functional behavior."""
return cls

mock_import_class.return_value = WrongNormalizerWithMethodFromConfig

with does_not_raise():
# Create pipeline from a json config file and suppress warning
_ = MelusinePipeline.from_config(config_key=pipeline_key, verbose=True, suppress_warnings=True)

assert len(recwarn) == 0

_ = MelusinePipeline.from_config(config_key=pipeline_key, verbose=True, suppress_warnings=False)

assert len(recwarn) == 1

with pytest.warns(
DeprecationWarning, match=r".+It seems you are not using a melusine object in your melusine pipeline.+"
):
# Create pipeline from a json config file
_ = MelusinePipeline.from_config(config_key=pipeline_key, verbose=True, suppress_warnings=False)
Loading