From eb0a020055fa8da51a66a506fe7b645e58db4ba4 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Sat, 10 Jun 2023 15:03:36 +0200 Subject: [PATCH 01/47] fall back to pandas if no datetime format is found --- skrub/_table_vectorizer.py | 29 +++++++++++++++++++++++++--- skrub/tests/test_table_vectorizer.py | 5 +++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 42574546b..48136b741 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -75,7 +75,7 @@ def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> Optional[ warnings.warn( f""" Both {date_format_monthfirst.iloc[0]} and {date_format_dayfirst.iloc[0]} are valid - formats for the dates in column {date_column.name}. + formats for the dates in column '{date_column.name}'. Format {date_format_monthfirst.iloc[0]} will be used. """, UserWarning, @@ -516,9 +516,32 @@ def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame: except (ValueError, TypeError): # Only try to convert to datetime # if the variable isn't numeric. + # try to find the best format format = _infer_date_format(X[col]) - if format is not None: - X[col] = pd.to_datetime(X[col], errors="raise", format=format) + # if a format is found, try to apply to the whole column + # if no format is found, pandas will try to parse each row + # with a different engine, which can understand weirder formats + try: + # catch the warnings raised by pandas + # in case the conversion fails + with warnings.catch_warnings(record=True) as w: + X[col] = pd.to_datetime( + X[col], errors="raise", format=format + ) + # if the conversion worked, raise pandas warnings + for warning in w: + with warnings.catch_warnings(): + # otherwise the warning is considered a duplicate + warnings.simplefilter("always") + warnings.warn( + "Warning raised by pandas when converting column" + f" '{col}' to datetime: " + + str(warning.message), + UserWarning, + stacklevel=2, + ) + except (ValueError, TypeError): + pass # Cast pandas dtypes to numpy dtypes # for earlier versions of sklearn. FIXME: which ? if issubclass(X[col].dtype.__class__, ExtensionDtype): diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index a8d448573..4bb4a6336 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -142,6 +142,10 @@ def _get_datetimes_dataframe() -> pd.DataFrame: "2015/12/31 01:31:34", "2014/01/31 00:32:45", ], + # this date format is not found by pandas guess_datetime_format + # so shoulnd't be found by our _infer_datetime_format + # but pandas.to_datetime can still parse it + "mm/dd/yy": ["12/1/22", "2/3/05", "2/1/20", "10/7/99", "1/23/04"], } ) @@ -255,6 +259,7 @@ def test_auto_cast() -> None: "dmy-": "datetime64[ns]", "ymd/": "datetime64[ns]", "ymd/_hms:": "datetime64[ns]", + "mm/dd/yy": "datetime64[ns]", } X_trans = vectorizer._auto_cast(X) for col in X_trans.columns: From f813fbb6a17273e3f825124136deb8ac151d7c1b Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Sat, 10 Jun 2023 15:18:04 +0200 Subject: [PATCH 02/47] change changelog --- CHANGES.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9305f1381..02f151663 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -39,9 +39,10 @@ Major changes Minor changes ------------- * Improvement of date column detection and date format inference in :class:`TableVectorizer`. The - format inference now finds a format which works for all non-missing values of the column, instead - of relying on pandas behavior. If no such format exists, the column is not casted to a date column. + format inference now tries to find a format which works for all non-missing values of the column, and only + tries pandas default inference if it fails. :pr:`543` by :user:`Leo Grinsztajn ` + :pr:`587` by :user:`Leo Grinsztajn ` Release 0.4.0 ============= From d9a374f82e089c2fcab6aabe57076c825915bd90 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Mon, 12 Jun 2023 17:14:56 +0200 Subject: [PATCH 03/47] first working version --- skrub/_table_vectorizer.py | 111 ++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 48136b741..1506c3fde 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -5,6 +5,7 @@ """ import warnings +from itertools import chain from typing import Dict, List, Literal, Optional, Tuple, Union from warnings import warn @@ -16,16 +17,39 @@ from sklearn import __version__ as sklearn_version from sklearn.base import TransformerMixin, clone from sklearn.compose import ColumnTransformer -from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.utils._set_output import _get_output_config from sklearn.utils.deprecation import deprecated from sklearn.utils.validation import check_is_fitted -from skrub import DatetimeEncoder, GapEncoder +from skrub import DatetimeEncoder, GapEncoder, MinHashEncoder from skrub._utils import parse_version # Required for ignoring lines too long in the docstrings # flake8: noqa: E501 +# transformers which can be applied column-wise +# TODO: add SimilarityEncoder? It was slower on a quick test +UNIVARIATE_TRANSFORMERS = (GapEncoder, MinHashEncoder) + + +def _is_empty_column_selection(column): + """ + Return True if the column selection is empty (empty list or all-False + boolean array). + + """ + if hasattr(column, "dtype") and np.issubdtype(column.dtype, np.bool_): + return not column.any() + elif hasattr(column, "__len__"): + return ( + len(column) == 0 + or all(isinstance(col, bool) for col in column) + and not any(column) + ) + else: + return False + def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> Optional[str]: """Infer the date format of a date column, @@ -399,6 +423,79 @@ def __init__( self.transformer_weights = transformer_weights self.verbose = verbose + # override _iter to parallelize on the columns instead of the transformers + # when possible + def _iter(self, fitted=False, replace_strings=False, column_as_strings=False): + """ + Generate (name, trans, column, weight) tuples. + + If fitted=True, use the fitted transformers, else use the + user specified transformers updated with converted column names + and potentially appended with transformer for remainder. + + """ + if fitted: + if replace_strings: + # Replace "passthrough" with the fitted version in + # _name_to_fitted_passthrough + def replace_passthrough(name, trans, columns): + if name not in self._name_to_fitted_passthrough: + return name, trans, columns + return name, self._name_to_fitted_passthrough[name], columns + + transformers = [ + replace_passthrough(*trans) for trans in self.transformers_ + ] + else: + transformers = self.transformers_ + else: + # interleave the validated column specifiers + transformers = [ + (name, trans, column) + for (name, trans, _), column in zip(self.transformers, self._columns) + ] + # add transformer tuple for remainder + if self._remainder[2]: + transformers = chain(transformers, [self._remainder]) + get_weight = (self.transformer_weights or {}).get + + output_config = _get_output_config("transform", self) + for name, trans, columns in transformers: + ######################################## + # This is where we deviate from the original implementation + if isinstance(trans, UNIVARIATE_TRANSFORMERS): + columns_ = [[col] for col in columns] + else: + columns_ = [columns] + ######################################## + for cols in columns_: + if replace_strings: + # replace 'passthrough' with identity transformer and + # skip in case of 'drop' + if trans == "passthrough": + trans = FunctionTransformer( + accept_sparse=True, + check_inverse=False, + feature_names_out="one-to-one", + ).set_output(transform=output_config["dense"]) + elif trans == "drop": + continue + elif _is_empty_column_selection(cols): + continue + + if column_as_strings: + # Convert all columns to using their string labels + columns_is_scalar = np.isscalar(cols) + + indices = self._transformer_to_input_indices[name] + columns = self.feature_names_in_[indices] + + if columns_is_scalar: + # selection is done with one dimension + cols = cols[0] + + yield (name, trans, cols, get_weight(name)) + def _more_tags(self): """ Used internally by sklearn to ease the estimator checks. @@ -726,6 +823,16 @@ def fit_transform(self, X, y=None): X_enc = super().fit_transform(X, y) + # if the same transformer is used on several columns, merge the columns in self.transformers_ + # this is needed because we override the _iter method of ColumnTransformer + transformers_merged = {} + for name, enc, cols in self.transformers_: + if name in transformers_merged: + transformers_merged[name][2].extend(cols) + else: + transformers_merged[name] = (name, enc, cols) + self.transformers_ = list(transformers_merged.values()) + # For the "remainder" columns, the `ColumnTransformer` `transformers_` # attribute contains the index instead of the column name, # so we convert the values to the appropriate column names From ffa9755428a13f2eabcfcee44ff2aee7b9e1a7e6 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Mon, 12 Jun 2023 18:40:38 +0200 Subject: [PATCH 04/47] add tests --- skrub/_table_vectorizer.py | 3 +- skrub/tests/test_table_vectorizer.py | 60 +++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 1506c3fde..26beb1345 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -29,7 +29,8 @@ # flake8: noqa: E501 # transformers which can be applied column-wise -# TODO: add SimilarityEncoder? It was slower on a quick test +# and which are slow enough to be worth parallelizing over columns +# TODO: add SimilarityEncoder? It was slower on a quick test. UNIVARIATE_TRANSFORMERS = (GapEncoder, MinHashEncoder) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 4bb4a6336..54da569f2 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -1,11 +1,13 @@ from typing import Any, Tuple +import joblib import numpy as np import pandas as pd import pytest import sklearn from sklearn.exceptions import NotFittedError -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel from sklearn.utils.validation import check_is_fitted from skrub import GapEncoder, SuperVectorizer, TableVectorizer @@ -608,3 +610,59 @@ def test__infer_date_format() -> None: # Test with a column containing more than two date formats date_column = pd.Series(["2022-01-01", "01/02/2022", "20220103", "2022-Jan-04"]) assert _infer_date_format(date_column) is None + + +@skip_if_no_parallel +def test_parallelism() -> None: + # Test that parallelism works + X = _get_clean_dataframe() + # the gap encoder should be parallelized on all columns + # the one hot encoder should not be parallelized + for high_card_cat_transformer in [ + lambda: GapEncoder(n_components=2, random_state=0), + OneHotEncoder, + ]: + table_vec_no_parallel = TableVectorizer( + high_card_cat_transformer=high_card_cat_transformer(), + cardinality_threshold=4, + ) + X_trans = table_vec_no_parallel.fit_transform(X) + for joblib_backend in ["loky", "threading", "multiprocessing"]: + with joblib.parallel_backend(joblib_backend): + for n_jobs in [None, 2, -1]: + table_vec = TableVectorizer( + n_jobs=n_jobs, + high_card_cat_transformer=high_card_cat_transformer(), + cardinality_threshold=4, + ) + X_trans_parallel = table_vec.fit_transform(X) + # print(table_vec.transform(X)) + assert_array_equal(X_trans, X_trans_parallel) + assert table_vec.n_jobs == n_jobs + # assert that all attributes are equal except for + # the n_jobs attribute + for attr in [ + "transformers_", + "columns_", + "types_", + "imputed_columns_", + ]: + assert str(getattr(table_vec, attr)) == str( + getattr(table_vec_no_parallel, attr) + ) + # assert that get_feature_names_out gives the same result + assert_array_equal( + table_vec.get_feature_names_out(), + table_vec_no_parallel.get_feature_names_out(), + ) + # assert that get_params gives the same result expect for n_jobs + # remove n_jobs from the dict + params = table_vec.get_params() + params.pop("n_jobs") + params_no_parallel = table_vec_no_parallel.get_params() + params_no_parallel.pop("n_jobs") + assert str(params) == str(params_no_parallel) + # assert that transform gives the same result + assert_array_equal( + table_vec.transform(X), table_vec_no_parallel.transform(X) + ) From 5bcb6dda4ee2935f65bc3f54f475595e3ff130ba Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Mon, 12 Jun 2023 18:46:27 +0200 Subject: [PATCH 05/47] update changelog --- CHANGES.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 3aa57016c..9bdc773c0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -47,6 +47,9 @@ Minor changes :pr:`543` by :user:`Leo Grinsztajn ` :pr:`587` by :user:`Leo Grinsztajn ` +* When possible, parallelism in done at the column level rather than the transformer level in :class:`TableVectorizer`. + This is the case for :class:`MinHashEncoder` and :class:`GapEncoder`. :pr:`592` by :user:`Leo Grinsztajn ` + Release 0.4.0 ============= From c005a4999bcdece1767e689fa2ce0fb046136a1b Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Mon, 12 Jun 2023 18:47:14 +0200 Subject: [PATCH 06/47] typo --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9bdc773c0..e883016f3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -47,7 +47,7 @@ Minor changes :pr:`543` by :user:`Leo Grinsztajn ` :pr:`587` by :user:`Leo Grinsztajn ` -* When possible, parallelism in done at the column level rather than the transformer level in :class:`TableVectorizer`. +* When possible, parallelism is done at the column level rather than the transformer level in :class:`TableVectorizer`. This is the case for :class:`MinHashEncoder` and :class:`GapEncoder`. :pr:`592` by :user:`Leo Grinsztajn ` Release 0.4.0 From 85b7f9c2c2e61c2b3fc3a0bea11e685514d25cc5 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 13 Jun 2023 10:27:57 +0200 Subject: [PATCH 07/47] copy transformer when split between columns to avoid conflict --- skrub/_table_vectorizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 26beb1345..e8bde5c7d 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -4,6 +4,7 @@ manually categorize them beforehand, or construct complex Pipelines. """ +import copy import warnings from itertools import chain from typing import Dict, List, Literal, Optional, Tuple, Union @@ -495,7 +496,7 @@ def replace_passthrough(name, trans, columns): # selection is done with one dimension cols = cols[0] - yield (name, trans, cols, get_weight(name)) + yield (name, copy.deepcopy(trans), cols, get_weight(name)) def _more_tags(self): """ From c3706c727ee9e27f6f261706af53dd1d50aa65bb Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Mon, 12 Jun 2023 18:46:27 +0200 Subject: [PATCH 08/47] update changelog --- CHANGES.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9a7e17648..275c71162 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -25,7 +25,7 @@ Major changes Minor changes ------------- - + * When possible, parallelism is done at the column level rather than the transformer level in :class:`TableVectorizer`. This is the case for :class:`MinHashEncoder` and :class:`GapEncoder`. :pr:`592` by :user:`Leo Grinsztajn ` @@ -66,7 +66,8 @@ Minor changes :pr:`543` by :user:`Leo Grinsztajn ` :pr:`587` by :user:`Leo Grinsztajn ` -======= + + Dirty-cat Release 0.4.0 ========================= From 41475493e367fdb5a912d6ad2585776bf0413b5e Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 13 Jun 2023 13:48:04 +0200 Subject: [PATCH 09/47] fix bug with repeated transformer --- skrub/_table_vectorizer.py | 22 ++++------ skrub/tests/test_table_vectorizer.py | 66 ++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index e8bde5c7d..0ccbec01f 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -4,7 +4,6 @@ manually categorize them beforehand, or construct complex Pipelines. """ -import copy import warnings from itertools import chain from typing import Dict, List, Literal, Optional, Tuple, Union @@ -464,7 +463,7 @@ def replace_passthrough(name, trans, columns): output_config = _get_output_config("transform", self) for name, trans, columns in transformers: ######################################## - # This is where we deviate from the original implementation + # We deviate from the original implementation if isinstance(trans, UNIVARIATE_TRANSFORMERS): columns_ = [[col] for col in columns] else: @@ -495,8 +494,13 @@ def replace_passthrough(name, trans, columns): if columns_is_scalar: # selection is done with one dimension cols = cols[0] - - yield (name, copy.deepcopy(trans), cols, get_weight(name)) + # Another deviation from the original implementation + if isinstance(trans, UNIVARIATE_TRANSFORMERS) and len(columns_) > 1: + # this should only happen before fitting + assert not fitted + yield (name, clone(trans), cols, get_weight(name)) + else: + yield (name, trans, cols, get_weight(name)) def _more_tags(self): """ @@ -825,16 +829,6 @@ def fit_transform(self, X, y=None): X_enc = super().fit_transform(X, y) - # if the same transformer is used on several columns, merge the columns in self.transformers_ - # this is needed because we override the _iter method of ColumnTransformer - transformers_merged = {} - for name, enc, cols in self.transformers_: - if name in transformers_merged: - transformers_merged[name][2].extend(cols) - else: - transformers_merged[name] = (name, enc, cols) - self.transformers_ = list(transformers_merged.values()) - # For the "remainder" columns, the `ColumnTransformer` `transformers_` # attribute contains the index instead of the column name, # so we convert the values to the appropriate column names diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 54da569f2..aae4bb20e 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -5,6 +5,7 @@ import pandas as pd import pytest import sklearn +from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel @@ -612,6 +613,26 @@ def test__infer_date_format() -> None: assert _infer_date_format(date_column) is None +def test_column_by_column(): + X = _get_clean_dataframe() + table_vec_all_cols = TableVectorizer( + high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + cardinality_threshold=4, + ) + table_vec_all_cols.fit(X) + for col in X.columns: + table_vec_one_col = TableVectorizer( + high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + cardinality_threshold=4, + ) + table_vec_one_col.fit(X[[col]]) + assert table_vec_one_col.get_feature_names_out() == [ + feat + for feat in table_vec_all_cols.get_feature_names_out() + if feat.startswith(col) + ] + + @skip_if_no_parallel def test_parallelism() -> None: # Test that parallelism works @@ -666,3 +687,48 @@ def test_parallelism() -> None: assert_array_equal( table_vec.transform(X), table_vec_no_parallel.transform(X) ) + + +def test_original_iter() -> None: + # Test if the behavior changed when we overrided the _ iter + # method from the ColumnTransformer + # only the transformers_ attribute should be different + + X = _get_clean_dataframe() + table_vec_old_iter = TableVectorizer( + high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + cardinality_threshold=4, + ) + table_vec_old_iter._iter = lambda *args, **kwargs: ColumnTransformer._iter( + table_vec_old_iter, *args, **kwargs + ) + table_vec_new_iter = TableVectorizer( + high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + cardinality_threshold=4, + ) + assert_array_equal( + table_vec_old_iter.fit_transform(X), table_vec_new_iter.fit_transform(X) + ) + # assert that all attributes are equal except for + # the transformers_ attribute + for attr in [ + "n_jobs", + "columns_", + "types_", + "imputed_columns_", + ]: + assert str(getattr(table_vec_old_iter, attr)) == str( + getattr(table_vec_new_iter, attr) + ) + # assert that get_feature_names_out gives the same result + assert_array_equal( + table_vec_old_iter.get_feature_names_out(), + table_vec_new_iter.get_feature_names_out(), + ) + # assert that get_params gives the same result expect for n_jobs + # remove n_jobs from the dict + params_old_iter = table_vec_old_iter.get_params() + params_new_iter = table_vec_new_iter.get_params() + assert str(params_old_iter) == str(params_new_iter) + # assert that transform gives the same result + assert_array_equal(table_vec_old_iter.transform(X), table_vec_new_iter.transform(X)) From 7771eb6d8ceb3dab4f24a170a091660458f5efe8 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 22 Jun 2023 18:28:14 +0200 Subject: [PATCH 10/47] split and merge --- skrub/_gap_encoder.py | 21 +++- skrub/_minhash_encoder.py | 18 ++- skrub/_table_vectorizer.py | 171 +++++++++++---------------- skrub/_utils.py | 8 ++ skrub/tests/test_gap_encoder.py | 33 ++++++ skrub/tests/test_table_vectorizer.py | 52 ++------ 6 files changed, 157 insertions(+), 146 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index be8be5af1..dae26d90b 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -12,7 +12,7 @@ from numpy.random import RandomState from scipy import sparse from sklearn import __version__ as sklearn_version -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.cluster import KMeans from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer from sklearn.neighbors import NearestNeighbors @@ -718,6 +718,25 @@ class GapEncoder(BaseEstimator, TransformerMixin): fitted_models_: List[GapEncoderColumn] column_names_: List[str] + @classmethod + def _merge(cls, transformers_list): + # merge GapEncoder fitted on different columns + # into a single GapEncoder + full_transformer = clone(transformers_list[0]) + # assert rho_ is the same for all transformers + print(transformers_list[0]) + rho_ = transformers_list[0].rho_ + assert all([transformers.rho_ == rho_ for transformers in transformers_list]) + full_transformer.rho_ = rho_ + full_transformer.fitted_models_ = [] + for transformers in transformers_list: + full_transformer.fitted_models_.extend(transformers.fitted_models_) + if hasattr(transformers_list[0], "column_names_"): + full_transformer.column_names_ = [] + for transformers in transformers_list: + full_transformer.column_names_.extend(transformers.column_names_) + return full_transformer + def __init__( self, *, diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index 386e1cb37..a55a6a389 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -7,13 +7,13 @@ import numpy as np from joblib import Parallel, delayed, effective_n_jobs -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.utils import gen_even_slices, murmurhash3_32 from sklearn.utils.validation import check_is_fitted from ._fast_hash import ngram_min_hash from ._string_distances import get_unique_ngrams -from ._utils import LRUDict, check_input +from ._utils import LRUDict, check_input, combine_LRUDicts NoneType = type(None) @@ -114,6 +114,20 @@ class MinHashEncoder(BaseEstimator, TransformerMixin): _capacity: int = 2**10 + @classmethod + def _merge(cls, transformers_list): + # merge MinHashEncoder fitted on different columns + # into a single MinHashEncoder + full_transformer = clone(transformers_list[0]) + capacity = transformers_list[0]._capacity + assert all( + transformer._capacity == capacity for transformer in transformers_list + ) + full_transformer.hash_dict_ = combine_LRUDicts( + capacity, *[transformer.hash_dict_ for transformer in transformers_list] + ) + return full_transformer + def __init__( self, *, diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 0ccbec01f..3e2675a7b 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -30,26 +30,7 @@ # transformers which can be applied column-wise # and which are slow enough to be worth parallelizing over columns -# TODO: add SimilarityEncoder? It was slower on a quick test. -UNIVARIATE_TRANSFORMERS = (GapEncoder, MinHashEncoder) - - -def _is_empty_column_selection(column): - """ - Return True if the column selection is empty (empty list or all-False - boolean array). - - """ - if hasattr(column, "dtype") and np.issubdtype(column.dtype, np.bool_): - return not column.any() - elif hasattr(column, "__len__"): - return ( - len(column) == 0 - or all(isinstance(col, bool) for col in column) - and not any(column) - ) - else: - return False +UNIVARIATE_TRANSFORMERS = ["GapEncoder", "MinHashEncoder"] def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> Optional[str]: @@ -424,84 +405,6 @@ def __init__( self.transformer_weights = transformer_weights self.verbose = verbose - # override _iter to parallelize on the columns instead of the transformers - # when possible - def _iter(self, fitted=False, replace_strings=False, column_as_strings=False): - """ - Generate (name, trans, column, weight) tuples. - - If fitted=True, use the fitted transformers, else use the - user specified transformers updated with converted column names - and potentially appended with transformer for remainder. - - """ - if fitted: - if replace_strings: - # Replace "passthrough" with the fitted version in - # _name_to_fitted_passthrough - def replace_passthrough(name, trans, columns): - if name not in self._name_to_fitted_passthrough: - return name, trans, columns - return name, self._name_to_fitted_passthrough[name], columns - - transformers = [ - replace_passthrough(*trans) for trans in self.transformers_ - ] - else: - transformers = self.transformers_ - else: - # interleave the validated column specifiers - transformers = [ - (name, trans, column) - for (name, trans, _), column in zip(self.transformers, self._columns) - ] - # add transformer tuple for remainder - if self._remainder[2]: - transformers = chain(transformers, [self._remainder]) - get_weight = (self.transformer_weights or {}).get - - output_config = _get_output_config("transform", self) - for name, trans, columns in transformers: - ######################################## - # We deviate from the original implementation - if isinstance(trans, UNIVARIATE_TRANSFORMERS): - columns_ = [[col] for col in columns] - else: - columns_ = [columns] - ######################################## - for cols in columns_: - if replace_strings: - # replace 'passthrough' with identity transformer and - # skip in case of 'drop' - if trans == "passthrough": - trans = FunctionTransformer( - accept_sparse=True, - check_inverse=False, - feature_names_out="one-to-one", - ).set_output(transform=output_config["dense"]) - elif trans == "drop": - continue - elif _is_empty_column_selection(cols): - continue - - if column_as_strings: - # Convert all columns to using their string labels - columns_is_scalar = np.isscalar(cols) - - indices = self._transformer_to_input_indices[name] - columns = self.feature_names_in_[indices] - - if columns_is_scalar: - # selection is done with one dimension - cols = cols[0] - # Another deviation from the original implementation - if isinstance(trans, UNIVARIATE_TRANSFORMERS) and len(columns_) > 1: - # this should only happen before fitting - assert not fitted - yield (name, clone(trans), cols, get_weight(name)) - else: - yield (name, trans, cols, get_weight(name)) - def _more_tags(self): """ Used internally by sklearn to ease the estimator checks. @@ -578,6 +481,58 @@ def _clone_transformers(self): # TODO: check that the provided transformers are valid + def _split_univariate_transformers(self): + self._transformers_original = self.transformers + new_transformers = [] + for name, trans, cols in self.transformers: + if trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS: + for i, col in enumerate(cols): + new_transformers.append( + (name + "_split_" + str(i), clone(trans), [col]) + ) + else: + new_transformers.append((name, trans, cols)) + self.transformers = new_transformers + + def _merge_univariate_transformers(self): + self.transformers = self._transformers_original + new_transformers_ = [] + new_transformer_to_input_indices = {} + # find all base names + base_names = [] + for name, _, _ in self.transformers_: + base_name = name.split("_split_")[0] + if base_name not in base_names: + base_names.append(base_name) + # merge all transformers with the same base name + for base_name in base_names: + transformers = [] + columns = [] + names = [] + for name, trans, cols in self.transformers_: + if name.startswith(base_name): + columns.extend(cols) + transformers.append(trans) + names.append(name) + if len(transformers) == 1: + new_transformers_.append((base_name, transformers[0], columns)) + new_transformer_to_input_indices[base_name] = list( + self._transformer_to_input_indices[base_name] + ) + else: + # merge transformers + new_transformers_.append( + (base_name, transformers[0].__class__._merge(transformers), columns) + ) + new_transformer_to_input_indices[base_name] = list( + np.concatenate( + [self._transformer_to_input_indices[name] for name in names] + ) + ) + + self.transformers_ = new_transformers_ + self._transformer_to_input_indices = new_transformer_to_input_indices + def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame: """Takes a dataframe and tries to convert its columns to their best possible data type. @@ -827,6 +782,11 @@ def fit_transform(self, X, y=None): if self.verbose: print(f"[TableVectorizer] Assigned transformers: {self.transformers}") + # split the univariate transformers on each column + # to be able to parallelize the encoding + if self.n_jobs not in (None, 1): + self._split_univariate_transformers() + X_enc = super().fit_transform(X, y) # For the "remainder" columns, the `ColumnTransformer` `transformers_` @@ -839,6 +799,9 @@ def fit_transform(self, X, y=None): cols: List[int] self.transformers_[i] = (name, enc, [self.columns_[j] for j in cols]) + if self.n_jobs not in (None, 1): + self._merge_univariate_transformers() + return X_enc def transform(self, X) -> np.ndarray: @@ -877,7 +840,17 @@ def transform(self, X) -> np.ndarray: if self.auto_cast: X = self._apply_cast(X) - return super().transform(X) + # split the univariate transformers on each column + # to be able to parallelize the encoding + if self.n_jobs not in (None, 1): + self._split_univariate_transformers() + + res = super().transform(X) + + if self.n_jobs not in (None, 1): + self._merge_univariate_transformers() + + return res def get_feature_names_out(self, input_features=None) -> List[str]: """Return clean feature names. diff --git a/skrub/_utils.py b/skrub/_utils.py index d5783037f..2919c50be 100644 --- a/skrub/_utils.py +++ b/skrub/_utils.py @@ -41,6 +41,14 @@ def __contains__(self, key: Hashable): return key in self.cache +def combine_LRUDicts(capacity: int, *LRUDicts: LRUDict) -> LRUDict: + combined_LRUDict = LRUDict(capacity) + for lru_dict in LRUDicts: + for key, value in lru_dict.cache.items(): + combined_LRUDict[key] = value + return combined_LRUDict + + def check_input(X) -> np.ndarray: """ Check input with sklearn standards. diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index ae6ad19ac..772b3ade2 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -271,3 +271,36 @@ def test_transform_deterministic() -> None: enc.transform(X_test) topics2 = enc.get_feature_names_out() assert topics1 == topics2 + + +def test_merge_transformers() -> None: + # test whether fitting on each column separately and then merging the + # transformers gives the same result as fitting on the whole dataset + + # generate data + X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) + X = pd.DataFrame(X, columns=["col0", "col1", "col2"]) + + # fit on each column separately + enc_list = [] + for i in range(3): + enc = GapEncoder(random_state=42) + enc.fit(X[[f"col{i}"]]) + enc_list.append(enc) + enc_merged = GapEncoder._merge(enc_list) + + # fit on the whole dataset + enc = GapEncoder(random_state=42) + enc.fit(X) + + # check that the results are the same + # check transform + assert np.allclose(enc_merged.transform(X), enc.transform(X)) + # check get_feature_names_out + assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() + # check score + assert enc_merged.score(X) == enc.score(X) + # check all attributes + attrs = ["rho_", "column_names_"] + for attr in attrs: + assert getattr(enc_merged, attr) == getattr(enc, attr) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index aae4bb20e..c5c3e363a 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -5,13 +5,12 @@ import pandas as pd import pytest import sklearn -from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel from sklearn.utils.validation import check_is_fitted -from skrub import GapEncoder, SuperVectorizer, TableVectorizer +from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer from skrub._table_vectorizer import _infer_date_format from skrub._utils import parse_version @@ -642,6 +641,7 @@ def test_parallelism() -> None: for high_card_cat_transformer in [ lambda: GapEncoder(n_components=2, random_state=0), OneHotEncoder, + lambda: MinHashEncoder(n_components=2), ]: table_vec_no_parallel = TableVectorizer( high_card_cat_transformer=high_card_cat_transformer(), @@ -689,46 +689,10 @@ def test_parallelism() -> None: ) -def test_original_iter() -> None: - # Test if the behavior changed when we overrided the _ iter - # method from the ColumnTransformer - # only the transformers_ attribute should be different +# def test_split_and_merge_univariate_transformers(): +# UNIVARIATE_TRANSFORMERS = ["GapEncoder", "MinHashEncoder"] +# X = _get_clean_dataframe() - X = _get_clean_dataframe() - table_vec_old_iter = TableVectorizer( - high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), - cardinality_threshold=4, - ) - table_vec_old_iter._iter = lambda *args, **kwargs: ColumnTransformer._iter( - table_vec_old_iter, *args, **kwargs - ) - table_vec_new_iter = TableVectorizer( - high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), - cardinality_threshold=4, - ) - assert_array_equal( - table_vec_old_iter.fit_transform(X), table_vec_new_iter.fit_transform(X) - ) - # assert that all attributes are equal except for - # the transformers_ attribute - for attr in [ - "n_jobs", - "columns_", - "types_", - "imputed_columns_", - ]: - assert str(getattr(table_vec_old_iter, attr)) == str( - getattr(table_vec_new_iter, attr) - ) - # assert that get_feature_names_out gives the same result - assert_array_equal( - table_vec_old_iter.get_feature_names_out(), - table_vec_new_iter.get_feature_names_out(), - ) - # assert that get_params gives the same result expect for n_jobs - # remove n_jobs from the dict - params_old_iter = table_vec_old_iter.get_params() - params_new_iter = table_vec_new_iter.get_params() - assert str(params_old_iter) == str(params_new_iter) - # assert that transform gives the same result - assert_array_equal(table_vec_old_iter.transform(X), table_vec_new_iter.transform(X)) +# # With GapEncoder +# high_card_cat_transformer = GapEncoder(n_components=2, random_state=0) +# hard_card_cat_cols = ["str_2", "cat_2"] From 8acca83bc436f495349a9e27d9669be5c6ffc98c Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 22 Jun 2023 23:36:52 +0200 Subject: [PATCH 11/47] more tests --- skrub/_gap_encoder.py | 1 + skrub/_minhash_encoder.py | 1 + skrub/tests/test_minhash_encoder.py | 40 ++++++++++++++----- skrub/tests/test_table_vectorizer.py | 57 ++++++++++++++++++++++++---- 4 files changed, 81 insertions(+), 18 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index dae26d90b..350e2d4d5 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -722,6 +722,7 @@ class GapEncoder(BaseEstimator, TransformerMixin): def _merge(cls, transformers_list): # merge GapEncoder fitted on different columns # into a single GapEncoder + # useful for parallelization in the TableVectorizer full_transformer = clone(transformers_list[0]) # assert rho_ is the same for all transformers print(transformers_list[0]) diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index a55a6a389..bc7adf043 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -118,6 +118,7 @@ class MinHashEncoder(BaseEstimator, TransformerMixin): def _merge(cls, transformers_list): # merge MinHashEncoder fitted on different columns # into a single MinHashEncoder + # useful for parallelization in the TableVectorizer full_transformer = clone(transformers_list[0]) capacity = transformers_list[0]._capacity assert all( diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py index 9bcd7a16d..ea6add7f8 100644 --- a/skrub/tests/test_minhash_encoder.py +++ b/skrub/tests/test_minhash_encoder.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd import pytest -from sklearn.exceptions import NotFittedError from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel from skrub import MinHashEncoder @@ -251,13 +250,34 @@ def test_correct_arguments() -> None: encoder.fit_transform(X) -def test_check_fitted_minhash_encoder() -> None: - """Test that calling transform before fit raises an error""" - encoder = MinHashEncoder(n_components=3) - X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None] - with pytest.raises(NotFittedError): - encoder.transform(X) +def test_merge_transformers() -> None: + # test whether fitting on each column separately and then merging the + # transformers gives the same result as fitting on the whole dataset - # Check that it works after fitting - encoder.fit(X) - encoder.transform(X) + # generate data + X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) + X = pd.DataFrame(X, columns=["col0", "col1", "col2"]) + + # fit on each column separately + enc_list = [] + for i in range(3): + enc = MinHashEncoder() + enc.fit(X[[f"col{i}"]]) + enc_list.append(enc) + enc_merged = MinHashEncoder._merge(enc_list) + + # fit on the whole dataset + enc = MinHashEncoder() + enc.fit(X) + + # check that the results are the same + # check transform + assert np.allclose(enc_merged.transform(X), enc.transform(X)) + # check get_feature_names_out + # assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() + # check that the hash_dict_ attribute is the same + assert enc.hash_dict_.cache.keys() == enc_merged.hash_dict_.cache.keys() + for key in enc.hash_dict_.cache.keys(): + assert np.array_equal( + enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key] + ) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index c5c3e363a..6a5d3ec78 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -636,12 +636,13 @@ def test_column_by_column(): def test_parallelism() -> None: # Test that parallelism works X = _get_clean_dataframe() - # the gap encoder should be parallelized on all columns + # the gap encoder and the minhashencoder + # should be parallelized on all columns # the one hot encoder should not be parallelized for high_card_cat_transformer in [ lambda: GapEncoder(n_components=2, random_state=0), OneHotEncoder, - lambda: MinHashEncoder(n_components=2), + # lambda: MinHashEncoder(n_components=2), ]: table_vec_no_parallel = TableVectorizer( high_card_cat_transformer=high_card_cat_transformer(), @@ -689,10 +690,50 @@ def test_parallelism() -> None: ) -# def test_split_and_merge_univariate_transformers(): -# UNIVARIATE_TRANSFORMERS = ["GapEncoder", "MinHashEncoder"] -# X = _get_clean_dataframe() +def test_split_and_merge_univariate_transformers(): + X = _get_clean_dataframe() + for high_card_cat_transformer in [ + lambda: GapEncoder(n_components=2, random_state=0), + lambda: MinHashEncoder(n_components=2), + ]: + enc = TableVectorizer( + high_card_cat_transformer=high_card_cat_transformer(), + cardinality_threshold=4, + n_jobs=None, # should disable the default splitting and merging + ) + + enc.fit(X) + assert len(enc.transformers) == 2 -# # With GapEncoder -# high_card_cat_transformer = GapEncoder(n_components=2, random_state=0) -# hard_card_cat_cols = ["str_2", "cat_2"] + enc_split = TableVectorizer( + high_card_cat_transformer=high_card_cat_transformer(), + cardinality_threshold=4, + n_jobs=None, + ) + enc_split.fit(X) + # during actual use, this is done during fit + enc_split._split_univariate_transformers() + # check that the GapEncoder is split into 2 transformers + assert len(enc_split.transformers) == 3 + assert np.allclose(enc.transform(X), enc_split.transform(X)) + + enc_split._merge_univariate_transformers() + # check that the GapEncoder is merged into 1 transformer + assert len(enc_split.transformers) == 2 + assert np.allclose(enc.transform(X), enc_split.transform(X)) + + # assert that the transformers attribute is the same as + # the one before splitting and merging + assert str(enc.transformers) == str(enc_split.transformers) + + # check that a OneHotEncoder is not split + enc_one_hot = TableVectorizer( + high_card_cat_transformer=OneHotEncoder(), + low_card_cat_transformer=OneHotEncoder( + handle_unknown="ignore" + ), # change the default to have a different transformer + cardinality_threshold=4, + n_jobs=None, + ) + enc_one_hot.fit(X) + assert len(enc_one_hot.transformers) == 2 From b997d287baa0fb2cda6ac44e20a72fc63a44270c Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 23 Jun 2023 10:32:14 +0200 Subject: [PATCH 12/47] fix test --- skrub/tests/test_table_vectorizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 877ce18ad..4454ccbc0 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -690,7 +690,7 @@ def test_split_and_merge_univariate_transformers(): ) enc.fit(X) - assert len(enc.transformers) == 2 + assert len(enc.transformers) == 3 enc_split = TableVectorizer( high_card_cat_transformer=high_card_cat_transformer(), @@ -701,12 +701,12 @@ def test_split_and_merge_univariate_transformers(): # during actual use, this is done during fit enc_split._split_univariate_transformers() # check that the GapEncoder is split into 2 transformers - assert len(enc_split.transformers) == 3 + assert len(enc_split.transformers) == 4 assert np.allclose(enc.transform(X), enc_split.transform(X)) enc_split._merge_univariate_transformers() # check that the GapEncoder is merged into 1 transformer - assert len(enc_split.transformers) == 2 + assert len(enc_split.transformers) == 3 assert np.allclose(enc.transform(X), enc_split.transform(X)) # assert that the transformers attribute is the same as @@ -723,4 +723,4 @@ def test_split_and_merge_univariate_transformers(): n_jobs=None, ) enc_one_hot.fit(X) - assert len(enc_one_hot.transformers) == 2 + assert len(enc_one_hot.transformers) == 3 From f1ec2dd0d354a046dfb4dcb176dfaa5dc2c89015 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 23 Jun 2023 15:58:24 +0200 Subject: [PATCH 13/47] also split self.transformers_ to parallelize transform --- skrub/_gap_encoder.py | 12 ++++- skrub/_table_vectorizer.py | 79 ++++++++++++++++++++++------ skrub/tests/test_table_vectorizer.py | 1 - 3 files changed, 75 insertions(+), 17 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index 000baca8b..eb4dfc7ed 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -657,7 +657,6 @@ def _merge(cls, transformers_list): # useful for parallelization in the TableVectorizer full_transformer = clone(transformers_list[0]) # assert rho_ is the same for all transformers - print(transformers_list[0]) rho_ = transformers_list[0].rho_ assert all([transformers.rho_ == rho_ for transformers in transformers_list]) full_transformer.rho_ = rho_ @@ -670,6 +669,17 @@ def _merge(cls, transformers_list): full_transformer.column_names_.extend(transformers.column_names_) return full_transformer + def _split(self): + check_is_fitted(self) + transformers_list = [] + for i, model in enumerate(self.fitted_models_): + transformer = clone(self) + transformer.rho_ = model.rho_ + transformer.fitted_models_ = [model] + transformer.column_names_ = [self.column_names_[i]] + transformers_list.append(transformer) + return transformers_list + def __init__( self, *, diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 214b69eb4..3b6b52e13 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -455,21 +455,70 @@ def _clone_transformers(self): # TODO: check that the provided transformers are valid - def _split_univariate_transformers(self): - self._transformers_original = self.transformers - new_transformers = [] - for name, trans, cols in self.transformers: - if trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS: - for i, col in enumerate(cols): - new_transformers.append( - (name + "_split_" + str(i), clone(trans), [col]) - ) - else: - new_transformers.append((name, trans, cols)) - self.transformers = new_transformers + def _split_univariate_transformers(self, split_fitted=False): + """ + Split univariate transformers into multiple transformers, one for each + column. This is useful to use the inherited `ColumnTransformer` class + parallelism. + + Parameters + ---------- + split_fitted : bool, default=False + Whether to split the self.transformers_ attribute (True) or the + self.transformers attribute (False). The former is used when + calling `transform` and the latter when calling `fit_transform`. + """ + if split_fitted: + check_is_fitted(self, attributes=["transformers_"]) + self._transformers_fitted_original = self.transformers_ + new_transformers_ = [] + new_transformer_to_input_indices = {} + for name, trans, cols in self.transformers_: + if ( + trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS + and len(cols) > 1 + ): + splitted_transformers_ = trans._split() + assert len(splitted_transformers_) == len(cols) + for i, col in enumerate(cols): + new_transformers_.append( + ( + name + "_split_" + str(i), + splitted_transformers_[i], + [col], + ) + ) + new_transformer_to_input_indices[name + "_split_" + str(i)] = [ + self._transformer_to_input_indices[name][i] + ] + else: + new_transformers_.append((name, trans, cols)) + new_transformer_to_input_indices[ + name + ] = self._transformer_to_input_indices[name] + self.transformers_ = new_transformers_ + self._transformer_to_input_indices = new_transformer_to_input_indices + else: + self._transformers_original = self.transformers + new_transformers = [] + for name, trans, cols in self.transformers: + if ( + trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS + and len(cols) > 1 + ): + for i, col in enumerate(cols): + new_transformers.append( + (name + "_split_" + str(i), clone(trans), [col]) + ) + else: + new_transformers.append((name, trans, cols)) + self.transformers = new_transformers def _merge_univariate_transformers(self): - self.transformers = self._transformers_original + # merge back self.transformers if it was split + if hasattr(self, "_transformers_original"): + self.transformers = self._transformers_original + # merge self.transformers_ new_transformers_ = [] new_transformer_to_input_indices = {} # find all base names @@ -719,7 +768,7 @@ def fit_transform(self, X, y=None): # split the univariate transformers on each column # to be able to parallelize the encoding if self.n_jobs not in (None, 1): - self._split_univariate_transformers() + self._split_univariate_transformers(split_fitted=False) X_enc = super().fit_transform(X, y) @@ -777,7 +826,7 @@ def transform(self, X) -> np.ndarray: # split the univariate transformers on each column # to be able to parallelize the encoding if self.n_jobs not in (None, 1): - self._split_univariate_transformers() + self._split_univariate_transformers(split_fitted=True) res = super().transform(X) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 4454ccbc0..651358e7b 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -645,7 +645,6 @@ def test_parallelism() -> None: cardinality_threshold=4, ) X_trans_parallel = table_vec.fit_transform(X) - # print(table_vec.transform(X)) assert_array_equal(X_trans, X_trans_parallel) assert table_vec.n_jobs == n_jobs # assert that all attributes are equal except for From 4780dfc534b52fa5ed2a4677fd727886d7a28dbc Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 27 Jun 2023 01:00:47 +0200 Subject: [PATCH 14/47] add _split to minhashencoder --- skrub/_minhash_encoder.py | 21 ++++ skrub/tests/test_gap_encoder.py | 75 +++++++++++++ skrub/tests/test_minhash_encoder.py | 151 ++++++++++++++++++++++----- skrub/tests/test_table_vectorizer.py | 19 +++- 4 files changed, 234 insertions(+), 32 deletions(-) diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index b90adb26f..f18199697 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -132,8 +132,29 @@ def _merge(cls, transformers_list): full_transformer.hash_dict_ = combine_LRUDicts( capacity, *[transformer.hash_dict_ for transformer in transformers_list] ) + full_transformer.n_features_in_ = sum( + transformer.n_features_in_ for transformer in transformers_list + ) + full_transformer.feature_names_in_ = np.concatenate( + [transformer.feature_names_in_ for transformer in transformers_list] + ) return full_transformer + def _split(self): + check_is_fitted(self) + transformer_list = [] + for i in range(self.n_features_in_): + trans = clone(self) + attributes = ["hash_dict_", "_capacity"] + for a in attributes: + if hasattr(self, a): + setattr(trans, a, getattr(self, a)) + # TODO; do we want to deepcopy hash_dict_ + trans.n_features_in_ = 1 + trans.feature_names_in_ = np.array([self.feature_names_in_[i]]) + transformer_list.append(trans) + return transformer_list + def __init__( self, *, diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index d2f0142c9..50ed8ed0e 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -1,6 +1,9 @@ +import copy + import numpy as np import pandas as pd import pytest +from numpy.testing import assert_array_equal from sklearn.exceptions import NotFittedError from sklearn.model_selection import train_test_split @@ -297,3 +300,75 @@ def test_merge_transformers() -> None: attrs = ["rho_", "column_names_"] for attr in attrs: assert getattr(enc_merged, attr) == getattr(enc, attr) + + +def test_split_transformers() -> None: + # test whether splitting the transformer after fitting + # change the output of transform + + # generate data + X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) + X = pd.DataFrame(X, columns=["col0", "col1", "col2"]) + + # fit on the whole dataset + enc = GapEncoder(random_state=42) + enc.fit(X) + + # split the transformer + enc_list = copy.deepcopy(enc)._split() + + # fit on each column separately + index = 0 + for i in range(3): + # check that the results are the same + # check transform + transformed_X_i = enc_list[i].transform(X[[f"col{i}"]]) + assert np.allclose( + transformed_X_i, + enc.transform(X)[:, index : index + transformed_X_i.shape[1]], + ) + # check get_feature_names_out + assert_array_equal( + np.array(enc_list[i].get_feature_names_out()), + np.array(enc.get_feature_names_out())[ + index : index + transformed_X_i.shape[1] + ], + ) + index += transformed_X_i.shape[1] + # check all attributes + attrs = ["rho_"] + for attr in attrs: + assert getattr(enc_list[i], attr) == getattr(enc, attr) + assert enc_list[i].column_names_ == [f"col{i}"] + + +def test_split_and_merge_transformers() -> None: + # test whether splitting the transformer after fitting + # and then merging the transformers gives the same result + # as fitting on the whole dataset + + # generate data + X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) + X = pd.DataFrame(X, columns=["col0", "col1", "col2"]) + + # fit on the whole dataset + enc = GapEncoder(random_state=42) + enc.fit(X) + + # split the transformer + enc_list = copy.deepcopy(enc)._split() + + # merge the transformers + enc_merged = GapEncoder._merge(enc_list) + + # check that the results are the same + # check transform + assert np.allclose(enc_merged.transform(X), enc.transform(X)) + # check get_feature_names_out + assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() + # check score + assert enc_merged.score(X) == enc.score(X) + # check all attributes + attrs = ["rho_", "column_names_"] + for attr in attrs: + assert getattr(enc_merged, attr) == getattr(enc, attr) diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py index 012bb9429..5dbd25aef 100644 --- a/skrub/tests/test_minhash_encoder.py +++ b/skrub/tests/test_minhash_encoder.py @@ -1,3 +1,4 @@ +import copy import random from string import ascii_lowercase @@ -249,6 +250,40 @@ def test_correct_arguments() -> None: encoder.fit_transform(X) +def test_deterministic(): + """Test that the encoder is deterministic""" + # TODO: add random state to encoder + encoder1 = MinHashEncoder(n_components=4) + encoder2 = MinHashEncoder(n_components=4) + X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None] + encoded1 = encoder1.fit_transform(X) + encoded2 = encoder2.fit_transform(X) + assert_array_equal(encoded1, encoded2) + + +def test_get_feature_names_out(): + """Test that get_feature_names_out returns the correct feature names""" + encoder = MinHashEncoder(n_components=4) + X = pd.DataFrame( + { + "col1": ["a", "b", "c", "d", "e", "f", "g", "h"], + "col2": ["a", "b", "c", "d", "e", "f", "g", "h"], + } + ) + encoder.fit(X) + expected_columns = np.array( + ["col1_0", "col1_1", "col1_2", "col1_3", "col2_0", "col2_1", "col2_2", "col2_3"] + ) + assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns) + + # Test that it works with a list of strings + encoder = MinHashEncoder(n_components=4) + X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None] + encoder.fit(X) + expected_columns = np.array(["x0_0", "x0_1", "x0_2", "x0_3"]) + assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns) + + def test_merge_transformers() -> None: # test whether fitting on each column separately and then merging the # transformers gives the same result as fitting on the whole dataset @@ -280,37 +315,95 @@ def test_merge_transformers() -> None: assert np.array_equal( enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key] ) + # check all attributes + attrs = ["_capacity", "n_features_in_"] + for attr in attrs: + assert getattr(enc_merged, attr) == getattr(enc, attr) + # check feature_names_in_ + assert (enc_merged.feature_names_in_ == enc.feature_names_in_).all() -def test_deterministic(): - """Test that the encoder is deterministic""" - # TODO: add random state to encoder - encoder1 = MinHashEncoder(n_components=4) - encoder2 = MinHashEncoder(n_components=4) - X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None] - encoded1 = encoder1.fit_transform(X) - encoded2 = encoder2.fit_transform(X) - assert_array_equal(encoded1, encoded2) +def test_split_transformers() -> None: + # test whether splitting the transformer after fitting + # change the output of transform + # generate data + X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) + X = pd.DataFrame(X, columns=["col0", "col1", "col2"]) -def test_get_feature_names_out(): - """Test that get_feature_names_out returns the correct feature names""" - encoder = MinHashEncoder(n_components=4) - X = pd.DataFrame( - { - "col1": ["a", "b", "c", "d", "e", "f", "g", "h"], - "col2": ["a", "b", "c", "d", "e", "f", "g", "h"], - } - ) - encoder.fit(X) - expected_columns = np.array( - ["col1_0", "col1_1", "col1_2", "col1_3", "col2_0", "col2_1", "col2_2", "col2_3"] - ) - assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns) + # fit on the whole dataset + enc = MinHashEncoder() + enc.fit_transform(X) - # Test that it works with a list of strings - encoder = MinHashEncoder(n_components=4) - X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None] - encoder.fit(X) - expected_columns = np.array(["x0_0", "x0_1", "x0_2", "x0_3"]) - assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns) + # split the transformer + enc_list = copy.deepcopy(enc)._split() + + # fit on each column separately + index = 0 + for i in range(3): + # check that the results are the same + # check transform + transformed_X_i = enc_list[i].transform(X[[f"col{i}"]]) + assert np.allclose( + transformed_X_i, + enc.transform(X)[:, index : index + transformed_X_i.shape[1]], + ) + # check get_feature_names_out + assert_array_equal( + np.array(enc_list[i].get_feature_names_out()), + np.array(enc.get_feature_names_out())[ + index : index + transformed_X_i.shape[1] + ], + ) + # check self.feature_names_in_ + assert enc_list[i].feature_names_in_ == [f"col{i}"] + # check self.n_features_in_ + assert enc_list[i].n_features_in_ == 1 + index += transformed_X_i.shape[1] + # check all attributes + attrs = ["_capacity"] + # TODO: do we want the hash_dict_ to be the same? + assert enc.hash_dict_.cache.keys() == enc_list[i].hash_dict_.cache.keys() + for key in enc.hash_dict_.cache.keys(): + assert np.array_equal( + enc.hash_dict_.cache[key], enc_list[i].hash_dict_.cache[key] + ) + for attr in attrs: + assert getattr(enc_list[i], attr) == getattr(enc, attr) + + +def test_split_and_merge_transformers() -> None: + # test whether splitting the transformer after fitting + # and then merging the transformers gives the same result + # as fitting on the whole dataset + + # generate data + X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) + X = pd.DataFrame(X, columns=["col0", "col1", "col2"]) + + # fit on the whole dataset + enc = MinHashEncoder() + enc.fit(X) + + # split the transformer + enc_list = copy.deepcopy(enc)._split() + + # merge the transformers + enc_merged = MinHashEncoder._merge(enc_list) + + # check that the results are the same + # check transform + assert np.allclose(enc_merged.transform(X), enc.transform(X)) + # check get_feature_names_out + assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() + assert enc.hash_dict_.cache.keys() == enc_merged.hash_dict_.cache.keys() + for key in enc.hash_dict_.cache.keys(): + assert np.array_equal( + enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key] + ) + # check all attributes + attrs = ["_capacity", "n_features_in_"] + for attr in attrs: + assert getattr(enc_merged, attr) == getattr(enc, attr) + # check feature_names_in_ + assert (enc_merged.feature_names_in_ == enc.feature_names_in_).all() diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 651358e7b..6f509eaa3 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -629,7 +629,7 @@ def test_parallelism() -> None: for high_card_cat_transformer in [ lambda: GapEncoder(n_components=2, random_state=0), OneHotEncoder, - # lambda: MinHashEncoder(n_components=2), + lambda: MinHashEncoder(n_components=2), ]: table_vec_no_parallel = TableVectorizer( high_card_cat_transformer=high_card_cat_transformer(), @@ -698,9 +698,22 @@ def test_split_and_merge_univariate_transformers(): ) enc_split.fit(X) # during actual use, this is done during fit - enc_split._split_univariate_transformers() + enc_split._split_univariate_transformers(split_fitted=False) # check that the GapEncoder is split into 2 transformers + # the transformers_ attribute should not be modified + # because split_fitted is False assert len(enc_split.transformers) == 4 + assert len(enc_split.transformers_) == 3 + enc_split._merge_univariate_transformers() + # check that the GapEncoder is merged into 1 transformer + assert len(enc_split.transformers) == 3 + assert np.allclose(enc.transform(X), enc_split.transform(X)) + + # Now split the transformers_ attribute (split_fitted=True) + enc._split_univariate_transformers(split_fitted=True) + assert len(enc.transformers) == 3 + assert len(enc.transformers_) == 4 + # the fitted transformers should still work assert np.allclose(enc.transform(X), enc_split.transform(X)) enc_split._merge_univariate_transformers() @@ -714,7 +727,7 @@ def test_split_and_merge_univariate_transformers(): # check that a OneHotEncoder is not split enc_one_hot = TableVectorizer( - high_card_cat_transformer=OneHotEncoder(), + high_card_cat_transformer=OneHotEncoder(handle_unknown="error"), low_card_cat_transformer=OneHotEncoder( handle_unknown="ignore" ), # change the default to have a different transformer From 744e881f510c60eb14fc5d4533373f88bb7223e1 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 27 Jun 2023 15:30:52 +0200 Subject: [PATCH 15/47] cleaning --- skrub/tests/test_gap_encoder.py | 9 ++++----- skrub/tests/test_minhash_encoder.py | 15 ++++++--------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index 50ed8ed0e..8257c0097 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -303,8 +303,8 @@ def test_merge_transformers() -> None: def test_split_transformers() -> None: - # test whether splitting the transformer after fitting - # change the output of transform + # check that splitting the transformer after fitting + # doesn't change the output of transform # generate data X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) @@ -343,9 +343,8 @@ def test_split_transformers() -> None: def test_split_and_merge_transformers() -> None: - # test whether splitting the transformer after fitting - # and then merging the transformers gives the same result - # as fitting on the whole dataset + # check that splitting the transformer after fitting + # and then merging the transformers doesn't the same result # generate data X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py index 5dbd25aef..f7edaded8 100644 --- a/skrub/tests/test_minhash_encoder.py +++ b/skrub/tests/test_minhash_encoder.py @@ -285,7 +285,7 @@ def test_get_feature_names_out(): def test_merge_transformers() -> None: - # test whether fitting on each column separately and then merging the + # check that fitting on each column separately and then merging the # transformers gives the same result as fitting on the whole dataset # generate data @@ -324,8 +324,8 @@ def test_merge_transformers() -> None: def test_split_transformers() -> None: - # test whether splitting the transformer after fitting - # change the output of transform + # check that splitting the transformer after fitting + # doesn't change the output of transform # generate data X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) @@ -373,9 +373,8 @@ def test_split_transformers() -> None: def test_split_and_merge_transformers() -> None: - # test whether splitting the transformer after fitting - # and then merging the transformers gives the same result - # as fitting on the whole dataset + # check that splitting the transformer after fitting + # and then merging the transformers doesn't change the result # generate data X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) @@ -398,9 +397,7 @@ def test_split_and_merge_transformers() -> None: assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() assert enc.hash_dict_.cache.keys() == enc_merged.hash_dict_.cache.keys() for key in enc.hash_dict_.cache.keys(): - assert np.array_equal( - enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key] - ) + assert_array_equal(enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key]) # check all attributes attrs = ["_capacity", "n_features_in_"] for attr in attrs: From 856892b915f46cf76c7cc0a6e8c03ae375014e6d Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 27 Jun 2023 19:24:15 +0200 Subject: [PATCH 16/47] clean test --- skrub/tests/test_gap_encoder.py | 2 +- skrub/tests/test_minhash_encoder.py | 22 +++++++++++----------- skrub/tests/test_table_vectorizer.py | 22 ++++++++++++++-------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index 8257c0097..b64e770b6 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -362,7 +362,7 @@ def test_split_and_merge_transformers() -> None: # check that the results are the same # check transform - assert np.allclose(enc_merged.transform(X), enc.transform(X)) + assert_array_equal(enc_merged.transform(X), enc.transform(X)) # check get_feature_names_out assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() # check score diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py index f7edaded8..5513da3f8 100644 --- a/skrub/tests/test_minhash_encoder.py +++ b/skrub/tests/test_minhash_encoder.py @@ -306,21 +306,19 @@ def test_merge_transformers() -> None: # check that the results are the same # check transform - assert np.allclose(enc_merged.transform(X), enc.transform(X)) + assert_array_equal(enc_merged.transform(X), enc.transform(X)) # check get_feature_names_out # assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() # check that the hash_dict_ attribute is the same assert enc.hash_dict_.cache.keys() == enc_merged.hash_dict_.cache.keys() for key in enc.hash_dict_.cache.keys(): - assert np.array_equal( - enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key] - ) + assert_array_equal(enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key]) # check all attributes attrs = ["_capacity", "n_features_in_"] for attr in attrs: assert getattr(enc_merged, attr) == getattr(enc, attr) # check feature_names_in_ - assert (enc_merged.feature_names_in_ == enc.feature_names_in_).all() + assert_array_equal(enc_merged.feature_names_in_, enc.feature_names_in_) def test_split_transformers() -> None: @@ -344,7 +342,7 @@ def test_split_transformers() -> None: # check that the results are the same # check transform transformed_X_i = enc_list[i].transform(X[[f"col{i}"]]) - assert np.allclose( + assert_array_equal( transformed_X_i, enc.transform(X)[:, index : index + transformed_X_i.shape[1]], ) @@ -362,14 +360,15 @@ def test_split_transformers() -> None: index += transformed_X_i.shape[1] # check all attributes attrs = ["_capacity"] + for attr in attrs: + assert getattr(enc_list[i], attr) == getattr(enc, attr) + # check hash_dict_ # TODO: do we want the hash_dict_ to be the same? assert enc.hash_dict_.cache.keys() == enc_list[i].hash_dict_.cache.keys() for key in enc.hash_dict_.cache.keys(): - assert np.array_equal( + assert_array_equal( enc.hash_dict_.cache[key], enc_list[i].hash_dict_.cache[key] ) - for attr in attrs: - assert getattr(enc_list[i], attr) == getattr(enc, attr) def test_split_and_merge_transformers() -> None: @@ -392,9 +391,10 @@ def test_split_and_merge_transformers() -> None: # check that the results are the same # check transform - assert np.allclose(enc_merged.transform(X), enc.transform(X)) + assert_array_equal(enc_merged.transform(X), enc.transform(X)) # check get_feature_names_out assert enc_merged.get_feature_names_out() == enc.get_feature_names_out() + # check hash_dict_ assert enc.hash_dict_.cache.keys() == enc_merged.hash_dict_.cache.keys() for key in enc.hash_dict_.cache.keys(): assert_array_equal(enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key]) @@ -403,4 +403,4 @@ def test_split_and_merge_transformers() -> None: for attr in attrs: assert getattr(enc_merged, attr) == getattr(enc, attr) # check feature_names_in_ - assert (enc_merged.feature_names_in_ == enc.feature_names_in_).all() + assert_array_equal(enc_merged.feature_names_in_, enc.feature_names_in_) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 6f509eaa3..f7da6d2d8 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -699,7 +699,8 @@ def test_split_and_merge_univariate_transformers(): enc_split.fit(X) # during actual use, this is done during fit enc_split._split_univariate_transformers(split_fitted=False) - # check that the GapEncoder is split into 2 transformers + # check that the high_card_cat_transformer + # is split into 2 transformers # the transformers_ attribute should not be modified # because split_fitted is False assert len(enc_split.transformers) == 4 @@ -708,22 +709,27 @@ def test_split_and_merge_univariate_transformers(): # check that the GapEncoder is merged into 1 transformer assert len(enc_split.transformers) == 3 assert np.allclose(enc.transform(X), enc_split.transform(X)) + # assert that the transformers attribute is the same as + # the one before splitting and merging + assert str(enc.transformers) == str(enc_split.transformers) + # check that you can refit the transformer + enc_split.fit(X) # Now split the transformers_ attribute (split_fitted=True) - enc._split_univariate_transformers(split_fitted=True) - assert len(enc.transformers) == 3 - assert len(enc.transformers_) == 4 + enc_split._split_univariate_transformers(split_fitted=True) + assert len(enc_split.transformers) == 3 + assert len(enc_split.transformers_) == 4 # the fitted transformers should still work - assert np.allclose(enc.transform(X), enc_split.transform(X)) + assert_array_equal(enc.transform(X), enc_split.transform(X)) enc_split._merge_univariate_transformers() # check that the GapEncoder is merged into 1 transformer - assert len(enc_split.transformers) == 3 - assert np.allclose(enc.transform(X), enc_split.transform(X)) + assert len(enc_split.transformers_) == 3 + assert_array_equal(enc.transform(X), enc_split.transform(X)) # assert that the transformers attribute is the same as # the one before splitting and merging - assert str(enc.transformers) == str(enc_split.transformers) + assert str(enc.transformers_) == str(enc_split.transformers_) # check that a OneHotEncoder is not split enc_one_hot = TableVectorizer( From e50323a238c95ba0958bc6409f2a0271a6476160 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 18 Jul 2023 00:21:44 +0200 Subject: [PATCH 17/47] fix wrong merge --- skrub/_minhash_encoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index 9d02cc98b..40ed6f7ed 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -8,6 +8,7 @@ import numpy as np from joblib import Parallel, delayed, effective_n_jobs +from numpy.typing import ArrayLike, NDArray from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.utils import gen_even_slices, murmurhash3_32 from sklearn.utils.validation import _check_feature_names_in, check_is_fitted From 6d007cf9cfab0d164c4512d8cd52bd0c8b2bde28 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 18 Jul 2023 00:26:00 +0200 Subject: [PATCH 18/47] fix wrong merge --- skrub/tests/test_minhash_encoder.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py index 5513da3f8..832747e19 100644 --- a/skrub/tests/test_minhash_encoder.py +++ b/skrub/tests/test_minhash_encoder.py @@ -7,6 +7,7 @@ import pandas as pd import pytest from numpy.testing import assert_array_equal +from sklearn.exceptions import NotFittedError from sklearn.utils._testing import skip_if_no_parallel from skrub import MinHashEncoder @@ -250,6 +251,18 @@ def test_correct_arguments() -> None: encoder.fit_transform(X) +def test_check_fitted_minhash_encoder() -> None: + """Test that calling transform before fit raises an error""" + encoder = MinHashEncoder(n_components=3) + X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None] + with pytest.raises(NotFittedError): + encoder.transform(X) + + # Check that it works after fitting + encoder.fit(X) + encoder.transform(X) + + def test_deterministic(): """Test that the encoder is deterministic""" # TODO: add random state to encoder From 970a73a74e50b8545d5bee580258adc0a933646c Mon Sep 17 00:00:00 2001 From: LeoGrin <45738728+LeoGrin@users.noreply.github.com> Date: Wed, 19 Jul 2023 18:34:26 +0200 Subject: [PATCH 19/47] Apply suggestions from code review type hints Co-authored-by: Lilian --- skrub/_gap_encoder.py | 2 +- skrub/_minhash_encoder.py | 2 +- skrub/_table_vectorizer.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index 5045c18b7..838ea6e41 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -652,7 +652,7 @@ class GapEncoder(BaseEstimator, TransformerMixin): column_names_: list[str] @classmethod - def _merge(cls, transformers_list): + def _merge(cls, transformers_list: list[GapEncoder]): # merge GapEncoder fitted on different columns # into a single GapEncoder # useful for parallelization in the TableVectorizer diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index 40ed6f7ed..d2f82b3e7 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -121,7 +121,7 @@ class MinHashEncoder(BaseEstimator, TransformerMixin): _capacity: int = 2**10 @classmethod - def _merge(cls, transformers_list): + def _merge(cls, transformers_list: list[MinHashEncoder]): # merge MinHashEncoder fitted on different columns # into a single MinHashEncoder # useful for parallelization in the TableVectorizer diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index b6f200b09..45b65d738 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -457,7 +457,7 @@ def _clone_transformers(self): # TODO: check that the provided transformers are valid - def _split_univariate_transformers(self, split_fitted=False): + def _split_univariate_transformers(self, split_fitted: bool = False): """ Split univariate transformers into multiple transformers, one for each column. This is useful to use the inherited `ColumnTransformer` class From d15e7e5548bdcc3f6ad80d3951498dc27b43e54b Mon Sep 17 00:00:00 2001 From: LeoGrin <45738728+LeoGrin@users.noreply.github.com> Date: Wed, 19 Jul 2023 18:35:08 +0200 Subject: [PATCH 20/47] Update skrub/_utils.py Co-authored-by: Lilian --- skrub/_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/skrub/_utils.py b/skrub/_utils.py index fb6e8be75..7f753c842 100644 --- a/skrub/_utils.py +++ b/skrub/_utils.py @@ -37,12 +37,12 @@ def __contains__(self, key: Hashable): return key in self.cache -def combine_LRUDicts(capacity: int, *LRUDicts: LRUDict) -> LRUDict: - combined_LRUDict = LRUDict(capacity) - for lru_dict in LRUDicts: +def combine_LRUDicts(capacity: int, *lru_dicts: LRUDict) -> LRUDict: + combined_lru_dict = LRUDict(capacity) + for lru_dict in lru_dicts: for key, value in lru_dict.cache.items(): - combined_LRUDict[key] = value - return combined_LRUDict + combined_lru_dict[key] = value + return combined_lru_dict def check_input(X) -> NDArray: From 4ae9ae2a7baa6c1c7eb8e815f8039ee0657e8f1f Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 19 Jul 2023 18:46:57 +0200 Subject: [PATCH 21/47] apply Lilian's suggestions --- skrub/_minhash_encoder.py | 4 ++-- skrub/_table_vectorizer.py | 12 ++++++++---- skrub/_utils.py | 2 +- skrub/tests/test_gap_encoder.py | 2 +- skrub/tests/test_table_vectorizer.py | 2 +- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index d2f82b3e7..3f63baae7 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -15,7 +15,7 @@ from ._fast_hash import ngram_min_hash from ._string_distances import get_unique_ngrams -from ._utils import LRUDict, check_input, combine_LRUDicts +from ._utils import LRUDict, check_input, combine_lru_dicts NoneType = type(None) @@ -130,7 +130,7 @@ def _merge(cls, transformers_list: list[MinHashEncoder]): assert all( transformer._capacity == capacity for transformer in transformers_list ) - full_transformer.hash_dict_ = combine_LRUDicts( + full_transformer.hash_dict_ = combine_lru_dicts( capacity, *[transformer.hash_dict_ for transformer in transformers_list] ) full_transformer.n_features_in_ = sum( diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 45b65d738..1909688ab 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -404,6 +404,10 @@ def _more_tags(self): """ return {"allow_nan": [True]} + @property + def is_parallelized(self) -> bool: + return self.n_jobs not in (None, 1) + def _clone_transformers(self): """ For each of the different transformers that can be passed, @@ -785,7 +789,7 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: # split the univariate transformers on each column # to be able to parallelize the encoding - if self.n_jobs not in (None, 1): + if self.is_parallelized: self._split_univariate_transformers(split_fitted=False) X_enc = super().fit_transform(X, y) @@ -800,7 +804,7 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: cols: list[int] self.transformers_[i] = (name, enc, [self.columns_[j] for j in cols]) - if self.n_jobs not in (None, 1): + if self.is_parallelized: self._merge_univariate_transformers() return X_enc @@ -843,12 +847,12 @@ def transform(self, X: ArrayLike) -> ArrayLike: # split the univariate transformers on each column # to be able to parallelize the encoding - if self.n_jobs not in (None, 1): + if self.is_parallelized: self._split_univariate_transformers(split_fitted=True) res = super().transform(X) - if self.n_jobs not in (None, 1): + if self.is_parallelized: self._merge_univariate_transformers() return res diff --git a/skrub/_utils.py b/skrub/_utils.py index 7f753c842..9d1a4ca14 100644 --- a/skrub/_utils.py +++ b/skrub/_utils.py @@ -37,7 +37,7 @@ def __contains__(self, key: Hashable): return key in self.cache -def combine_LRUDicts(capacity: int, *lru_dicts: LRUDict) -> LRUDict: +def combine_lru_dicts(capacity: int, *lru_dicts: LRUDict) -> LRUDict: combined_lru_dict = LRUDict(capacity) for lru_dict in lru_dicts: for key, value in lru_dict.cache.items(): diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index b64e770b6..054c76c64 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -344,7 +344,7 @@ def test_split_transformers() -> None: def test_split_and_merge_transformers() -> None: # check that splitting the transformer after fitting - # and then merging the transformers doesn't the same result + # and then merging the transformers doesn't change the result # generate data X = np.concatenate([generate_data(100, random_state=i) for i in range(3)], axis=1) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 88739e5eb..204308396 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -686,7 +686,7 @@ def test_parallelism() -> None: cardinality_threshold=4, ) X_trans = table_vec_no_parallel.fit_transform(X) - for joblib_backend in ["loky", "threading", "multiprocessing"]: + for joblib_backend in ["loky"]: with joblib.parallel_backend(joblib_backend): for n_jobs in [None, 2, -1]: table_vec = TableVectorizer( From b5848e7bbc7c6de1997cac94b57ce3db174e1c8f Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 19 Jul 2023 18:56:05 +0200 Subject: [PATCH 22/47] add future annotation to avoid circular import in type hints --- skrub/_gap_encoder.py | 1 + skrub/_minhash_encoder.py | 1 + 2 files changed, 2 insertions(+) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index 838ea6e41..1e27dea83 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -1,6 +1,7 @@ """ Implements the GapEncoder: a probabilistic encoder for categorical variables. """ +from __future__ import annotations from collections.abc import Generator from copy import deepcopy diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index 3f63baae7..352e46f30 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -2,6 +2,7 @@ Implements the MinHashEncoder, which encodes string categorical features by applying the MinHash method to n-gram decompositions of strings. """ +from __future__ import annotations from collections.abc import Callable, Collection from typing import Literal From 6f645fc3079b1c46dba989bfbaa465bb84a836aa Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 19 Jul 2023 19:55:09 +0200 Subject: [PATCH 23/47] add docstring --- skrub/_table_vectorizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 1909688ab..8f13a8f8a 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -406,6 +406,9 @@ def _more_tags(self): @property def is_parallelized(self) -> bool: + """ + Returns True if the transformers are parallelized over columns, False otherwise. + """ return self.n_jobs not in (None, 1) def _clone_transformers(self): From 9753bd0fb75cd28290322efb28ab80b8d85c1328 Mon Sep 17 00:00:00 2001 From: LeoGrin <45738728+LeoGrin@users.noreply.github.com> Date: Thu, 20 Jul 2023 12:18:48 +0200 Subject: [PATCH 24/47] Update skrub/_table_vectorizer.py Co-authored-by: Lilian --- skrub/_table_vectorizer.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 8f13a8f8a..ff0cd25f6 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -538,14 +538,11 @@ def _merge_univariate_transformers(self): base_names.append(base_name) # merge all transformers with the same base name for base_name in base_names: - transformers = [] - columns = [] - names = [] for name, trans, cols in self.transformers_: if name.startswith(base_name): - columns.extend(cols) - transformers.append(trans) - names.append(name) + names, transformers, nested_columns = zip(*self.transformers_) + # Flatten list[list[str]] to list[str] + columns = [column for column in columns for columns in nested_columns] if len(transformers) == 1: new_transformers_.append((base_name, transformers[0], columns)) new_transformer_to_input_indices[base_name] = list( From 6e48fab29f322e599e73c15df7d4dd66f8b2f562 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 20 Jul 2023 12:21:48 +0200 Subject: [PATCH 25/47] type hint --- skrub/_table_vectorizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 8f13a8f8a..58d4263f0 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -362,6 +362,8 @@ class TableVectorizer(ColumnTransformer): types_: dict[str, type] imputed_columns_: list[str] + _transformer_to_input_indices: dict[str, list[int]] + # Override required parameters _required_parameters = [] From 63e5afd6dc243b0474bebd99189112c01d2d884d Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Thu, 20 Jul 2023 15:05:26 +0200 Subject: [PATCH 26/47] revert change --- skrub/_table_vectorizer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 0064e44df..2f6001a91 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -538,13 +538,14 @@ def _merge_univariate_transformers(self): base_name = name.split("_split_")[0] if base_name not in base_names: base_names.append(base_name) - # merge all transformers with the same base name for base_name in base_names: + # merge all transformers with the same base name + transformers, columns, names = [], [], [] for name, trans, cols in self.transformers_: if name.startswith(base_name): - names, transformers, nested_columns = zip(*self.transformers_) - # Flatten list[list[str]] to list[str] - columns = [column for column in columns for columns in nested_columns] + columns.extend(cols) + transformers.append(trans) + names.append(name) if len(transformers) == 1: new_transformers_.append((base_name, transformers[0], columns)) new_transformer_to_input_indices[base_name] = list( From 5e6966accb4f3baaa6891495236d21b931c4e744 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 4 Aug 2023 17:48:20 +0200 Subject: [PATCH 27/47] first batch of Vincent's suggestions --- skrub/_gap_encoder.py | 1 - skrub/_minhash_encoder.py | 3 - skrub/_table_vectorizer.py | 80 +++++----- skrub/tests/test_table_vectorizer.py | 227 ++++++++++++++------------- 4 files changed, 157 insertions(+), 154 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index 4ff908cf5..a199138d6 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -654,7 +654,6 @@ def _merge(cls, transformers_list: list[GapEncoder]): full_transformer = clone(transformers_list[0]) # assert rho_ is the same for all transformers rho_ = transformers_list[0].rho_ - assert all([transformers.rho_ == rho_ for transformers in transformers_list]) full_transformer.rho_ = rho_ full_transformer.fitted_models_ = [] for transformers in transformers_list: diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index 638546407..d46db12d2 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -128,9 +128,6 @@ def _merge(cls, transformers_list: list[MinHashEncoder]): # useful for parallelization in the TableVectorizer full_transformer = clone(transformers_list[0]) capacity = transformers_list[0]._capacity - assert all( - transformer._capacity == capacity for transformer in transformers_list - ) full_transformer.hash_dict_ = combine_lru_dicts( capacity, *[transformer.hash_dict_ for transformer in transformers_list] ) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index b1d14022e..769aac7d0 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -471,7 +471,7 @@ def _clone_transformers(self): # TODO: check that the provided transformers are valid - def _split_univariate_transformers(self, split_fitted: bool = False): + def _split_univariate_transformers(self, during_fit: bool = False): """ Split univariate transformers into multiple transformers, one for each column. This is useful to use the inherited `ColumnTransformer` class @@ -479,12 +479,28 @@ def _split_univariate_transformers(self, split_fitted: bool = False): Parameters ---------- - split_fitted : bool, default=False - Whether to split the self.transformers_ attribute (True) or the - self.transformers attribute (False). The former is used when - calling `transform` and the latter when calling `fit_transform`. + during_fit : bool, default=False + Whether the method is called during `fit_transform` (True) or + during `transform` (False). This is used to determine whether + to split the self.transformers_ attribute (when False) or the + self.transformers attribute (when True). """ - if split_fitted: + if during_fit: + self._transformers_original = self.transformers + new_transformers = [] + for name, trans, cols in self.transformers: + if ( + trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS + and len(cols) > 1 + ): + for i, col in enumerate(cols): + new_transformers.append( + (name + "_split_" + str(i), clone(trans), [col]) + ) + else: + new_transformers.append((name, trans, cols)) + self.transformers = new_transformers + else: check_is_fitted(self, attributes=["transformers_"]) self._transformers_fitted_original = self.transformers_ new_transformers_ = [] @@ -495,17 +511,17 @@ def _split_univariate_transformers(self, split_fitted: bool = False): and len(cols) > 1 ): splitted_transformers_ = trans._split() - assert len(splitted_transformers_) == len(cols) - for i, col in enumerate(cols): - new_transformers_.append( - ( - name + "_split_" + str(i), - splitted_transformers_[i], - [col], - ) + for i, (col, trans, trans_to_mapping) in enumerate( + zip( + cols, + splitted_transformers_, + self._transformer_to_input_indices[name], ) - new_transformer_to_input_indices[name + "_split_" + str(i)] = [ - self._transformer_to_input_indices[name][i] + ): + name_split = f"{name}_split_{i}" + new_transformers_.append((name_split, trans, [col])) + new_transformer_to_input_indices[name_split] = [ + trans_to_mapping ] else: new_transformers_.append((name, trans, cols)) @@ -514,35 +530,17 @@ def _split_univariate_transformers(self, split_fitted: bool = False): ] = self._transformer_to_input_indices[name] self.transformers_ = new_transformers_ self._transformer_to_input_indices = new_transformer_to_input_indices - else: - self._transformers_original = self.transformers - new_transformers = [] - for name, trans, cols in self.transformers: - if ( - trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS - and len(cols) > 1 - ): - for i, col in enumerate(cols): - new_transformers.append( - (name + "_split_" + str(i), clone(trans), [col]) - ) - else: - new_transformers.append((name, trans, cols)) - self.transformers = new_transformers def _merge_univariate_transformers(self): # merge back self.transformers if it was split - if hasattr(self, "_transformers_original"): - self.transformers = self._transformers_original + check_is_fitted(self, attributes=["transformers_"]) + self.transformers = self._transformers_original # merge self.transformers_ new_transformers_ = [] new_transformer_to_input_indices = {} - # find all base names - base_names = [] - for name, _, _ in self.transformers_: - base_name = name.split("_split_")[0] - if base_name not in base_names: - base_names.append(base_name) + base_names = pd.unique( + [name.split("_split_")[0] for name, _, _ in self.transformers_] + ) for base_name in base_names: # merge all transformers with the same base name transformers, columns, names = [], [], [] @@ -876,7 +874,7 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: # split the univariate transformers on each column # to be able to parallelize the encoding if self.is_parallelized: - self._split_univariate_transformers(split_fitted=False) + self._split_univariate_transformers(during_fit=True) X_enc = super().fit_transform(X, y) @@ -924,7 +922,7 @@ def transform(self, X: ArrayLike) -> ArrayLike: # split the univariate transformers on each column # to be able to parallelize the encoding if self.is_parallelized: - self._split_univariate_transformers(split_fitted=True) + self._split_univariate_transformers(during_fit=False) res = super().transform(X) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 1e43e2420..1563b3a2d 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -11,7 +11,9 @@ from skrub._table_vectorizer import _infer_date_format -def check_same_transformers(expected_transformers: dict, actual_transformers: list): +def check_same_transformers( + expected_transformers: dict, actual_transformers: list +) -> None: # Construct the dict from the actual transformers actual_transformers_dict = {name: cols for name, trans, cols in actual_transformers} assert actual_transformers_dict == expected_transformers @@ -644,7 +646,7 @@ def test__infer_date_format() -> None: assert _infer_date_format(date_column) is None -def test_mixed_types(): +def test_mixed_types() -> None: # TODO: datetime/str mixed types # don't work df = _get_mixed_types_dataframe() @@ -717,7 +719,9 @@ def test_mixed_types(): ), ], ) -def test_changing_types(X_fit, X_transform_original, X_transform_with_missing_original): +def test_changing_types( + X_fit, X_transform_original, X_transform_with_missing_original +) -> None: """ Test that the TableVectorizer performs properly when the type inferred during fit does not match the type of the @@ -747,7 +751,7 @@ def test_changing_types(X_fit, X_transform_original, X_transform_with_missing_or assert np.allclose(res, res_missing, equal_nan=True) -def test_changing_types_int_float(): +def test_changing_types_int_float() -> None: # The TableVectorizer shouldn't cast floats to ints # even if only ints were seen during fit X_fit, X_transform = ( @@ -760,7 +764,7 @@ def test_changing_types_int_float(): assert np.allclose(res, np.array([[1.0], [2.0], [3.3]])) -def test_column_by_column(): +def test_column_by_column() -> None: X = _get_clean_dataframe() table_vec_all_cols = TableVectorizer( high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), @@ -781,118 +785,123 @@ def test_column_by_column(): @skip_if_no_parallel -def test_parallelism() -> None: - # Test that parallelism works - X = _get_clean_dataframe() +@pytest.mark.parametrize( + "high_card_cat_transformer", # the gap encoder and the minhashencoder # should be parallelized on all columns # the one hot encoder should not be parallelized - for high_card_cat_transformer in [ - lambda: GapEncoder(n_components=2, random_state=0), - OneHotEncoder, - lambda: MinHashEncoder(n_components=2), - ]: - table_vec_no_parallel = TableVectorizer( - high_card_cat_transformer=high_card_cat_transformer(), - cardinality_threshold=4, - ) - X_trans = table_vec_no_parallel.fit_transform(X) - for joblib_backend in ["loky"]: - with joblib.parallel_backend(joblib_backend): - for n_jobs in [None, 2, -1]: - table_vec = TableVectorizer( - n_jobs=n_jobs, - high_card_cat_transformer=high_card_cat_transformer(), - cardinality_threshold=4, - ) - X_trans_parallel = table_vec.fit_transform(X) - assert_array_equal(X_trans, X_trans_parallel) - assert table_vec.n_jobs == n_jobs - # assert that all attributes are equal except for - # the n_jobs attribute - for attr in [ - "transformers_", - "columns_", - "types_", - "imputed_columns_", - ]: - assert str(getattr(table_vec, attr)) == str( - getattr(table_vec_no_parallel, attr) - ) - # assert that get_feature_names_out gives the same result - assert_array_equal( - table_vec.get_feature_names_out(), - table_vec_no_parallel.get_feature_names_out(), - ) - # assert that get_params gives the same result expect for n_jobs - # remove n_jobs from the dict - params = table_vec.get_params() - params.pop("n_jobs") - params_no_parallel = table_vec_no_parallel.get_params() - params_no_parallel.pop("n_jobs") - assert str(params) == str(params_no_parallel) - # assert that transform gives the same result - assert_array_equal( - table_vec.transform(X), table_vec_no_parallel.transform(X) - ) - - -def test_split_and_merge_univariate_transformers(): + [ + GapEncoder(n_components=2, random_state=0), + OneHotEncoder(), + MinHashEncoder(n_components=2), + ], +) +def test_parallelism(high_card_cat_transformer) -> None: + # Test that parallelism works X = _get_clean_dataframe() - for high_card_cat_transformer in [ - lambda: GapEncoder(n_components=2, random_state=0), - lambda: MinHashEncoder(n_components=2), - ]: - enc = TableVectorizer( - high_card_cat_transformer=high_card_cat_transformer(), - cardinality_threshold=4, - n_jobs=None, # should disable the default splitting and merging - ) + table_vec_no_parallel = TableVectorizer( + high_card_cat_transformer=high_card_cat_transformer, + cardinality_threshold=4, + ) + X_trans = table_vec_no_parallel.fit_transform(X) + with joblib.parallel_backend("loky"): + for n_jobs in [None, 2, -1]: + table_vec = TableVectorizer( + n_jobs=n_jobs, + high_card_cat_transformer=high_card_cat_transformer, + cardinality_threshold=4, + ) + X_trans_parallel = table_vec.fit_transform(X) + assert_array_equal(X_trans, X_trans_parallel) + assert table_vec.n_jobs == n_jobs + # assert that all attributes are equal except for + # the n_jobs attribute + assert str(table_vec.transformers_) == str( + table_vec_no_parallel.transformers_ + ) + assert (table_vec.columns_ == table_vec_no_parallel.columns_).all() + assert table_vec.types_ == table_vec_no_parallel.types_ + assert table_vec.imputed_columns_ == table_vec_no_parallel.imputed_columns_ + # assert that get_feature_names_out gives the same result + assert_array_equal( + table_vec.get_feature_names_out(), + table_vec_no_parallel.get_feature_names_out(), + ) + # assert that get_params gives the same result expect for n_jobs + # remove n_jobs from the dict + params = table_vec.get_params() + params.pop("n_jobs") + params_no_parallel = table_vec_no_parallel.get_params() + params_no_parallel.pop("n_jobs") + assert str(params) == str(params_no_parallel) + # assert that transform gives the same result + assert_array_equal( + table_vec.transform(X), table_vec_no_parallel.transform(X) + ) - enc.fit(X) - assert len(enc.transformers) == 3 - enc_split = TableVectorizer( - high_card_cat_transformer=high_card_cat_transformer(), - cardinality_threshold=4, - n_jobs=None, - ) - enc_split.fit(X) - # during actual use, this is done during fit - enc_split._split_univariate_transformers(split_fitted=False) - # check that the high_card_cat_transformer - # is split into 2 transformers - # the transformers_ attribute should not be modified - # because split_fitted is False - assert len(enc_split.transformers) == 4 - assert len(enc_split.transformers_) == 3 - enc_split._merge_univariate_transformers() - # check that the GapEncoder is merged into 1 transformer - assert len(enc_split.transformers) == 3 - assert np.allclose(enc.transform(X), enc_split.transform(X)) - # assert that the transformers attribute is the same as - # the one before splitting and merging - assert str(enc.transformers) == str(enc_split.transformers) - # check that you can refit the transformer - enc_split.fit(X) - - # Now split the transformers_ attribute (split_fitted=True) - enc_split._split_univariate_transformers(split_fitted=True) - assert len(enc_split.transformers) == 3 - assert len(enc_split.transformers_) == 4 - # the fitted transformers should still work - assert_array_equal(enc.transform(X), enc_split.transform(X)) - - enc_split._merge_univariate_transformers() - # check that the GapEncoder is merged into 1 transformer - assert len(enc_split.transformers_) == 3 - assert_array_equal(enc.transform(X), enc_split.transform(X)) - - # assert that the transformers attribute is the same as - # the one before splitting and merging - assert str(enc.transformers_) == str(enc_split.transformers_) +@pytest.mark.parametrize( + "high_card_cat_transformer", + [ + GapEncoder(n_components=2, random_state=0), + MinHashEncoder(n_components=2), + ], +) +def test_split_and_merge_univariate_transformers(high_card_cat_transformer) -> None: + X = _get_clean_dataframe() + enc = TableVectorizer( + high_card_cat_transformer=high_card_cat_transformer, + cardinality_threshold=4, + n_jobs=None, # should disable the default splitting and merging + ) + enc.fit(X) + assert len(enc.transformers) == 3 + + enc_split = TableVectorizer( + high_card_cat_transformer=high_card_cat_transformer, + cardinality_threshold=4, + n_jobs=None, + ) + enc_split.fit(X) + # during actual use, this is done during fit + enc_split._split_univariate_transformers(during_fit=True) + # check that the high_card_cat_transformer + # is split into 2 transformers + # the transformers_ attribute should not be modified + # because during_fit is True + assert len(enc_split.transformers) == 4 + assert len(enc_split.transformers_) == 3 + enc_split._merge_univariate_transformers() + # check that the GapEncoder is merged into 1 transformer + assert len(enc_split.transformers) == 3 + assert np.allclose(enc.transform(X), enc_split.transform(X)) + # assert that the transformers attribute is the same as + # the one before splitting and merging + assert str(enc.transformers) == str(enc_split.transformers) + # check that you can refit the transformer + enc_split.fit(X) + + # Now split the transformers_ attribute (during_fit=False) + enc_split._split_univariate_transformers(during_fit=False) + assert len(enc_split.transformers) == 3 + assert len(enc_split.transformers_) == 4 + # the fitted transformers should still work + assert_array_equal(enc.transform(X), enc_split.transform(X)) + + enc_split._merge_univariate_transformers() + # check that the GapEncoder is merged into 1 transformer + assert len(enc_split.transformers_) == 3 + assert_array_equal(enc.transform(X), enc_split.transform(X)) + + # assert that the transformers attribute is the same as + # the one before splitting and merging + assert str(enc.transformers_) == str(enc_split.transformers_) + + +def test_split_one_hot_encoder() -> None: # check that a OneHotEncoder is not split + X = _get_clean_dataframe() enc_one_hot = TableVectorizer( high_card_cat_transformer=OneHotEncoder(handle_unknown="error"), low_card_cat_transformer=OneHotEncoder( From da06783e7df9d59948c6c3ba51d89aad96c4bba7 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 4 Aug 2023 18:20:52 +0200 Subject: [PATCH 28/47] use tags --- skrub/_gap_encoder.py | 4 ++++ skrub/_minhash_encoder.py | 4 ++++ skrub/_table_vectorizer.py | 10 ++++------ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index a199138d6..dab71c6d3 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -962,6 +962,10 @@ def _more_tags(self): ), "check_estimators_dtypes": "We only support string dtypes.", }, + "univariate": True, # whether the estimator is univariate and can be + # applied column by column. This is useful for the TableVectorizer, + # to decide whether to apply the transformer on each column separately + # and thus improve the parallelization when the transformer is slow enough. } diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index d46db12d2..66a69870d 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -429,4 +429,8 @@ def _more_tags(self): ), "check_estimators_dtypes": "We only support string dtypes.", }, + "univariate": True, # whether the estimator is univariate and can be + # applied column by column. This is useful for the TableVectorizer, + # to decide whether to apply the transformer on each column separately + # and thus improve the parallelization when the transformer is slow enough. } diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 769aac7d0..8fcd92932 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -28,10 +28,6 @@ # Required for ignoring lines too long in the docstrings # flake8: noqa: E501 -# transformers which can be applied column-wise -# and which are slow enough to be worth parallelizing over columns -UNIVARIATE_TRANSFORMERS = ["GapEncoder", "MinHashEncoder"] - def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> str | None: """Infer the date format of a date column, @@ -490,7 +486,8 @@ def _split_univariate_transformers(self, during_fit: bool = False): new_transformers = [] for name, trans, cols in self.transformers: if ( - trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS + (not type(trans) == str) + and trans._get_tags().get("univariate", False) and len(cols) > 1 ): for i, col in enumerate(cols): @@ -507,7 +504,8 @@ def _split_univariate_transformers(self, during_fit: bool = False): new_transformer_to_input_indices = {} for name, trans, cols in self.transformers_: if ( - trans.__class__.__name__ in UNIVARIATE_TRANSFORMERS + (not type(trans) == str) + and trans._get_tags().get("univariate", False) and len(cols) > 1 ): splitted_transformers_ = trans._split() From f8b177a00e331599d56b7b651ed0a37fa8623f43 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 4 Aug 2023 18:55:09 +0200 Subject: [PATCH 29/47] compare fitted transformers better --- skrub/tests/test_table_vectorizer.py | 5 +- skrub/tests/utils.py | 76 ++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 1563b3a2d..926f60045 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -9,6 +9,7 @@ from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer from skrub._table_vectorizer import _infer_date_format +from skrub.tests.utils import transformers_list_equal def check_same_transformers( @@ -816,8 +817,8 @@ def test_parallelism(high_card_cat_transformer) -> None: assert table_vec.n_jobs == n_jobs # assert that all attributes are equal except for # the n_jobs attribute - assert str(table_vec.transformers_) == str( - table_vec_no_parallel.transformers_ + assert transformers_list_equal( + table_vec.transformers_, table_vec_no_parallel.transformers_ ) assert (table_vec.columns_ == table_vec_no_parallel.columns_).all() assert table_vec.types_ == table_vec_no_parallel.types_ diff --git a/skrub/tests/utils.py b/skrub/tests/utils.py index 5572cf4e5..d99c8b6e3 100644 --- a/skrub/tests/utils.py +++ b/skrub/tests/utils.py @@ -27,3 +27,79 @@ def generate_data( else: X = np.array(str_list).reshape(n_samples, 1) return X + + +def is_valid_attribute(attribute): + # check that the type is not too weird + # so we can check equality + valid_types = ( + int, + float, + np.ndarray, + str, + bool, + type(None), + list, + tuple, + dict, + set, + ) + + if isinstance(attribute, (list, tuple, set)): + return all(is_valid_attribute(item) for item in attribute) + elif isinstance(attribute, dict): + return all( + is_valid_attribute(key) and is_valid_attribute(value) + for key, value in attribute.items() + ) + else: + return isinstance(attribute, valid_types) + + +def transformers_equal(transformer1, transformer2): + # Check if the transformers are of the same type + if type(transformer1) != type(transformer2): + return False + + # if string transformers, check if they are the same + if isinstance(transformer1, str): + return transformer1 == transformer2 + + # Compare hyperparameters + if transformer1.get_params() != transformer2.get_params(): + return False + + # Compare fitted attributes + for attribute in transformer1.__dict__: + if attribute.endswith("_"): + if not is_valid_attribute(getattr(transformer1, attribute)): + # check that the type is the same + if not isinstance( + getattr(transformer1, attribute), + type(getattr(transformer2, attribute)), + ): + return False + else: + if not np.array_equal( + getattr(transformer1, attribute), getattr(transformer2, attribute) + ): + return False + + return True + + +def transformers_list_equal(transformers_list1, transformers_list2): + # check equaility for list of 3-tuples (name, transformer, columns) + # used in the TableVectorizer + if len(transformers_list1) != len(transformers_list2): + return False + for (name1, transformer1, columns1), (name2, transformer2, columns2) in zip( + transformers_list1, transformers_list2 + ): + if name1 != name2: + return False + if columns1 != columns2: + return False + if not transformers_equal(transformer1, transformer2): + return False + return True From 02bf26eccc9bff8304c373e176031671f6fe9c75 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 4 Aug 2023 18:59:16 +0200 Subject: [PATCH 30/47] talk about tags in changelog --- CHANGES.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index b7857833d..aa8ac9b07 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -31,7 +31,8 @@ Minor changes ------------- * When possible, parallelism is done at the column level rather than the transformer level in :class:`TableVectorizer`. - This is the case for :class:`MinHashEncoder` and :class:`GapEncoder`. :pr:`592` by :user:`Leo Grinsztajn ` + This is the case for transformers with the tag `univariate` (right now :class:`MinHashEncoder` and :class:`GapEncoder`). + :pr:`592` by :user:`Leo Grinsztajn ` * Parallelized the :func:`deduplicate` function. Parameter `n_jobs` added to the signature. :pr:`618` by :user:`Jovan Stojanovic ` From bb7ad811d774e5beb0fe3edfeb31d1b36e7afba6 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 4 Aug 2023 19:32:57 +0200 Subject: [PATCH 31/47] run precommit checks --- skrub/tests/test_table_vectorizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 7d9cc7e45..2a3123d07 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -3,7 +3,7 @@ import pandas as pd import pytest from sklearn.exceptions import NotFittedError -from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel from sklearn.utils.validation import check_is_fitted @@ -914,6 +914,7 @@ def test_split_one_hot_encoder() -> None: enc_one_hot.fit(X) assert len(enc_one_hot.transformers) == 3 + def test_table_vectorizer_remainder_cloning(): """Check that remainder is cloned when used.""" df1 = _get_clean_dataframe() From 9dd62bb0c5c4415954b25264c9c4b110da88ea15 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 8 Aug 2023 12:17:51 +0200 Subject: [PATCH 32/47] get rid of _transformers_original --- skrub/_table_vectorizer.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index ae6094fe8..fb60fa36f 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -498,7 +498,6 @@ def _split_univariate_transformers(self, during_fit: bool = False): self.transformers attribute (when True). """ if during_fit: - self._transformers_original = self.transformers new_transformers = [] for name, trans, cols in self.transformers: if ( @@ -546,9 +545,31 @@ def _split_univariate_transformers(self, during_fit: bool = False): self._transformer_to_input_indices = new_transformer_to_input_indices def _merge_univariate_transformers(self): - # merge back self.transformers if it was split + # merge back self.transformers and self.transformers_ check_is_fitted(self, attributes=["transformers_"]) - self.transformers = self._transformers_original + # merge self.transformers + new_transformers = [] + base_names = pd.unique( + [name.split("_split_")[0] for name, _, _ in self.transformers] + ) + for base_name in base_names: + for name, trans, _ in self.transformers: + if name.startswith(base_name): + new_trans = trans if isinstance(trans, str) else clone(trans) + break + new_transformers.append( + ( + base_name, + new_trans, + [ + col + for name, _, cols in self.transformers + for col in cols + if name.startswith(base_name) + ], + ) + ) + self.transformers = new_transformers # merge self.transformers_ new_transformers_ = [] new_transformer_to_input_indices = {} From d6939f572065e0323c8541d0daf7a79a620aa6c1 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 18 Aug 2023 16:17:07 +0200 Subject: [PATCH 33/47] clean tests --- skrub/tests/test_gap_encoder.py | 14 +++++--------- skrub/tests/test_minhash_encoder.py | 14 +++++--------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/skrub/tests/test_gap_encoder.py b/skrub/tests/test_gap_encoder.py index 054c76c64..1b7c04355 100644 --- a/skrub/tests/test_gap_encoder.py +++ b/skrub/tests/test_gap_encoder.py @@ -297,9 +297,8 @@ def test_merge_transformers() -> None: # check score assert enc_merged.score(X) == enc.score(X) # check all attributes - attrs = ["rho_", "column_names_"] - for attr in attrs: - assert getattr(enc_merged, attr) == getattr(enc, attr) + enc_merged.rho_ == enc.rho_ + enc_merged.column_names_ == enc.column_names_ def test_split_transformers() -> None: @@ -336,9 +335,7 @@ def test_split_transformers() -> None: ) index += transformed_X_i.shape[1] # check all attributes - attrs = ["rho_"] - for attr in attrs: - assert getattr(enc_list[i], attr) == getattr(enc, attr) + assert enc_list[i].rho_ == enc.rho_ assert enc_list[i].column_names_ == [f"col{i}"] @@ -368,6 +365,5 @@ def test_split_and_merge_transformers() -> None: # check score assert enc_merged.score(X) == enc.score(X) # check all attributes - attrs = ["rho_", "column_names_"] - for attr in attrs: - assert getattr(enc_merged, attr) == getattr(enc, attr) + enc_merged.rho_ == enc.rho_ + enc_merged.column_names_ == enc.column_names_ diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py index 832747e19..63ce8c148 100644 --- a/skrub/tests/test_minhash_encoder.py +++ b/skrub/tests/test_minhash_encoder.py @@ -327,9 +327,8 @@ def test_merge_transformers() -> None: for key in enc.hash_dict_.cache.keys(): assert_array_equal(enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key]) # check all attributes - attrs = ["_capacity", "n_features_in_"] - for attr in attrs: - assert getattr(enc_merged, attr) == getattr(enc, attr) + assert enc_merged._capacity == enc._capacity + assert enc_merged.n_features_in_ == enc.n_features_in_ # check feature_names_in_ assert_array_equal(enc_merged.feature_names_in_, enc.feature_names_in_) @@ -372,9 +371,7 @@ def test_split_transformers() -> None: assert enc_list[i].n_features_in_ == 1 index += transformed_X_i.shape[1] # check all attributes - attrs = ["_capacity"] - for attr in attrs: - assert getattr(enc_list[i], attr) == getattr(enc, attr) + assert enc_list[i]._capacity == enc._capacity # check hash_dict_ # TODO: do we want the hash_dict_ to be the same? assert enc.hash_dict_.cache.keys() == enc_list[i].hash_dict_.cache.keys() @@ -412,8 +409,7 @@ def test_split_and_merge_transformers() -> None: for key in enc.hash_dict_.cache.keys(): assert_array_equal(enc.hash_dict_.cache[key], enc_merged.hash_dict_.cache[key]) # check all attributes - attrs = ["_capacity", "n_features_in_"] - for attr in attrs: - assert getattr(enc_merged, attr) == getattr(enc, attr) + assert enc_merged._capacity == enc._capacity + assert enc_merged.n_features_in_ == enc.n_features_in_ # check feature_names_in_ assert_array_equal(enc_merged.feature_names_in_, enc.feature_names_in_) From 99b3a9b00dc7fe835fd3d4a81c87e967949b8d10 Mon Sep 17 00:00:00 2001 From: LeoGrin <45738728+LeoGrin@users.noreply.github.com> Date: Sat, 26 Aug 2023 18:08:41 +0200 Subject: [PATCH 34/47] Apply suggestions from code review Co-authored-by: Vincent M --- skrub/_table_vectorizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index fb60fa36f..8c9d0182e 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -501,13 +501,13 @@ def _split_univariate_transformers(self, during_fit: bool = False): new_transformers = [] for name, trans, cols in self.transformers: if ( - (not type(trans) == str) + (not isinstance(trans, str)) and trans._get_tags().get("univariate", False) and len(cols) > 1 ): for i, col in enumerate(cols): new_transformers.append( - (name + "_split_" + str(i), clone(trans), [col]) + (f"{name}_split_{i}", clone(trans), [col]) ) else: new_transformers.append((name, trans, cols)) From 769df3a08ee929858c1a49fa2edace166001fe37 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Mon, 28 Aug 2023 12:24:58 +0200 Subject: [PATCH 35/47] _parallel_on_columns --- skrub/_table_vectorizer.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index fb60fa36f..d5b3f5db1 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -146,6 +146,19 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se return ser +def _parallel_on_columns(trans: TransformerMixin, cols: list[str]): + """ + Assert whether we want to parallelize the transformer over + the columns or not. We only want to parallelize if the transformer + is "univariate" (i.e. it can be duplicated for each column). + """ + return ( + (not type(trans) == str) + and trans._get_tags().get("univariate", False) + and len(cols) > 1 + ) + + OptionalTransformer = ( TransformerMixin | Literal["drop", "remainder", "passthrough"] | None ) @@ -500,11 +513,7 @@ def _split_univariate_transformers(self, during_fit: bool = False): if during_fit: new_transformers = [] for name, trans, cols in self.transformers: - if ( - (not type(trans) == str) - and trans._get_tags().get("univariate", False) - and len(cols) > 1 - ): + if _parallel_on_columns(trans, cols): for i, col in enumerate(cols): new_transformers.append( (name + "_split_" + str(i), clone(trans), [col]) @@ -518,11 +527,7 @@ def _split_univariate_transformers(self, during_fit: bool = False): new_transformers_ = [] new_transformer_to_input_indices = {} for name, trans, cols in self.transformers_: - if ( - (not type(trans) == str) - and trans._get_tags().get("univariate", False) - and len(cols) > 1 - ): + if _parallel_on_columns(trans, cols): splitted_transformers_ = trans._split() for i, (col, trans, trans_to_mapping) in enumerate( zip( From 28cfb4c77bf30be6991f1cf820e168e3da764788 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 29 Aug 2023 10:33:38 +0200 Subject: [PATCH 36/47] explain transformers vs transformers_ --- skrub/_table_vectorizer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index d5b3f5db1..b2aab486c 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -511,6 +511,10 @@ def _split_univariate_transformers(self, during_fit: bool = False): self.transformers attribute (when True). """ if during_fit: + # split self.transformers, a list of 3-tuples (name, transformer, columns) + # containing the unfitted transformers (or strings) and the columns + # to be fitted on. This attribute is used by the `ColumnTransformer` + # when calling `fit` and `fit_transform`. new_transformers = [] for name, trans, cols in self.transformers: if _parallel_on_columns(trans, cols): @@ -522,6 +526,10 @@ def _split_univariate_transformers(self, during_fit: bool = False): new_transformers.append((name, trans, cols)) self.transformers = new_transformers else: + # split self.transformers_, a list of 3-tuples (name, transformer, columns) + # containing the fitted transformers (or strings) and the columns + # they were fitted on. This attribute is used by the `ColumnTransformer` + # when calling `transform`. check_is_fitted(self, attributes=["transformers_"]) self._transformers_fitted_original = self.transformers_ new_transformers_ = [] From 9aec05853ab474edee4db9c47cdd1c22029e95f6 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 29 Aug 2023 11:04:18 +0200 Subject: [PATCH 37/47] split merge into two functions --- skrub/_table_vectorizer.py | 117 ++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 54 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 70977919b..10ed58333 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -159,6 +159,63 @@ def _parallel_on_columns(trans: TransformerMixin, cols: list[str]): ) +def _merge_unfitted_transformers( + transformers: list[tuple[str, str | TransformerMixin, list[str]]] +): + new_transformers = [] + base_names = pd.unique([name.split("_split_")[0] for name, _, _ in transformers]) + for base_name in base_names: + for name, trans, _ in transformers: + if name.startswith(base_name): + new_trans = trans if isinstance(trans, str) else clone(trans) + break + new_transformers.append( + ( + base_name, + new_trans, + [ + col + for name, _, cols in transformers + for col in cols + if name.startswith(base_name) + ], + ) + ) + return new_transformers + + +def _merge_fitted_transformers( + transformers_: list[tuple[str, str | TransformerMixin, list[str]]], + _transformer_to_input_indices: dict[str, list[int]], +): + new_transformers_ = [] + new_transformer_to_input_indices = {} + base_names = pd.unique([name.split("_split_")[0] for name, _, _ in transformers_]) + for base_name in base_names: + # merge all transformers with the same base name + transformers, columns, names = [], [], [] + for name, trans, cols in transformers_: + if name.startswith(base_name): + columns.extend(cols) + transformers.append(trans) + names.append(name) + if len(transformers) == 1: + new_transformers_.append((base_name, transformers[0], columns)) + new_transformer_to_input_indices[base_name] = list( + _transformer_to_input_indices[base_name] + ) + else: + # merge transformers + new_transformers_.append( + (base_name, transformers[0].__class__._merge(transformers), columns) + ) + new_transformer_to_input_indices[base_name] = list( + np.concatenate([_transformer_to_input_indices[name] for name in names]) + ) + + return new_transformers_, new_transformer_to_input_indices + + OptionalTransformer = ( TransformerMixin | Literal["drop", "remainder", "passthrough"] | None ) @@ -560,61 +617,13 @@ def _split_univariate_transformers(self, during_fit: bool = False): def _merge_univariate_transformers(self): # merge back self.transformers and self.transformers_ check_is_fitted(self, attributes=["transformers_"]) - # merge self.transformers - new_transformers = [] - base_names = pd.unique( - [name.split("_split_")[0] for name, _, _ in self.transformers] + self.transformers = _merge_unfitted_transformers(self.transformers) + ( + self.transformers_, + self._transformer_to_input_indices, + ) = _merge_fitted_transformers( + self.transformers_, self._transformer_to_input_indices ) - for base_name in base_names: - for name, trans, _ in self.transformers: - if name.startswith(base_name): - new_trans = trans if isinstance(trans, str) else clone(trans) - break - new_transformers.append( - ( - base_name, - new_trans, - [ - col - for name, _, cols in self.transformers - for col in cols - if name.startswith(base_name) - ], - ) - ) - self.transformers = new_transformers - # merge self.transformers_ - new_transformers_ = [] - new_transformer_to_input_indices = {} - base_names = pd.unique( - [name.split("_split_")[0] for name, _, _ in self.transformers_] - ) - for base_name in base_names: - # merge all transformers with the same base name - transformers, columns, names = [], [], [] - for name, trans, cols in self.transformers_: - if name.startswith(base_name): - columns.extend(cols) - transformers.append(trans) - names.append(name) - if len(transformers) == 1: - new_transformers_.append((base_name, transformers[0], columns)) - new_transformer_to_input_indices[base_name] = list( - self._transformer_to_input_indices[base_name] - ) - else: - # merge transformers - new_transformers_.append( - (base_name, transformers[0].__class__._merge(transformers), columns) - ) - new_transformer_to_input_indices[base_name] = list( - np.concatenate( - [self._transformer_to_input_indices[name] for name in names] - ) - ) - - self.transformers_ = new_transformers_ - self._transformer_to_input_indices = new_transformer_to_input_indices def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame: """Takes a dataframe and tries to convert its columns to their best possible data type. From 72dd3d4147780a733f36308a704d917a68237eef Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Tue, 29 Aug 2023 11:45:03 +0200 Subject: [PATCH 38/47] add tests to check that splitting doesn't prevent resetting transformers --- skrub/tests/test_table_vectorizer.py | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 2a3123d07..2437429d1 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -915,6 +915,50 @@ def test_split_one_hot_encoder() -> None: assert len(enc_one_hot.transformers) == 3 +@pytest.mark.parametrize( + "low_card_cat_transformer", + [ + # transformers that should be split + MinHashEncoder(n_components=2), + GapEncoder(n_components=2), + ], +) +@skip_if_no_parallel +def test_modying_transformers(low_card_cat_transformer): + """Check that the splitting/merging mecanism doesn't + prevent resetting the transformers""" + + # test that modifying a transformer before refitting works + # https://github.com/skrub-data/skrub/pull/592#discussion_r1284531301 + tb = TableVectorizer(low_card_cat_transformer=low_card_cat_transformer, n_jobs=2) + X = _get_clean_dataframe() + tb.fit_transform(X) + tb.low_card_cat_transformer = "passthrough" + tb.fit_transform(X) + assert tb.low_card_cat_transformer_ == "passthrough" + assert tb.transformers[0][1] == "passthrough" + assert tb.transformers_[0][1] == "passthrough" + assert tb.transform(X).shape == (5, 6) + + # test a failed fit_transform doesn't break the following fit_transform + # https://github.com/skrub-data/skrub/pull/592#discussion_r1278591301 + tb = TableVectorizer( + low_card_cat_transformer=low_card_cat_transformer, + # to make ColumnTransformer fit_transform fail + numerical_transformer="not_applicable", + n_jobs=2, + ) + with pytest.raises(TypeError): + tb.fit_transform(X) + assert len(tb.transformers) == 5 # the transformers should have been splitted + # but not merged + tb.numerical_transformer = "passthrough" + tb.fit_transform(X) + assert len(tb.transformers) == 2 # the transformers should have been splitted + # and merged correctly + assert len(tb.transformers_) == 2 + + def test_table_vectorizer_remainder_cloning(): """Check that remainder is cloned when used.""" df1 = _get_clean_dataframe() From 2d5fd3a5be64f057d3aaf104f74e687cff567aa2 Mon Sep 17 00:00:00 2001 From: LeoGrin <45738728+LeoGrin@users.noreply.github.com> Date: Wed, 30 Aug 2023 14:24:20 +0200 Subject: [PATCH 39/47] Apply suggestions from code review Co-authored-by: Vincent M --- CHANGES.rst | 3 +-- skrub/_table_vectorizer.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 68d08403b..39905f1e3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -36,8 +36,7 @@ Major changes Minor changes ------------- -* When possible, parallelism is done at the column level rather than the transformer level in :class:`TableVectorizer`. - This is the case for transformers with the tag `univariate` (right now :class:`MinHashEncoder` and :class:`GapEncoder`). +* :class:`TableVectorizer` is now able to apply parallelism at the column level rather than the transformer level. This is the default for univariate transformers, like :class:`MinHashEncoder`, and :class:`GapEncoder`. :pr:`592` by :user:`Leo Grinsztajn ` * Parallelized the :func:`deduplicate` function. Parameter `n_jobs` diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 10ed58333..6cfdbb425 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -149,7 +149,7 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se def _parallel_on_columns(trans: TransformerMixin, cols: list[str]): """ Assert whether we want to parallelize the transformer over - the columns or not. We only want to parallelize if the transformer + the columns or not. We only want to parallelize over columns if the transformer is "univariate" (i.e. it can be duplicated for each column). """ return ( From 2eb6f5ba6d882137841ea547cbfde25a32843dd4 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 30 Aug 2023 15:41:32 +0200 Subject: [PATCH 40/47] combine merge_unfitted and merge_fitted and move _split outside of class --- skrub/_table_vectorizer.py | 204 ++++++++++++++++++++++--------------- 1 file changed, 122 insertions(+), 82 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 6cfdbb425..87a0f7b1a 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -6,6 +6,7 @@ import re import warnings +from itertools import chain from typing import Literal from warnings import warn @@ -159,61 +160,121 @@ def _parallel_on_columns(trans: TransformerMixin, cols: list[str]): ) -def _merge_unfitted_transformers( - transformers: list[tuple[str, str | TransformerMixin, list[str]]] +def _split_transformers( + transformers: list[tuple[str, str | TransformerMixin, list[str]]], + transformers_to_input_indices: dict[str, list[int]] | None = None, + during_fit: bool = False, ): + """ + Split univariate transformers into multiple transformers, one for each + column. This is useful to use the inherited `ColumnTransformer` class + parallelism. + + Parameters + ---------- + transformers : list of 3-tuples (str, Transformer or str, list of str) + The collection of transformers to split, as tuples of + (name, transformer, column). + transformers_to_input_indices : dict of str to list of int, optional + The mapping of transformer names to the indices of the columns they were + fitted on. Should correspond to the `self._transformer_to_input_indices` attribute. + Only used when `during_fit` is False. + during_fit : bool, default=False + Whether the method is called during `fit_transform` (True) or + during `transform` (False). This is used to determine if the + transformers in `transformers` are fitted or not, i.e. whether + the `transformers` argument corresponds to the `self.transformers_` attribute + (when False) or the `self.transformers` attribute (when True). + """ new_transformers = [] - base_names = pd.unique([name.split("_split_")[0] for name, _, _ in transformers]) - for base_name in base_names: - for name, trans, _ in transformers: - if name.startswith(base_name): - new_trans = trans if isinstance(trans, str) else clone(trans) - break - new_transformers.append( - ( - base_name, - new_trans, - [ - col - for name, _, cols in transformers - for col in cols - if name.startswith(base_name) - ], - ) - ) - return new_transformers + new_transformer_to_input_indices = {} + if during_fit: + # split a list of 3-tuples (name, transformer, columns) + # containing the unfitted transformers (or strings) and the columns + # to be fitted on. + for name, trans, cols in transformers: + if _parallel_on_columns(trans, cols): + for i, col in enumerate(cols): + new_transformers.append((f"{name}_split_{i}", clone(trans), [col])) + else: + new_transformers.append((name, trans, cols)) + else: + # split a list of 3-tuples (name, transformer, columns) + # containing the fitted transformers (or strings) and the columns + # they were fitted on. + for name, trans, cols in transformers: + if _parallel_on_columns(trans, cols): + splitted_transformers_ = trans._split() + for i, (col, trans, trans_to_mapping) in enumerate( + zip( + cols, + splitted_transformers_, + transformers_to_input_indices[name], + ) + ): + name_split = f"{name}_split_{i}" + new_transformers.append((name_split, trans, [col])) + new_transformer_to_input_indices[name_split] = [trans_to_mapping] + else: + new_transformers.append((name, trans, cols)) + new_transformer_to_input_indices[name] = transformers_to_input_indices[ + name + ] + return new_transformers, new_transformer_to_input_indices -def _merge_fitted_transformers( - transformers_: list[tuple[str, str | TransformerMixin, list[str]]], - _transformer_to_input_indices: dict[str, list[int]], -): - new_transformers_ = [] + +def _merge_transformers( + transformers: list[tuple[str, str | TransformerMixin, list[str]]], + is_fitted: bool, + transformer_to_input_indices: dict[str, list[int]] | None = None, +) -> tuple[list[TransformerMixin], dict[str, list[int]] | None]: + """ + Merge splitted transformers into a single transformer. + + Parameters + ---------- + transformers : list of 3-tuples (str, Transformer or str, list of str) + The collection of transformers to merge, as tuples of + (name, transformer, column). + is_fitted : bool + Whether the transformers are fitted or not, i.e. whether the + `transformers` argument corresponds to the `self.transformers_` attribute + (when True) or the `self.transformers` attribute (when False). + transformer_to_input_indices : dict of str to list of int, optional + The mapping of transformer names to the indices of the columns they were + fitted on. Should correspond to the `self._transformer_to_input_indices` attribute. + Only used when `is_fitted` is True. + """ + new_transformers = [] new_transformer_to_input_indices = {} - base_names = pd.unique([name.split("_split_")[0] for name, _, _ in transformers_]) + base_names = pd.unique([name.split("_split_")[0] for name, _, _ in transformers]) + for base_name in base_names: # merge all transformers with the same base name - transformers, columns, names = [], [], [] - for name, trans, cols in transformers_: + transformers_base_name, names, columns = [], [], [] + for name, trans, cols in transformers: if name.startswith(base_name): columns.extend(cols) - transformers.append(trans) + transformers_base_name.append(trans) names.append(name) - if len(transformers) == 1: - new_transformers_.append((base_name, transformers[0], columns)) - new_transformer_to_input_indices[base_name] = list( - _transformer_to_input_indices[base_name] - ) + + new_trans = transformers_base_name[0] + if not is_fitted: + if isinstance(new_trans, TransformerMixin): + new_trans = clone(new_trans) else: - # merge transformers - new_transformers_.append( - (base_name, transformers[0].__class__._merge(transformers), columns) - ) + if len(transformers_base_name) > 1: + # merge transformers + new_trans = new_trans.__class__._merge(transformers_base_name) new_transformer_to_input_indices[base_name] = list( - np.concatenate([_transformer_to_input_indices[name] for name in names]) + chain.from_iterable( + [transformer_to_input_indices[name] for name in names] + ) ) + new_transformers.append((base_name, new_trans, columns)) - return new_transformers_, new_transformer_to_input_indices + return new_transformers, new_transformer_to_input_indices OptionalTransformer = ( @@ -572,57 +633,36 @@ def _split_univariate_transformers(self, during_fit: bool = False): # containing the unfitted transformers (or strings) and the columns # to be fitted on. This attribute is used by the `ColumnTransformer` # when calling `fit` and `fit_transform`. - new_transformers = [] - for name, trans, cols in self.transformers: - if _parallel_on_columns(trans, cols): - for i, col in enumerate(cols): - new_transformers.append( - (f"{name}_split_{i}", clone(trans), [col]) - ) - else: - new_transformers.append((name, trans, cols)) - self.transformers = new_transformers + self.transformers, _ = _split_transformers( + self.transformers, during_fit=True + ) else: # split self.transformers_, a list of 3-tuples (name, transformer, columns) # containing the fitted transformers (or strings) and the columns # they were fitted on. This attribute is used by the `ColumnTransformer` # when calling `transform`. check_is_fitted(self, attributes=["transformers_"]) - self._transformers_fitted_original = self.transformers_ - new_transformers_ = [] - new_transformer_to_input_indices = {} - for name, trans, cols in self.transformers_: - if _parallel_on_columns(trans, cols): - splitted_transformers_ = trans._split() - for i, (col, trans, trans_to_mapping) in enumerate( - zip( - cols, - splitted_transformers_, - self._transformer_to_input_indices[name], - ) - ): - name_split = f"{name}_split_{i}" - new_transformers_.append((name_split, trans, [col])) - new_transformer_to_input_indices[name_split] = [ - trans_to_mapping - ] - else: - new_transformers_.append((name, trans, cols)) - new_transformer_to_input_indices[ - name - ] = self._transformer_to_input_indices[name] - self.transformers_ = new_transformers_ - self._transformer_to_input_indices = new_transformer_to_input_indices + ( + self.transformers_, + self._transformer_to_input_indices, + ) = _split_transformers( + self.transformers_, + during_fit=False, + transformers_to_input_indices=self._transformer_to_input_indices, + ) def _merge_univariate_transformers(self): - # merge back self.transformers and self.transformers_ + """ + Merge splitted transformers into a single transformer. + To be used after `_split_univariate_transformers`. + """ + # merge self.transformers and self.transformers_ check_is_fitted(self, attributes=["transformers_"]) - self.transformers = _merge_unfitted_transformers(self.transformers) - ( + self.transformers, _ = _merge_transformers(self.transformers, is_fitted=False) + self.transformers_, self._transformer_to_input_indices = _merge_transformers( self.transformers_, - self._transformer_to_input_indices, - ) = _merge_fitted_transformers( - self.transformers_, self._transformer_to_input_indices + is_fitted=True, + transformer_to_input_indices=self._transformer_to_input_indices, ) def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame: From 7f7b5249615ccea619e5ded8c58f7131ff1c08c1 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 30 Aug 2023 15:45:34 +0200 Subject: [PATCH 41/47] don't return empty new_transformer_to_input_indices --- skrub/_table_vectorizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 87a0f7b1a..e1b343b33 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -187,7 +187,9 @@ def _split_transformers( (when False) or the `self.transformers` attribute (when True). """ new_transformers = [] - new_transformer_to_input_indices = {} + new_transformer_to_input_indices = ( + {} if not during_fit else transformers_to_input_indices + ) if during_fit: # split a list of 3-tuples (name, transformer, columns) # containing the unfitted transformers (or strings) and the columns @@ -247,7 +249,7 @@ def _merge_transformers( Only used when `is_fitted` is True. """ new_transformers = [] - new_transformer_to_input_indices = {} + new_transformer_to_input_indices = {} if is_fitted else transformer_to_input_indices base_names = pd.unique([name.split("_split_")[0] for name, _, _ in transformers]) for base_name in base_names: From ebcca2080dccbe2b79677d3a09d794df2513ee45 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Wed, 30 Aug 2023 15:56:49 +0200 Subject: [PATCH 42/47] remove future warning --- skrub/_table_vectorizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index e1b343b33..c7db0ac79 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -250,7 +250,9 @@ def _merge_transformers( """ new_transformers = [] new_transformer_to_input_indices = {} if is_fitted else transformer_to_input_indices - base_names = pd.unique([name.split("_split_")[0] for name, _, _ in transformers]) + base_names = pd.unique( + pd.Series([name.split("_split_")[0] for name, _, _ in transformers]) + ) for base_name in base_names: # merge all transformers with the same base name From 25d854b12bfe5bdabab6f1d2e8f5da7d0637f4a0 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 1 Sep 2023 13:28:06 +0200 Subject: [PATCH 43/47] add docstrings --- skrub/_gap_encoder.py | 14 +++++++++++--- skrub/_minhash_encoder.py | 14 +++++++++++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index f31b6e449..2c78b98fc 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -742,9 +742,11 @@ class GapEncoder(TransformerMixin, BaseEstimator): @classmethod def _merge(cls, transformers_list: list[GapEncoder]): - # merge GapEncoder fitted on different columns - # into a single GapEncoder - # useful for parallelization in the TableVectorizer + """ + Merge GapEncoders fitted on different columns + into a single GapEncoder. This is useful for parallelization + over columns in the TableVectorizer. + """ full_transformer = clone(transformers_list[0]) # assert rho_ is the same for all transformers rho_ = transformers_list[0].rho_ @@ -759,6 +761,12 @@ def _merge(cls, transformers_list: list[GapEncoder]): return full_transformer def _split(self): + """ + Split a GapEncoder fitted on multiple columns + into a list of GapEncoders fitted on one column each. + This is useful for parallelizing transform over columns + in the TableVectorizer. + """ check_is_fitted(self) transformers_list = [] for i, model in enumerate(self.fitted_models_): diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py index 9ba9e9f84..1c5b9d7be 100644 --- a/skrub/_minhash_encoder.py +++ b/skrub/_minhash_encoder.py @@ -123,9 +123,11 @@ class MinHashEncoder(TransformerMixin, BaseEstimator): @classmethod def _merge(cls, transformers_list: list[MinHashEncoder]): - # merge MinHashEncoder fitted on different columns - # into a single MinHashEncoder - # useful for parallelization in the TableVectorizer + """ + Merge MinHashEncoders fitted on different columns + into a single MinHashEncoder. This is useful for parallelization + over columns in the TableVectorizer. + """ full_transformer = clone(transformers_list[0]) capacity = transformers_list[0]._capacity full_transformer.hash_dict_ = combine_lru_dicts( @@ -140,6 +142,12 @@ def _merge(cls, transformers_list: list[MinHashEncoder]): return full_transformer def _split(self): + """ + Split a MinHashEncoder fitted on multiple columns + into a list of MinHashEncoders (one for each column). + This is useful for parallelizing transform over columns + in the TableVectorizer. + """ check_is_fitted(self) transformer_list = [] for i in range(self.n_features_in_): From 0d381d01235eb5995833db60f32ef71e130e9b1d Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 1 Sep 2023 13:37:04 +0200 Subject: [PATCH 44/47] fix merge --- skrub/_table_vectorizer.py | 143 +++---------------------------------- 1 file changed, 8 insertions(+), 135 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index abf6c550f..3f0bef932 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -282,140 +282,6 @@ def _merge_transformers( return new_transformers, new_transformer_to_input_indices -def _parallel_on_columns(trans: TransformerMixin, cols: list[str]): - """ - Assert whether we want to parallelize the transformer over - the columns or not. We only want to parallelize over columns if the transformer - is "univariate" (i.e. it can be duplicated for each column). - """ - return ( - (not isinstance(trans, str)) - and trans._get_tags().get("univariate", False) - and len(cols) > 1 - ) - - -def _split_transformers( - transformers: list[tuple[str, str | TransformerMixin, list[str]]], - transformers_to_input_indices: dict[str, list[int]] | None = None, - during_fit: bool = False, -): - """ - Split univariate transformers into multiple transformers, one for each - column. This is useful to use the inherited `ColumnTransformer` class - parallelism. - - Parameters - ---------- - transformers : list of 3-tuples (str, Transformer or str, list of str) - The collection of transformers to split, as tuples of - (name, transformer, column). - transformers_to_input_indices : dict of str to list of int, optional - The mapping of transformer names to the indices of the columns they were - fitted on. Should correspond to the `self._transformer_to_input_indices` attribute. - Only used when `during_fit` is False. - during_fit : bool, default=False - Whether the method is called during `fit_transform` (True) or - during `transform` (False). This is used to determine if the - transformers in `transformers` are fitted or not, i.e. whether - the `transformers` argument corresponds to the `self.transformers_` attribute - (when False) or the `self.transformers` attribute (when True). - """ - new_transformers = [] - new_transformer_to_input_indices = ( - {} if not during_fit else transformers_to_input_indices - ) - if during_fit: - # split a list of 3-tuples (name, transformer, columns) - # containing the unfitted transformers (or strings) and the columns - # to be fitted on. - for name, trans, cols in transformers: - if _parallel_on_columns(trans, cols): - for i, col in enumerate(cols): - new_transformers.append((f"{name}_split_{i}", clone(trans), [col])) - else: - new_transformers.append((name, trans, cols)) - else: - # split a list of 3-tuples (name, transformer, columns) - # containing the fitted transformers (or strings) and the columns - # they were fitted on. - for name, trans, cols in transformers: - if _parallel_on_columns(trans, cols): - splitted_transformers_ = trans._split() - for i, (col, trans, trans_to_mapping) in enumerate( - zip( - cols, - splitted_transformers_, - transformers_to_input_indices[name], - ) - ): - name_split = f"{name}_split_{i}" - new_transformers.append((name_split, trans, [col])) - new_transformer_to_input_indices[name_split] = [trans_to_mapping] - else: - new_transformers.append((name, trans, cols)) - new_transformer_to_input_indices[name] = transformers_to_input_indices[ - name - ] - - return new_transformers, new_transformer_to_input_indices - - -def _merge_transformers( - transformers: list[tuple[str, str | TransformerMixin, list[str]]], - is_fitted: bool, - transformer_to_input_indices: dict[str, list[int]] | None = None, -) -> tuple[list[TransformerMixin], dict[str, list[int]] | None]: - """ - Merge splitted transformers into a single transformer. - - Parameters - ---------- - transformers : list of 3-tuples (str, Transformer or str, list of str) - The collection of transformers to merge, as tuples of - (name, transformer, column). - is_fitted : bool - Whether the transformers are fitted or not, i.e. whether the - `transformers` argument corresponds to the `self.transformers_` attribute - (when True) or the `self.transformers` attribute (when False). - transformer_to_input_indices : dict of str to list of int, optional - The mapping of transformer names to the indices of the columns they were - fitted on. Should correspond to the `self._transformer_to_input_indices` attribute. - Only used when `is_fitted` is True. - """ - new_transformers = [] - new_transformer_to_input_indices = {} if is_fitted else transformer_to_input_indices - base_names = pd.unique( - pd.Series([name.split("_split_")[0] for name, _, _ in transformers]) - ) - - for base_name in base_names: - # merge all transformers with the same base name - transformers_base_name, names, columns = [], [], [] - for name, trans, cols in transformers: - if name.startswith(base_name): - columns.extend(cols) - transformers_base_name.append(trans) - names.append(name) - - new_trans = transformers_base_name[0] - if not is_fitted: - if isinstance(new_trans, TransformerMixin): - new_trans = clone(new_trans) - else: - if len(transformers_base_name) > 1: - # merge transformers - new_trans = new_trans.__class__._merge(transformers_base_name) - new_transformer_to_input_indices[base_name] = list( - chain.from_iterable( - [transformer_to_input_indices[name] for name in names] - ) - ) - new_transformers.append((base_name, new_trans, columns)) - - return new_transformers, new_transformer_to_input_indices - - Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"] @@ -699,7 +565,14 @@ def _more_tags(self) -> dict: }, } - def _clone_transformers(self): + @property + def is_parallelized(self) -> bool: + """ + Returns True if the transformers are parallelized over columns, False otherwise. + """ + return self.n_jobs not in (None, 1) + + def _clone_transformers(self) -> None: """ For each of the different transformers that can be passed, create the corresponding variable name with a trailing underscore, From 1cef43d6b74eda990349fea7db5fb051f99fe19b Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 1 Sep 2023 14:05:09 +0200 Subject: [PATCH 45/47] fix test --- skrub/tests/test_table_vectorizer.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index b733c3b2a..e978f3242 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -855,6 +855,8 @@ def test_changing_types_int_float() -> None: def test_column_by_column() -> None: + # Test that the TableVectorizer gives the same result + # when applied column by column X = _get_clean_dataframe() table_vec_all_cols = TableVectorizer( high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), @@ -867,11 +869,21 @@ def test_column_by_column() -> None: cardinality_threshold=4, ) table_vec_one_col.fit(X[[col]]) - assert table_vec_one_col.get_feature_names_out() == [ + features_from_col = [ feat for feat in table_vec_all_cols.get_feature_names_out() if feat.startswith(col) ] + assert table_vec_one_col.get_feature_names_out() == features_from_col + indices_features_from_col = [ + table_vec_all_cols.get_feature_names_out().index(feat) + for feat in features_from_col + ] + excepted_result = table_vec_all_cols.transform(X)[:, indices_features_from_col] + assert np.allclose( + table_vec_one_col.transform(X[[col]]), + excepted_result, + ) @skip_if_no_parallel From eabdb8b64e1e2e25b6780893c2fdb9532602f239 Mon Sep 17 00:00:00 2001 From: LeoGrin <45738728+LeoGrin@users.noreply.github.com> Date: Fri, 1 Sep 2023 15:26:20 +0200 Subject: [PATCH 46/47] Update skrub/_table_vectorizer.py Co-authored-by: Vincent M --- skrub/_table_vectorizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 3f0bef932..9ae0f6d98 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -148,7 +148,7 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se return ser -def _parallel_on_columns(trans: TransformerMixin, cols: list[str]): +def _parallel_on_columns(trans: TransformerMixin, cols: list[str]) -> bool: """ Assert whether we want to parallelize the transformer over the columns or not. We only want to parallelize over columns if the transformer From 134a4132c4d11b02ec08923178fccb100215dc36 Mon Sep 17 00:00:00 2001 From: LeoGrin Date: Fri, 1 Sep 2023 15:32:13 +0200 Subject: [PATCH 47/47] fix type hints --- skrub/_table_vectorizer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 9ae0f6d98..751a49272 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -148,7 +148,10 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se return ser -def _parallel_on_columns(trans: TransformerMixin, cols: list[str]) -> bool: +Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"] + + +def _parallel_on_columns(trans: Transformer, cols: list[str]) -> bool: """ Assert whether we want to parallelize the transformer over the columns or not. We only want to parallelize over columns if the transformer @@ -162,10 +165,10 @@ def _parallel_on_columns(trans: TransformerMixin, cols: list[str]) -> bool: def _split_transformers( - transformers: list[tuple[str, str | TransformerMixin, list[str]]], + transformers: list[tuple[str, Transformer, list[str]]], transformers_to_input_indices: dict[str, list[int]] | None = None, during_fit: bool = False, -): +) -> tuple[list[tuple[str, Transformer, list[str]]], dict[str, list[int]]]: """ Split univariate transformers into multiple transformers, one for each column. This is useful to use the inherited `ColumnTransformer` class @@ -228,10 +231,10 @@ def _split_transformers( def _merge_transformers( - transformers: list[tuple[str, str | TransformerMixin, list[str]]], + transformers: list[tuple[str, Transformer, list[str]]], is_fitted: bool, transformer_to_input_indices: dict[str, list[int]] | None = None, -) -> tuple[list[TransformerMixin], dict[str, list[int]] | None]: +) -> tuple[list[tuple[str, Transformer, list[str]]], dict[str, list[int]]]: """ Merge splitted transformers into a single transformer. @@ -282,9 +285,6 @@ def _merge_transformers( return new_transformers, new_transformer_to_input_indices -Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"] - - class TableVectorizer(ColumnTransformer): """Automatically transform a heterogeneous dataframe to a numerical array. @@ -494,7 +494,7 @@ class TableVectorizer(ColumnTransformer): ] """ - transformers_: list[tuple[str, str | TransformerMixin, list[str]]] + transformers_: list[tuple[str, Transformer, list[str]]] columns_: pd.Index types_: dict[str, type] imputed_columns_: list[str]