Skip to content

Commit

Permalink
add get_feature_names_out to MinHashEncoder (#616)
Browse files Browse the repository at this point in the history
* add get_features_out to MinHashEncoder and add a test for deterministic

* changelog + fix doc

* changelog

* Apply suggestions from code review

replace ` by ``

Co-authored-by: Vincent M <[email protected]>

* fix Vincent's comments

* Update skrub/tests/test_minhash_encoder.py

Co-authored-by: Jovan Stojanovic <[email protected]>

---------

Co-authored-by: Vincent M <[email protected]>
Co-authored-by: Jovan Stojanovic <[email protected]>
  • Loading branch information
3 people authored Jun 26, 2023
1 parent 3a83501 commit a7e0e41
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 8 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ Minor changes
and types.
:pr:`601` by :user:`Jovan Stojanovic <jovan-stojanovic>`

* Add `get_feature_names_out` method to :class:`MinHashEncoder`.
:pr:`616` by :user:`Leo Grinsztajn <LeoGrin>`

Before skrub: dirty_cat
========================

Expand Down
48 changes: 47 additions & 1 deletion skrub/_minhash_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from joblib import Parallel, delayed, effective_n_jobs
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import gen_even_slices, murmurhash3_32
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted

from ._fast_hash import ngram_min_hash
from ._string_distances import get_unique_ngrams
Expand Down Expand Up @@ -71,6 +71,10 @@ class MinHashEncoder(BaseEstimator, TransformerMixin):
----------
hash_dict_ : LRUDict
Computed hashes.
n_features_in_ : int
Number of features seen during :term:`fit`.
feature_names_in_ : ndarray of shape (n_features_in,)
Names of features seen during :term:`fit`.
See Also
--------
Expand Down Expand Up @@ -244,6 +248,10 @@ def fit(self, X, y=None) -> "MinHashEncoder":
:obj:`MinHashEncoder`
The fitted :class:`MinHashEncoder` instance (self).
"""
self._check_n_features(X, reset=True)
self._check_feature_names(X, reset=True)
X = check_input(X)

if self.hashing not in ["fast", "murmur"]:
raise ValueError(
f"Got hashing={self.hashing!r}, "
Expand Down Expand Up @@ -272,6 +280,8 @@ def transform(self, X) -> np.ndarray:
Transformed input.
"""
check_is_fitted(self, "hash_dict_")
self._check_n_features(X, reset=False)
self._check_feature_names(X, reset=False)
X = check_input(X)
if self.minmax_hash:
if self.n_components % 2 != 0:
Expand Down Expand Up @@ -337,3 +347,39 @@ def transform(self, X) -> np.ndarray:
)

return X_out.astype(np.float64) # The output is an int32 before conversion

def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.
The output feature names look like:
``["x0_0", "x0_1", ..., "x0_(n_components - 1)",
"x1_0", ..., "x1_(n_components - 1)", ...,
"x(n_features_out - 1)_(n_components - 1)"]``
Parameters
----------
input_features : array-like of str or None, default=None
Input features.
- If ``input_features`` is ``None``, then ``feature_names_in_`` is
used as feature names in. If ``feature_names_in_`` is not defined,
then the following input feature names are generated:
``["x0", "x1", ..., "x(n_features_in_ - 1)"]``.
- If ``input_features`` is an array-like, then ``input_features`` must
match ``feature_names_in_`` if ``feature_names_in_`` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""

check_is_fitted(self)
input_features = _check_feature_names_in(self, input_features)

feature_names = []
for feature in input_features:
for i in range(self.n_components):
feature_names.append(f"{feature}_{i}")

return feature_names
47 changes: 40 additions & 7 deletions skrub/tests/test_minhash_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import numpy as np
import pandas as pd
import pytest
from numpy.testing import assert_array_equal
from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel
from sklearn.utils._testing import skip_if_no_parallel

from skrub import MinHashEncoder

Expand Down Expand Up @@ -34,7 +35,7 @@ def test_minhash_encoder(hashing, minmax_hash) -> None:
encoder2 = MinHashEncoder(n_components=2, hashing=hashing)
encoder2.fit(X)
y2 = encoder2.transform(X)
np.testing.assert_array_equal(y, y2)
assert_array_equal(y, y2)

# Test min property
if not minmax_hash:
Expand Down Expand Up @@ -66,9 +67,7 @@ def test_multiple_columns() -> None:
fit1 = MinHashEncoder(n_components=30).fit_transform(X1)
fit2 = MinHashEncoder(n_components=30).fit_transform(X2)
fit = MinHashEncoder(n_components=30).fit_transform(X)
assert np.array_equal(
np.array([fit[:, :30], fit[:, 30:60]]), np.array([fit1, fit2])
)
assert_array_equal(np.array([fit[:, :30], fit[:, 30:60]]), np.array([fit1, fit2]))


def test_input_type() -> None:
Expand Down Expand Up @@ -146,11 +145,11 @@ def test_missing_values_none() -> None:

enc = MinHashEncoder()
d = enc.fit_transform(a)
np.testing.assert_array_equal(d[2], 0)
assert_array_equal(d[2], 0)

e = np.array([["a", "b", "", "c"]], dtype=object).T
f = enc.fit_transform(e)
np.testing.assert_array_equal(f[2], 0)
assert_array_equal(f[2], 0)


def test_cache_overflow() -> None:
Expand Down Expand Up @@ -261,3 +260,37 @@ def test_check_fitted_minhash_encoder() -> None:
# Check that it works after fitting
encoder.fit(X)
encoder.transform(X)


def test_deterministic():
"""Test that the encoder is deterministic"""
# TODO: add random state to encoder
encoder1 = MinHashEncoder(n_components=4)
encoder2 = MinHashEncoder(n_components=4)
X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None]
encoded1 = encoder1.fit_transform(X)
encoded2 = encoder2.fit_transform(X)
assert_array_equal(encoded1, encoded2)


def test_get_feature_names_out():
"""Test that get_feature_names_out returns the correct feature names"""
encoder = MinHashEncoder(n_components=4)
X = pd.DataFrame(
{
"col1": ["a", "b", "c", "d", "e", "f", "g", "h"],
"col2": ["a", "b", "c", "d", "e", "f", "g", "h"],
}
)
encoder.fit(X)
expected_columns = np.array(
["col1_0", "col1_1", "col1_2", "col1_3", "col2_0", "col2_1", "col2_2", "col2_3"]
)
assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns)

# Test that it works with a list of strings
encoder = MinHashEncoder(n_components=4)
X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None]
encoder.fit(X)
expected_columns = np.array(["x0_0", "x0_1", "x0_2", "x0_3"])
assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns)

0 comments on commit a7e0e41

Please sign in to comment.