add get_feature_names_out to MinHashEncoder (#616)

* add get_features_out to MinHashEncoder and add a test for deterministic * changelog + fix doc * changelog * Apply suggestions from code review replace ` by `` Co-authored-by: Vincent M <[email protected]> * fix Vincent's comments * Update skrub/tests/test_minhash_encoder.py Co-authored-by: Jovan Stojanovic <[email protected]> --------- Co-authored-by: Vincent M <[email protected]> Co-authored-by: Jovan Stojanovic <[email protected]>
skrub-data · Jun 26, 2023 · a7e0e41 · a7e0e41
1 parent 3a83501
commit a7e0e41
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 8 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -65,6 +65,9 @@ Minor changes
   and types.
   :pr:`601` by :user:`Jovan Stojanovic <jovan-stojanovic>`
 
+* Add `get_feature_names_out` method to :class:`MinHashEncoder`.
+  :pr:`616` by :user:`Leo Grinsztajn <LeoGrin>`
+
 Before skrub: dirty_cat
 ========================
 

diff --git a/skrub/_minhash_encoder.py b/skrub/_minhash_encoder.py
@@ -10,7 +10,7 @@
 from joblib import Parallel, delayed, effective_n_jobs
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils import gen_even_slices, murmurhash3_32
-from sklearn.utils.validation import check_is_fitted
+from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
 
 from ._fast_hash import ngram_min_hash
 from ._string_distances import get_unique_ngrams
@@ -71,6 +71,10 @@ class MinHashEncoder(BaseEstimator, TransformerMixin):
     ----------
     hash_dict_ : LRUDict
         Computed hashes.
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+    feature_names_in_ : ndarray of shape (n_features_in,)
+        Names of features seen during :term:`fit`.
 
     See Also
     --------
@@ -244,6 +248,10 @@ def fit(self, X, y=None) -> "MinHashEncoder":
         :obj:`MinHashEncoder`
             The fitted :class:`MinHashEncoder` instance (self).
         """
+        self._check_n_features(X, reset=True)
+        self._check_feature_names(X, reset=True)
+        X = check_input(X)
+
         if self.hashing not in ["fast", "murmur"]:
             raise ValueError(
                 f"Got hashing={self.hashing!r}, "
@@ -272,6 +280,8 @@ def transform(self, X) -> np.ndarray:
             Transformed input.
         """
         check_is_fitted(self, "hash_dict_")
+        self._check_n_features(X, reset=False)
+        self._check_feature_names(X, reset=False)
         X = check_input(X)
         if self.minmax_hash:
             if self.n_components % 2 != 0:
@@ -337,3 +347,39 @@ def transform(self, X) -> np.ndarray:
         )
 
         return X_out.astype(np.float64)  # The output is an int32 before conversion
+
+    def get_feature_names_out(self, input_features=None):
+        """Get output feature names for transformation.
+
+        The output feature names look like:
+        ``["x0_0", "x0_1", ..., "x0_(n_components - 1)",
+        "x1_0", ..., "x1_(n_components - 1)", ...,
+        "x(n_features_out - 1)_(n_components - 1)"]``
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features.
+
+            - If ``input_features`` is ``None``, then ``feature_names_in_`` is
+              used as feature names in. If ``feature_names_in_`` is not defined,
+              then the following input feature names are generated:
+              ``["x0", "x1", ..., "x(n_features_in_ - 1)"]``.
+            - If ``input_features`` is an array-like, then ``input_features`` must
+              match ``feature_names_in_`` if ``feature_names_in_`` is defined.
+
+        Returns
+        -------
+        feature_names_out : ndarray of str objects
+            Transformed feature names.
+        """
+
+        check_is_fitted(self)
+        input_features = _check_feature_names_in(self, input_features)
+
+        feature_names = []
+        for feature in input_features:
+            for i in range(self.n_components):
+                feature_names.append(f"{feature}_{i}")
+
+        return feature_names
diff --git a/skrub/tests/test_minhash_encoder.py b/skrub/tests/test_minhash_encoder.py
@@ -5,8 +5,9 @@
 import numpy as np
 import pandas as pd
 import pytest
+from numpy.testing import assert_array_equal
 from sklearn.exceptions import NotFittedError
-from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel
+from sklearn.utils._testing import skip_if_no_parallel
 
 from skrub import MinHashEncoder
 
@@ -34,7 +35,7 @@ def test_minhash_encoder(hashing, minmax_hash) -> None:
     encoder2 = MinHashEncoder(n_components=2, hashing=hashing)
     encoder2.fit(X)
     y2 = encoder2.transform(X)
-    np.testing.assert_array_equal(y, y2)
+    assert_array_equal(y, y2)
 
     # Test min property
     if not minmax_hash:
@@ -66,9 +67,7 @@ def test_multiple_columns() -> None:
     fit1 = MinHashEncoder(n_components=30).fit_transform(X1)
     fit2 = MinHashEncoder(n_components=30).fit_transform(X2)
     fit = MinHashEncoder(n_components=30).fit_transform(X)
-    assert np.array_equal(
-        np.array([fit[:, :30], fit[:, 30:60]]), np.array([fit1, fit2])
-    )
+    assert_array_equal(np.array([fit[:, :30], fit[:, 30:60]]), np.array([fit1, fit2]))
 
 
 def test_input_type() -> None:
@@ -146,11 +145,11 @@ def test_missing_values_none() -> None:
 
     enc = MinHashEncoder()
     d = enc.fit_transform(a)
-    np.testing.assert_array_equal(d[2], 0)
+    assert_array_equal(d[2], 0)
 
     e = np.array([["a", "b", "", "c"]], dtype=object).T
     f = enc.fit_transform(e)
-    np.testing.assert_array_equal(f[2], 0)
+    assert_array_equal(f[2], 0)
 
 
 def test_cache_overflow() -> None:
@@ -261,3 +260,37 @@ def test_check_fitted_minhash_encoder() -> None:
     # Check that it works after fitting
     encoder.fit(X)
     encoder.transform(X)
+
+
+def test_deterministic():
+    """Test that the encoder is deterministic"""
+    # TODO: add random state to encoder
+    encoder1 = MinHashEncoder(n_components=4)
+    encoder2 = MinHashEncoder(n_components=4)
+    X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None]
+    encoded1 = encoder1.fit_transform(X)
+    encoded2 = encoder2.fit_transform(X)
+    assert_array_equal(encoded1, encoded2)
+
+
+def test_get_feature_names_out():
+    """Test that get_feature_names_out returns the correct feature names"""
+    encoder = MinHashEncoder(n_components=4)
+    X = pd.DataFrame(
+        {
+            "col1": ["a", "b", "c", "d", "e", "f", "g", "h"],
+            "col2": ["a", "b", "c", "d", "e", "f", "g", "h"],
+        }
+    )
+    encoder.fit(X)
+    expected_columns = np.array(
+        ["col1_0", "col1_1", "col1_2", "col1_3", "col2_0", "col2_1", "col2_2", "col2_3"]
+    )
+    assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns)
+
+    # Test that it works with a list of strings
+    encoder = MinHashEncoder(n_components=4)
+    X = np.array(["a", "b", "c", "d", "e", "f", "g", "h"])[:, None]
+    encoder.fit(X)
+    expected_columns = np.array(["x0_0", "x0_1", "x0_2", "x0_3"])
+    assert_array_equal(np.array(encoder.get_feature_names_out()), expected_columns)