Reference implementation for TfIdfVectorizer supports string inputs (o…

…nnx#5650) ### Description Current implementation of TfIdfVectorizer does not support string inputs. This implements this supported behaviour and adds unit tests. --------- Signed-off-by: Xavier Dupre <[email protected]>
AlexandreEichenberger · Oct 18, 2023 · 42b0662 · 42b0662
1 parent 59995e5
commit 42b0662
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 5 deletions.
diff --git a/onnx/reference/ops/op_tfidf_vectorizer.py b/onnx/reference/ops/op_tfidf_vectorizer.py
@@ -17,8 +17,8 @@ def __init__(self):
         self.added_keys = []
 
     def emplace(self, key, value):
-        if not isinstance(key, int):
-            raise TypeError(f"key must be a NGramPart not {type(key)}.")
+        if not isinstance(key, (int, str)):
+            raise TypeError(f"key must be a int or str not {type(key)}.")
         if not isinstance(value, NgramPart):
             raise TypeError(f"value must be a NGramPart not {type(value)}.")
         if key not in self:
@@ -147,11 +147,12 @@ def __init__(self, onnx_node, run_params):  # type: ignore
         self.output_size_ = max(self.ngram_indexes_) + 1
         self.weights_ = self.weights  # type: ignore
         self.pool_int64s_ = self.pool_int64s  # type: ignore
+        self.pool_strings_ = self.pool_strings  # type: ignore
 
         self.int64_map_ = NgramPart(-10)
         self.int64_map_.init()
 
-        total_items = len(self.pool_int64s_)
+        total_items = len(self.pool_int64s_ or self.pool_strings_)
         ngram_id = 1  # start with 1, 0 - means no n-gram
         # Load into dictionary only required gram sizes
         ngram_size = 1
@@ -170,7 +171,7 @@ def __init__(self, onnx_node, run_params):  # type: ignore
                     and ngram_size <= self.max_gram_length_
                 ):
                     ngram_id = populate_grams(
-                        self.pool_int64s_,
+                        self.pool_int64s_ or self.pool_strings_,
                         start_idx,
                         ngrams,
                         ngram_size,
@@ -359,7 +360,7 @@ def _run(  # type: ignore
             # TfidfVectorizer returns a zero tensor of shape
             # {b_dim, output_size} when b_dim is the number of received observations
             # and output_size the is the maximum value in ngram_indexes attribute plus 1.
-            return self.output_result(B, frequencies)  # type: ignore[arg-type]
+            return (self.output_result(B, frequencies),)  # type: ignore[arg-type]
 
         def fn(row_num):
             self.compute_impl(

diff --git a/onnx/test/reference_evaluator_ml_test.py b/onnx/test/reference_evaluator_ml_test.py
@@ -1802,6 +1802,86 @@ def test_svm_regressor_linear_one_class(self):
                 got = sess.run(None, {"X": x})
                 assert_allclose(expected[0], got[0], atol=1e-6)
 
+    def test_onnxrt_tfidf_vectorizer_ints(self):
+        inputi = np.array([[1, 1, 3, 3, 3, 7], [8, 6, 7, 5, 6, 8]]).astype(np.int64)
+        output = np.array(
+            [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]]
+        ).astype(np.float32)
+
+        ngram_counts = np.array([0, 4]).astype(np.int64)
+        ngram_indexes = np.array([0, 1, 2, 3, 4, 5, 6]).astype(np.int64)
+        pool_int64s = np.array([2, 3, 5, 4, 5, 6, 7, 8, 6, 7]).astype(  # unigrams
+            np.int64
+        )  # bigrams
+
+        model = make_model_gen_version(
+            make_graph(
+                [
+                    make_node(
+                        "TfIdfVectorizer",
+                        ["tokens"],
+                        ["out"],
+                        mode="TF",
+                        min_gram_length=2,
+                        max_gram_length=2,
+                        max_skip_count=0,
+                        ngram_counts=ngram_counts,
+                        ngram_indexes=ngram_indexes,
+                        pool_int64s=pool_int64s,
+                    )
+                ],
+                "tfidf",
+                [make_tensor_value_info("tokens", TensorProto.INT64, [None, None])],
+                [make_tensor_value_info("out", TensorProto.FLOAT, [None, None])],
+            ),
+            opset_imports=OPSETS,
+        )
+
+        oinf = ReferenceEvaluator(model)
+        res = oinf.run(None, {"tokens": inputi})
+        self.assertEqual(output.tolist(), res[0].tolist())
+
+    def test_onnxrt_tfidf_vectorizer_strings(self):
+        inputi = np.array(
+            [["i1", "i1", "i3", "i3", "i3", "i7"], ["i8", "i6", "i7", "i5", "i6", "i8"]]
+        )
+        output = np.array(
+            [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]]
+        ).astype(np.float32)
+
+        ngram_counts = np.array([0, 4]).astype(np.int64)
+        ngram_indexes = np.array([0, 1, 2, 3, 4, 5, 6]).astype(np.int64)
+        pool_strings = np.array(
+            ["i2", "i3", "i5", "i4", "i5", "i6", "i7", "i8", "i6", "i7"]
+        )
+
+        model = make_model_gen_version(
+            make_graph(
+                [
+                    make_node(
+                        "TfIdfVectorizer",
+                        ["tokens"],
+                        ["out"],
+                        mode="TF",
+                        min_gram_length=2,
+                        max_gram_length=2,
+                        max_skip_count=0,
+                        ngram_counts=ngram_counts,
+                        ngram_indexes=ngram_indexes,
+                        pool_strings=pool_strings,
+                    )
+                ],
+                "tfidf",
+                [make_tensor_value_info("tokens", TensorProto.INT64, [None, None])],
+                [make_tensor_value_info("out", TensorProto.FLOAT, [None, None])],
+            ),
+            opset_imports=OPSETS,
+        )
+
+        oinf = ReferenceEvaluator(model)
+        res = oinf.run(None, {"tokens": inputi})
+        self.assertEqual(output.tolist(), res[0].tolist())
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)