Skip to content

Commit

Permalink
Reference implementation for TfIdfVectorizer supports string inputs (o…
Browse files Browse the repository at this point in the history
…nnx#5650)

### Description
Current implementation of TfIdfVectorizer does not support string
inputs. This implements this supported behaviour and adds unit tests.

---------

Signed-off-by: Xavier Dupre <[email protected]>
  • Loading branch information
xadupre authored Oct 18, 2023
1 parent 59995e5 commit 42b0662
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 5 deletions.
11 changes: 6 additions & 5 deletions onnx/reference/ops/op_tfidf_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def __init__(self):
self.added_keys = []

def emplace(self, key, value):
if not isinstance(key, int):
raise TypeError(f"key must be a NGramPart not {type(key)}.")
if not isinstance(key, (int, str)):
raise TypeError(f"key must be a int or str not {type(key)}.")
if not isinstance(value, NgramPart):
raise TypeError(f"value must be a NGramPart not {type(value)}.")
if key not in self:
Expand Down Expand Up @@ -147,11 +147,12 @@ def __init__(self, onnx_node, run_params): # type: ignore
self.output_size_ = max(self.ngram_indexes_) + 1
self.weights_ = self.weights # type: ignore
self.pool_int64s_ = self.pool_int64s # type: ignore
self.pool_strings_ = self.pool_strings # type: ignore

self.int64_map_ = NgramPart(-10)
self.int64_map_.init()

total_items = len(self.pool_int64s_)
total_items = len(self.pool_int64s_ or self.pool_strings_)
ngram_id = 1 # start with 1, 0 - means no n-gram
# Load into dictionary only required gram sizes
ngram_size = 1
Expand All @@ -170,7 +171,7 @@ def __init__(self, onnx_node, run_params): # type: ignore
and ngram_size <= self.max_gram_length_
):
ngram_id = populate_grams(
self.pool_int64s_,
self.pool_int64s_ or self.pool_strings_,
start_idx,
ngrams,
ngram_size,
Expand Down Expand Up @@ -359,7 +360,7 @@ def _run( # type: ignore
# TfidfVectorizer returns a zero tensor of shape
# {b_dim, output_size} when b_dim is the number of received observations
# and output_size the is the maximum value in ngram_indexes attribute plus 1.
return self.output_result(B, frequencies) # type: ignore[arg-type]
return (self.output_result(B, frequencies),) # type: ignore[arg-type]

def fn(row_num):
self.compute_impl(
Expand Down
80 changes: 80 additions & 0 deletions onnx/test/reference_evaluator_ml_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1802,6 +1802,86 @@ def test_svm_regressor_linear_one_class(self):
got = sess.run(None, {"X": x})
assert_allclose(expected[0], got[0], atol=1e-6)

def test_onnxrt_tfidf_vectorizer_ints(self):
inputi = np.array([[1, 1, 3, 3, 3, 7], [8, 6, 7, 5, 6, 8]]).astype(np.int64)
output = np.array(
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]]
).astype(np.float32)

ngram_counts = np.array([0, 4]).astype(np.int64)
ngram_indexes = np.array([0, 1, 2, 3, 4, 5, 6]).astype(np.int64)
pool_int64s = np.array([2, 3, 5, 4, 5, 6, 7, 8, 6, 7]).astype( # unigrams
np.int64
) # bigrams

model = make_model_gen_version(
make_graph(
[
make_node(
"TfIdfVectorizer",
["tokens"],
["out"],
mode="TF",
min_gram_length=2,
max_gram_length=2,
max_skip_count=0,
ngram_counts=ngram_counts,
ngram_indexes=ngram_indexes,
pool_int64s=pool_int64s,
)
],
"tfidf",
[make_tensor_value_info("tokens", TensorProto.INT64, [None, None])],
[make_tensor_value_info("out", TensorProto.FLOAT, [None, None])],
),
opset_imports=OPSETS,
)

oinf = ReferenceEvaluator(model)
res = oinf.run(None, {"tokens": inputi})
self.assertEqual(output.tolist(), res[0].tolist())

def test_onnxrt_tfidf_vectorizer_strings(self):
inputi = np.array(
[["i1", "i1", "i3", "i3", "i3", "i7"], ["i8", "i6", "i7", "i5", "i6", "i8"]]
)
output = np.array(
[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]]
).astype(np.float32)

ngram_counts = np.array([0, 4]).astype(np.int64)
ngram_indexes = np.array([0, 1, 2, 3, 4, 5, 6]).astype(np.int64)
pool_strings = np.array(
["i2", "i3", "i5", "i4", "i5", "i6", "i7", "i8", "i6", "i7"]
)

model = make_model_gen_version(
make_graph(
[
make_node(
"TfIdfVectorizer",
["tokens"],
["out"],
mode="TF",
min_gram_length=2,
max_gram_length=2,
max_skip_count=0,
ngram_counts=ngram_counts,
ngram_indexes=ngram_indexes,
pool_strings=pool_strings,
)
],
"tfidf",
[make_tensor_value_info("tokens", TensorProto.INT64, [None, None])],
[make_tensor_value_info("out", TensorProto.FLOAT, [None, None])],
),
opset_imports=OPSETS,
)

oinf = ReferenceEvaluator(model)
res = oinf.run(None, {"tokens": inputi})
self.assertEqual(output.tolist(), res[0].tolist())


if __name__ == "__main__":
unittest.main(verbosity=2)

0 comments on commit 42b0662

Please sign in to comment.