Score documents: fix word preprocessing

biolab · Aug 13, 2021 · ffc33fe · ffc33fe
1 parent 00f71d0
commit ffc33fe
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 8 deletions.
diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py
@@ -10,7 +10,6 @@
     Domain,
     StringVariable,
     ContinuousVariable,
-    DiscreteVariable,
 )
 from Orange.util import wrap_callback
 from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
@@ -116,22 +115,31 @@ def _preprocess_words(
     with words preprocessors that change words (e.g. normalization) must
     be applied to words too.
     """
+    # workaround to preprocess words
+    # TODO: currently preprocessors work only on corpus, when there will be more
+    #  cases like this think about implementation of preprocessors for a list
+    #  of strings
+    words_feature = StringVariable("words")
+    words_c = Corpus(
+        Domain([], metas=[words_feature]),
+        metas=np.array([[w] for w in words]),
+        text_features=[words_feature]
+    )
     # only transformers and normalizers preprocess on the word level
     pps = [
         pp
         for pp in corpus.used_preprocessor.preprocessors
         if isinstance(pp, (BaseTransformer, BaseNormalizer))
     ]
     for i, pp in enumerate(pps):
-        # TODO: _preprocess is protected make it public
-        words = [pp._preprocess(w) for w in words]
+        words_c = pp(words_c)
         callback((i + 1) / len(pps))
-    return words
+    return [w[0] for w in words_c.tokens]
 
 
 def _run(
     corpus: Corpus,
-    words: Table,
+    words: List[str],
     scoring_methods: List[str],
     aggregation: str,
     additional_params: dict,
@@ -155,7 +163,6 @@ def _run(
     state
         TaskState for reporting the task status and giving partial results
     """
-
     def callback(i: float) -> None:
         state.set_progress_value(i * 100)
         if state.is_interruption_requested():

diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
@@ -1,10 +1,11 @@
 import unittest
 from math import isclose
-from typing import List, Union
+from typing import List
 from unittest.mock import patch
 
 import numpy as np
 from AnyQt.QtCore import Qt
+from Orange.util import dummy_callback
 from Orange.widgets.tests.base import WidgetTest
 from Orange.data import Table, StringVariable, Domain, ContinuousVariable
 from Orange.widgets.tests.utils import simulate
@@ -13,7 +14,10 @@
 from orangecontrib.text import Corpus
 from orangecontrib.text import preprocess
 from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
-from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments
+from orangecontrib.text.widgets.owscoredocuments import (
+    OWScoreDocuments,
+    _preprocess_words,
+)
 
 
 def embedding_mock(_, corpus, __):
@@ -297,6 +301,42 @@ def test_sort_table(self):
         data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
         self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])
 
+    def test_preprocess_words(self):
+        corpus = Corpus.from_file("book-excerpts")
+        words = [
+            "House",
+            "dóctor",
+            "boy",
+            "way",
+            "Rum https://google.com",
+            "<p>abra<b>cadabra</b><p>",
+        ]
+
+        pp_list = [
+            preprocess.LowercaseTransformer(),
+            preprocess.StripAccentsTransformer(),
+            preprocess.UrlRemover(),
+            preprocess.HtmlTransformer(),
+        ]
+        for p in pp_list:
+            corpus = p(corpus)
+
+        self.assertListEqual(
+            ["house", "doctor", "boy", "way", "rum", "abracadabra"],
+            _preprocess_words(corpus, words, dummy_callback),
+        )
+
+        words = ["House", "dóctor", "boys", "way", "Rum"]
+
+        pp_list = [preprocess.SnowballStemmer()]
+        for p in pp_list:
+            corpus = p(corpus)
+
+        self.assertListEqual(
+            ["hous", "doctor", "boy", "way", "rum"],
+            _preprocess_words(corpus, words, dummy_callback),
+        )
+
 
 if __name__ == "__main__":
     unittest.main()