Score documents: fix word preprocessing

biolab · Aug 26, 2021 · df9e115 · df9e115
1 parent 96c295c
commit df9e115
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 78 deletions.
diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py
@@ -1,25 +1,19 @@
 import re
 from collections import Counter
 from inspect import signature
-from typing import List, Callable, Tuple, Union
+from typing import Callable, List, Tuple, Union
 
 import numpy as np
-from pandas import isnull
-from Orange.data import (
-    Table,
-    Domain,
-    StringVariable,
-    ContinuousVariable,
-    DiscreteVariable,
-)
+from AnyQt.QtCore import QSortFilterProxyModel, Qt
+from AnyQt.QtWidgets import QHeaderView, QLineEdit, QTableView
+from Orange.data import ContinuousVariable, Domain, StringVariable, Table
 from Orange.util import wrap_callback
 from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
+from Orange.widgets.utils.itemmodels import PyTableModel, TableModel
 from Orange.widgets.widget import Input, Msg, Output, OWWidget
 from orangewidget import gui
 from orangewidget.settings import Setting
-from Orange.widgets.utils.itemmodels import PyTableModel, TableModel
-from AnyQt.QtWidgets import QTableView, QLineEdit, QHeaderView
-from AnyQt.QtCore import Qt, QSortFilterProxyModel
+from pandas import isnull
 from sklearn.metrics.pairwise import cosine_similarity
 
 from orangecontrib.text import Corpus
@@ -30,9 +24,7 @@
 )
 
 
-def _word_frequency(
-    corpus: Corpus, words: List[str], callback: Callable
-) -> np.ndarray:
+def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray:
     res = []
     tokens = corpus.tokens
     for i, t in enumerate(tokens):
@@ -116,22 +108,31 @@ def _preprocess_words(
     with words preprocessors that change words (e.g. normalization) must
     be applied to words too.
     """
+    # workaround to preprocess words
+    # TODO: currently preprocessors work only on corpus, when there will be more
+    #  cases like this think about implementation of preprocessors for a list
+    #  of strings
+    words_feature = StringVariable("words")
+    words_c = Corpus(
+        Domain([], metas=[words_feature]),
+        metas=np.array([[w] for w in words]),
+        text_features=[words_feature],
+    )
     # only transformers and normalizers preprocess on the word level
     pps = [
         pp
         for pp in corpus.used_preprocessor.preprocessors
         if isinstance(pp, (BaseTransformer, BaseNormalizer))
     ]
     for i, pp in enumerate(pps):
-        # TODO: _preprocess is protected make it public
-        words = [pp._preprocess(w) for w in words]
+        words_c = pp(words_c)
         callback((i + 1) / len(pps))
-    return words
+    return [w[0] for w in words_c.tokens if len(w)]
 
 
 def _run(
     corpus: Corpus,
-    words: Table,
+    words: List[str],
     scoring_methods: List[str],
     aggregation: str,
     additional_params: dict,
@@ -163,21 +164,19 @@ def callback(i: float) -> None:
 
     cb_part = 1 / (len(scoring_methods) + 1)  # +1 for preprocessing
 
-    words = _preprocess_words(
-        corpus, words, wrap_callback(callback, end=cb_part)
-    )
+    words = _preprocess_words(corpus, words, wrap_callback(callback, end=cb_part))
+    if len(words) == 0:
+        raise Exception(
+            "Empty word list after preprocessing. Please provide a valid set of words."
+        )
     for i, sm in enumerate(scoring_methods):
         scoring_method = SCORING_METHODS[sm][1]
         sig = signature(scoring_method)
-        add_params = {
-            k: v for k, v in additional_params.items() if k in sig.parameters
-        }
+        add_params = {k: v for k, v in additional_params.items() if k in sig.parameters}
         scs = scoring_method(
             corpus,
             words,
-            wrap_callback(
-                callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part
-            ),
+            wrap_callback(callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part),
             **add_params
         )
         scs = AGGREGATIONS[aggregation](scs, axis=1)
@@ -202,7 +201,8 @@ def update_column_widths(self) -> None:
         """
         header = self.horizontalHeader()
         col_width = max(
-            [0] + [
+            [0]
+            + [
                 max(self.sizeHintForColumn(i), header.sectionSizeHint(i))
                 for i in range(1, self.model().columnCount())
             ]
@@ -223,10 +223,7 @@ def _convert(text: str) -> Union[str, int]:
 
     @staticmethod
     def _alphanum_key(key: str) -> List[Union[str, int]]:
-        return [
-            ScoreDocumentsProxyModel._convert(c)
-            for c in re.split("([0-9]+)", key)
-        ]
+        return [ScoreDocumentsProxyModel._convert(c) for c in re.split("([0-9]+)", key)]
 
     def lessThan(self, left_ind, right_ind):
         """
@@ -281,7 +278,7 @@ class Warning(OWWidget.Warning):
         corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.")
 
     class Error(OWWidget.Error):
-        unknown_err = Msg("{}")
+        custom_err = Msg("{}")
 
     def __init__(self):
         OWWidget.__init__(self)
@@ -391,10 +388,7 @@ def avg_len(attr):
 
     @Inputs.words
     def set_words(self, words: Table) -> None:
-        if (
-            words is None
-            or len(words.domain.variables + words.domain.metas) == 0
-        ):
+        if words is None or len(words.domain.variables + words.domain.metas) == 0:
             self.words = None
         else:
             self.Warning.missing_words.clear()
@@ -418,11 +412,7 @@ def _gather_scores(self) -> Tuple[np.ndarray, List[str]]:
         scorers = self._get_active_scorers()
         methods = [m for m in scorers if (m, aggregation) in self.scores]
         scores = [self.scores[(m, aggregation)] for m in methods]
-        scores = (
-            np.column_stack(scores)
-            if scores
-            else np.empty((len(self.corpus), 0))
-        )
+        scores = np.column_stack(scores) if scores else np.empty((len(self.corpus), 0))
         labels = [SCORING_METHODS[m][0] for m in methods]
         return scores, labels
 
@@ -466,13 +456,13 @@ def _fill_table(self, scores: np.ndarray, labels: List[str]) -> None:
         self.view.horizontalHeader().setSortIndicator(*self.sort_column_order)
 
     def _fill_and_output(self) -> None:
-        """ Fill the table in the widget and send the output """
+        """Fill the table in the widget and send the output"""
         scores, labels = self._gather_scores()
         self._fill_table(scores, labels)
         self._send_output(scores, labels)
 
     def _clear_and_run(self) -> None:
-        """ Clear cached scores and commit """
+        """Clear cached scores and commit"""
         self.scores = {}
         self.cancel()
         self._fill_and_output()
@@ -482,7 +472,7 @@ def __setting_changed(self) -> None:
         self.commit()
 
     def commit(self) -> None:
-        self.Error.unknown_err.clear()
+        self.Error.custom_err.clear()
         self.cancel()
         if self.corpus is None and self.words is None:
             return
@@ -493,9 +483,7 @@ def commit(self) -> None:
         else:
             scorers = self._get_active_scorers()
             aggregation = self._get_active_aggregation()
-            new_scores = [
-                s for s in scorers if (s, aggregation) not in self.scores
-            ]
+            new_scores = [s for s in scorers if (s, aggregation) not in self.scores]
             if new_scores:
                 self.start(
                     _run,
@@ -522,7 +510,8 @@ def on_partial_result(self, result: Tuple[str, str, np.ndarray]) -> None:
         self._fill_table(scores, labels)
 
     def on_exception(self, ex: Exception) -> None:
-        self.Error.unknown_err(ex)
+        self.Error.custom_err(ex)
+        self._fill_and_output()
 
     def _get_active_scorers(self) -> List[str]:
         """

diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
@@ -1,19 +1,22 @@
 import unittest
 from math import isclose
-from typing import List, Union
+from typing import List
 from unittest.mock import patch
 
 import numpy as np
 from AnyQt.QtCore import Qt
+from Orange.data import ContinuousVariable, Domain, StringVariable, Table
+from Orange.misc.collections import natural_sorted
+from Orange.util import dummy_callback
 from Orange.widgets.tests.base import WidgetTest
-from Orange.data import Table, StringVariable, Domain, ContinuousVariable
 from Orange.widgets.tests.utils import simulate
-from Orange.misc.collections import natural_sorted
 
-from orangecontrib.text import Corpus
-from orangecontrib.text import preprocess
+from orangecontrib.text import Corpus, preprocess
 from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
-from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments
+from orangecontrib.text.widgets.owscoredocuments import (
+    OWScoreDocuments,
+    _preprocess_words,
+)
 
 
 def embedding_mock(_, corpus, __):
@@ -48,6 +51,7 @@ def setUp(self) -> None:
         pp_list = [
             preprocess.LowercaseTransformer(),
             preprocess.StripAccentsTransformer(),
+            preprocess.UrlRemover(),
             preprocess.SnowballStemmer(),
         ]
         for p in pp_list:
@@ -59,9 +63,7 @@ def setUp(self) -> None:
 
     def test_set_data(self):
         self.send_signal(self.widget.Inputs.corpus, self.corpus)
-        self.assertEqual(
-            [x[0] for x in self.widget.model], self.corpus.titles.tolist()
-        )
+        self.assertEqual([x[0] for x in self.widget.model], self.corpus.titles.tolist())
         self.assertTrue(self.widget.Warning.missing_words.is_shown())
 
         self.send_signal(self.widget.Inputs.words, self.words)
@@ -71,12 +73,8 @@ def test_set_data(self):
         self.assertTrue(all(len(x) == 2 for x in self.widget.model))
 
         output = self.get_output(self.widget.Outputs.corpus)
-        self.assertTupleEqual(
-            output.domain.variables, self.corpus.domain.variables
-        )
-        self.assertTupleEqual(
-            output.domain.metas[:-1], self.corpus.domain.metas
-        )
+        self.assertTupleEqual(output.domain.variables, self.corpus.domain.variables)
+        self.assertTupleEqual(output.domain.metas[:-1], self.corpus.domain.metas)
         self.assertEqual(str(output.domain.metas[-1]), "Word count")
         self.assertEqual(len(output), len(self.corpus))
 
@@ -101,12 +99,8 @@ def test_guess_word_attribute(self):
         w = StringVariable("Words")
         w.attributes["type"] = "words"
         w1 = StringVariable("Words 1")
-        words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape(
-            (-1, 1)
-        )
-        words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape(
-            (-1, 1)
-        )
+        words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape((-1, 1))
+        words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape((-1, 1))
 
         # guess by attribute type
         self.words = Table(
@@ -128,9 +122,7 @@ def test_guess_word_attribute(self):
 
         # guess by length
         w2 = StringVariable("Words 2")
-        words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape(
-            (-1, 1)
-        )
+        words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape((-1, 1))
         self.words = Table(
             Domain([], metas=[w2, w1]),
             np.empty((len(words), 0)),
@@ -183,7 +175,7 @@ def test_change_scorer(self):
 
     @staticmethod
     def create_corpus(texts: List[str]) -> Corpus:
-        """ Create sample corpus with texts passed """
+        """Create sample corpus with texts passed"""
         text_var = StringVariable("Text")
         domain = Domain([], metas=[text_var])
         c = Corpus(
@@ -235,9 +227,7 @@ def test_word_appearance(self):
         self.widget.controls.word_frequency.click()
         self.widget.controls.word_appearance.click()
         self.wait_until_finished()
-        self.assertListEqual(
-            [x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1]
-        )
+        self.assertListEqual([x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1])
 
         cb_aggregation = self.widget.controls.aggregation
         simulate.combobox_activate_item(cb_aggregation, "Max")
@@ -297,6 +287,65 @@ def test_sort_table(self):
         data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
         self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])
 
+    def test_preprocess_words(self):
+        corpus = Corpus.from_file("book-excerpts")
+        words = [
+            "House",
+            "dóctor",
+            "boy",
+            "way",
+            "Rum https://google.com",
+            "https://google.com",
+            "<p>abra<b>cadabra</b><p>",
+        ]
+
+        pp_list = [
+            preprocess.LowercaseTransformer(),
+            preprocess.StripAccentsTransformer(),
+            preprocess.UrlRemover(),
+            preprocess.HtmlTransformer(),
+        ]
+        for p in pp_list:
+            corpus = p(corpus)
+
+        self.assertListEqual(
+            ["house", "doctor", "boy", "way", "rum", "abracadabra"],
+            _preprocess_words(corpus, words, dummy_callback),
+        )
+
+        words = ["House", "dóctor", "boys", "way", "Rum"]
+
+        pp_list = [preprocess.SnowballStemmer()]
+        for p in pp_list:
+            corpus = p(corpus)
+
+        self.assertListEqual(
+            ["hous", "doctor", "boy", "way", "rum"],
+            _preprocess_words(corpus, words, dummy_callback),
+        )
+
+    def test_no_words_after_preprocess(self):
+        w = StringVariable("Words")
+        words = np.array(["https://google.com"]).reshape((-1, 1))
+        words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words)
+        self.send_signal(self.widget.Inputs.corpus, self.corpus)
+        self.send_signal(self.widget.Inputs.words, words)
+        self.wait_until_finished()
+
+        self.assertTrue(self.widget.Error.custom_err.is_shown())
+        self.assertEqual(
+            "Empty word list after preprocessing. Please provide a valid set of words.",
+            str(self.widget.Error.custom_err),
+        )
+
+        w = StringVariable("Words")
+        words = np.array(["https://google.com", "house"]).reshape((-1, 1))
+        words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words)
+        self.send_signal(self.widget.Inputs.words, words)
+        self.wait_until_finished()
+
+        self.assertFalse(self.widget.Error.custom_err.is_shown())
+
     def test_sort_setting(self):
         """
         Test if sorting is correctly memorized in setting and restored