From df9e11579568168313e492d532f64f60824110f0 Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Fri, 13 Aug 2021 12:04:23 +0200
Subject: [PATCH] Score documents: fix word preprocessing
---
.../text/widgets/owscoredocuments.py | 89 +++++++--------
.../widgets/tests/test_owscoredocuments.py | 105 +++++++++++++-----
2 files changed, 116 insertions(+), 78 deletions(-)
diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py
index b657d22db..d50b7b5c0 100644
--- a/orangecontrib/text/widgets/owscoredocuments.py
+++ b/orangecontrib/text/widgets/owscoredocuments.py
@@ -1,25 +1,19 @@
import re
from collections import Counter
from inspect import signature
-from typing import List, Callable, Tuple, Union
+from typing import Callable, List, Tuple, Union
import numpy as np
-from pandas import isnull
-from Orange.data import (
- Table,
- Domain,
- StringVariable,
- ContinuousVariable,
- DiscreteVariable,
-)
+from AnyQt.QtCore import QSortFilterProxyModel, Qt
+from AnyQt.QtWidgets import QHeaderView, QLineEdit, QTableView
+from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.util import wrap_callback
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
+from Orange.widgets.utils.itemmodels import PyTableModel, TableModel
from Orange.widgets.widget import Input, Msg, Output, OWWidget
from orangewidget import gui
from orangewidget.settings import Setting
-from Orange.widgets.utils.itemmodels import PyTableModel, TableModel
-from AnyQt.QtWidgets import QTableView, QLineEdit, QHeaderView
-from AnyQt.QtCore import Qt, QSortFilterProxyModel
+from pandas import isnull
from sklearn.metrics.pairwise import cosine_similarity
from orangecontrib.text import Corpus
@@ -30,9 +24,7 @@
)
-def _word_frequency(
- corpus: Corpus, words: List[str], callback: Callable
-) -> np.ndarray:
+def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray:
res = []
tokens = corpus.tokens
for i, t in enumerate(tokens):
@@ -116,6 +108,16 @@ def _preprocess_words(
with words preprocessors that change words (e.g. normalization) must
be applied to words too.
"""
+ # workaround to preprocess words
+ # TODO: currently preprocessors work only on corpus, when there will be more
+ # cases like this think about implementation of preprocessors for a list
+ # of strings
+ words_feature = StringVariable("words")
+ words_c = Corpus(
+ Domain([], metas=[words_feature]),
+ metas=np.array([[w] for w in words]),
+ text_features=[words_feature],
+ )
# only transformers and normalizers preprocess on the word level
pps = [
pp
@@ -123,15 +125,14 @@ def _preprocess_words(
if isinstance(pp, (BaseTransformer, BaseNormalizer))
]
for i, pp in enumerate(pps):
- # TODO: _preprocess is protected make it public
- words = [pp._preprocess(w) for w in words]
+ words_c = pp(words_c)
callback((i + 1) / len(pps))
- return words
+ return [w[0] for w in words_c.tokens if len(w)]
def _run(
corpus: Corpus,
- words: Table,
+ words: List[str],
scoring_methods: List[str],
aggregation: str,
additional_params: dict,
@@ -163,21 +164,19 @@ def callback(i: float) -> None:
cb_part = 1 / (len(scoring_methods) + 1) # +1 for preprocessing
- words = _preprocess_words(
- corpus, words, wrap_callback(callback, end=cb_part)
- )
+ words = _preprocess_words(corpus, words, wrap_callback(callback, end=cb_part))
+ if len(words) == 0:
+ raise Exception(
+ "Empty word list after preprocessing. Please provide a valid set of words."
+ )
for i, sm in enumerate(scoring_methods):
scoring_method = SCORING_METHODS[sm][1]
sig = signature(scoring_method)
- add_params = {
- k: v for k, v in additional_params.items() if k in sig.parameters
- }
+ add_params = {k: v for k, v in additional_params.items() if k in sig.parameters}
scs = scoring_method(
corpus,
words,
- wrap_callback(
- callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part
- ),
+ wrap_callback(callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part),
**add_params
)
scs = AGGREGATIONS[aggregation](scs, axis=1)
@@ -202,7 +201,8 @@ def update_column_widths(self) -> None:
"""
header = self.horizontalHeader()
col_width = max(
- [0] + [
+ [0]
+ + [
max(self.sizeHintForColumn(i), header.sectionSizeHint(i))
for i in range(1, self.model().columnCount())
]
@@ -223,10 +223,7 @@ def _convert(text: str) -> Union[str, int]:
@staticmethod
def _alphanum_key(key: str) -> List[Union[str, int]]:
- return [
- ScoreDocumentsProxyModel._convert(c)
- for c in re.split("([0-9]+)", key)
- ]
+ return [ScoreDocumentsProxyModel._convert(c) for c in re.split("([0-9]+)", key)]
def lessThan(self, left_ind, right_ind):
"""
@@ -281,7 +278,7 @@ class Warning(OWWidget.Warning):
corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.")
class Error(OWWidget.Error):
- unknown_err = Msg("{}")
+ custom_err = Msg("{}")
def __init__(self):
OWWidget.__init__(self)
@@ -391,10 +388,7 @@ def avg_len(attr):
@Inputs.words
def set_words(self, words: Table) -> None:
- if (
- words is None
- or len(words.domain.variables + words.domain.metas) == 0
- ):
+ if words is None or len(words.domain.variables + words.domain.metas) == 0:
self.words = None
else:
self.Warning.missing_words.clear()
@@ -418,11 +412,7 @@ def _gather_scores(self) -> Tuple[np.ndarray, List[str]]:
scorers = self._get_active_scorers()
methods = [m for m in scorers if (m, aggregation) in self.scores]
scores = [self.scores[(m, aggregation)] for m in methods]
- scores = (
- np.column_stack(scores)
- if scores
- else np.empty((len(self.corpus), 0))
- )
+ scores = np.column_stack(scores) if scores else np.empty((len(self.corpus), 0))
labels = [SCORING_METHODS[m][0] for m in methods]
return scores, labels
@@ -466,13 +456,13 @@ def _fill_table(self, scores: np.ndarray, labels: List[str]) -> None:
self.view.horizontalHeader().setSortIndicator(*self.sort_column_order)
def _fill_and_output(self) -> None:
- """ Fill the table in the widget and send the output """
+ """Fill the table in the widget and send the output"""
scores, labels = self._gather_scores()
self._fill_table(scores, labels)
self._send_output(scores, labels)
def _clear_and_run(self) -> None:
- """ Clear cached scores and commit """
+ """Clear cached scores and commit"""
self.scores = {}
self.cancel()
self._fill_and_output()
@@ -482,7 +472,7 @@ def __setting_changed(self) -> None:
self.commit()
def commit(self) -> None:
- self.Error.unknown_err.clear()
+ self.Error.custom_err.clear()
self.cancel()
if self.corpus is None and self.words is None:
return
@@ -493,9 +483,7 @@ def commit(self) -> None:
else:
scorers = self._get_active_scorers()
aggregation = self._get_active_aggregation()
- new_scores = [
- s for s in scorers if (s, aggregation) not in self.scores
- ]
+ new_scores = [s for s in scorers if (s, aggregation) not in self.scores]
if new_scores:
self.start(
_run,
@@ -522,7 +510,8 @@ def on_partial_result(self, result: Tuple[str, str, np.ndarray]) -> None:
self._fill_table(scores, labels)
def on_exception(self, ex: Exception) -> None:
- self.Error.unknown_err(ex)
+ self.Error.custom_err(ex)
+ self._fill_and_output()
def _get_active_scorers(self) -> List[str]:
"""
diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
index 69b31e090..9d8d4fc20 100644
--- a/orangecontrib/text/widgets/tests/test_owscoredocuments.py
+++ b/orangecontrib/text/widgets/tests/test_owscoredocuments.py
@@ -1,19 +1,22 @@
import unittest
from math import isclose
-from typing import List, Union
+from typing import List
from unittest.mock import patch
import numpy as np
from AnyQt.QtCore import Qt
+from Orange.data import ContinuousVariable, Domain, StringVariable, Table
+from Orange.misc.collections import natural_sorted
+from Orange.util import dummy_callback
from Orange.widgets.tests.base import WidgetTest
-from Orange.data import Table, StringVariable, Domain, ContinuousVariable
from Orange.widgets.tests.utils import simulate
-from Orange.misc.collections import natural_sorted
-from orangecontrib.text import Corpus
-from orangecontrib.text import preprocess
+from orangecontrib.text import Corpus, preprocess
from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
-from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments
+from orangecontrib.text.widgets.owscoredocuments import (
+ OWScoreDocuments,
+ _preprocess_words,
+)
def embedding_mock(_, corpus, __):
@@ -48,6 +51,7 @@ def setUp(self) -> None:
pp_list = [
preprocess.LowercaseTransformer(),
preprocess.StripAccentsTransformer(),
+ preprocess.UrlRemover(),
preprocess.SnowballStemmer(),
]
for p in pp_list:
@@ -59,9 +63,7 @@ def setUp(self) -> None:
def test_set_data(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
- self.assertEqual(
- [x[0] for x in self.widget.model], self.corpus.titles.tolist()
- )
+ self.assertEqual([x[0] for x in self.widget.model], self.corpus.titles.tolist())
self.assertTrue(self.widget.Warning.missing_words.is_shown())
self.send_signal(self.widget.Inputs.words, self.words)
@@ -71,12 +73,8 @@ def test_set_data(self):
self.assertTrue(all(len(x) == 2 for x in self.widget.model))
output = self.get_output(self.widget.Outputs.corpus)
- self.assertTupleEqual(
- output.domain.variables, self.corpus.domain.variables
- )
- self.assertTupleEqual(
- output.domain.metas[:-1], self.corpus.domain.metas
- )
+ self.assertTupleEqual(output.domain.variables, self.corpus.domain.variables)
+ self.assertTupleEqual(output.domain.metas[:-1], self.corpus.domain.metas)
self.assertEqual(str(output.domain.metas[-1]), "Word count")
self.assertEqual(len(output), len(self.corpus))
@@ -101,12 +99,8 @@ def test_guess_word_attribute(self):
w = StringVariable("Words")
w.attributes["type"] = "words"
w1 = StringVariable("Words 1")
- words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape(
- (-1, 1)
- )
- words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape(
- (-1, 1)
- )
+ words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape((-1, 1))
+ words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape((-1, 1))
# guess by attribute type
self.words = Table(
@@ -128,9 +122,7 @@ def test_guess_word_attribute(self):
# guess by length
w2 = StringVariable("Words 2")
- words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape(
- (-1, 1)
- )
+ words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape((-1, 1))
self.words = Table(
Domain([], metas=[w2, w1]),
np.empty((len(words), 0)),
@@ -183,7 +175,7 @@ def test_change_scorer(self):
@staticmethod
def create_corpus(texts: List[str]) -> Corpus:
- """ Create sample corpus with texts passed """
+ """Create sample corpus with texts passed"""
text_var = StringVariable("Text")
domain = Domain([], metas=[text_var])
c = Corpus(
@@ -235,9 +227,7 @@ def test_word_appearance(self):
self.widget.controls.word_frequency.click()
self.widget.controls.word_appearance.click()
self.wait_until_finished()
- self.assertListEqual(
- [x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1]
- )
+ self.assertListEqual([x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1])
cb_aggregation = self.widget.controls.aggregation
simulate.combobox_activate_item(cb_aggregation, "Max")
@@ -297,6 +287,65 @@ def test_sort_table(self):
data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])
+ def test_preprocess_words(self):
+ corpus = Corpus.from_file("book-excerpts")
+ words = [
+ "House",
+ "dóctor",
+ "boy",
+ "way",
+ "Rum https://google.com",
+ "https://google.com",
+ "abracadabra
",
+ ]
+
+ pp_list = [
+ preprocess.LowercaseTransformer(),
+ preprocess.StripAccentsTransformer(),
+ preprocess.UrlRemover(),
+ preprocess.HtmlTransformer(),
+ ]
+ for p in pp_list:
+ corpus = p(corpus)
+
+ self.assertListEqual(
+ ["house", "doctor", "boy", "way", "rum", "abracadabra"],
+ _preprocess_words(corpus, words, dummy_callback),
+ )
+
+ words = ["House", "dóctor", "boys", "way", "Rum"]
+
+ pp_list = [preprocess.SnowballStemmer()]
+ for p in pp_list:
+ corpus = p(corpus)
+
+ self.assertListEqual(
+ ["hous", "doctor", "boy", "way", "rum"],
+ _preprocess_words(corpus, words, dummy_callback),
+ )
+
+ def test_no_words_after_preprocess(self):
+ w = StringVariable("Words")
+ words = np.array(["https://google.com"]).reshape((-1, 1))
+ words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words)
+ self.send_signal(self.widget.Inputs.corpus, self.corpus)
+ self.send_signal(self.widget.Inputs.words, words)
+ self.wait_until_finished()
+
+ self.assertTrue(self.widget.Error.custom_err.is_shown())
+ self.assertEqual(
+ "Empty word list after preprocessing. Please provide a valid set of words.",
+ str(self.widget.Error.custom_err),
+ )
+
+ w = StringVariable("Words")
+ words = np.array(["https://google.com", "house"]).reshape((-1, 1))
+ words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words)
+ self.send_signal(self.widget.Inputs.words, words)
+ self.wait_until_finished()
+
+ self.assertFalse(self.widget.Error.custom_err.is_shown())
+
def test_sort_setting(self):
"""
Test if sorting is correctly memorized in setting and restored