From df9e11579568168313e492d532f64f60824110f0 Mon Sep 17 00:00:00 2001 From: Primoz Godec Date: Fri, 13 Aug 2021 12:04:23 +0200 Subject: [PATCH] Score documents: fix word preprocessing --- .../text/widgets/owscoredocuments.py | 89 +++++++-------- .../widgets/tests/test_owscoredocuments.py | 105 +++++++++++++----- 2 files changed, 116 insertions(+), 78 deletions(-) diff --git a/orangecontrib/text/widgets/owscoredocuments.py b/orangecontrib/text/widgets/owscoredocuments.py index b657d22db..d50b7b5c0 100644 --- a/orangecontrib/text/widgets/owscoredocuments.py +++ b/orangecontrib/text/widgets/owscoredocuments.py @@ -1,25 +1,19 @@ import re from collections import Counter from inspect import signature -from typing import List, Callable, Tuple, Union +from typing import Callable, List, Tuple, Union import numpy as np -from pandas import isnull -from Orange.data import ( - Table, - Domain, - StringVariable, - ContinuousVariable, - DiscreteVariable, -) +from AnyQt.QtCore import QSortFilterProxyModel, Qt +from AnyQt.QtWidgets import QHeaderView, QLineEdit, QTableView +from Orange.data import ContinuousVariable, Domain, StringVariable, Table from Orange.util import wrap_callback from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState +from Orange.widgets.utils.itemmodels import PyTableModel, TableModel from Orange.widgets.widget import Input, Msg, Output, OWWidget from orangewidget import gui from orangewidget.settings import Setting -from Orange.widgets.utils.itemmodels import PyTableModel, TableModel -from AnyQt.QtWidgets import QTableView, QLineEdit, QHeaderView -from AnyQt.QtCore import Qt, QSortFilterProxyModel +from pandas import isnull from sklearn.metrics.pairwise import cosine_similarity from orangecontrib.text import Corpus @@ -30,9 +24,7 @@ ) -def _word_frequency( - corpus: Corpus, words: List[str], callback: Callable -) -> np.ndarray: +def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray: res = [] tokens = corpus.tokens for i, t in enumerate(tokens): @@ -116,6 +108,16 @@ def _preprocess_words( with words preprocessors that change words (e.g. normalization) must be applied to words too. """ + # workaround to preprocess words + # TODO: currently preprocessors work only on corpus, when there will be more + # cases like this think about implementation of preprocessors for a list + # of strings + words_feature = StringVariable("words") + words_c = Corpus( + Domain([], metas=[words_feature]), + metas=np.array([[w] for w in words]), + text_features=[words_feature], + ) # only transformers and normalizers preprocess on the word level pps = [ pp @@ -123,15 +125,14 @@ def _preprocess_words( if isinstance(pp, (BaseTransformer, BaseNormalizer)) ] for i, pp in enumerate(pps): - # TODO: _preprocess is protected make it public - words = [pp._preprocess(w) for w in words] + words_c = pp(words_c) callback((i + 1) / len(pps)) - return words + return [w[0] for w in words_c.tokens if len(w)] def _run( corpus: Corpus, - words: Table, + words: List[str], scoring_methods: List[str], aggregation: str, additional_params: dict, @@ -163,21 +164,19 @@ def callback(i: float) -> None: cb_part = 1 / (len(scoring_methods) + 1) # +1 for preprocessing - words = _preprocess_words( - corpus, words, wrap_callback(callback, end=cb_part) - ) + words = _preprocess_words(corpus, words, wrap_callback(callback, end=cb_part)) + if len(words) == 0: + raise Exception( + "Empty word list after preprocessing. Please provide a valid set of words." + ) for i, sm in enumerate(scoring_methods): scoring_method = SCORING_METHODS[sm][1] sig = signature(scoring_method) - add_params = { - k: v for k, v in additional_params.items() if k in sig.parameters - } + add_params = {k: v for k, v in additional_params.items() if k in sig.parameters} scs = scoring_method( corpus, words, - wrap_callback( - callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part - ), + wrap_callback(callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part), **add_params ) scs = AGGREGATIONS[aggregation](scs, axis=1) @@ -202,7 +201,8 @@ def update_column_widths(self) -> None: """ header = self.horizontalHeader() col_width = max( - [0] + [ + [0] + + [ max(self.sizeHintForColumn(i), header.sectionSizeHint(i)) for i in range(1, self.model().columnCount()) ] @@ -223,10 +223,7 @@ def _convert(text: str) -> Union[str, int]: @staticmethod def _alphanum_key(key: str) -> List[Union[str, int]]: - return [ - ScoreDocumentsProxyModel._convert(c) - for c in re.split("([0-9]+)", key) - ] + return [ScoreDocumentsProxyModel._convert(c) for c in re.split("([0-9]+)", key)] def lessThan(self, left_ind, right_ind): """ @@ -281,7 +278,7 @@ class Warning(OWWidget.Warning): corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.") class Error(OWWidget.Error): - unknown_err = Msg("{}") + custom_err = Msg("{}") def __init__(self): OWWidget.__init__(self) @@ -391,10 +388,7 @@ def avg_len(attr): @Inputs.words def set_words(self, words: Table) -> None: - if ( - words is None - or len(words.domain.variables + words.domain.metas) == 0 - ): + if words is None or len(words.domain.variables + words.domain.metas) == 0: self.words = None else: self.Warning.missing_words.clear() @@ -418,11 +412,7 @@ def _gather_scores(self) -> Tuple[np.ndarray, List[str]]: scorers = self._get_active_scorers() methods = [m for m in scorers if (m, aggregation) in self.scores] scores = [self.scores[(m, aggregation)] for m in methods] - scores = ( - np.column_stack(scores) - if scores - else np.empty((len(self.corpus), 0)) - ) + scores = np.column_stack(scores) if scores else np.empty((len(self.corpus), 0)) labels = [SCORING_METHODS[m][0] for m in methods] return scores, labels @@ -466,13 +456,13 @@ def _fill_table(self, scores: np.ndarray, labels: List[str]) -> None: self.view.horizontalHeader().setSortIndicator(*self.sort_column_order) def _fill_and_output(self) -> None: - """ Fill the table in the widget and send the output """ + """Fill the table in the widget and send the output""" scores, labels = self._gather_scores() self._fill_table(scores, labels) self._send_output(scores, labels) def _clear_and_run(self) -> None: - """ Clear cached scores and commit """ + """Clear cached scores and commit""" self.scores = {} self.cancel() self._fill_and_output() @@ -482,7 +472,7 @@ def __setting_changed(self) -> None: self.commit() def commit(self) -> None: - self.Error.unknown_err.clear() + self.Error.custom_err.clear() self.cancel() if self.corpus is None and self.words is None: return @@ -493,9 +483,7 @@ def commit(self) -> None: else: scorers = self._get_active_scorers() aggregation = self._get_active_aggregation() - new_scores = [ - s for s in scorers if (s, aggregation) not in self.scores - ] + new_scores = [s for s in scorers if (s, aggregation) not in self.scores] if new_scores: self.start( _run, @@ -522,7 +510,8 @@ def on_partial_result(self, result: Tuple[str, str, np.ndarray]) -> None: self._fill_table(scores, labels) def on_exception(self, ex: Exception) -> None: - self.Error.unknown_err(ex) + self.Error.custom_err(ex) + self._fill_and_output() def _get_active_scorers(self) -> List[str]: """ diff --git a/orangecontrib/text/widgets/tests/test_owscoredocuments.py b/orangecontrib/text/widgets/tests/test_owscoredocuments.py index 69b31e090..9d8d4fc20 100644 --- a/orangecontrib/text/widgets/tests/test_owscoredocuments.py +++ b/orangecontrib/text/widgets/tests/test_owscoredocuments.py @@ -1,19 +1,22 @@ import unittest from math import isclose -from typing import List, Union +from typing import List from unittest.mock import patch import numpy as np from AnyQt.QtCore import Qt +from Orange.data import ContinuousVariable, Domain, StringVariable, Table +from Orange.misc.collections import natural_sorted +from Orange.util import dummy_callback from Orange.widgets.tests.base import WidgetTest -from Orange.data import Table, StringVariable, Domain, ContinuousVariable from Orange.widgets.tests.utils import simulate -from Orange.misc.collections import natural_sorted -from orangecontrib.text import Corpus -from orangecontrib.text import preprocess +from orangecontrib.text import Corpus, preprocess from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder -from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments +from orangecontrib.text.widgets.owscoredocuments import ( + OWScoreDocuments, + _preprocess_words, +) def embedding_mock(_, corpus, __): @@ -48,6 +51,7 @@ def setUp(self) -> None: pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), + preprocess.UrlRemover(), preprocess.SnowballStemmer(), ] for p in pp_list: @@ -59,9 +63,7 @@ def setUp(self) -> None: def test_set_data(self): self.send_signal(self.widget.Inputs.corpus, self.corpus) - self.assertEqual( - [x[0] for x in self.widget.model], self.corpus.titles.tolist() - ) + self.assertEqual([x[0] for x in self.widget.model], self.corpus.titles.tolist()) self.assertTrue(self.widget.Warning.missing_words.is_shown()) self.send_signal(self.widget.Inputs.words, self.words) @@ -71,12 +73,8 @@ def test_set_data(self): self.assertTrue(all(len(x) == 2 for x in self.widget.model)) output = self.get_output(self.widget.Outputs.corpus) - self.assertTupleEqual( - output.domain.variables, self.corpus.domain.variables - ) - self.assertTupleEqual( - output.domain.metas[:-1], self.corpus.domain.metas - ) + self.assertTupleEqual(output.domain.variables, self.corpus.domain.variables) + self.assertTupleEqual(output.domain.metas[:-1], self.corpus.domain.metas) self.assertEqual(str(output.domain.metas[-1]), "Word count") self.assertEqual(len(output), len(self.corpus)) @@ -101,12 +99,8 @@ def test_guess_word_attribute(self): w = StringVariable("Words") w.attributes["type"] = "words" w1 = StringVariable("Words 1") - words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape( - (-1, 1) - ) - words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape( - (-1, 1) - ) + words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape((-1, 1)) + words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape((-1, 1)) # guess by attribute type self.words = Table( @@ -128,9 +122,7 @@ def test_guess_word_attribute(self): # guess by length w2 = StringVariable("Words 2") - words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape( - (-1, 1) - ) + words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape((-1, 1)) self.words = Table( Domain([], metas=[w2, w1]), np.empty((len(words), 0)), @@ -183,7 +175,7 @@ def test_change_scorer(self): @staticmethod def create_corpus(texts: List[str]) -> Corpus: - """ Create sample corpus with texts passed """ + """Create sample corpus with texts passed""" text_var = StringVariable("Text") domain = Domain([], metas=[text_var]) c = Corpus( @@ -235,9 +227,7 @@ def test_word_appearance(self): self.widget.controls.word_frequency.click() self.widget.controls.word_appearance.click() self.wait_until_finished() - self.assertListEqual( - [x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1] - ) + self.assertListEqual([x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1]) cb_aggregation = self.widget.controls.aggregation simulate.combobox_activate_item(cb_aggregation, "Max") @@ -297,6 +287,65 @@ def test_sort_table(self): data = [model.data(model.index(i, 0)) for i in range(model.rowCount())] self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1]) + def test_preprocess_words(self): + corpus = Corpus.from_file("book-excerpts") + words = [ + "House", + "dóctor", + "boy", + "way", + "Rum https://google.com", + "https://google.com", + "

abracadabra

", + ] + + pp_list = [ + preprocess.LowercaseTransformer(), + preprocess.StripAccentsTransformer(), + preprocess.UrlRemover(), + preprocess.HtmlTransformer(), + ] + for p in pp_list: + corpus = p(corpus) + + self.assertListEqual( + ["house", "doctor", "boy", "way", "rum", "abracadabra"], + _preprocess_words(corpus, words, dummy_callback), + ) + + words = ["House", "dóctor", "boys", "way", "Rum"] + + pp_list = [preprocess.SnowballStemmer()] + for p in pp_list: + corpus = p(corpus) + + self.assertListEqual( + ["hous", "doctor", "boy", "way", "rum"], + _preprocess_words(corpus, words, dummy_callback), + ) + + def test_no_words_after_preprocess(self): + w = StringVariable("Words") + words = np.array(["https://google.com"]).reshape((-1, 1)) + words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words) + self.send_signal(self.widget.Inputs.corpus, self.corpus) + self.send_signal(self.widget.Inputs.words, words) + self.wait_until_finished() + + self.assertTrue(self.widget.Error.custom_err.is_shown()) + self.assertEqual( + "Empty word list after preprocessing. Please provide a valid set of words.", + str(self.widget.Error.custom_err), + ) + + w = StringVariable("Words") + words = np.array(["https://google.com", "house"]).reshape((-1, 1)) + words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words) + self.send_signal(self.widget.Inputs.words, words) + self.wait_until_finished() + + self.assertFalse(self.widget.Error.custom_err.is_shown()) + def test_sort_setting(self): """ Test if sorting is correctly memorized in setting and restored