Skip to content

Commit

Permalink
Merge pull request #707 from PrimozGodec/score-documents-preprocessors
Browse files Browse the repository at this point in the history
Score documents: fix word preprocessing
  • Loading branch information
VesnaT authored Aug 27, 2021
2 parents 3203eb8 + df9e115 commit 7a5f5fe
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 78 deletions.
89 changes: 39 additions & 50 deletions orangecontrib/text/widgets/owscoredocuments.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,19 @@
import re
from collections import Counter
from inspect import signature
from typing import List, Callable, Tuple, Union
from typing import Callable, List, Tuple, Union

import numpy as np
from pandas import isnull
from Orange.data import (
Table,
Domain,
StringVariable,
ContinuousVariable,
DiscreteVariable,
)
from AnyQt.QtCore import QSortFilterProxyModel, Qt
from AnyQt.QtWidgets import QHeaderView, QLineEdit, QTableView
from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.util import wrap_callback
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
from Orange.widgets.utils.itemmodels import PyTableModel, TableModel
from Orange.widgets.widget import Input, Msg, Output, OWWidget
from orangewidget import gui
from orangewidget.settings import Setting
from Orange.widgets.utils.itemmodels import PyTableModel, TableModel
from AnyQt.QtWidgets import QTableView, QLineEdit, QHeaderView
from AnyQt.QtCore import Qt, QSortFilterProxyModel
from pandas import isnull
from sklearn.metrics.pairwise import cosine_similarity

from orangecontrib.text import Corpus
Expand All @@ -30,9 +24,7 @@
)


def _word_frequency(
corpus: Corpus, words: List[str], callback: Callable
) -> np.ndarray:
def _word_frequency(corpus: Corpus, words: List[str], callback: Callable) -> np.ndarray:
res = []
tokens = corpus.tokens
for i, t in enumerate(tokens):
Expand Down Expand Up @@ -116,22 +108,31 @@ def _preprocess_words(
with words preprocessors that change words (e.g. normalization) must
be applied to words too.
"""
# workaround to preprocess words
# TODO: currently preprocessors work only on corpus, when there will be more
# cases like this think about implementation of preprocessors for a list
# of strings
words_feature = StringVariable("words")
words_c = Corpus(
Domain([], metas=[words_feature]),
metas=np.array([[w] for w in words]),
text_features=[words_feature],
)
# only transformers and normalizers preprocess on the word level
pps = [
pp
for pp in corpus.used_preprocessor.preprocessors
if isinstance(pp, (BaseTransformer, BaseNormalizer))
]
for i, pp in enumerate(pps):
# TODO: _preprocess is protected make it public
words = [pp._preprocess(w) for w in words]
words_c = pp(words_c)
callback((i + 1) / len(pps))
return words
return [w[0] for w in words_c.tokens if len(w)]


def _run(
corpus: Corpus,
words: Table,
words: List[str],
scoring_methods: List[str],
aggregation: str,
additional_params: dict,
Expand Down Expand Up @@ -163,21 +164,19 @@ def callback(i: float) -> None:

cb_part = 1 / (len(scoring_methods) + 1) # +1 for preprocessing

words = _preprocess_words(
corpus, words, wrap_callback(callback, end=cb_part)
)
words = _preprocess_words(corpus, words, wrap_callback(callback, end=cb_part))
if len(words) == 0:
raise Exception(
"Empty word list after preprocessing. Please provide a valid set of words."
)
for i, sm in enumerate(scoring_methods):
scoring_method = SCORING_METHODS[sm][1]
sig = signature(scoring_method)
add_params = {
k: v for k, v in additional_params.items() if k in sig.parameters
}
add_params = {k: v for k, v in additional_params.items() if k in sig.parameters}
scs = scoring_method(
corpus,
words,
wrap_callback(
callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part
),
wrap_callback(callback, start=(i + 1) * cb_part, end=(i + 2) * cb_part),
**add_params
)
scs = AGGREGATIONS[aggregation](scs, axis=1)
Expand All @@ -202,7 +201,8 @@ def update_column_widths(self) -> None:
"""
header = self.horizontalHeader()
col_width = max(
[0] + [
[0]
+ [
max(self.sizeHintForColumn(i), header.sectionSizeHint(i))
for i in range(1, self.model().columnCount())
]
Expand All @@ -223,10 +223,7 @@ def _convert(text: str) -> Union[str, int]:

@staticmethod
def _alphanum_key(key: str) -> List[Union[str, int]]:
return [
ScoreDocumentsProxyModel._convert(c)
for c in re.split("([0-9]+)", key)
]
return [ScoreDocumentsProxyModel._convert(c) for c in re.split("([0-9]+)", key)]

def lessThan(self, left_ind, right_ind):
"""
Expand Down Expand Up @@ -281,7 +278,7 @@ class Warning(OWWidget.Warning):
corpus_not_normalized = Msg("Use Preprocess Text to normalize corpus.")

class Error(OWWidget.Error):
unknown_err = Msg("{}")
custom_err = Msg("{}")

def __init__(self):
OWWidget.__init__(self)
Expand Down Expand Up @@ -391,10 +388,7 @@ def avg_len(attr):

@Inputs.words
def set_words(self, words: Table) -> None:
if (
words is None
or len(words.domain.variables + words.domain.metas) == 0
):
if words is None or len(words.domain.variables + words.domain.metas) == 0:
self.words = None
else:
self.Warning.missing_words.clear()
Expand All @@ -418,11 +412,7 @@ def _gather_scores(self) -> Tuple[np.ndarray, List[str]]:
scorers = self._get_active_scorers()
methods = [m for m in scorers if (m, aggregation) in self.scores]
scores = [self.scores[(m, aggregation)] for m in methods]
scores = (
np.column_stack(scores)
if scores
else np.empty((len(self.corpus), 0))
)
scores = np.column_stack(scores) if scores else np.empty((len(self.corpus), 0))
labels = [SCORING_METHODS[m][0] for m in methods]
return scores, labels

Expand Down Expand Up @@ -466,13 +456,13 @@ def _fill_table(self, scores: np.ndarray, labels: List[str]) -> None:
self.view.horizontalHeader().setSortIndicator(*self.sort_column_order)

def _fill_and_output(self) -> None:
""" Fill the table in the widget and send the output """
"""Fill the table in the widget and send the output"""
scores, labels = self._gather_scores()
self._fill_table(scores, labels)
self._send_output(scores, labels)

def _clear_and_run(self) -> None:
""" Clear cached scores and commit """
"""Clear cached scores and commit"""
self.scores = {}
self.cancel()
self._fill_and_output()
Expand All @@ -482,7 +472,7 @@ def __setting_changed(self) -> None:
self.commit()

def commit(self) -> None:
self.Error.unknown_err.clear()
self.Error.custom_err.clear()
self.cancel()
if self.corpus is None and self.words is None:
return
Expand All @@ -493,9 +483,7 @@ def commit(self) -> None:
else:
scorers = self._get_active_scorers()
aggregation = self._get_active_aggregation()
new_scores = [
s for s in scorers if (s, aggregation) not in self.scores
]
new_scores = [s for s in scorers if (s, aggregation) not in self.scores]
if new_scores:
self.start(
_run,
Expand All @@ -522,7 +510,8 @@ def on_partial_result(self, result: Tuple[str, str, np.ndarray]) -> None:
self._fill_table(scores, labels)

def on_exception(self, ex: Exception) -> None:
self.Error.unknown_err(ex)
self.Error.custom_err(ex)
self._fill_and_output()

def _get_active_scorers(self) -> List[str]:
"""
Expand Down
105 changes: 77 additions & 28 deletions orangecontrib/text/widgets/tests/test_owscoredocuments.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import unittest
from math import isclose
from typing import List, Union
from typing import List
from unittest.mock import patch

import numpy as np
from AnyQt.QtCore import Qt
from Orange.data import ContinuousVariable, Domain, StringVariable, Table
from Orange.misc.collections import natural_sorted
from Orange.util import dummy_callback
from Orange.widgets.tests.base import WidgetTest
from Orange.data import Table, StringVariable, Domain, ContinuousVariable
from Orange.widgets.tests.utils import simulate
from Orange.misc.collections import natural_sorted

from orangecontrib.text import Corpus
from orangecontrib.text import preprocess
from orangecontrib.text import Corpus, preprocess
from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments
from orangecontrib.text.widgets.owscoredocuments import (
OWScoreDocuments,
_preprocess_words,
)


def embedding_mock(_, corpus, __):
Expand Down Expand Up @@ -48,6 +51,7 @@ def setUp(self) -> None:
pp_list = [
preprocess.LowercaseTransformer(),
preprocess.StripAccentsTransformer(),
preprocess.UrlRemover(),
preprocess.SnowballStemmer(),
]
for p in pp_list:
Expand All @@ -59,9 +63,7 @@ def setUp(self) -> None:

def test_set_data(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.assertEqual(
[x[0] for x in self.widget.model], self.corpus.titles.tolist()
)
self.assertEqual([x[0] for x in self.widget.model], self.corpus.titles.tolist())
self.assertTrue(self.widget.Warning.missing_words.is_shown())

self.send_signal(self.widget.Inputs.words, self.words)
Expand All @@ -71,12 +73,8 @@ def test_set_data(self):
self.assertTrue(all(len(x) == 2 for x in self.widget.model))

output = self.get_output(self.widget.Outputs.corpus)
self.assertTupleEqual(
output.domain.variables, self.corpus.domain.variables
)
self.assertTupleEqual(
output.domain.metas[:-1], self.corpus.domain.metas
)
self.assertTupleEqual(output.domain.variables, self.corpus.domain.variables)
self.assertTupleEqual(output.domain.metas[:-1], self.corpus.domain.metas)
self.assertEqual(str(output.domain.metas[-1]), "Word count")
self.assertEqual(len(output), len(self.corpus))

Expand All @@ -101,12 +99,8 @@ def test_guess_word_attribute(self):
w = StringVariable("Words")
w.attributes["type"] = "words"
w1 = StringVariable("Words 1")
words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape(
(-1, 1)
)
words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape(
(-1, 1)
)
words = np.array(["house", "doctor", "boy", "way", "Rum"]).reshape((-1, 1))
words1 = np.array(["house", "doctor1", "boy", "way", "Rum"]).reshape((-1, 1))

# guess by attribute type
self.words = Table(
Expand All @@ -128,9 +122,7 @@ def test_guess_word_attribute(self):

# guess by length
w2 = StringVariable("Words 2")
words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape(
(-1, 1)
)
words2 = np.array(["house 1", "doctor 1", "boy", "way", "Rum"]).reshape((-1, 1))
self.words = Table(
Domain([], metas=[w2, w1]),
np.empty((len(words), 0)),
Expand Down Expand Up @@ -183,7 +175,7 @@ def test_change_scorer(self):

@staticmethod
def create_corpus(texts: List[str]) -> Corpus:
""" Create sample corpus with texts passed """
"""Create sample corpus with texts passed"""
text_var = StringVariable("Text")
domain = Domain([], metas=[text_var])
c = Corpus(
Expand Down Expand Up @@ -235,9 +227,7 @@ def test_word_appearance(self):
self.widget.controls.word_frequency.click()
self.widget.controls.word_appearance.click()
self.wait_until_finished()
self.assertListEqual(
[x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1]
)
self.assertListEqual([x[1] for x in self.widget.model], [2 / 3, 2 / 3, 1])

cb_aggregation = self.widget.controls.aggregation
simulate.combobox_activate_item(cb_aggregation, "Max")
Expand Down Expand Up @@ -297,6 +287,65 @@ def test_sort_table(self):
data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])

def test_preprocess_words(self):
corpus = Corpus.from_file("book-excerpts")
words = [
"House",
"dóctor",
"boy",
"way",
"Rum https://google.com",
"https://google.com",
"<p>abra<b>cadabra</b><p>",
]

pp_list = [
preprocess.LowercaseTransformer(),
preprocess.StripAccentsTransformer(),
preprocess.UrlRemover(),
preprocess.HtmlTransformer(),
]
for p in pp_list:
corpus = p(corpus)

self.assertListEqual(
["house", "doctor", "boy", "way", "rum", "abracadabra"],
_preprocess_words(corpus, words, dummy_callback),
)

words = ["House", "dóctor", "boys", "way", "Rum"]

pp_list = [preprocess.SnowballStemmer()]
for p in pp_list:
corpus = p(corpus)

self.assertListEqual(
["hous", "doctor", "boy", "way", "rum"],
_preprocess_words(corpus, words, dummy_callback),
)

def test_no_words_after_preprocess(self):
w = StringVariable("Words")
words = np.array(["https://google.com"]).reshape((-1, 1))
words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words)
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.send_signal(self.widget.Inputs.words, words)
self.wait_until_finished()

self.assertTrue(self.widget.Error.custom_err.is_shown())
self.assertEqual(
"Empty word list after preprocessing. Please provide a valid set of words.",
str(self.widget.Error.custom_err),
)

w = StringVariable("Words")
words = np.array(["https://google.com", "house"]).reshape((-1, 1))
words = Table(Domain([], metas=[w]), np.empty((len(words), 0)), metas=words)
self.send_signal(self.widget.Inputs.words, words)
self.wait_until_finished()

self.assertFalse(self.widget.Error.custom_err.is_shown())

def test_sort_setting(self):
"""
Test if sorting is correctly memorized in setting and restored
Expand Down

0 comments on commit 7a5f5fe

Please sign in to comment.