Skip to content

Commit

Permalink
Score documents: fix word preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 13, 2021
1 parent 00f71d0 commit ffc33fe
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 8 deletions.
19 changes: 13 additions & 6 deletions orangecontrib/text/widgets/owscoredocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
Domain,
StringVariable,
ContinuousVariable,
DiscreteVariable,
)
from Orange.util import wrap_callback
from Orange.widgets.utils.concurrent import ConcurrentWidgetMixin, TaskState
Expand Down Expand Up @@ -116,22 +115,31 @@ def _preprocess_words(
with words preprocessors that change words (e.g. normalization) must
be applied to words too.
"""
# workaround to preprocess words
# TODO: currently preprocessors work only on corpus, when there will be more
# cases like this think about implementation of preprocessors for a list
# of strings
words_feature = StringVariable("words")
words_c = Corpus(
Domain([], metas=[words_feature]),
metas=np.array([[w] for w in words]),
text_features=[words_feature]
)
# only transformers and normalizers preprocess on the word level
pps = [
pp
for pp in corpus.used_preprocessor.preprocessors
if isinstance(pp, (BaseTransformer, BaseNormalizer))
]
for i, pp in enumerate(pps):
# TODO: _preprocess is protected make it public
words = [pp._preprocess(w) for w in words]
words_c = pp(words_c)
callback((i + 1) / len(pps))
return words
return [w[0] for w in words_c.tokens]


def _run(
corpus: Corpus,
words: Table,
words: List[str],
scoring_methods: List[str],
aggregation: str,
additional_params: dict,
Expand All @@ -155,7 +163,6 @@ def _run(
state
TaskState for reporting the task status and giving partial results
"""

def callback(i: float) -> None:
state.set_progress_value(i * 100)
if state.is_interruption_requested():
Expand Down
44 changes: 42 additions & 2 deletions orangecontrib/text/widgets/tests/test_owscoredocuments.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import unittest
from math import isclose
from typing import List, Union
from typing import List
from unittest.mock import patch

import numpy as np
from AnyQt.QtCore import Qt
from Orange.util import dummy_callback
from Orange.widgets.tests.base import WidgetTest
from Orange.data import Table, StringVariable, Domain, ContinuousVariable
from Orange.widgets.tests.utils import simulate
Expand All @@ -13,7 +14,10 @@
from orangecontrib.text import Corpus
from orangecontrib.text import preprocess
from orangecontrib.text.vectorization.document_embedder import DocumentEmbedder
from orangecontrib.text.widgets.owscoredocuments import OWScoreDocuments
from orangecontrib.text.widgets.owscoredocuments import (
OWScoreDocuments,
_preprocess_words,
)


def embedding_mock(_, corpus, __):
Expand Down Expand Up @@ -297,6 +301,42 @@ def test_sort_table(self):
data = [model.data(model.index(i, 0)) for i in range(model.rowCount())]
self.assertListEqual(data, natural_sorted(self.corpus.titles)[::-1])

def test_preprocess_words(self):
corpus = Corpus.from_file("book-excerpts")
words = [
"House",
"dóctor",
"boy",
"way",
"Rum https://google.com",
"<p>abra<b>cadabra</b><p>",
]

pp_list = [
preprocess.LowercaseTransformer(),
preprocess.StripAccentsTransformer(),
preprocess.UrlRemover(),
preprocess.HtmlTransformer(),
]
for p in pp_list:
corpus = p(corpus)

self.assertListEqual(
["house", "doctor", "boy", "way", "rum", "abracadabra"],
_preprocess_words(corpus, words, dummy_callback),
)

words = ["House", "dóctor", "boys", "way", "Rum"]

pp_list = [preprocess.SnowballStemmer()]
for p in pp_list:
corpus = p(corpus)

self.assertListEqual(
["hous", "doctor", "boy", "way", "rum"],
_preprocess_words(corpus, words, dummy_callback),
)


if __name__ == "__main__":
unittest.main()

0 comments on commit ffc33fe

Please sign in to comment.