Skip to content

Commit

Permalink
[FIX] Corpus widget - preserve corpus's preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Feb 28, 2023
1 parent ef1dd29 commit 5dbdc5c
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 10 deletions.
26 changes: 16 additions & 10 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,21 +321,27 @@ def remove_duplicates(l):
return unique

if self.corpus is not None:
self.corpus.set_text_features(
remove_duplicates(self.used_attrs_model))
# corpus must be copied that original properties are preserved
# example: if user selects different text features set_text_features
# would reset preprocessing inplace but when user select initial
# features again we want to have preprocessing preserved
corpus = self.corpus.copy()
corpus.set_text_features(remove_duplicates(self.used_attrs_model))
self.used_attrs = list(self.used_attrs_model)

if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
if len(self.unused_attrs_model) > 0 and not corpus.text_features:
self.Error.no_text_features_used()

self.corpus.set_title_variable(self.title_variable)
self.corpus.attributes["language"] = LANG2ISO[self.language]
corpus.set_title_variable(self.title_variable)
corpus.attributes["language"] = LANG2ISO[self.language]
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
or len(self.corpus) == 0 \
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)
dom = corpus.domain
empty = (
not (dom.variables or dom.metas)
or len(corpus) == 0
or not corpus.text_features
)
self.Outputs.corpus.send(corpus if not empty else None)

def send_report(self):
def describe(features):
Expand Down
46 changes: 46 additions & 0 deletions orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import tempfile
import unittest

import numpy as np
Expand All @@ -7,6 +8,7 @@
from Orange.widgets.tests.utils import simulate

from orangecontrib.text import Corpus
from orangecontrib.text.preprocess import RegexpTokenizer
from orangecontrib.text.widgets.owcorpus import OWCorpus


Expand Down Expand Up @@ -335,6 +337,50 @@ def test_language_unpickle(self):
self.wait_until_finished()
self.assertEqual(self.widget.language, "English")

def test_preserve_preprocessing(self):
"""When preprocessed corpus on input preprocessing should be retained"""
corpus = Corpus.from_file("andersen")
corpus = RegexpTokenizer()(corpus)

# preprocessing should be maintained
self.send_signal(self.widget.Inputs.data, corpus)
res = self.get_output(self.widget.Outputs.corpus)
self.assertTrue(res.has_tokens())

# add additional text feature - preprocessing should be reset
self.widget.used_attrs_model.append(corpus.domain.metas[0])
res = self.get_output(self.widget.Outputs.corpus)
self.assertFalse(res.has_tokens())

# remove previously added feature - preprocessing should be kept again
self.widget.used_attrs_model.remove(corpus.domain.metas[0])
res = self.get_output(self.widget.Outputs.corpus)
self.assertTrue(res.has_tokens())

def test_preserve_preprocessing_from_file(self):
"""When preprocessed corpus loaded preprocessing should be retained"""
corpus = Corpus.from_file("andersen")
corpus = RegexpTokenizer()(corpus)
with tempfile.TemporaryDirectory() as tmp_dir:
file = os.path.join(tmp_dir, "andersen.pkl")
corpus.save(file)
self.widget.file_widget.open_file(file)

# preprocessing should be maintained
self.send_signal(self.widget.Inputs.data, corpus)
res = self.get_output(self.widget.Outputs.corpus)
self.assertTrue(res.has_tokens())

# add additional text feature - preprocessing should be reset
self.widget.used_attrs_model.append(corpus.domain.metas[0])
res = self.get_output(self.widget.Outputs.corpus)
self.assertFalse(res.has_tokens())

# remove previously added feature - preprocessing should be kept again
self.widget.used_attrs_model.remove(corpus.domain.metas[0])
res = self.get_output(self.widget.Outputs.corpus)
self.assertTrue(res.has_tokens())


if __name__ == "__main__":
unittest.main()

0 comments on commit 5dbdc5c

Please sign in to comment.