diff --git a/orangecontrib/text/tests/test_bowvectorizer.py b/orangecontrib/text/tests/test_bowvectorizer.py index cdc38d550..5eb0aa6f2 100644 --- a/orangecontrib/text/tests/test_bowvectorizer.py +++ b/orangecontrib/text/tests/test_bowvectorizer.py @@ -31,6 +31,13 @@ def test_empty_tokens(self): self.assertIs(corpus, bag_of_words) + def test_store_tokens(self): + corpus = Corpus.from_file('deerwester') + self.assertFalse(corpus.has_tokens()) + + bag_of_words = BowVectorizer().transform(corpus, copy=False) + self.assertTrue(bag_of_words.has_tokens()) + def test_domain(self): vect = BowVectorizer() corpus = Corpus.from_file('deerwester') diff --git a/orangecontrib/text/vectorization/bagofwords.py b/orangecontrib/text/vectorization/bagofwords.py index ed51ed2eb..f44ca2883 100644 --- a/orangecontrib/text/vectorization/bagofwords.py +++ b/orangecontrib/text/vectorization/bagofwords.py @@ -73,7 +73,11 @@ def _transform(self, corpus, source_dict=None, callback=dummy_callback): if len(corpus) == 0: return corpus temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True)) - dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict + if not source_dict: + corpus.store_tokens(temp_corpus) + dic = corpora.Dictionary(temp_corpus, prune_at=None) + else: + dic = source_dict if len(dic) == 0: return corpus callback(0.3)