Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OWPreprocess Text: Add Lemmagen normalizer #708

Merged
merged 5 commits into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/widgets/preprocesstext.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Preprocesses corpus with selected methods.
- [Snowball Stemmer](http://snowballstem.org/) applies an improved version of Porter stemmer (Porter2). Set the language for normalization, default is English.
- [WordNet Lemmatizer](http://wordnet.princeton.edu/) applies a networks of cognitive synonyms to tokens based on a large lexical database of English.
- [UDPipe](http://ufal.mff.cuni.cz/udpipe/1) applies a [pre-trained model](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2998?show=full) for normalizing data.
- [Lemmagen](https://github.com/vpodpecan/lemmagen3) applies a pre-trained model for normalizing data.
5. **Filtering** removes or keeps a selection of words.
- *Stopwords* removes stopwords from text (e.g. removes 'and', 'or', 'in'...). Select the language to filter by, English is set as default. You can also load your own list of stopwords provided in a simple \*.txt file with one stopword per line.
![](images/stopwords.png)
Expand Down
42 changes: 39 additions & 3 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import json
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
import serverfiles
from nltk import stem
from requests.exceptions import ConnectionError
Expand All @@ -14,7 +15,7 @@
from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor

__all__ = ['BaseNormalizer', 'WordNetLemmatizer', 'PorterStemmer',
'SnowballStemmer', 'UDPipeLemmatizer']
'SnowballStemmer', 'UDPipeLemmatizer', 'LemmagenLemmatizer']


class BaseNormalizer(TokenizedPreprocessor):
Expand Down Expand Up @@ -52,7 +53,8 @@ class PorterStemmer(BaseNormalizer):

class SnowballStemmer(BaseNormalizer):
name = 'Snowball Stemmer'
supported_languages = [l.capitalize() for l in stem.SnowballStemmer.languages]
supported_languages = [l.capitalize() for l in
stem.SnowballStemmer.languages]

def __init__(self, language='English'):
self.normalizer = stem.SnowballStemmer(language.lower())
Expand All @@ -70,7 +72,7 @@ def file_to_name(file):


def file_to_language(file):
return file[:file.find('ud')-1]\
return file[:file.find('ud') - 1] \
.replace('-', ' ').replace('_', ' ').capitalize()


Expand Down Expand Up @@ -184,3 +186,37 @@ def __getstate__(self):
state['_UDPipeLemmatizer__model'] = None
state['_UDPipeLemmatizer__output_format'] = None
return state


class LemmagenLemmatizer(BaseNormalizer):
name = 'Lemmagen Lemmatizer'
lemmagen_languages = {
"Bulgarian": "bg",
"Croatian": "hr",
"Czech": "cs",
"English": "en",
"Estonian": "et",
"Farsi/Persian": "fa",
"French": "fr",
"German": "de",
"Hungarian": "hu",
"Italian": "it",
"Macedonian": "mk",
"Polish": "pl",
"Romanian": "ro",
"Russian": "ru",
"Serbian": "sr",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Ukrainian": "uk"
}

def __init__(self, language='English'):
self.lemmatizer = Lemmatizer(self.lemmagen_languages[language])

def normalizer(self, token):
t = self.lemmatizer.lemmatize(token)
# sometimes Lemmagen returns an empty string, return original tokens
# in this case
return t if t else token
21 changes: 19 additions & 2 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import nltk
from gensim import corpora
from lemmagen3 import Lemmatizer
from requests.exceptions import ConnectionError
import numpy as np

Expand Down Expand Up @@ -78,7 +79,8 @@ def _preprocess(cls, string):
return string.split()

p = SpaceTokenizer()
array = np.array([sent.split() for sent in self.corpus.documents])
array = np.array([sent.split() for sent in self.corpus.documents],
dtype=object)
np.testing.assert_equal(p(self.corpus).tokens, array)

def test_token_normalizer(self):
Expand All @@ -101,7 +103,7 @@ def _check(self, token):

p = LengthFilter()
tokens = np.array([[token for token in doc.split() if len(token) < 4]
for doc in self.corpus.documents])
for doc in self.corpus.documents], dtype=object)
np.testing.assert_equal(p(self.corpus).tokens, tokens)

def test_inplace(self):
Expand Down Expand Up @@ -238,6 +240,13 @@ def test_call_UDPipe(self):
self.assertTrue(corpus.has_tokens())
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_call_lemmagen(self):
pp = preprocess.LemmagenLemmatizer()
self.assertFalse(self.corpus.has_tokens())
corpus = pp(self.corpus)
self.assertTrue(corpus.has_tokens())
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_function(self):
stemmer = preprocess.BaseNormalizer()
stemmer.normalizer = lambda x: x[:-1]
Expand Down Expand Up @@ -291,6 +300,14 @@ def test_udpipe_deepcopy(self):
self.assertEqual(list(copied(self.corpus).tokens[0]),
['gora', 'na', 'gora', 'hiša', 'goreti'])

def test_lemmagen(self):
normalizer = preprocess.LemmagenLemmatizer('Slovenian')
token = 'veselja'
self.assertEqual(
normalizer._preprocess(token),
Lemmatizer("sl").lemmatize(token)
)


class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self):
Expand Down
32 changes: 29 additions & 3 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,11 +384,12 @@ def __repr__(self):


class NormalizationModule(SingleMethodModule):
Porter, Snowball, WordNet, UDPipe = range(4)
Porter, Snowball, WordNet, UDPipe, Lemmagen = range(5)
Methods = {Porter: PorterStemmer,
Snowball: SnowballStemmer,
WordNet: WordNetLemmatizer,
UDPipe: UDPipeLemmatizer}
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_LANGUAGE = "English"
DEFAULT_USE_TOKE = False
Expand All @@ -397,6 +398,7 @@ def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

self.__combo_sbl = ComboBox(
Expand All @@ -410,6 +412,10 @@ def __init__(self, parent=None, **kwargs):
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
self, LemmagenLemmatizer.lemmagen_languages,
self.__lemmagen_lang, self.__set_lemmagen_lang
)

label = QLabel("Language:")
label.setAlignment(Qt.AlignRight | Qt.AlignVCenter)
Expand All @@ -425,6 +431,11 @@ def __init__(self, parent=None, **kwargs):
self.layout().setColumnStretch(2, 1)
self.__enable_udpipe()

label = QLabel("Language:")
label.setAlignment(Qt.AlignRight | Qt.AlignVCenter)
self.layout().addWidget(label, self.Lemmagen, 1)
self.layout().addWidget(self.__combo_lemm, self.Lemmagen, 2)

def __enable_udpipe(self):
enable = bool(self.__combo_udl.items)
layout = self.layout() # type: QGridLayout
Expand All @@ -441,6 +452,8 @@ def setParameters(self, params: Dict):
self.__set_udpipe_lang(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
lemmagen_lang = params.get("lemmagen_language", self.DEFAULT_LANGUAGE)
self.__set_lemmagen_lang(lemmagen_lang)

def _set_method(self, method: int):
super()._set_method(method)
Expand All @@ -462,6 +475,14 @@ def __set_udpipe_lang(self, language: str):
if self.method == self.UDPipe:
self.edited.emit()

def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.setCurrentText(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()

def __set_use_tokenizer(self, use: bool):
if self.__use_tokenizer != use:
self.__use_tokenizer = use
Expand All @@ -474,7 +495,8 @@ def parameters(self) -> Dict:
params = super().parameters()
params.update({"snowball_language": self.__snowball_lang,
"udpipe_language": self.__udpipe_lang,
"udpipe_tokenizer": self.__use_tokenizer})
"udpipe_tokenizer": self.__use_tokenizer,
"lemmagen_language": self.__lemmagen_lang})
return params

@staticmethod
Expand All @@ -488,6 +510,8 @@ def createinstance(params: Dict) -> BaseNormalizer:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_lang),
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
elif method == NormalizationModule.Lemmagen:
args = {"language": params.get("lemmagen_language", def_lang)}
return NormalizationModule.Methods[method](**args)

def __repr__(self):
Expand All @@ -497,6 +521,8 @@ def __repr__(self):
elif self.method == self.UDPipe:
text = f"{text} ({self.__udpipe_lang}, " \
f"Tokenize: {['No', 'Yes'][self.__use_tokenizer]})"
elif self.method == self.Lemmagen:
text = f"{text} ({self.__lemmagen_lang})"
return text


Expand Down
7 changes: 7 additions & 0 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,10 @@ def combo_sbl(self):
def combo_udl(self):
return self.editor._NormalizationModule__combo_udl

@property
def combo_lemm(self):
return self.editor._NormalizationModule__combo_lemm

@property
def check_use(self):
return self.editor._NormalizationModule__check_use
Expand All @@ -374,18 +378,21 @@ def test_parameters(self):
params = {"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "English",
"udpipe_tokenizer": False}
self.assertDictEqual(self.editor.parameters(), params)

def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Finnish",
"lemmagen_language": "Bulgarian",
"udpipe_tokenizer": True}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
self.assertEqual(self.combo_sbl.currentText(), "Dutch")
self.assertEqual(self.combo_udl.currentText(), "Finnish")
self.assertEqual(self.combo_lemm.currentText(), "Bulgarian")
self.assertTrue(self.check_use.isChecked())

def test_createinstance(self):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ ufal.udpipe >=1.2.0.3
orange-widget-base >=4.12.0
yake
conllu
lemmagen3