Skip to content

Commit

Permalink
Merge pull request #708 from ajdapretnar/lemmagen
Browse files Browse the repository at this point in the history
OWPreprocess Text: Add Lemmagen normalizer
  • Loading branch information
PrimozGodec authored Aug 23, 2021
2 parents f306d03 + 9d1a370 commit 994ff6a
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/widgets/preprocesstext.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Preprocesses corpus with selected methods.
- [Snowball Stemmer](http://snowballstem.org/) applies an improved version of Porter stemmer (Porter2). Set the language for normalization, default is English.
- [WordNet Lemmatizer](http://wordnet.princeton.edu/) applies a networks of cognitive synonyms to tokens based on a large lexical database of English.
- [UDPipe](http://ufal.mff.cuni.cz/udpipe/1) applies a [pre-trained model](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2998?show=full) for normalizing data.
- [Lemmagen](https://github.com/vpodpecan/lemmagen3) applies a pre-trained model for normalizing data.
5. **Filtering** removes or keeps a selection of words.
- *Stopwords* removes stopwords from text (e.g. removes 'and', 'or', 'in'...). Select the language to filter by, English is set as default. You can also load your own list of stopwords provided in a simple \*.txt file with one stopword per line.
![](images/stopwords.png)
Expand Down
42 changes: 39 additions & 3 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import json
import ufal.udpipe as udpipe
from lemmagen3 import Lemmatizer
import serverfiles
from nltk import stem
from requests.exceptions import ConnectionError
Expand All @@ -14,7 +15,7 @@
from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor

__all__ = ['BaseNormalizer', 'WordNetLemmatizer', 'PorterStemmer',
'SnowballStemmer', 'UDPipeLemmatizer']
'SnowballStemmer', 'UDPipeLemmatizer', 'LemmagenLemmatizer']


class BaseNormalizer(TokenizedPreprocessor):
Expand Down Expand Up @@ -52,7 +53,8 @@ class PorterStemmer(BaseNormalizer):

class SnowballStemmer(BaseNormalizer):
name = 'Snowball Stemmer'
supported_languages = [l.capitalize() for l in stem.SnowballStemmer.languages]
supported_languages = [l.capitalize() for l in
stem.SnowballStemmer.languages]

def __init__(self, language='English'):
self.normalizer = stem.SnowballStemmer(language.lower())
Expand All @@ -70,7 +72,7 @@ def file_to_name(file):


def file_to_language(file):
return file[:file.find('ud')-1]\
return file[:file.find('ud') - 1] \
.replace('-', ' ').replace('_', ' ').capitalize()


Expand Down Expand Up @@ -184,3 +186,37 @@ def __getstate__(self):
state['_UDPipeLemmatizer__model'] = None
state['_UDPipeLemmatizer__output_format'] = None
return state


class LemmagenLemmatizer(BaseNormalizer):
name = 'Lemmagen Lemmatizer'
lemmagen_languages = {
"Bulgarian": "bg",
"Croatian": "hr",
"Czech": "cs",
"English": "en",
"Estonian": "et",
"Farsi/Persian": "fa",
"French": "fr",
"German": "de",
"Hungarian": "hu",
"Italian": "it",
"Macedonian": "mk",
"Polish": "pl",
"Romanian": "ro",
"Russian": "ru",
"Serbian": "sr",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Ukrainian": "uk"
}

def __init__(self, language='English'):
self.lemmatizer = Lemmatizer(self.lemmagen_languages[language])

def normalizer(self, token):
t = self.lemmatizer.lemmatize(token)
# sometimes Lemmagen returns an empty string, return original tokens
# in this case
return t if t else token
21 changes: 19 additions & 2 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import nltk
from gensim import corpora
from lemmagen3 import Lemmatizer
from requests.exceptions import ConnectionError
import numpy as np

Expand Down Expand Up @@ -78,7 +79,8 @@ def _preprocess(cls, string):
return string.split()

p = SpaceTokenizer()
array = np.array([sent.split() for sent in self.corpus.documents])
array = np.array([sent.split() for sent in self.corpus.documents],
dtype=object)
np.testing.assert_equal(p(self.corpus).tokens, array)

def test_token_normalizer(self):
Expand All @@ -101,7 +103,7 @@ def _check(self, token):

p = LengthFilter()
tokens = np.array([[token for token in doc.split() if len(token) < 4]
for doc in self.corpus.documents])
for doc in self.corpus.documents], dtype=object)
np.testing.assert_equal(p(self.corpus).tokens, tokens)

def test_inplace(self):
Expand Down Expand Up @@ -238,6 +240,13 @@ def test_call_UDPipe(self):
self.assertTrue(corpus.has_tokens())
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_call_lemmagen(self):
pp = preprocess.LemmagenLemmatizer()
self.assertFalse(self.corpus.has_tokens())
corpus = pp(self.corpus)
self.assertTrue(corpus.has_tokens())
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)

def test_function(self):
stemmer = preprocess.BaseNormalizer()
stemmer.normalizer = lambda x: x[:-1]
Expand Down Expand Up @@ -291,6 +300,14 @@ def test_udpipe_deepcopy(self):
self.assertEqual(list(copied(self.corpus).tokens[0]),
['gora', 'na', 'gora', 'hiša', 'goreti'])

def test_lemmagen(self):
normalizer = preprocess.LemmagenLemmatizer('Slovenian')
token = 'veselja'
self.assertEqual(
normalizer._preprocess(token),
Lemmatizer("sl").lemmatize(token)
)


class UDPipeModelsTests(unittest.TestCase):
def test_label_transform(self):
Expand Down
32 changes: 29 additions & 3 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,11 +384,12 @@ def __repr__(self):


class NormalizationModule(SingleMethodModule):
Porter, Snowball, WordNet, UDPipe = range(4)
Porter, Snowball, WordNet, UDPipe, Lemmagen = range(5)
Methods = {Porter: PorterStemmer,
Snowball: SnowballStemmer,
WordNet: WordNetLemmatizer,
UDPipe: UDPipeLemmatizer}
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_LANGUAGE = "English"
DEFAULT_USE_TOKE = False
Expand All @@ -397,6 +398,7 @@ def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

self.__combo_sbl = ComboBox(
Expand All @@ -410,6 +412,10 @@ def __init__(self, parent=None, **kwargs):
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
self, LemmagenLemmatizer.lemmagen_languages,
self.__lemmagen_lang, self.__set_lemmagen_lang
)

label = QLabel("Language:")
label.setAlignment(Qt.AlignRight | Qt.AlignVCenter)
Expand All @@ -425,6 +431,11 @@ def __init__(self, parent=None, **kwargs):
self.layout().setColumnStretch(2, 1)
self.__enable_udpipe()

label = QLabel("Language:")
label.setAlignment(Qt.AlignRight | Qt.AlignVCenter)
self.layout().addWidget(label, self.Lemmagen, 1)
self.layout().addWidget(self.__combo_lemm, self.Lemmagen, 2)

def __enable_udpipe(self):
enable = bool(self.__combo_udl.items)
layout = self.layout() # type: QGridLayout
Expand All @@ -441,6 +452,8 @@ def setParameters(self, params: Dict):
self.__set_udpipe_lang(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
lemmagen_lang = params.get("lemmagen_language", self.DEFAULT_LANGUAGE)
self.__set_lemmagen_lang(lemmagen_lang)

def _set_method(self, method: int):
super()._set_method(method)
Expand All @@ -462,6 +475,14 @@ def __set_udpipe_lang(self, language: str):
if self.method == self.UDPipe:
self.edited.emit()

def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.setCurrentText(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()

def __set_use_tokenizer(self, use: bool):
if self.__use_tokenizer != use:
self.__use_tokenizer = use
Expand All @@ -474,7 +495,8 @@ def parameters(self) -> Dict:
params = super().parameters()
params.update({"snowball_language": self.__snowball_lang,
"udpipe_language": self.__udpipe_lang,
"udpipe_tokenizer": self.__use_tokenizer})
"udpipe_tokenizer": self.__use_tokenizer,
"lemmagen_language": self.__lemmagen_lang})
return params

@staticmethod
Expand All @@ -488,6 +510,8 @@ def createinstance(params: Dict) -> BaseNormalizer:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_lang),
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
elif method == NormalizationModule.Lemmagen:
args = {"language": params.get("lemmagen_language", def_lang)}
return NormalizationModule.Methods[method](**args)

def __repr__(self):
Expand All @@ -497,6 +521,8 @@ def __repr__(self):
elif self.method == self.UDPipe:
text = f"{text} ({self.__udpipe_lang}, " \
f"Tokenize: {['No', 'Yes'][self.__use_tokenizer]})"
elif self.method == self.Lemmagen:
text = f"{text} ({self.__lemmagen_lang})"
return text


Expand Down
7 changes: 7 additions & 0 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,10 @@ def combo_sbl(self):
def combo_udl(self):
return self.editor._NormalizationModule__combo_udl

@property
def combo_lemm(self):
return self.editor._NormalizationModule__combo_lemm

@property
def check_use(self):
return self.editor._NormalizationModule__check_use
Expand All @@ -374,18 +378,21 @@ def test_parameters(self):
params = {"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "English",
"udpipe_tokenizer": False}
self.assertDictEqual(self.editor.parameters(), params)

def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Finnish",
"lemmagen_language": "Bulgarian",
"udpipe_tokenizer": True}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
self.assertEqual(self.combo_sbl.currentText(), "Dutch")
self.assertEqual(self.combo_udl.currentText(), "Finnish")
self.assertEqual(self.combo_lemm.currentText(), "Bulgarian")
self.assertTrue(self.check_use.isChecked())

def test_createinstance(self):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ ufal.udpipe >=1.2.0.3
orange-widget-base >=4.12.0
yake
conllu
lemmagen3

0 comments on commit 994ff6a

Please sign in to comment.