Skip to content

Commit

Permalink
Preprocess Text: add Spacy POS tagger
Browse files Browse the repository at this point in the history
  • Loading branch information
ajdapretnar committed Jul 19, 2024
1 parent d648299 commit a0ad22a
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 7 deletions.
87 changes: 85 additions & 2 deletions orangecontrib/text/tag/pos.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from typing import List, Callable
from typing import List, Callable, Tuple

import nltk
import spacy
from spacy.cli import info, download
from spacy.tokens import Doc
import numpy as np
from Orange.util import wrap_callback, dummy_callback

Expand All @@ -10,7 +13,38 @@
from orangecontrib.text.util import chunkable


__all__ = ["POSTagger", "AveragedPerceptronTagger", "MaxEntTagger"]
__all__ = ["POSTagger", "AveragedPerceptronTagger", "MaxEntTagger",
"SpacyPOSTagger"]


SPACY_MODELS = {
"ca": {"language": "Catalan", "package": "ca_core_news_sm", "dependency": "None"},
"zh": {"language": "Chinese", "package": "zh_core_web_sm", "dependency": "Jieba"},
"hr": {"language": "Croatian", "package": "hr_core_news_sm", "dependency": "None"},
"da": {"language": "Danish", "package": "da_core_news_sm", "dependency": "None"},
"nl": {"language": "Dutch", "package": "nl_core_news_sm", "dependency": "None"},
"en": {"language": "English", "package": "en_core_web_sm", "dependency": "None"},
"fi": {"language": "Finnish", "package": "fi_core_news_sm", "dependency": "None"},
"fr": {"language": "French", "package": "fr_core_news_sm", "dependency": "None"},
"de": {"language": "German", "package": "de_core_news_sm", "dependency": "None"},
"el": {"language": "Greek", "package": "el_core_news_sm", "dependency": "None"},
"it": {"language": "Italian", "package": "it_core_news_sm", "dependency": "None"},
"ja": {"language": "Japanese", "package": "ja_core_news_sm", "dependency": "SudachiPy"},
"ko": {"language": "Korean", "package": "ko_core_news_sm", "dependency": "None"},
"lt": {"language": "Lithuanian", "package": "lt_core_news_sm", "dependency": "None"},
"mk": {"language": "Macedonian", "package": "mk_core_news_sm", "dependency": "None"},
"xx": {"language": "Multi-language", "package": "xx_ent_wiki_sm", "dependency": "None"},
"nb": {"language": "Norwegian Bokmål", "package": "nb_core_news_sm", "dependency": "None"},
"pl": {"language": "Polish", "package": "pl_core_news_sm", "dependency": "None"},
"pt": {"language": "Portuguese", "package": "pt_core_news_sm", "dependency": "None"},
"ro": {"language": "Romanian", "package": "ro_core_news_sm", "dependency": "None"},
"ru": {"language": "Russian", "package": "ru_core_news_sm", "dependency": "pymorphy3"},
"sl": {"language": "Slovenian", "package": "sl_core_news_sm", "dependency": "None"},
"es": {"language": "Spanish", "package": "es_core_news_sm", "dependency": "None"},
"sv": {"language": "Swedish", "package": "sv_core_news_sm", "dependency": "None"},
"uk": {"language": "Ukrainian", "package": "uk_core_news_sm", "dependency":
"pymorphy3, pymorphy3-dicts-uk"}
}


class POSTagger(TokenizedPreprocessor):
Expand Down Expand Up @@ -52,3 +86,52 @@ class MaxEntTagger(POSTagger):
def __init__(self):
tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
super().__init__(tagger)


def find_model(language: str) -> str:
return SPACY_MODELS[language]["package"]


class SpacyModels:
installed_models_info = info()

def __init__(self):
self.installed_models = self.installed_models_info['pipelines']

def __getitem__(self, language: str) -> str:
model = find_model(language)
if model not in self.installed_models:
download(model)
return model


class SpacyPOSTagger(TokenizedPreprocessor):
name = 'Spacy POS Tagger'
supported_languages = set(SPACY_MODELS.keys())

def __init__(self, language: str = "en"):
self.__language = language
self.models = SpacyModels()
self.__model = None

def __call__(self, corpus: Corpus, callback: Callable = None,
**kw) -> Corpus:
""" Marks tokens of a corpus with POS tags. """
if callback is None:
callback = dummy_callback
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))

assert corpus.has_tokens()
callback(0.2, "POS Tagging...")
self.__model = spacy.load(self.models[self.__language])
tags = np.array(self.tag(corpus.tokens), dtype=object)
corpus.pos_tags = tags
return corpus

def tag(self, tokens):
out_tokens = []
for token_list in tokens:
# required for Spacy to work with pre-tokenized texts
doc = Doc(self.__model.vocab, words=token_list)
out_tokens.append([token.pos_ for token in self.__model(doc)])
return out_tokens
58 changes: 53 additions & 5 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration, UDPipeModels
from orangecontrib.text.tag import AveragedPerceptronTagger, MaxEntTagger, \
POSTagger
from orangecontrib.text.tag import (AveragedPerceptronTagger, MaxEntTagger,
SpacyPOSTagger, POSTagger)

_DEFAULT_NONE = "(none)"

Expand Down Expand Up @@ -1033,15 +1033,63 @@ def createinstance(params: Dict) -> NGrams:


class POSTaggingModule(SingleMethodModule):
Averaged, MaxEnt = range(2)
Averaged, MaxEnt, Spacy = range(3)
Methods = {Averaged: AveragedPerceptronTagger,
MaxEnt: MaxEntTagger}
MaxEnt: MaxEntTagger,
Spacy: SpacyPOSTagger}
DEFAULT_METHOD = Averaged
DEFAULT_LANGUAGE = "en"

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__method = self.DEFAULT_METHOD
self.spacy_lang = self.DEFAULT_LANGUAGE

self.__combo_scy = LanguageComboBox(
self,
SpacyPOSTagger.supported_languages,
self.DEFAULT_LANGUAGE,
False,
self.__set_spacy_lang
)

label = QLabel("Language:")
label.setAlignment(Qt.AlignRight | Qt.AlignVCenter)
self.layout().addWidget(label, self.Spacy, 1)
self.layout().addWidget(self.__combo_scy, self.Spacy, 2)

def __set_spacy_lang(self, language: str):
if self.spacy_lang != language:
self.spacy_lang = language
self.__combo_scy.set_current_language(language)
self.changed.emit()
if self.method == self.Spacy:
self.edited.emit()

def setParameters(self, params: Dict):
super().setParameters(params)
spacy_lang = params.get("spacy_language", self.DEFAULT_LANGUAGE)
self.__set_spacy_lang(spacy_lang)

def parameters(self) -> Dict:
params = super().parameters()
params.update({"spacy_language": self.spacy_lang})
return params

@staticmethod
def createinstance(params: Dict) -> POSTagger:
method = params.get("method", POSTaggingModule.DEFAULT_METHOD)
return POSTaggingModule.Methods[method]()
args = {}
if method == POSTaggingModule.Spacy:
args = {"language": params.get("spacy_language",
POSTaggingModule.DEFAULT_LANGUAGE)}
return POSTaggingModule.Methods[method](**args)

def __repr__(self):
text = super().__repr__()
if self.method == self.Spacy:
text = f"{text} ({self.spacy_lang})"
return text


PREPROCESS_ACTIONS = [
Expand Down

0 comments on commit a0ad22a

Please sign in to comment.