Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor somajo module. #53

Merged
merged 5 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 70 additions & 48 deletions mltb2/somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,52 +10,88 @@
"""


from abc import ABC
from dataclasses import dataclass, field
from typing import List, Set
from typing import Container, Iterable, List, Optional, Set

from somajo import SoMaJo
from tqdm import tqdm


@dataclass
class SoMaJoSentenceSplitter:
"""Use SoMaJo to split text into sentences.
class SoMaJoBaseClass(ABC):
"""Base Class for SoMaJo tools.

Args:
language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
show_progress_bar: Show a progressbar during processing.

Note:
This class is an abstract base class. It should not be used directly.
"""

language: str
somajo: SoMaJo = field(init=False, repr=False)
show_progress_bar: bool = False

def __post_init__(self):
"""Do post init."""
self.somajo = SoMaJo(self.language)

# see https://github.com/tsproisl/SoMaJo/issues/17
@staticmethod
def detokenize(tokens) -> str:
"""Convert SoMaJo tokens to sentence (string).

Args:
tokens: The tokens to be de-tokenized.
Returns:
The de-tokenized sentence.
"""
result_list = []
for token in tokens:
if token.original_spelling is not None:
result_list.append(token.original_spelling)
else:
result_list.append(token.text)

if token.space_after:
result_list.append(" ")
result = "".join(result_list)
result = result.strip()
return result
def detokenize(tokens) -> str:
"""Convert SoMaJo tokens to sentence (string).

Args:
tokens: The tokens to be de-tokenized.
Returns:
The de-tokenized sentence.

See Also:
`How do I split sentences but not words? <https://github.com/tsproisl/SoMaJo/issues/17>`_
"""
result_list = []
for token in tokens:
if token.original_spelling is not None:
result_list.append(token.original_spelling)
else:
result_list.append(token.text)

if token.space_after:
result_list.append(" ")
result = "".join(result_list)
result = result.strip()
return result


def extract_token_class_set(sentences: Iterable, keep_token_classes: Optional[Container[str]] = None) -> Set[str]:
"""Extract token from sentences by token class.

Args:
sentences: The sentences from which to extract.
keep_token_classes: The token classes to keep. If ``None`` all will be kept.
Returns:
The set of extracted token texts.
"""
result = set()
for sentence in sentences:
for token in sentence:
if keep_token_classes is None:
result.add(token.text)
elif token.token_class in keep_token_classes:
result.add(token.text)
# else ignore
return result


@dataclass
class SoMaJoSentenceSplitter(SoMaJoBaseClass):
"""Use SoMaJo to split text into sentences.

Args:
language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
show_progress_bar: Show a progressbar during processing.
"""

show_progress_bar: bool = False

def __call__(self, text: str) -> List[str]:
"""Split the text into a list of sentences.
Expand All @@ -70,27 +106,20 @@ def __call__(self, text: str) -> List[str]:
result = []

for sentence in tqdm(sentences, disable=not self.show_progress_bar):
sentence_string = self.detokenize(sentence)
sentence_string = detokenize(sentence)
result.append(sentence_string)

return result


@dataclass
class JaccardSimilarity:
class JaccardSimilarity(SoMaJoBaseClass):
"""Calculate the `jaccard similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_.

Args:
language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
"""

language: str
somajo: SoMaJo = field(init=False, repr=False)

def __post_init__(self):
"""Do post init."""
self.somajo = SoMaJo(self.language)

def get_token_set(self, text: str) -> Set[str]:
"""Get token set for text.

Expand All @@ -100,9 +129,9 @@ def get_token_set(self, text: str) -> Set[str]:
The set of tokens (words).
"""
sentences = self.somajo.tokenize_text([text])
tokens = [t.text.lower() for sentence in sentences for t in sentence]
# TODO: add option to filter tokens
token_set = set(tokens)
token_set = extract_token_class_set(sentences) # TODO: filter tokens
token_set = {t.lower() for t in token_set}

return token_set

def __call__(self, text1: str, text2: str) -> float:
Expand All @@ -123,28 +152,21 @@ def __call__(self, text1: str, text2: str) -> float:


@dataclass
class TokenExtractor:
class TokenExtractor(SoMaJoBaseClass):
"""Extract tokens from text.

Args:
language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
"""

language: str
somajo: SoMaJo = field(init=False, repr=False)

def __post_init__(self):
"""Do post init."""
self.somajo = SoMaJo(self.language)

def extract_url_set(self, text: str) -> Set[str]:
"""Extract tokens from text.
"""Extract URLs from text.

Args:
text: the text
Returns:
Set of extracted links.
"""
sentences = self.somajo.tokenize_text([text])
result = {token.text for sentence in sentences for token in sentence if token.token_class == "URL"}
result = extract_token_class_set(sentences, keep_token_classes="URL")
return result
33 changes: 32 additions & 1 deletion tests/test_somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

from math import isclose

from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter, TokenExtractor
from somajo import SoMaJo

from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter, TokenExtractor, detokenize, extract_token_class_set


def test_SoMaJoSentenceSplitter_call() -> None:
Expand Down Expand Up @@ -75,3 +77,32 @@ def test_TokenExtractor_call_no_url():
token_extractor = TokenExtractor("de_CMC")
result = token_extractor.extract_url_set(text_with_url)
assert len(result) == 0


def test_extract_token_class_set_symbol():
somajo = SoMaJo("de_CMC")
sentences = somajo.tokenize_text(["Das ist ein Satz. Das ist ein anderer Satz."])
result = extract_token_class_set(sentences, keep_token_classes="symbol")

assert isinstance(result, set)
assert len(result) == 1
assert "." in result


def test_extract_token_class_set_url():
somajo = SoMaJo("de_CMC")
sentences = somajo.tokenize_text(["Das ist ein Satz. Das ist ein Link: http://github.com"])
result = extract_token_class_set(sentences, keep_token_classes="URL")

assert isinstance(result, set)
assert len(result) == 1
assert "http://github.com" in result


def test_detokenize():
somajo = SoMaJo("de_CMC")
sentences = somajo.tokenize_text(["Das ist ein Satz. Das ist ein anderer Satz."])
result = detokenize(list(sentences)[0])

assert isinstance(result, str)
assert result == "Das ist ein Satz."