telekom · PhilipMay · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/mltb2/somajo.py b/mltb2/somajo.py
@@ -10,52 +10,88 @@
 """
 
 
+from abc import ABC
 from dataclasses import dataclass, field
-from typing import List, Set
+from typing import Container, Iterable, List, Optional, Set
 
 from somajo import SoMaJo
 from tqdm import tqdm
 
 
 @dataclass
-class SoMaJoSentenceSplitter:
-    """Use SoMaJo to split text into sentences.
+class SoMaJoBaseClass(ABC):
+    """Base Class for SoMaJo tools.
 
     Args:
         language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
-        show_progress_bar: Show a progressbar during processing.
+
+    Note:
+        This class is an abstract base class. It should not be used directly.
     """
 
     language: str
     somajo: SoMaJo = field(init=False, repr=False)
-    show_progress_bar: bool = False
 
     def __post_init__(self):
         """Do post init."""
         self.somajo = SoMaJo(self.language)
 
-    # see https://github.com/tsproisl/SoMaJo/issues/17
-    @staticmethod
-    def detokenize(tokens) -> str:
-        """Convert SoMaJo tokens to sentence (string).
 
-        Args:
-            tokens: The tokens to be de-tokenized.
-        Returns:
-            The de-tokenized sentence.
-        """
-        result_list = []
-        for token in tokens:
-            if token.original_spelling is not None:
-                result_list.append(token.original_spelling)
-            else:
-                result_list.append(token.text)
-
-            if token.space_after:
-                result_list.append(" ")
-        result = "".join(result_list)
-        result = result.strip()
-        return result
+def detokenize(tokens) -> str:
+    """Convert SoMaJo tokens to sentence (string).
+
+    Args:
+        tokens: The tokens to be de-tokenized.
+    Returns:
+        The de-tokenized sentence.
+
+    See Also:
+        `How do I split sentences but not words? <https://github.com/tsproisl/SoMaJo/issues/17>`_
+    """
+    result_list = []
+    for token in tokens:
+        if token.original_spelling is not None:
+            result_list.append(token.original_spelling)
+        else:
+            result_list.append(token.text)
+
+        if token.space_after:
+            result_list.append(" ")
+    result = "".join(result_list)
+    result = result.strip()
+    return result
+
+
+def extract_token_class_set(sentences: Iterable, keep_token_classes: Optional[Container[str]] = None) -> Set[str]:
+    """Extract token from sentences by token class.
+
+    Args:
+        sentences: The sentences from which to extract.
+        keep_token_classes: The token classes to keep. If ``None`` all will be kept.
+    Returns:
+        The set of extracted token texts.
+    """
+    result = set()
+    for sentence in sentences:
+        for token in sentence:
+            if keep_token_classes is None:
+                result.add(token.text)
+            elif token.token_class in keep_token_classes:
+                result.add(token.text)
+            # else ignore
+    return result
+
+
+@dataclass
+class SoMaJoSentenceSplitter(SoMaJoBaseClass):
+    """Use SoMaJo to split text into sentences.
+
+    Args:
+        language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
+        show_progress_bar: Show a progressbar during processing.
+    """
+
+    show_progress_bar: bool = False
 
     def __call__(self, text: str) -> List[str]:
         """Split the text into a list of sentences.
@@ -70,27 +106,20 @@ def __call__(self, text: str) -> List[str]:
         result = []
 
         for sentence in tqdm(sentences, disable=not self.show_progress_bar):
-            sentence_string = self.detokenize(sentence)
+            sentence_string = detokenize(sentence)
             result.append(sentence_string)
 
         return result
 
 
 @dataclass
-class JaccardSimilarity:
+class JaccardSimilarity(SoMaJoBaseClass):
     """Calculate the `jaccard similarity <https://en.wikipedia.org/wiki/Jaccard_index>`_.
 
     Args:
         language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
     """
 
-    language: str
-    somajo: SoMaJo = field(init=False, repr=False)
-
-    def __post_init__(self):
-        """Do post init."""
-        self.somajo = SoMaJo(self.language)
-
     def get_token_set(self, text: str) -> Set[str]:
         """Get token set for text.
 
@@ -100,9 +129,9 @@ def get_token_set(self, text: str) -> Set[str]:
             The set of tokens (words).
         """
         sentences = self.somajo.tokenize_text([text])
-        tokens = [t.text.lower() for sentence in sentences for t in sentence]
-        # TODO: add option to filter tokens
-        token_set = set(tokens)
+        token_set = extract_token_class_set(sentences)  # TODO: filter tokens
+        token_set = {t.lower() for t in token_set}
+
         return token_set
 
     def __call__(self, text1: str, text2: str) -> float:
@@ -123,28 +152,21 @@ def __call__(self, text1: str, text2: str) -> float:
 
 
 @dataclass
-class TokenExtractor:
+class TokenExtractor(SoMaJoBaseClass):
     """Extract tokens from text.
 
     Args:
         language: The language. ``de_CMC`` for German or ``en_PTB`` for English.
     """
 
-    language: str
-    somajo: SoMaJo = field(init=False, repr=False)
-
-    def __post_init__(self):
-        """Do post init."""
-        self.somajo = SoMaJo(self.language)
-
     def extract_url_set(self, text: str) -> Set[str]:
-        """Extract tokens from text.
+        """Extract URLs from text.
 
         Args:
             text: the text
         Returns:
             Set of extracted links.
         """
         sentences = self.somajo.tokenize_text([text])
-        result = {token.text for sentence in sentences for token in sentence if token.token_class == "URL"}
+        result = extract_token_class_set(sentences, keep_token_classes="URL")
         return result
diff --git a/tests/test_somajo.py b/tests/test_somajo.py
@@ -4,7 +4,9 @@
 
 from math import isclose
 
-from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter, TokenExtractor
+from somajo import SoMaJo
+
+from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter, TokenExtractor, detokenize, extract_token_class_set
 
 
 def test_SoMaJoSentenceSplitter_call() -> None:
@@ -75,3 +77,32 @@ def test_TokenExtractor_call_no_url():
     token_extractor = TokenExtractor("de_CMC")
     result = token_extractor.extract_url_set(text_with_url)
     assert len(result) == 0
+
+
+def test_extract_token_class_set_symbol():
+    somajo = SoMaJo("de_CMC")
+    sentences = somajo.tokenize_text(["Das ist ein Satz. Das ist ein anderer Satz."])
+    result = extract_token_class_set(sentences, keep_token_classes="symbol")
+
+    assert isinstance(result, set)
+    assert len(result) == 1
+    assert "." in result
+
+
+def test_extract_token_class_set_url():
+    somajo = SoMaJo("de_CMC")
+    sentences = somajo.tokenize_text(["Das ist ein Satz. Das ist ein Link: http://github.com"])
+    result = extract_token_class_set(sentences, keep_token_classes="URL")
+
+    assert isinstance(result, set)
+    assert len(result) == 1
+    assert "http://github.com" in result
+
+
+def test_detokenize():
+    somajo = SoMaJo("de_CMC")
+    sentences = somajo.tokenize_text(["Das ist ein Satz. Das ist ein anderer Satz."])
+    result = detokenize(list(sentences)[0])
+
+    assert isinstance(result, str)
+    assert result == "Das ist ein Satz."