Skip to content

Commit

Permalink
add test
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipMay committed Jul 5, 2023
1 parent dbd6599 commit 562fd6b
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
4 changes: 2 additions & 2 deletions mltb2/somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def __post_init__(self):
"""Do post init."""
self.somajo = SoMaJo(self.language)

def extract_url(self, text: str) -> str:
sentences = self.somajo.tokenize_text(text)
def extract_url_set(self, text: str) -> Set[str]:
sentences = self.somajo.tokenize_text([text])
result = {token.text for sentence in sentences for token in sentence if token.token_class == "URL"}
return result
20 changes: 19 additions & 1 deletion tests/test_somajo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from math import isclose

from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter
from mltb2.somajo import JaccardSimilarity, SoMaJoSentenceSplitter, TokenExtractor


def test_SoMaJoSentenceSplitter_call() -> None:
Expand Down Expand Up @@ -57,3 +57,21 @@ def test_JaccardSimilarity_call_no_overlap():
result = jaccard_similarity(text1, text2)

assert isclose(result, 0.0)


def test_TokenExtractor_call():
url1 = "http://may.la"
url2 = "github.com"
text_with_url = f"{url1} Das ist eine Text. {url2} Er enthält eine URL."
token_extractor = TokenExtractor("de_CMC")
result = token_extractor.extract_url_set(text_with_url)
assert len(result) == 2
assert url1 in result
assert url2 in result


def test_TokenExtractor_call():
text_with_url = f"Das ist eine Text. Er enthält keine URLs."
token_extractor = TokenExtractor("de_CMC")
result = token_extractor.extract_url_set(text_with_url)
assert len(result) == 0

0 comments on commit 562fd6b

Please sign in to comment.