Skip to content

Commit

Permalink
Add default language for detection. (#149)
Browse files Browse the repository at this point in the history
* Add support for always detecting specific languages in language identification

* Update language identification function to return language probabilities

* Update copyright year to 2024
  • Loading branch information
PhilipMay authored Jan 29, 2024
1 parent 9e83c95 commit 313ebb0
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
14 changes: 11 additions & 3 deletions mltb2/fasttext.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 Philip May
# Copyright (c) 2023-2024 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

Expand All @@ -11,6 +11,7 @@

import os
from dataclasses import dataclass, field
from typing import List, Optional

import fasttext
from fasttext.FastText import _FastText
Expand Down Expand Up @@ -51,12 +52,15 @@ def get_model_path_and_download() -> str:

return model_full_path

def __call__(self, text: str, num_lang: int = 10):
def __call__(self, text: str, num_lang: int = 10, always_detect_lang: Optional[List[str]] = None):
"""Identify languages of a given text.
Args:
text: the text for which the language is to be recognized
num_lang: number of returned languages
num_lang: number of returned language probabilities
always_detect_lang: A list of languages that should always be returned
even if not detected. If the language is not detected, the probability
is set to 0.0.
Returns:
A dict from language to probability.
This dict contains no more than ``num_lang`` elements.
Expand All @@ -76,4 +80,8 @@ def __call__(self, text: str, num_lang: int = 10):
languages = predictions[0]
probabilities = predictions[1]
lang_to_prob = {lang[9:]: prob for lang, prob in zip(languages, probabilities)}
if always_detect_lang is not None:
for lang in always_detect_lang:
if lang not in lang_to_prob:
lang_to_prob[lang] = 0.0
return lang_to_prob
10 changes: 9 additions & 1 deletion tests/test_fasttext.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 Philip May
# Copyright (c) 2023-2024 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

Expand All @@ -16,3 +16,11 @@ def test_fasttext_language_identification_call():
languages = language_identification("This is an English sentence.")
assert languages is not None
assert len(languages) == 10


def test_fasttext_language_identification_call_with_always_detect_lang():
language_identification = FastTextLanguageIdentification()
languages = language_identification("This is an English sentence.", always_detect_lang=["fake_language"])
assert languages is not None
assert len(languages) == 11
assert "fake_language" in languages

0 comments on commit 313ebb0

Please sign in to comment.