Skip to content

Commit

Permalink
feat:standardize_lang_tag
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl committed Sep 16, 2024
1 parent 1ab200b commit df1de83
Show file tree
Hide file tree
Showing 13 changed files with 80 additions and 52 deletions.
15 changes: 8 additions & 7 deletions ovos_plugin_manager/templates/coreference.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import word_tokenize

Expand Down Expand Up @@ -64,10 +65,10 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-us")

def contains_corefs(self, text, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang, macro=True)
if lang.startswith("en"):
indicators = self.COREFERENCE_INDICATORS_EN
elif lang.startswith("pt"):
Expand Down Expand Up @@ -120,7 +121,7 @@ def extract_replacements(original, solved):
return bucket

def add_context(self, word, solved, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
if lang not in self.contexts:
self.contexts[lang] = {}
if word not in self.contexts[lang]:
Expand All @@ -130,7 +131,7 @@ def add_context(self, word, solved, lang=None):
self.contexts[lang][word].append(solved)

def extract_context(self, text=None, solved=None, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
text = text or self._prev_sentence
solved = solved or self._prev_solved
replaced = self.extract_replacements(text, solved)
Expand All @@ -139,7 +140,7 @@ def extract_context(self, text=None, solved=None, lang=None):
return replaced

def replace_coreferences(self, text, lang=None, set_context=False):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
solved = self.solve_corefs(text, lang=lang)
self._prev_sentence = text
self._prev_solved = solved
Expand All @@ -148,7 +149,7 @@ def replace_coreferences(self, text, lang=None, set_context=False):
return solved

def replace_coreferences_with_context(self, text, lang=None, context=None, set_context=False):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
lang_context = self.contexts.get(lang) or {}
default_context = {k: v[0] for k, v in lang_context.items() if v}

Expand All @@ -168,7 +169,7 @@ def replace_coreferences_with_context(self, text, lang=None, context=None, set_c
return solved

def solve_corefs(self, text, lang=None):
lang = lang or self.lang
lang = standardize_lang_tag(lang or self.lang)
return text


Expand Down
3 changes: 2 additions & 1 deletion ovos_plugin_manager/templates/hotwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""
from ovos_config import Configuration
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements


Expand Down Expand Up @@ -49,7 +50,7 @@ def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"):
self.expected_duration = self.num_phonemes * phoneme_duration

self.listener_config = mycroft_config.get("listener") or {}
self.lang = str(self.config.get("lang", lang)).lower()
self.lang = standardize_lang_tag(self.config.get("lang", lang))

@classproperty
def runtime_requirements(self):
Expand Down
3 changes: 2 additions & 1 deletion ovos_plugin_manager/templates/postag.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements


Expand Down Expand Up @@ -48,7 +49,7 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-us")

def postag(self, spans, lang=None):
lang = lang or self.lang
Expand Down
6 changes: 3 additions & 3 deletions ovos_plugin_manager/templates/segmentation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils import flatten_list
from ovos_utils import classproperty, flatten_list
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import sentence_tokenize

Expand Down Expand Up @@ -58,7 +58,7 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-us")

@staticmethod
def __extract(text, markers):
Expand Down
5 changes: 5 additions & 0 deletions ovos_plugin_manager/templates/solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from json_database import JsonStorageXDG
from ovos_utils.log import LOG, log_deprecation
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.xdg_utils import xdg_cache_home

from ovos_plugin_manager.templates.language import LanguageTranslator, LanguageDetector
Expand All @@ -26,6 +27,8 @@ def func_wrapper(*args, **kwargs):
return func(*args, **kwargs)

lang = kwargs.get("lang")
if lang:
lang = standardize_lang_tag(lang)
# check if translation can be skipped
if any([lang is None,
lang == solver.default_lang,
Expand Down Expand Up @@ -91,6 +94,8 @@ def func_wrapper(*args, **kwargs):
lang = solver.detect_language(v)
LOG.debug(f"detected 'lang': {lang} in argument '{idx}' for func: {func}")

if lang:
lang = standardize_lang_tag(lang)
kwargs["lang"] = lang
return func(*args, **kwargs)

Expand Down
33 changes: 26 additions & 7 deletions ovos_plugin_manager/templates/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ovos_utils import classproperty
from ovos_utils.log import deprecated
from ovos_utils.process_utils import RuntimeRequirements

from ovos_utils.lang import standardize_lang_tag
from ovos_plugin_manager.utils.config import get_plugin_config


Expand Down Expand Up @@ -78,9 +78,9 @@ def recognizer(self, val):

@property
def lang(self):
return self._lang or \
return standardize_lang_tag(self._lang or \
self.config.get("lang") or \
Configuration().get("lang", "en-us")
Configuration().get("lang", "en-us"))

@lang.setter
def lang(self, val):
Expand Down Expand Up @@ -114,10 +114,7 @@ def credential(self, val):
"implement config handling directly instead", "1.0.0")
def init_language(config_core):
lang = config_core.get("lang", "en-US")
langs = lang.split("-")
if len(langs) == 2:
return langs[0].lower() + "-" + langs[1].upper()
return lang
return standardize_lang_tag(lang, macro=True)

@abstractmethod
def execute(self, audio, language: Optional[str] = None) -> str:
Expand All @@ -140,6 +137,28 @@ def available_languages(self) -> set:
return set()


class STTT(STT):
"""speech to text translation
not only transcribes audio, but also outputs text in a different language
from OVOS POV this is a regular STT plugin
"""

@property
def output_language(self) -> str:
return standardize_lang_tag(self.config.get("output_lang") or self.lang)

def execute(self, audio, language: Optional[str] = None) -> str:
# TODO - eventually deprecate this, just for compat
return self.transcribe(audio, language)[0][0]

@abstractmethod
def transcribe(self, audio,
lang: Optional[str] = None,
output_lang: Optional[str] = None) -> List[Tuple[str, float]]:
"""transcribe audio data to a list of possible transcriptions and respective confidences"""


class TokenSTT(STT, metaclass=ABCMeta):
@deprecated("TokenSTT is deprecated, please subclass from STT directly", "1.0.0")
def __init__(self, config=None):
Expand Down
5 changes: 2 additions & 3 deletions ovos_plugin_manager/templates/tokenization.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ovos_bus_client.message import dig_for_message
from ovos_utils import classproperty
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.process_utils import RuntimeRequirements
from quebra_frases import span_indexed_word_tokenize, word_tokenize

Expand Down Expand Up @@ -49,14 +50,12 @@ def lang(self):
msg = dig_for_message()
if msg:
lang = msg.data.get("lang")
return lang or "en-us"
return standardize_lang_tag(lang or "en-us")

def span_tokenize(self, text, lang=None):
lang = lang or self.lang
return span_indexed_word_tokenize(text)

def tokenize(self, text, lang=None):
lang = lang or self.lang
return word_tokenize(text)

@staticmethod
Expand Down
3 changes: 2 additions & 1 deletion ovos_plugin_manager/templates/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ovos_utils.fakebus import FakeBus
from ovos_utils.file_utils import get_cache_directory
from ovos_utils.lang.visimes import VISIMES
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG, deprecated, log_deprecation
from ovos_utils.metrics import Stopwatch
from ovos_utils.process_utils import RuntimeRequirements
Expand Down Expand Up @@ -62,7 +63,7 @@ def __init__(self, plugin_id: str, lang: str, voice: str, synth_kwargs: dict = N
synth_kwargs (dict, optional): Additional keyword arguments for the synthesizer.
"""
self.plugin_id = plugin_id
self.lang = lang
self.lang = standardize_lang_tag(lang)
self.voice = voice
self.synth_kwargs = synth_kwargs or {}

Expand Down
9 changes: 5 additions & 4 deletions ovos_plugin_manager/thirdparty/solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from typing import Optional, List, Dict

from ovos_utils import flatten_list
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG
from quebra_frases import sentence_tokenize

Expand All @@ -53,7 +54,7 @@ def __init__(self, config=None,
self.enable_cache = enable_cache
self.config = config or {}
self.supported_langs = self.config.get("supported_langs") or []
self.default_lang = internal_lang or self.config.get("lang", "en")
self.default_lang = standardize_lang_tag(internal_lang or self.config.get("lang", "en"), macro=True)
if self.default_lang not in self.supported_langs:
self.supported_langs.insert(0, self.default_lang)
self._translator = translator or OVOSLangTranslationFactory.create() if self.enable_tx else None
Expand Down Expand Up @@ -123,9 +124,9 @@ def translate(self, text: str,
:param source_lang: Source language code.
:return: Translated text.
"""
source_lang = source_lang or self.detect_language(text)
target_lang = target_lang or self.default_lang
if source_lang.split("-")[0] == target_lang.split("-")[0]:
source_lang = standardize_lang_tag(source_lang or self.detect_language(text), macro=True)
target_lang = standardize_lang_tag(target_lang or self.default_lang, macro=True)
if source_lang == target_lang:
return text # skip translation
return self.translator.translate(text,
target=target_lang,
Expand Down
32 changes: 15 additions & 17 deletions ovos_plugin_manager/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@
from ovos_utils.log import LOG, log_deprecation


def standardize_lang_tag(lang_code, macro=True):
"""https://langcodes-hickford.readthedocs.io/en/sphinx/index.html"""
# TODO - move to ovos-utils
try:
from langcodes import standardize_tag as std
return std(lang_code, macro=macro)
except:
if macro:
return lang_code.split("-")[0].lower()
return lang_code.lower()


class PluginTypes(str, Enum):
TRIPLES = "opm.triples"
PIPELINE = "opm.pipeline"
Expand Down Expand Up @@ -175,23 +187,9 @@ def load_plugin(plug_name: str, plug_type: Optional[PluginTypes] = None):


def normalize_lang(lang):
# TODO consider moving to LF or ovos_utils
# special handling, the parse sometimes messes this up
# eg, uk-ua gets normalized to uk-gb
# this also makes lookup easier as we
# often get duplicate entries with both variants
if "-" in lang:
pieces = lang.split("-")
if len(pieces) == 2 and pieces[0] == pieces[1]:
lang = pieces[0]

try:
from langcodes import standardize_tag as _normalize_lang
lang = _normalize_lang(lang, macro=True)
except ValueError:
# this lang code is apparently not valid ?
pass
return lang
# TODO - add deprecation warning
from ovos_utils.lang import standardize_lang_tag
return standardize_lang_tag(lang)


class ReadWriteStream:
Expand Down
13 changes: 7 additions & 6 deletions ovos_plugin_manager/utils/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional, Union
from ovos_config.config import Configuration
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG
from ovos_plugin_manager.utils import load_plugin, find_plugins, \
normalize_lang, PluginTypes, PluginConfigTypes
Expand All @@ -19,7 +20,7 @@ def get_plugin_config(config: Optional[dict] = None, section: str = None,
@return: Configuration for the requested module, including `lang` and `module` keys
"""
config = config or Configuration()
lang = config.get('lang') or Configuration().get('lang')
lang = standardize_lang_tag(config.get('lang') or Configuration().get('lang', "en"))
config = (config.get('intentBox', {}).get(section) or config.get(section)
or config) if section else config
module = module or config.get('module')
Expand Down Expand Up @@ -54,7 +55,7 @@ def get_valid_plugin_configs(configs: dict, lang: str,
valid_configs = list()
if include_dialects:
# Check other dialects of the requested language
base_lang = lang.split("-")[0]
base_lang = standardize_lang_tag(lang, macro=True)
for language, confs in configs.items():
if language.startswith(base_lang):
for config in confs:
Expand Down Expand Up @@ -144,7 +145,7 @@ def get_plugin_supported_languages(plug_type: PluginTypes) -> dict:
load_plugin_configs(plug,
PluginConfigTypes(f"{plug_type.value}.config"))
for lang, config in configs:
lang = normalize_lang(lang)
lang = standardize_lang_tag(lang)
lang_configs.setdefault(lang, list())
lang_configs[lang].append(plug)
return lang_configs
Expand All @@ -159,18 +160,18 @@ def get_plugin_language_configs(plug_type: PluginTypes, lang: str,
@param include_dialects: consider configurations in different locales
@return: dict {`plugin_name`: [`valid_configs`]}
"""
lang = normalize_lang(lang)
lang = standardize_lang_tag(lang)
plugin_configs = dict()
valid_configs = dict()
for plug in find_plugins(plug_type):
plugin_configs[plug] = list()
valid_configs = \
load_plugin_configs(plug,
PluginConfigTypes(f"{plug_type.value}.config"))
valid_configs = {normalize_lang(lang): conf
valid_configs = {standardize_lang_tag(lang): conf
for lang, conf in valid_configs.items()}
if include_dialects:
lang = lang.split('-')[0]
lang = standardize_lang_tag(lang, macro=True)
for language in valid_configs:
if language.startswith(lang):
plugin_configs[plug] += valid_configs[language]
Expand Down
3 changes: 2 additions & 1 deletion ovos_plugin_manager/utils/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional

from ovos_utils import flatten_list
from ovos_utils.lang import standardize_lang_tag
from ovos_utils.log import LOG
from ovos_plugin_manager import PluginTypes
from ovos_plugin_manager.stt import get_stt_lang_configs
Expand Down Expand Up @@ -40,7 +41,7 @@ def config2option(cls, cfg: dict, plugin_type: PluginTypes,
"""
cfg = cls._migrate_old_cfg(cfg)
engine = cfg["module"]
lang = lang or cfg.get("lang")
lang = standardize_lang_tag(lang or cfg.get("lang"), macro=True)

plugin_display_name = engine.replace("_", " ").replace("-",
" ").title()
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ovos-utils>=0.0.38,<1.0.0
ovos-utils>=0.2.1,<1.0.0
ovos_bus_client>=0.0.8,<1.0.0
ovos-config>=0.0.12,<1.0.0
combo_lock~=0.2
Expand Down

0 comments on commit df1de83

Please sign in to comment.