diff --git a/colexification_graphs/schema.py b/colexification_graphs/schema.py index 98798d2..8eb2c3a 100644 --- a/colexification_graphs/schema.py +++ b/colexification_graphs/schema.py @@ -15,10 +15,16 @@ class TranslationSchema(t.TypedDict): sense: t.NotRequired[str] # If `None`, treat as equal to `word`. +class SynonymSchema(t.TypedDict): + """Schema for values inside .senses[*].synonyms.""" + word: str + + class SenseSchema(t.TypedDict): """Schema for values inside .senses.""" - # There are other fields, but we only need the translations. + # There are other fields, but we only need translations and synonyms. translations: t.NotRequired[list[TranslationSchema]] + synonyms: t.NotRequired[list[SynonymSchema]] class Schema(t.TypedDict): @@ -31,4 +37,4 @@ class Schema(t.TypedDict): translations: t.NotRequired[list[TranslationSchema]] -__all__ = ["Schema", "SenseSchema", "TranslationSchema"] +__all__ = ["Schema", "SenseSchema", "SynonymSchema", "TranslationSchema"] diff --git a/colexification_graphs/wordsenses.py b/colexification_graphs/wordsenses.py index f35de98..b8ee18c 100644 --- a/colexification_graphs/wordsenses.py +++ b/colexification_graphs/wordsenses.py @@ -14,7 +14,12 @@ from orjson import loads # pylint: disable=no-name-in-module -from colexification_graphs.schema import Schema, SenseSchema, TranslationSchema +from colexification_graphs.schema import ( + Schema, + SenseSchema, + SynonymSchema, + TranslationSchema, +) TranslationError: t.TypeAlias = t.Literal[ @@ -42,6 +47,18 @@ def get_word_senses(data: Schema) -> t.Iterator[SenseSchema]: yield from data["senses"] +def get_synonyms(sense: SenseSchema) -> list[SynonymSchema]: + """Get synonyms from word sense.""" + synonyms = [] + for synonym in sense.get("synonyms", []): + # If there's an error in one synonym, there's probably an error in + # other synonyms, too. + if not synonym["word"]: + return [] + synonyms.append(synonym) + return synonyms + + def warn( kind: TranslationError, language: str, @@ -78,6 +95,7 @@ def get_translations(data: Schema) -> t.Iterator[TranslationSchema]: word and sense values. A word is not considered a translation of itself. """ + language_name = data["lang"] language = data["lang_code"] word = data["word"] @@ -90,6 +108,14 @@ def get_translations(data: Schema) -> t.Iterator[TranslationSchema]: yield translation for sense in get_word_senses(data): + # Get sense description of first translation. + # The translations may have different senses, but the assumption is + # they're all roughly the same. + # But to be sure, we won't change the sense values of translations. + # We'll only use `sense_description` for synonyms. + sense_description = None + + # Yield translations. for translation in sense.get("translations", []): error = check_translation(translation) if error is not None: @@ -98,6 +124,20 @@ def get_translations(data: Schema) -> t.Iterator[TranslationSchema]: continue yield translation + # Set sense description. + if sense_description is None: + sense_description = translation.get("sense") + + # Treat synonyms as translations. + if sense_description is not None: + for synonym in get_synonyms(sense): + yield { + "lang": language_name, + "code": language, + "word": synonym["word"], + "sense": sense_description, + } + def fix_whitespace(text: str) -> tuple[bool, str]: """Fix whitespace characters in text.