scribe-org · SethiShreya · Oct 19, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -30,6 +30,7 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
+from scribe_data.unicode.generate_emoji_keywords import generate_emoji
 
 
 def get_data(
@@ -102,18 +103,7 @@ def get_data(
     # MARK: Emojis
 
     elif data_type in {"emoji-keywords", "emoji_keywords"}:
-        for lang in languages:
-            emoji_keyword_extraction_script = (
-                Path(__file__).parent.parent
-                / "language_data_extraction"
-                / lang
-                / "emoji_keywords"
-                / "generate_emoji_keywords.py"
-            )
-
-            subprocess_result = subprocess.run(
-                ["python", emoji_keyword_extraction_script]
-            )
+        generate_emoji(language, output_dir)
 
     # MARK: Query Data
 

diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py
@@ -0,0 +1,61 @@
+"""
+Centralized keyword-emoji generation file to generated emoji for a specified Language
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import json
+from scribe_data.unicode.process_unicode import gen_emoji_lexicon
+from scribe_data.utils import export_formatted_data
+from pathlib import Path
+
+DATA_TYPE = "emoji-keywords"
+EMOJI_KEYWORDS_DICT = 3
+
+SUPPORTED_LANGUAGE_FILE = Path(__file__).parent / "supported_languages.json"
+
+def generate_emoji(language, output_dir: str = None):
+    print(f"Got the language {language} for emoji generation")
+
+    # check if this language is supported
+    with open(SUPPORTED_LANGUAGE_FILE, "r", encoding="utf-8") as file:
+        languages = json.load(file)
+    # Check if the language code exists in the dictionary
+    for code, name in languages.items():
+        if name == language:
+            print(f"Emoji Generation for language : {language} is supported")
+            break
+    else:
+        print(f"Emoji Generation for language : {language} is not supported")
+        return
+
+    updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
+    export_dir = Path(updated_path) / language.capitalize()
+    export_dir.mkdir(parents=True, exist_ok=True)
+
+    if emoji_keywords_dict := gen_emoji_lexicon(
+        language=language,
+        emojis_per_keyword=EMOJI_KEYWORDS_DICT,
+    ):export_formatted_data(
+            file_path=output_dir,
+            formatted_data=emoji_keywords_dict,
+            query_data_in_use=True,
+            language=language,
+            data_type=DATA_TYPE,
+        )
diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py
@@ -76,7 +76,9 @@ def gen_emoji_lexicon(
     # Pre-set up the emoji popularity data.
     popularity_dict = {}
 
-    with (Path(__file__).parent / "2021_ranked.tsv").open() as popularity_file:
+    with (Path(__file__).parent / "2021_ranked.tsv").open(
+        encoding="utf-8"
+    ) as popularity_file:
         tsv_reader = csv.DictReader(popularity_file, delimiter="\t")
         for tsv_row in tsv_reader:
             popularity_dict[tsv_row["Emoji"]] = int(tsv_row["Rank"])
@@ -107,7 +109,7 @@ def gen_emoji_lexicon(
     }
 
     for cldr_file_key, cldr_file_path in cldr_file_paths.items():
-        with open(cldr_file_path, "r") as file:
+        with open(cldr_file_path, "r", encoding="utf-8") as file:
             cldr_data = json.load(file)
 
         cldr_dict = cldr_data[cldr_file_key]["annotations"]
@@ -185,9 +187,9 @@ def gen_emoji_lexicon(
             noun_data = json.load(f)
 
         plurals_to_singulars_dict = {
-            noun_data[row]["plural"].lower(): row.lower()
-            for row in noun_data
-            if noun_data[row]["plural"] != "isPlural"
+            noun["singular"].lower(): noun["lexemeID"].lower()
+            for noun in noun_data
+            if noun.get("singular")  # Ensure the singular field exists
         }
 
         for plural, singular in plurals_to_singulars_dict.items():

diff --git a/src/scribe_data/unicode/supported_languages.json b/src/scribe_data/unicode/supported_languages.json
@@ -0,0 +1,157 @@
+{
+    "am": "amharic",
+    "ar": "arabic",
+    "ar-SA": "arabic (saudi arabia)",
+    "as": "assamese",
+    "ast": "asturian",
+    "az": "azerbaijani",
+    "be": "belarusian",
+    "bew": "betawi",
+    "bg": "bulgarian",
+    "bgn": "western balochi",
+    "bn": "bengali",
+    "br": "breton",
+    "bs": "bosnian",
+    "ca": "catalan",
+    "ccp": "chakma",
+    "ceb": "cebuano",
+    "chr": "cherokee",
+    "ckb": "sorani kurdish",
+    "cs": "czech",
+    "cv": "chuvash",
+    "cy": "welsh",
+    "da": "danish",
+    "de": "german",
+    "de-CH": "german (switzerland)",
+    "doi": "dogri",
+    "dsb": "lower sorbian",
+    "el": "greek",
+    "en": "english",
+    "en-001": "english (world)",
+    "en-AU": "english (australia)",
+    "en-CA": "english (canada)",
+    "en-GB": "english (united kingdom)",
+    "en-IN": "english (india)",
+    "es": "spanish",
+    "es-419": "spanish (latin america)",
+    "es-MX": "spanish (mexico)",
+    "es-US": "spanish (united states)",
+    "et": "estonian",
+    "eu": "basque",
+    "fa": "persian",
+    "ff": "fulah",
+    "ff-Adlm": "fulah (adlam)",
+    "fi": "finnish",
+    "fil": "filipino",
+    "fo": "faroese",
+    "fr": "french",
+    "fr-CA": "french (canada)",
+    "ga": "irish",
+    "gd": "scottish gaelic",
+    "gl": "galician",
+    "gu": "gujarati",
+    "ha": "hausa",
+    "ha-NE": "hausa (niger)",
+    "he": "hebrew",
+    "hi": "hindi",
+    "hi-Latn": "hindi (latin script)",
+    "hr": "croatian",
+    "hsb": "upper sorbian",
+    "hu": "hungarian",
+    "hy": "armenian",
+    "ia": "interlingua",
+    "id": "indonesian",
+    "ig": "igbo",
+    "is": "icelandic",
+    "it": "italian",
+    "ja": "japanese",
+    "jv": "javanese",
+    "ka": "georgian",
+    "kab": "kabyle",
+    "kk": "kazakh",
+    "kl": "greenlandic",
+    "km": "khmer",
+    "kn": "kannada",
+    "ko": "korean",
+    "kok": "konkani",
+    "ku": "kurdish",
+    "ky": "kyrgyz",
+    "lb": "luxembourgish",
+    "lij": "ligurian",
+    "lo": "lao",
+    "lt": "lithuanian",
+    "lv": "latvian",
+    "mai": "maithili",
+    "mi": "māori",
+    "mk": "macedonian",
+    "ml": "malayalam",
+    "mn": "mongolian",
+    "mni": "meitei",
+    "mr": "marathi",
+    "ms": "malay",
+    "mt": "maltese",
+    "my": "burmese",
+    "ne": "nepali",
+    "nl": "dutch",
+    "nn": "norwegian nynorsk",
+    "no": "norwegian",
+    "nso": "northern sotho",
+    "oc": "occitan",
+    "or": "odia",
+    "pa": "punjabi",
+    "pa-Arab": "punjabi (arabic script)",
+    "pcm": "nigerian pidgin",
+    "pl": "polish",
+    "ps": "pashto",
+    "pt": "portuguese",
+    "pt-PT": "portuguese (portugal)",
+    "qu": "quechua",
+    "quc": "k'iche'",
+    "rhg": "rohingya",
+    "rm": "romansh",
+    "ro": "romanian",
+    "ru": "russian",
+    "rw": "kinyarwanda",
+    "sa": "sanskrit",
+    "sat": "santali",
+    "sc": "sardinian",
+    "sd": "sindhi",
+    "si": "sinhala",
+    "sk": "slovak",
+    "sl": "slovenian",
+    "so": "somali",
+    "sq": "albanian",
+    "sr": "serbian",
+    "sr-Cyrl": "serbian (cyrillic)",
+    "sr-Cyrl-BA": "serbian (cyrillic, bosnia and herzegovina)",
+    "sr-Latn": "serbian (latin)",
+    "sr-Latn-BA": "serbian (latin, bosnia and herzegovina)",
+    "su": "sundanese",
+    "sv": "swedish",
+    "sw": "swahili",
+    "sw-KE": "swahili (kenya)",
+    "ta": "tamil",
+    "te": "telugu",
+    "tg": "tajik",
+    "th": "thai",
+    "ti": "tigrinya",
+    "tk": "turkmen",
+    "tn": "tswana",
+    "to": "tongan",
+    "tr": "turkish",
+    "tt": "tatar",
+    "ug": "uyghur",
+    "uk": "ukrainian",
+    "und": "undetermined",
+    "ur": "urdu",
+    "uz": "uzbek",
+    "vi": "vietnamese",
+    "wo": "wolof",
+    "xh": "xhosa",
+    "yi": "yiddish",
+    "yo": "yoruba",
+    "zh": "chinese",
+    "zh-Hans": "chinese (simplified)",
+    "zh-Hant": "chinese (traditional)",
+    "zu": "zulu"
+}