Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created and Added all the languages that support Emoji #440

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
DEFAULT_TSV_EXPORT_DIR,
)
from scribe_data.wikidata.query_data import query_data
from scribe_data.unicode.generate_emoji_keywords import generate_emoji


def get_data(
Expand Down Expand Up @@ -102,18 +103,7 @@ def get_data(
# MARK: Emojis

elif data_type in {"emoji-keywords", "emoji_keywords"}:
for lang in languages:
emoji_keyword_extraction_script = (
Path(__file__).parent.parent
/ "language_data_extraction"
/ lang
/ "emoji_keywords"
/ "generate_emoji_keywords.py"
)

subprocess_result = subprocess.run(
["python", emoji_keyword_extraction_script]
)
generate_emoji(language, output_dir)

# MARK: Query Data

Expand Down
61 changes: 61 additions & 0 deletions src/scribe_data/unicode/generate_emoji_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
Centralized keyword-emoji generation file to generated emoji for a specified Language
.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import json
from scribe_data.unicode.process_unicode import gen_emoji_lexicon
from scribe_data.utils import export_formatted_data
from pathlib import Path

DATA_TYPE = "emoji-keywords"
EMOJI_KEYWORDS_DICT = 3

SUPPORTED_LANGUAGE_FILE = Path(__file__).parent / "supported_languages.json"

def generate_emoji(language, output_dir: str = None):
print(f"Got the language {language} for emoji generation")

# check if this language is supported
with open(SUPPORTED_LANGUAGE_FILE, "r", encoding="utf-8") as file:
languages = json.load(file)
# Check if the language code exists in the dictionary
for code, name in languages.items():
if name == language:
print(f"Emoji Generation for language : {language} is supported")
Copy link
Contributor

@axif0 axif0 Oct 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For lower and capitalization user input language, we need to dynamically handle it by language.lower()

break
else:
print(f"Emoji Generation for language : {language} is not supported")
return

updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
export_dir = Path(updated_path) / language.capitalize()
export_dir.mkdir(parents=True, exist_ok=True)

if emoji_keywords_dict := gen_emoji_lexicon(
language=language,
emojis_per_keyword=EMOJI_KEYWORDS_DICT,
):export_formatted_data(
file_path=output_dir,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=language,
Copy link
Contributor

@axif0 axif0 Oct 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use language.capitalize(), therefore no No such file or directory: 'scribe_data_json_export/english/emoji_keywords.json' will cause. In file data-type should be '{user_given_directory}/English/emoji_keywords.json'

Then we can call scribe-data get -lang English -dt emoji_keywords -od ./output_data and get the file in output_data

data_type=DATA_TYPE,
)
12 changes: 7 additions & 5 deletions src/scribe_data/unicode/process_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ def gen_emoji_lexicon(
# Pre-set up the emoji popularity data.
popularity_dict = {}

with (Path(__file__).parent / "2021_ranked.tsv").open() as popularity_file:
with (Path(__file__).parent / "2021_ranked.tsv").open(
encoding="utf-8"
) as popularity_file:
tsv_reader = csv.DictReader(popularity_file, delimiter="\t")
for tsv_row in tsv_reader:
popularity_dict[tsv_row["Emoji"]] = int(tsv_row["Rank"])
Expand Down Expand Up @@ -107,7 +109,7 @@ def gen_emoji_lexicon(
}

for cldr_file_key, cldr_file_path in cldr_file_paths.items():
with open(cldr_file_path, "r") as file:
with open(cldr_file_path, "r", encoding="utf-8") as file:
cldr_data = json.load(file)

cldr_dict = cldr_data[cldr_file_key]["annotations"]
Expand Down Expand Up @@ -185,9 +187,9 @@ def gen_emoji_lexicon(
noun_data = json.load(f)

plurals_to_singulars_dict = {
noun_data[row]["plural"].lower(): row.lower()
for row in noun_data
if noun_data[row]["plural"] != "isPlural"
noun["singular"].lower(): noun["lexemeID"].lower()
for noun in noun_data
if noun.get("singular") # Ensure the singular field exists
}

for plural, singular in plurals_to_singulars_dict.items():
Expand Down
157 changes: 157 additions & 0 deletions src/scribe_data/unicode/supported_languages.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
{
"am": "amharic",
"ar": "arabic",
"ar-SA": "arabic (saudi arabia)",
"as": "assamese",
"ast": "asturian",
"az": "azerbaijani",
"be": "belarusian",
"bew": "betawi",
"bg": "bulgarian",
"bgn": "western balochi",
"bn": "bengali",
"br": "breton",
"bs": "bosnian",
"ca": "catalan",
"ccp": "chakma",
"ceb": "cebuano",
"chr": "cherokee",
"ckb": "sorani kurdish",
"cs": "czech",
"cv": "chuvash",
"cy": "welsh",
"da": "danish",
"de": "german",
"de-CH": "german (switzerland)",
"doi": "dogri",
"dsb": "lower sorbian",
"el": "greek",
"en": "english",
"en-001": "english (world)",
"en-AU": "english (australia)",
"en-CA": "english (canada)",
"en-GB": "english (united kingdom)",
"en-IN": "english (india)",
"es": "spanish",
"es-419": "spanish (latin america)",
"es-MX": "spanish (mexico)",
"es-US": "spanish (united states)",
"et": "estonian",
"eu": "basque",
"fa": "persian",
"ff": "fulah",
"ff-Adlm": "fulah (adlam)",
"fi": "finnish",
"fil": "filipino",
"fo": "faroese",
"fr": "french",
"fr-CA": "french (canada)",
"ga": "irish",
"gd": "scottish gaelic",
"gl": "galician",
"gu": "gujarati",
"ha": "hausa",
"ha-NE": "hausa (niger)",
"he": "hebrew",
"hi": "hindi",
"hi-Latn": "hindi (latin script)",
"hr": "croatian",
"hsb": "upper sorbian",
"hu": "hungarian",
"hy": "armenian",
"ia": "interlingua",
"id": "indonesian",
"ig": "igbo",
"is": "icelandic",
"it": "italian",
"ja": "japanese",
"jv": "javanese",
"ka": "georgian",
"kab": "kabyle",
"kk": "kazakh",
"kl": "greenlandic",
"km": "khmer",
"kn": "kannada",
"ko": "korean",
"kok": "konkani",
"ku": "kurdish",
"ky": "kyrgyz",
"lb": "luxembourgish",
"lij": "ligurian",
"lo": "lao",
"lt": "lithuanian",
"lv": "latvian",
"mai": "maithili",
"mi": "māori",
"mk": "macedonian",
"ml": "malayalam",
"mn": "mongolian",
"mni": "meitei",
"mr": "marathi",
"ms": "malay",
"mt": "maltese",
"my": "burmese",
"ne": "nepali",
"nl": "dutch",
"nn": "norwegian nynorsk",
"no": "norwegian",
"nso": "northern sotho",
"oc": "occitan",
"or": "odia",
"pa": "punjabi",
"pa-Arab": "punjabi (arabic script)",
"pcm": "nigerian pidgin",
"pl": "polish",
"ps": "pashto",
"pt": "portuguese",
"pt-PT": "portuguese (portugal)",
"qu": "quechua",
"quc": "k'iche'",
"rhg": "rohingya",
"rm": "romansh",
"ro": "romanian",
"ru": "russian",
"rw": "kinyarwanda",
"sa": "sanskrit",
"sat": "santali",
"sc": "sardinian",
"sd": "sindhi",
"si": "sinhala",
"sk": "slovak",
"sl": "slovenian",
"so": "somali",
"sq": "albanian",
"sr": "serbian",
"sr-Cyrl": "serbian (cyrillic)",
"sr-Cyrl-BA": "serbian (cyrillic, bosnia and herzegovina)",
"sr-Latn": "serbian (latin)",
"sr-Latn-BA": "serbian (latin, bosnia and herzegovina)",
"su": "sundanese",
"sv": "swedish",
"sw": "swahili",
"sw-KE": "swahili (kenya)",
"ta": "tamil",
"te": "telugu",
"tg": "tajik",
"th": "thai",
"ti": "tigrinya",
"tk": "turkmen",
"tn": "tswana",
"to": "tongan",
"tr": "turkish",
"tt": "tatar",
"ug": "uyghur",
"uk": "ukrainian",
"und": "undetermined",
"ur": "urdu",
"uz": "uzbek",
"vi": "vietnamese",
"wo": "wolof",
"xh": "xhosa",
"yi": "yiddish",
"yo": "yoruba",
"zh": "chinese",
"zh-Hans": "chinese (simplified)",
"zh-Hant": "chinese (traditional)",
"zu": "zulu"
}
Loading