From 8e82f6237a09c103172bdc3e810167e822af194b Mon Sep 17 00:00:00 2001 From: JuniorTux Date: Thu, 1 Aug 2024 21:45:46 +0800 Subject: [PATCH 1/2] Add Pinyin to Zhuyin convertion --- commands/tocfl/chewing.py | 267 ++++++++++++++++++++++++++++++++++++++ commands/tocfl/tocfl.py | 9 +- 2 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 commands/tocfl/chewing.py diff --git a/commands/tocfl/chewing.py b/commands/tocfl/chewing.py new file mode 100644 index 0000000..3a4cda5 --- /dev/null +++ b/commands/tocfl/chewing.py @@ -0,0 +1,267 @@ +PINYIN_INITIALS = { + "b": "ㄅ", + "p": "ㄆ", + "m": "ㄇ", + "f": "ㄈ", + "d": "ㄉ", + "t": "ㄊ", + "n": "ㄋ", + "l": "ㄌ", + "g": "ㄍ", + "k": "ㄎ", + "h": "ㄏ", + "j": "ㄐ", + "q": "ㄑ", + "x": "ㄒ", + "zh": "ㄓ", + "ch": "ㄔ", + "sh": "ㄕ", + "r": "ㄖ", + "z": "ㄗ", + "c": "ㄘ", + "s": "ㄙ", +} + +# In Hanyu Pinyin, 「ㄜ」 and 「ㄝ」 share the same character 〔e〕 +# because the possible initials are almost different. +# Only 「誒」 (「ㄝˋ」) interferes with 「惡」(「ㄜˋ」), +# in this case 「ㄝ」 is spelt as 〔ê〕 +PINYIN_ALONE = { + "zhi": "ㄓ", + "chi": "ㄔ", + "shi": "ㄕ", + "ri": "ㄖ", + "zi": "ㄗ", + "ci": "ㄘ", + "si": "ㄙ", + "a": "ㄚ", + "o": "ㄛ", + "e": "ㄜ", + "ê": "ㄝ", + "ai": "ㄞ", + "ei": "ㄟ", + "ao": "ㄠ", + "ou": "ㄡ", + "an": "ㄢ", + "en": "ㄣ", + "ang": "ㄤ", + "er": "ㄦ", + "yi": "ㄧ", + "wu": "ㄨ", + "yu": "ㄩ", + # Combined + "ya": "ㄧㄚ", + "yo": "ㄧㄛ", + "ye": "ㄧㄝ", + "yai": "ㄧㄞ", + "yao": "一ㄠ", + "you": "ㄧㄡ", + "yan": "ㄧㄢ", + "yin": "ㄧㄣ", + "yang": "ㄧㄤ", + "ying": "ㄧㄥ", + "wa": "ㄨㄚ", + "wo": "ㄨㄛ", + "wai": "ㄨㄞ", + "wei": "ㄨㄟ", + "wan": "ㄨㄢ", + "wen": "ㄨㄣ", + "wang": "ㄨㄤ", + "weng": "ㄨㄥ", + "yue": "ㄩㄝ", + "yuan": "ㄩㄢ", + "yun": "ㄩㄣ", + "yong": "ㄩㄥ", + # v is used to replace ü in typing + "nü": "ㄋㄩ", + "lü": "ㄌㄩ", + "nv": "ㄋㄩ", + "lv": "ㄌㄩ", +} + +PINYIN_CENTER = { + "i": "ㄧ", + "u": "ㄨ", + "ü": "ㄩ", + "v": "ㄩ", +} + +# The designer of Hanyu Pinyin used e to represent both 「ㄜ」 and 「ㄝ」. +# This is because 「ㄝ」 could only be used in 「ㄩㄝ」 and 「ㄧㄝ」 +PINYIN_FINALS = { + "a": "ㄚ", + "o": "ㄛ", + "e": "ㄜ", # also ㄝ + "ai": "ㄞ", + "ei": "ㄟ", + "ao": "ㄠ", + "ou": "ㄡ", + "an": "ㄢ", + "en": "ㄣ", + "ang": "ㄤ", + "eng": "ㄥ", + "er": "ㄦ", +} + +PINYIN_COMBINED = { + "iu": "ㄧㄡ", + "ian": "ㄧㄢ", + "in": "ㄧㄣ", + "iang": "ㄧㄤ", + "ing": "ㄧㄥ", + "ui": "ㄨㄟ", + "uan": "ㄨㄢ", # also ㄩㄢ + "un": "ㄨㄣ", # also ㄨㄣ + "uang": "ㄨㄤ", + "ong": "ㄨㄥ", + "ue": "ㄩㄝ", + "iong": "ㄩㄥ", +} + +# Only 「a﹑o﹑e﹑i﹑u﹑ü」 are added diacritics in Hanyu Pinyin +# Neutral tones are not labeled +DIACRITIC_TO_BASE_AND_TONE = { + # ā (ɑ̄) ē ī ō ū ǖ + # á (ɑ́) é í ó ú ǘ + # ǎ (ɑ̌) ě ǐ ǒ ǔ ǚ + # à (ɑ̀) è ì ò ù ǜ + "ā": ("a", "¯"), + "á": ("a", "ˊ"), + "ǎ": ("a", "ˇ"), + "ă": ("a", "ˇ"), + "à": ("a", "ˋ"), + "ē": ("e", "¯"), + "é": ("e", "ˊ"), + "ě": ("e", "ˇ"), + "ĕ": ("e", "ˇ"), + "è": ("e", "ˋ"), + "ī": ("i", "¯"), + "í": ("i", "ˊ"), + "ǐ": ("i", "ˇ"), + "ĭ": ("i", "ˇ"), + "ì": ("i", "ˋ"), + "ō": ("o", "¯"), + "ó": ("o", "ˊ"), + "ǒ": ("o", "ˇ"), + "ŏ": ("o", "ˇ"), + "ò": ("o", "ˋ"), + "ū": ("u", "¯"), + "ú": ("u", "ˊ"), + "ǔ": ("u", "ˇ"), + "ŭ": ("u", "ˇ"), + "ù": ("u", "ˋ"), + "ǖ": ("ü", "¯"), + "ǘ": ("ü", "ˊ"), + "ǚ": ("ü", "ˇ"), + "ü̆": ("ü", "ˇ"), + "ǜ": ("ü", "ˋ"), + "ề": ("ê", "ˋ"), # 「ㄝ」 could only possibly be the fourth tone +} + + +def to_chewing(pinyin: str) -> str: + # Remove leading and trailing spaces + pinyin = pinyin.strip() + + # Matches chewing from substrings + def match_chewing(string: str, index: int, target: dict[str, str]): + global PINYIN_COMBINED, PINYIN_FINALS + # Substrings only to the maxium possible character amount + for i in range(max([len(i) for i in target.keys()]), 0, -1): + target_str = string[index : index + i] + result = target.get(target_str) + if result: + # Resolve duplicates + if target == PINYIN_COMBINED: + if target_str == "uan" and string[index - 1] in [ + "y", + "j", + "q", + "x", + ]: + result = "ㄩㄢ" + elif target_str == "un" and string[index - 1] in [ + "y", + "j", + "q", + "x", + ]: + result = "ㄩㄣ" + elif target == PINYIN_FINALS: + if target_str == "e" and string[index - 1] == "y": + result = "ㄝ" + + return (index + i, result) + return (index + 1, None) + + # Ensure there are no trailing characters unable to form word + def forms_new_word(index: int): + nonlocal pinyin + global PINYIN_INITIALS, PINYIN_ALONE + return ( + match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1] + or index >= len(pinyin) + or not pinyin[index].isalpha() + ) + + # Temporarily store the chewing tones and original index + tones = [] + for index in range(len(pinyin)): + value = DIACRITIC_TO_BASE_AND_TONE.get(pinyin[index]) + if not value: + continue + pinyin = pinyin[:index] + value[0] + pinyin[index + 1 :] + tones.append((index, value[1])) + + chewing = "" + index = 0 + while index < len(pinyin): + + # Ignore special characters + if not (pinyin[index].isalpha() and pinyin[index].islower()): + chewing += pinyin[index] + index += 1 + continue + + # In case of Erhua, 〔r〕 is adeed to the last character instead of 〔er〕 + if index == len(pinyin) - 1 and pinyin[index] == "r": + chewing += "ㄦ¯" + break + + res = match_chewing(pinyin, index, PINYIN_ALONE) + if res[1] and forms_new_word(res[0]): + chewing += res[1] + index = res[0] + + else: + initial = match_chewing(pinyin, index, PINYIN_INITIALS) + assert initial[1] + index = initial[0] + chewing += initial[1] + combined = match_chewing(pinyin, index, PINYIN_COMBINED) + if combined[1]: + index = combined[0] + chewing += combined[1] + else: + center = match_chewing(pinyin, index, PINYIN_CENTER) + if center[1]: + chewing += center[1] + index = center[0] + final = match_chewing(pinyin, index, PINYIN_FINALS) + safety_ctr = 0 + while final[1]: + # incase things explode causing infinite loop + assert safety_ctr < 2 + chewing += final[1] + index = final[0] + final = match_chewing(pinyin, index, PINYIN_FINALS) + safety_ctr += 1 + + if len(tones) and tones[0][0] < index: + chewing += tones.pop(0)[1] + else: + chewing += "˙" + + chewing += " " # Add a fullwidth space between words + + return chewing diff --git a/commands/tocfl/tocfl.py b/commands/tocfl/tocfl.py index d106bde..3677e7e 100644 --- a/commands/tocfl/tocfl.py +++ b/commands/tocfl/tocfl.py @@ -6,6 +6,7 @@ from modules.supabase import supabaseClient from random import randint from .consts import TOCFL_LEVELS_CHOICES, TOCFL_LEVELS +from .chewing import to_chewing def register_commands( @@ -56,10 +57,16 @@ async def tocfl_rand( def _create_word_embed( word: str, level: int, part_of_speech: str, pinyin: str ): + try: + chewing = to_chewing(pinyin) + except AssertionError: + chewing = "ERROR OCCURRED" + embed = discord.Embed( title=word ) # ^ add description="desc" for translation - embed.add_field(name="Pronunciation", value=pinyin, inline=False) + embed.add_field(name="Pronunciation (Pinyin)", value=pinyin, inline=False) + embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False) embed.add_field( name="Dictionary Reference", value=f"https://cdict.net/?q={word}", From d38ac42bb7ed5f57c4f1a5b900099e3fed96218e Mon Sep 17 00:00:00 2001 From: JuniorTux Date: Thu, 1 Aug 2024 23:04:27 +0800 Subject: [PATCH 2/2] Address code review --- commands/tocfl/chewing.py | 107 ++++++++++++++++++-------------------- commands/tocfl/tocfl.py | 8 +-- 2 files changed, 57 insertions(+), 58 deletions(-) diff --git a/commands/tocfl/chewing.py b/commands/tocfl/chewing.py index 3a4cda5..96ea3ae 100644 --- a/commands/tocfl/chewing.py +++ b/commands/tocfl/chewing.py @@ -159,50 +159,51 @@ } -def to_chewing(pinyin: str) -> str: - # Remove leading and trailing spaces - pinyin = pinyin.strip() +# Matches chewing from substrings +def match_chewing(string: str, index: int, target: dict[str, str]): + global PINYIN_COMBINED, PINYIN_FINALS + # Substrings only to the maxium possible character amount + for i in range(max([len(i) for i in target.keys()]), 0, -1): + target_str = string[index : index + i] + result = target.get(target_str) + if result: + # Resolve duplicates + if target == PINYIN_COMBINED: + if target_str == "uan" and string[index - 1] in [ + "y", + "j", + "q", + "x", + ]: + result = "ㄩㄢ" + elif target_str == "un" and string[index - 1] in [ + "y", + "j", + "q", + "x", + ]: + result = "ㄩㄣ" + elif target == PINYIN_FINALS: + if target_str == "e" and string[index - 1] == "y": + result = "ㄝ" + + return (index + i, result) + return (index + 1, None) - # Matches chewing from substrings - def match_chewing(string: str, index: int, target: dict[str, str]): - global PINYIN_COMBINED, PINYIN_FINALS - # Substrings only to the maxium possible character amount - for i in range(max([len(i) for i in target.keys()]), 0, -1): - target_str = string[index : index + i] - result = target.get(target_str) - if result: - # Resolve duplicates - if target == PINYIN_COMBINED: - if target_str == "uan" and string[index - 1] in [ - "y", - "j", - "q", - "x", - ]: - result = "ㄩㄢ" - elif target_str == "un" and string[index - 1] in [ - "y", - "j", - "q", - "x", - ]: - result = "ㄩㄣ" - elif target == PINYIN_FINALS: - if target_str == "e" and string[index - 1] == "y": - result = "ㄝ" - return (index + i, result) - return (index + 1, None) +# Ensure there are no trailing characters unable to form word +def forms_new_word(pinyin: str, index: int): + global PINYIN_INITIALS, PINYIN_ALONE + return ( + match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1] + or index >= len(pinyin) + or not pinyin[index].isalpha() + ) - # Ensure there are no trailing characters unable to form word - def forms_new_word(index: int): - nonlocal pinyin - global PINYIN_INITIALS, PINYIN_ALONE - return ( - match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1] - or index >= len(pinyin) - or not pinyin[index].isalpha() - ) + +def to_chewing(pinyin: str) -> str: + # Remove leading and trailing spaces + pinyin = pinyin.strip() # Temporarily store the chewing tones and original index tones = [] @@ -228,37 +229,33 @@ def forms_new_word(index: int): chewing += "ㄦ¯" break + # Check matches for independent words res = match_chewing(pinyin, index, PINYIN_ALONE) - if res[1] and forms_new_word(res[0]): - chewing += res[1] + if res[1] and forms_new_word(pinyin, res[0]): + chewing += res[1] # ㄧㄚ index = res[0] else: initial = match_chewing(pinyin, index, PINYIN_INITIALS) - assert initial[1] + assert initial[1], f"Failed to match initial in '{pinyin}' at index {index - 1}" index = initial[0] - chewing += initial[1] + chewing += initial[1] # ㄍ combined = match_chewing(pinyin, index, PINYIN_COMBINED) if combined[1]: index = combined[0] - chewing += combined[1] + chewing += combined[1] # ㄨㄤ else: center = match_chewing(pinyin, index, PINYIN_CENTER) if center[1]: - chewing += center[1] + chewing += center[1] # ㄍㄨ index = center[0] final = match_chewing(pinyin, index, PINYIN_FINALS) - safety_ctr = 0 - while final[1]: - # incase things explode causing infinite loop - assert safety_ctr < 2 - chewing += final[1] + if final[1]: + chewing += final[1] # ㄍㄨㄛ index = final[0] - final = match_chewing(pinyin, index, PINYIN_FINALS) - safety_ctr += 1 if len(tones) and tones[0][0] < index: - chewing += tones.pop(0)[1] + chewing += tones.pop(0)[1] # ㄍㄨㄛˊ else: chewing += "˙" diff --git a/commands/tocfl/tocfl.py b/commands/tocfl/tocfl.py index 3677e7e..716273e 100644 --- a/commands/tocfl/tocfl.py +++ b/commands/tocfl/tocfl.py @@ -59,14 +59,16 @@ def _create_word_embed( ): try: chewing = to_chewing(pinyin) - except AssertionError: - chewing = "ERROR OCCURRED" + except AssertionError as e: + print("Error occurred during chewing conversion: ", e) + chewing = None embed = discord.Embed( title=word ) # ^ add description="desc" for translation embed.add_field(name="Pronunciation (Pinyin)", value=pinyin, inline=False) - embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False) + if chewing: + embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False) embed.add_field( name="Dictionary Reference", value=f"https://cdict.net/?q={word}",