Add Pinyin to Zhuyin convertion

placeTW · Aug 1, 2024 · 55a1832 · 55a1832
1 parent d0bad54
commit 55a1832
Show file tree

Hide file tree

Showing 2 changed files with 270 additions and 1 deletion.
diff --git a/commands/tocfl/chewing.py b/commands/tocfl/chewing.py
@@ -0,0 +1,262 @@
+PINYIN_INITIALS = {
+    "b": "ㄅ",
+    "p": "ㄆ",
+    "m": "ㄇ",
+    "f": "ㄈ",
+    "d": "ㄉ",
+    "t": "ㄊ",
+    "n": "ㄋ",
+    "l": "ㄌ",
+    "g": "ㄍ",
+    "k": "ㄎ",
+    "h": "ㄏ",
+    "j": "ㄐ",
+    "q": "ㄑ",
+    "x": "ㄒ",
+    "zh": "ㄓ",
+    "ch": "ㄔ",
+    "sh": "ㄕ",
+    "r": "ㄖ",
+    "z": "ㄗ",
+    "c": "ㄘ",
+    "s": "ㄙ",
+}
+
+# In Hanyu Pinyin, 「ㄜ」 and 「ㄝ」 share the same character 〔e〕
+# because the possible initials are almost different.
+# Only 「誒」 (「ㄝˋ」) interferes with 「惡」(「ㄜˋ」),
+# in this case 「ㄝ」 is spelt as 〔ê〕
+PINYIN_ALONE = {
+    "zhi": "ㄓ",
+    "chi": "ㄔ",
+    "shi": "ㄕ",
+    "ri": "ㄖ",
+    "zi": "ㄗ",
+    "ci": "ㄘ",
+    "si": "ㄙ",
+    "a": "ㄚ",
+    "o": "ㄛ",
+    "e": "ㄜ",
+    "ê": "ㄝ",
+    "ai": "ㄞ",
+    "ei": "ㄟ",
+    "ao": "ㄠ",
+    "ou": "ㄡ",
+    "an": "ㄢ",
+    "en": "ㄣ",
+    "ang": "ㄤ",
+    "er": "ㄦ",
+    # In case of Erhua, 〔r〕 is adeed to the last character instead of 〔er〕
+    "r": "ㄦ",
+    "yi": "ㄧ",
+    "wu": "ㄨ",
+    "yu": "ㄩ",
+    # Combined
+    "ya": "ㄧㄚ",
+    "yo": "ㄧㄛ",
+    "ye": "ㄧㄝ",
+    "yai": "ㄧㄞ",
+    "yao": "一ㄠ",
+    "you": "ㄧㄡ",
+    "yan": "ㄧㄢ",
+    "yin": "ㄧㄣ",
+    "yang": "ㄧㄤ",
+    "ying": "ㄧㄥ",
+    "wa": "ㄨㄚ",
+    "wo": "ㄨㄛ",
+    "wai": "ㄨㄞ",
+    "wei": "ㄨㄟ",
+    "wan": "ㄨㄢ",
+    "wen": "ㄨㄣ",
+    "wang": "ㄨㄤ",
+    "weng": "ㄨㄥ",
+    "yue": "ㄩㄝ",
+    "yuan": "ㄩㄢ",
+    "yun": "ㄩㄣ",
+    "yong": "ㄩㄥ",
+    # v is used to replace ü in typing
+    "nü": "ㄋㄩ",
+    "lü": "ㄌㄩ",
+    "nv": "ㄋㄩ",
+    "lv": "ㄌㄩ",
+}
+
+PINYIN_CENTER = {
+    "i": "ㄧ",
+    "u": "ㄨ",
+    "ü": "ㄩ",
+    "v": "ㄩ",
+}
+
+# The designer of Hanyu Pinyin used e to represent both 「ㄜ」 and 「ㄝ」.
+# This is because 「ㄝ」 could only be used in  「ㄩㄝ」 and 「ㄧㄝ」
+PINYIN_FINALS = {
+    "a": "ㄚ",
+    "o": "ㄛ",
+    "e": "ㄜ",  # also ㄝ
+    "ai": "ㄞ",
+    "ei": "ㄟ",
+    "ao": "ㄠ",
+    "ou": "ㄡ",
+    "an": "ㄢ",
+    "en": "ㄣ",
+    "ang": "ㄤ",
+    "eng": "ㄥ",
+    "er": "ㄦ",
+}
+
+PINYIN_COMBINED = {
+    "iu": "ㄧㄡ",
+    "ian": "ㄧㄢ",
+    "in": "ㄧㄣ",
+    "iang": "ㄧㄤ",
+    "ing": "ㄧㄥ",
+    "ui": "ㄨㄟ",
+    "uan": "ㄨㄢ",  # also ㄩㄢ
+    "un": "ㄨㄣ",  # also ㄨㄣ
+    "uang": "ㄨㄤ",
+    "ong": "ㄨㄥ",
+    "ue": "ㄩㄝ",
+    "iong": "ㄩㄥ",
+}
+
+# Only 「a﹑o﹑e﹑i﹑u﹑ü」 are added diacritics in Hanyu Pinyin
+# Neutral tones are not labeled
+DIACRITIC_TO_BASE_AND_TONE = {
+    # ā (ɑ̄) ē ī ō ū ǖ
+    # á (ɑ́) é í ó ú ǘ
+    # ǎ (ɑ̌) ě ǐ ǒ ǔ ǚ
+    # à (ɑ̀) è ì ò ù ǜ
+    "ā": ("a", "¯"),
+    "á": ("a", "ˊ"),
+    "ǎ": ("a", "ˇ"),
+    "ă": ("a", "ˇ"),
+    "à": ("a", "ˋ"),
+    "ē": ("e", "¯"),
+    "é": ("e", "ˊ"),
+    "ě": ("e", "ˇ"),
+    "ĕ": ("e", "ˇ"),
+    "è": ("e", "ˋ"),
+    "ī": ("i", "¯"),
+    "í": ("i", "ˊ"),
+    "ǐ": ("i", "ˇ"),
+    "ĭ": ("i", "ˇ"),
+    "ì": ("i", "ˋ"),
+    "ō": ("o", "¯"),
+    "ó": ("o", "ˊ"),
+    "ǒ": ("o", "ˇ"),
+    "ŏ": ("o", "ˇ"),
+    "ò": ("o", "ˋ"),
+    "ū": ("u", "¯"),
+    "ú": ("u", "ˊ"),
+    "ǔ": ("u", "ˇ"),
+    "ŭ": ("u", "ˇ"),
+    "ù": ("u", "ˋ"),
+    "ǖ": ("ü", "¯"),
+    "ǘ": ("ü", "ˊ"),
+    "ǚ": ("ü", "ˇ"),
+    "ü̆": ("ü", "ˇ"),
+    "ǜ": ("ü", "ˋ"),
+    "ề": ("ê", "ˋ"),  # 「ㄝ」 could only possibly be the fourth tone
+}
+
+
+def to_chewing(pinyin: str) -> str:
+
+    # Matches chewing from substrings
+    def match_chewing(string: str, index: int, target: dict[str, str]):
+        global PINYIN_COMBINED, PINYIN_FINALS
+        # Substrings only to the maxium possible character amount
+        for i in range(max([len(i) for i in target.keys()]), 0, -1):
+            target_str = string[index : index + i]
+            result = target.get(target_str)
+            if result:
+                # Resolve duplicates
+                if target == PINYIN_COMBINED:
+                    if target_str == "uan" and string[index - 1] in [
+                        "y",
+                        "j",
+                        "q",
+                        "x",
+                    ]:
+                        result = "ㄩㄢ"
+                    elif target_str == "un" and string[index - 1] in [
+                        "y",
+                        "j",
+                        "q",
+                        "x",
+                    ]:
+                        result = "ㄩㄣ"
+                elif target == PINYIN_FINALS:
+                    if target_str == "e" and string[index - 1] == "y":
+                        result = "ㄝ"
+
+                return (index + i, result)
+        return (index + 1, None)
+
+    # Ensure there are no trailing characters unable to form word
+    def forms_new_word(index: int):
+        nonlocal pinyin
+        global PINYIN_INITIALS, PINYIN_ALONE
+        return (
+            match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1]
+            or index >= len(pinyin)
+            or not pinyin[index].isalpha()
+        )
+
+    # Temporarily store the chewing tones and original index
+    tones = []
+    for index in range(len(pinyin)):
+        value = DIACRITIC_TO_BASE_AND_TONE.get(pinyin[index])
+        if not value:
+            continue
+        pinyin = pinyin[:index] + value[0] + pinyin[index + 1 :]
+        tones.append((index, value[1]))
+
+    chewing = ""
+    index = 0
+    while index < len(pinyin):
+
+        # Ignore special characters
+        if not (pinyin[index].isalpha() and pinyin[index].islower()):
+            chewing += pinyin[index]
+            index += 1
+            continue
+
+        res = match_chewing(pinyin, index, PINYIN_ALONE)
+        if res[1] and forms_new_word(res[0]):
+            chewing += res[1]
+            index = res[0]
+
+        else:
+            initial = match_chewing(pinyin, index, PINYIN_INITIALS)
+            assert initial[1]
+            index = initial[0]
+            chewing += initial[1]
+            combined = match_chewing(pinyin, index, PINYIN_COMBINED)
+            if combined[1]:
+                index = combined[0]
+                chewing += combined[1]
+            else:
+                center = match_chewing(pinyin, index, PINYIN_CENTER)
+                if center[1]:
+                    chewing += center[1]
+                    index = center[0]
+                final = match_chewing(pinyin, index, PINYIN_FINALS)
+                safety_ctr = 0
+                while final[1]:
+                    # incase things explode causing infinite loop
+                    assert safety_ctr < 2
+                    chewing += final[1]
+                    index = final[0]
+                    final = match_chewing(pinyin, index, PINYIN_FINALS)
+                    safety_ctr += 1
+
+        if len(tones) and tones[0][0] < index:
+            chewing += tones.pop(0)[1]
+        else:
+            chewing += "˙"
+
+        chewing += "　"  # Add a fullwidth space between words
+
+    return chewing
diff --git a/commands/tocfl/tocfl.py b/commands/tocfl/tocfl.py
@@ -6,6 +6,7 @@
 from modules.supabase import supabaseClient
 from random import randint
 from .consts import TOCFL_LEVELS_CHOICES, TOCFL_LEVELS
+from .chewing import to_chewing
 
 
 def register_commands(
@@ -56,10 +57,16 @@ async def tocfl_rand(
 def _create_word_embed(
     word: str, level: int, part_of_speech: str, pinyin: str
 ):
+    try:
+        chewing = to_chewing(pinyin)
+    except AssertionError:
+        chewing = "ERROR OCCURRED"
+
     embed = discord.Embed(
         title=word
     )  # ^ add description="desc" for translation
-    embed.add_field(name="Pronunciation", value=pinyin, inline=False)
+    embed.add_field(name="Pronunciation (Pinyin)", value=pinyin, inline=False)
+    embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False)
     embed.add_field(
         name="Dictionary Reference",
         value=f"https://cdict.net/?q={word}",