From 8e82f6237a09c103172bdc3e810167e822af194b Mon Sep 17 00:00:00 2001
From: JuniorTux <school.shsps@gmail.com>
Date: Thu, 1 Aug 2024 21:45:46 +0800
Subject: [PATCH 1/2] Add Pinyin to Zhuyin convertion

---
 commands/tocfl/chewing.py | 267 ++++++++++++++++++++++++++++++++++++++
 commands/tocfl/tocfl.py   |   9 +-
 2 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 commands/tocfl/chewing.py

diff --git a/commands/tocfl/chewing.py b/commands/tocfl/chewing.py
new file mode 100644
index 0000000..3a4cda5
--- /dev/null
+++ b/commands/tocfl/chewing.py
@@ -0,0 +1,267 @@
+﻿PINYIN_INITIALS = {
+    "b": "ㄅ",
+    "p": "ㄆ",
+    "m": "ㄇ",
+    "f": "ㄈ",
+    "d": "ㄉ",
+    "t": "ㄊ",
+    "n": "ㄋ",
+    "l": "ㄌ",
+    "g": "ㄍ",
+    "k": "ㄎ",
+    "h": "ㄏ",
+    "j": "ㄐ",
+    "q": "ㄑ",
+    "x": "ㄒ",
+    "zh": "ㄓ",
+    "ch": "ㄔ",
+    "sh": "ㄕ",
+    "r": "ㄖ",
+    "z": "ㄗ",
+    "c": "ㄘ",
+    "s": "ㄙ",
+}
+
+# In Hanyu Pinyin, 「ㄜ」 and 「ㄝ」 share the same character 〔e〕
+# because the possible initials are almost different.
+# Only 「誒」 (「ㄝˋ」) interferes with 「惡」(「ㄜˋ」),
+# in this case 「ㄝ」 is spelt as 〔ê〕
+PINYIN_ALONE = {
+    "zhi": "ㄓ",
+    "chi": "ㄔ",
+    "shi": "ㄕ",
+    "ri": "ㄖ",
+    "zi": "ㄗ",
+    "ci": "ㄘ",
+    "si": "ㄙ",
+    "a": "ㄚ",
+    "o": "ㄛ",
+    "e": "ㄜ",
+    "ê": "ㄝ",
+    "ai": "ㄞ",
+    "ei": "ㄟ",
+    "ao": "ㄠ",
+    "ou": "ㄡ",
+    "an": "ㄢ",
+    "en": "ㄣ",
+    "ang": "ㄤ",
+    "er": "ㄦ",
+    "yi": "ㄧ",
+    "wu": "ㄨ",
+    "yu": "ㄩ",
+    # Combined
+    "ya": "ㄧㄚ",
+    "yo": "ㄧㄛ",
+    "ye": "ㄧㄝ",
+    "yai": "ㄧㄞ",
+    "yao": "一ㄠ",
+    "you": "ㄧㄡ",
+    "yan": "ㄧㄢ",
+    "yin": "ㄧㄣ",
+    "yang": "ㄧㄤ",
+    "ying": "ㄧㄥ",
+    "wa": "ㄨㄚ",
+    "wo": "ㄨㄛ",
+    "wai": "ㄨㄞ",
+    "wei": "ㄨㄟ",
+    "wan": "ㄨㄢ",
+    "wen": "ㄨㄣ",
+    "wang": "ㄨㄤ",
+    "weng": "ㄨㄥ",
+    "yue": "ㄩㄝ",
+    "yuan": "ㄩㄢ",
+    "yun": "ㄩㄣ",
+    "yong": "ㄩㄥ",
+    # v is used to replace ü in typing
+    "nü": "ㄋㄩ",
+    "lü": "ㄌㄩ",
+    "nv": "ㄋㄩ",
+    "lv": "ㄌㄩ",
+}
+
+PINYIN_CENTER = {
+    "i": "ㄧ",
+    "u": "ㄨ",
+    "ü": "ㄩ",
+    "v": "ㄩ",
+}
+
+# The designer of Hanyu Pinyin used e to represent both 「ㄜ」 and 「ㄝ」.
+# This is because 「ㄝ」 could only be used in  「ㄩㄝ」 and 「ㄧㄝ」
+PINYIN_FINALS = {
+    "a": "ㄚ",
+    "o": "ㄛ",
+    "e": "ㄜ",  # also ㄝ
+    "ai": "ㄞ",
+    "ei": "ㄟ",
+    "ao": "ㄠ",
+    "ou": "ㄡ",
+    "an": "ㄢ",
+    "en": "ㄣ",
+    "ang": "ㄤ",
+    "eng": "ㄥ",
+    "er": "ㄦ",
+}
+
+PINYIN_COMBINED = {
+    "iu": "ㄧㄡ",
+    "ian": "ㄧㄢ",
+    "in": "ㄧㄣ",
+    "iang": "ㄧㄤ",
+    "ing": "ㄧㄥ",
+    "ui": "ㄨㄟ",
+    "uan": "ㄨㄢ",  # also ㄩㄢ
+    "un": "ㄨㄣ",  # also ㄨㄣ
+    "uang": "ㄨㄤ",
+    "ong": "ㄨㄥ",
+    "ue": "ㄩㄝ",
+    "iong": "ㄩㄥ",
+}
+
+# Only 「a﹑o﹑e﹑i﹑u﹑ü」 are added diacritics in Hanyu Pinyin
+# Neutral tones are not labeled
+DIACRITIC_TO_BASE_AND_TONE = {
+    # ā (ɑ̄) ē ī ō ū ǖ
+    # á (ɑ́) é í ó ú ǘ
+    # ǎ (ɑ̌) ě ǐ ǒ ǔ ǚ
+    # à (ɑ̀) è ì ò ù ǜ
+    "ā": ("a", "¯"),
+    "á": ("a", "ˊ"),
+    "ǎ": ("a", "ˇ"),
+    "ă": ("a", "ˇ"),
+    "à": ("a", "ˋ"),
+    "ē": ("e", "¯"),
+    "é": ("e", "ˊ"),
+    "ě": ("e", "ˇ"),
+    "ĕ": ("e", "ˇ"),
+    "è": ("e", "ˋ"),
+    "ī": ("i", "¯"),
+    "í": ("i", "ˊ"),
+    "ǐ": ("i", "ˇ"),
+    "ĭ": ("i", "ˇ"),
+    "ì": ("i", "ˋ"),
+    "ō": ("o", "¯"),
+    "ó": ("o", "ˊ"),
+    "ǒ": ("o", "ˇ"),
+    "ŏ": ("o", "ˇ"),
+    "ò": ("o", "ˋ"),
+    "ū": ("u", "¯"),
+    "ú": ("u", "ˊ"),
+    "ǔ": ("u", "ˇ"),
+    "ŭ": ("u", "ˇ"),
+    "ù": ("u", "ˋ"),
+    "ǖ": ("ü", "¯"),
+    "ǘ": ("ü", "ˊ"),
+    "ǚ": ("ü", "ˇ"),
+    "ü̆": ("ü", "ˇ"),
+    "ǜ": ("ü", "ˋ"),
+    "ề": ("ê", "ˋ"),  # 「ㄝ」 could only possibly be the fourth tone
+}
+
+
+def to_chewing(pinyin: str) -> str:
+    # Remove leading and trailing spaces
+    pinyin = pinyin.strip()
+
+    # Matches chewing from substrings
+    def match_chewing(string: str, index: int, target: dict[str, str]):
+        global PINYIN_COMBINED, PINYIN_FINALS
+        # Substrings only to the maxium possible character amount
+        for i in range(max([len(i) for i in target.keys()]), 0, -1):
+            target_str = string[index : index + i]
+            result = target.get(target_str)
+            if result:
+                # Resolve duplicates
+                if target == PINYIN_COMBINED:
+                    if target_str == "uan" and string[index - 1] in [
+                        "y",
+                        "j",
+                        "q",
+                        "x",
+                    ]:
+                        result = "ㄩㄢ"
+                    elif target_str == "un" and string[index - 1] in [
+                        "y",
+                        "j",
+                        "q",
+                        "x",
+                    ]:
+                        result = "ㄩㄣ"
+                elif target == PINYIN_FINALS:
+                    if target_str == "e" and string[index - 1] == "y":
+                        result = "ㄝ"
+
+                return (index + i, result)
+        return (index + 1, None)
+
+    # Ensure there are no trailing characters unable to form word
+    def forms_new_word(index: int):
+        nonlocal pinyin
+        global PINYIN_INITIALS, PINYIN_ALONE
+        return (
+            match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1]
+            or index >= len(pinyin)
+            or not pinyin[index].isalpha()
+        )
+
+    # Temporarily store the chewing tones and original index
+    tones = []
+    for index in range(len(pinyin)):
+        value = DIACRITIC_TO_BASE_AND_TONE.get(pinyin[index])
+        if not value:
+            continue
+        pinyin = pinyin[:index] + value[0] + pinyin[index + 1 :]
+        tones.append((index, value[1]))
+
+    chewing = ""
+    index = 0
+    while index < len(pinyin):
+
+        # Ignore special characters
+        if not (pinyin[index].isalpha() and pinyin[index].islower()):
+            chewing += pinyin[index]
+            index += 1
+            continue
+
+        # In case of Erhua, 〔r〕 is adeed to the last character instead of 〔er〕
+        if index == len(pinyin) - 1 and pinyin[index] == "r":
+            chewing += "ㄦ¯"
+            break
+
+        res = match_chewing(pinyin, index, PINYIN_ALONE)
+        if res[1] and forms_new_word(res[0]):
+            chewing += res[1]
+            index = res[0]
+
+        else:
+            initial = match_chewing(pinyin, index, PINYIN_INITIALS)
+            assert initial[1]
+            index = initial[0]
+            chewing += initial[1]
+            combined = match_chewing(pinyin, index, PINYIN_COMBINED)
+            if combined[1]:
+                index = combined[0]
+                chewing += combined[1]
+            else:
+                center = match_chewing(pinyin, index, PINYIN_CENTER)
+                if center[1]:
+                    chewing += center[1]
+                    index = center[0]
+                final = match_chewing(pinyin, index, PINYIN_FINALS)
+                safety_ctr = 0
+                while final[1]:
+                    # incase things explode causing infinite loop
+                    assert safety_ctr < 2
+                    chewing += final[1]
+                    index = final[0]
+                    final = match_chewing(pinyin, index, PINYIN_FINALS)
+                    safety_ctr += 1
+
+        if len(tones) and tones[0][0] < index:
+            chewing += tones.pop(0)[1]
+        else:
+            chewing += "˙"
+
+        chewing += "　"  # Add a fullwidth space between words
+
+    return chewing
diff --git a/commands/tocfl/tocfl.py b/commands/tocfl/tocfl.py
index d106bde..3677e7e 100644
--- a/commands/tocfl/tocfl.py
+++ b/commands/tocfl/tocfl.py
@@ -6,6 +6,7 @@
 from modules.supabase import supabaseClient
 from random import randint
 from .consts import TOCFL_LEVELS_CHOICES, TOCFL_LEVELS
+from .chewing import to_chewing
 
 
 def register_commands(
@@ -56,10 +57,16 @@ async def tocfl_rand(
 def _create_word_embed(
     word: str, level: int, part_of_speech: str, pinyin: str
 ):
+    try:
+        chewing = to_chewing(pinyin)
+    except AssertionError:
+        chewing = "ERROR OCCURRED"
+
     embed = discord.Embed(
         title=word
     )  # ^ add description="desc" for translation
-    embed.add_field(name="Pronunciation", value=pinyin, inline=False)
+    embed.add_field(name="Pronunciation (Pinyin)", value=pinyin, inline=False)
+    embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False)
     embed.add_field(
         name="Dictionary Reference",
         value=f"https://cdict.net/?q={word}",

From d38ac42bb7ed5f57c4f1a5b900099e3fed96218e Mon Sep 17 00:00:00 2001
From: JuniorTux <school.shsps@gmail.com>
Date: Thu, 1 Aug 2024 23:04:27 +0800
Subject: [PATCH 2/2] Address code review

---
 commands/tocfl/chewing.py | 107 ++++++++++++++++++--------------------
 commands/tocfl/tocfl.py   |   8 +--
 2 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/commands/tocfl/chewing.py b/commands/tocfl/chewing.py
index 3a4cda5..96ea3ae 100644
--- a/commands/tocfl/chewing.py
+++ b/commands/tocfl/chewing.py
@@ -159,50 +159,51 @@
 }
 
 
-def to_chewing(pinyin: str) -> str:
-    # Remove leading and trailing spaces
-    pinyin = pinyin.strip()
+# Matches chewing from substrings
+def match_chewing(string: str, index: int, target: dict[str, str]):
+    global PINYIN_COMBINED, PINYIN_FINALS
+    # Substrings only to the maxium possible character amount
+    for i in range(max([len(i) for i in target.keys()]), 0, -1):
+        target_str = string[index : index + i]
+        result = target.get(target_str)
+        if result:
+            # Resolve duplicates
+            if target == PINYIN_COMBINED:
+                if target_str == "uan" and string[index - 1] in [
+                    "y",
+                    "j",
+                    "q",
+                    "x",
+                ]:
+                    result = "ㄩㄢ"
+                elif target_str == "un" and string[index - 1] in [
+                    "y",
+                    "j",
+                    "q",
+                    "x",
+                ]:
+                    result = "ㄩㄣ"
+            elif target == PINYIN_FINALS:
+                if target_str == "e" and string[index - 1] == "y":
+                    result = "ㄝ"
+
+            return (index + i, result)
+    return (index + 1, None)
 
-    # Matches chewing from substrings
-    def match_chewing(string: str, index: int, target: dict[str, str]):
-        global PINYIN_COMBINED, PINYIN_FINALS
-        # Substrings only to the maxium possible character amount
-        for i in range(max([len(i) for i in target.keys()]), 0, -1):
-            target_str = string[index : index + i]
-            result = target.get(target_str)
-            if result:
-                # Resolve duplicates
-                if target == PINYIN_COMBINED:
-                    if target_str == "uan" and string[index - 1] in [
-                        "y",
-                        "j",
-                        "q",
-                        "x",
-                    ]:
-                        result = "ㄩㄢ"
-                    elif target_str == "un" and string[index - 1] in [
-                        "y",
-                        "j",
-                        "q",
-                        "x",
-                    ]:
-                        result = "ㄩㄣ"
-                elif target == PINYIN_FINALS:
-                    if target_str == "e" and string[index - 1] == "y":
-                        result = "ㄝ"
 
-                return (index + i, result)
-        return (index + 1, None)
+# Ensure there are no trailing characters unable to form word
+def forms_new_word(pinyin: str, index: int):
+    global PINYIN_INITIALS, PINYIN_ALONE
+    return (
+        match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1]
+        or index >= len(pinyin)
+        or not pinyin[index].isalpha()
+    )
 
-    # Ensure there are no trailing characters unable to form word
-    def forms_new_word(index: int):
-        nonlocal pinyin
-        global PINYIN_INITIALS, PINYIN_ALONE
-        return (
-            match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1]
-            or index >= len(pinyin)
-            or not pinyin[index].isalpha()
-        )
+
+def to_chewing(pinyin: str) -> str:
+    # Remove leading and trailing spaces
+    pinyin = pinyin.strip()
 
     # Temporarily store the chewing tones and original index
     tones = []
@@ -228,37 +229,33 @@ def forms_new_word(index: int):
             chewing += "ㄦ¯"
             break
 
+		# Check matches for independent words
         res = match_chewing(pinyin, index, PINYIN_ALONE)
-        if res[1] and forms_new_word(res[0]):
-            chewing += res[1]
+        if res[1] and forms_new_word(pinyin, res[0]):
+            chewing += res[1] # ㄧㄚ
             index = res[0]
 
         else:
             initial = match_chewing(pinyin, index, PINYIN_INITIALS)
-            assert initial[1]
+            assert initial[1], f"Failed to match initial in '{pinyin}' at index {index - 1}"
             index = initial[0]
-            chewing += initial[1]
+            chewing += initial[1] # ㄍ
             combined = match_chewing(pinyin, index, PINYIN_COMBINED)
             if combined[1]:
                 index = combined[0]
-                chewing += combined[1]
+                chewing += combined[1] # ㄨㄤ
             else:
                 center = match_chewing(pinyin, index, PINYIN_CENTER)
                 if center[1]:
-                    chewing += center[1]
+                    chewing += center[1] # ㄍㄨ
                     index = center[0]
                 final = match_chewing(pinyin, index, PINYIN_FINALS)
-                safety_ctr = 0
-                while final[1]:
-                    # incase things explode causing infinite loop
-                    assert safety_ctr < 2
-                    chewing += final[1]
+                if final[1]:
+                    chewing += final[1] # ㄍㄨㄛ
                     index = final[0]
-                    final = match_chewing(pinyin, index, PINYIN_FINALS)
-                    safety_ctr += 1
 
         if len(tones) and tones[0][0] < index:
-            chewing += tones.pop(0)[1]
+            chewing += tones.pop(0)[1] # ㄍㄨㄛˊ
         else:
             chewing += "˙"
 
diff --git a/commands/tocfl/tocfl.py b/commands/tocfl/tocfl.py
index 3677e7e..716273e 100644
--- a/commands/tocfl/tocfl.py
+++ b/commands/tocfl/tocfl.py
@@ -59,14 +59,16 @@ def _create_word_embed(
 ):
     try:
         chewing = to_chewing(pinyin)
-    except AssertionError:
-        chewing = "ERROR OCCURRED"
+    except AssertionError as e:
+        print("Error occurred during chewing conversion: ", e)
+        chewing = None
 
     embed = discord.Embed(
         title=word
     )  # ^ add description="desc" for translation
     embed.add_field(name="Pronunciation (Pinyin)", value=pinyin, inline=False)
-    embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False)
+    if chewing:
+         embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False)
     embed.add_field(
         name="Dictionary Reference",
         value=f"https://cdict.net/?q={word}",