Skip to content

Commit

Permalink
Add Pinyin to Zhuyin convertion
Browse files Browse the repository at this point in the history
  • Loading branch information
JuniorSuperTux committed Aug 1, 2024
1 parent d0bad54 commit 55a1832
Show file tree
Hide file tree
Showing 2 changed files with 270 additions and 1 deletion.
262 changes: 262 additions & 0 deletions commands/tocfl/chewing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
PINYIN_INITIALS = {
"b": "ㄅ",
"p": "ㄆ",
"m": "ㄇ",
"f": "ㄈ",
"d": "ㄉ",
"t": "ㄊ",
"n": "ㄋ",
"l": "ㄌ",
"g": "ㄍ",
"k": "ㄎ",
"h": "ㄏ",
"j": "ㄐ",
"q": "ㄑ",
"x": "ㄒ",
"zh": "ㄓ",
"ch": "ㄔ",
"sh": "ㄕ",
"r": "ㄖ",
"z": "ㄗ",
"c": "ㄘ",
"s": "ㄙ",
}

# In Hanyu Pinyin, 「ㄜ」 and 「ㄝ」 share the same character 〔e〕
# because the possible initials are almost different.
# Only 「誒」 (「ㄝˋ」) interferes with 「惡」(「ㄜˋ」),
# in this case 「ㄝ」 is spelt as 〔ê〕
PINYIN_ALONE = {
"zhi": "ㄓ",
"chi": "ㄔ",
"shi": "ㄕ",
"ri": "ㄖ",
"zi": "ㄗ",
"ci": "ㄘ",
"si": "ㄙ",
"a": "ㄚ",
"o": "ㄛ",
"e": "ㄜ",
"ê": "ㄝ",
"ai": "ㄞ",
"ei": "ㄟ",
"ao": "ㄠ",
"ou": "ㄡ",
"an": "ㄢ",
"en": "ㄣ",
"ang": "ㄤ",
"er": "ㄦ",
# In case of Erhua, 〔r〕 is adeed to the last character instead of 〔er〕
"r": "ㄦ",
"yi": "ㄧ",
"wu": "ㄨ",
"yu": "ㄩ",
# Combined
"ya": "ㄧㄚ",
"yo": "ㄧㄛ",
"ye": "ㄧㄝ",
"yai": "ㄧㄞ",
"yao": "一ㄠ",
"you": "ㄧㄡ",
"yan": "ㄧㄢ",
"yin": "ㄧㄣ",
"yang": "ㄧㄤ",
"ying": "ㄧㄥ",
"wa": "ㄨㄚ",
"wo": "ㄨㄛ",
"wai": "ㄨㄞ",
"wei": "ㄨㄟ",
"wan": "ㄨㄢ",
"wen": "ㄨㄣ",
"wang": "ㄨㄤ",
"weng": "ㄨㄥ",
"yue": "ㄩㄝ",
"yuan": "ㄩㄢ",
"yun": "ㄩㄣ",
"yong": "ㄩㄥ",
# v is used to replace ü in typing
"nü": "ㄋㄩ",
"lü": "ㄌㄩ",
"nv": "ㄋㄩ",
"lv": "ㄌㄩ",
}

PINYIN_CENTER = {
"i": "ㄧ",
"u": "ㄨ",
"ü": "ㄩ",
"v": "ㄩ",
}

# The designer of Hanyu Pinyin used e to represent both 「ㄜ」 and 「ㄝ」.
# This is because 「ㄝ」 could only be used in 「ㄩㄝ」 and 「ㄧㄝ」
PINYIN_FINALS = {
"a": "ㄚ",
"o": "ㄛ",
"e": "ㄜ", # also ㄝ
"ai": "ㄞ",
"ei": "ㄟ",
"ao": "ㄠ",
"ou": "ㄡ",
"an": "ㄢ",
"en": "ㄣ",
"ang": "ㄤ",
"eng": "ㄥ",
"er": "ㄦ",
}

PINYIN_COMBINED = {
"iu": "ㄧㄡ",
"ian": "ㄧㄢ",
"in": "ㄧㄣ",
"iang": "ㄧㄤ",
"ing": "ㄧㄥ",
"ui": "ㄨㄟ",
"uan": "ㄨㄢ", # also ㄩㄢ
"un": "ㄨㄣ", # also ㄨㄣ
"uang": "ㄨㄤ",
"ong": "ㄨㄥ",
"ue": "ㄩㄝ",
"iong": "ㄩㄥ",
}

# Only 「a﹑o﹑e﹑i﹑u﹑ü」 are added diacritics in Hanyu Pinyin
# Neutral tones are not labeled
DIACRITIC_TO_BASE_AND_TONE = {
# ā (ɑ̄) ē ī ō ū ǖ
# á (ɑ́) é í ó ú ǘ
# ǎ (ɑ̌) ě ǐ ǒ ǔ ǚ
# à (ɑ̀) è ì ò ù ǜ
"ā": ("a", "¯"),
"á": ("a", "ˊ"),
"ǎ": ("a", "ˇ"),
"ă": ("a", "ˇ"),
"à": ("a", "ˋ"),
"ē": ("e", "¯"),
"é": ("e", "ˊ"),
"ě": ("e", "ˇ"),
"ĕ": ("e", "ˇ"),
"è": ("e", "ˋ"),
"ī": ("i", "¯"),
"í": ("i", "ˊ"),
"ǐ": ("i", "ˇ"),
"ĭ": ("i", "ˇ"),
"ì": ("i", "ˋ"),
"ō": ("o", "¯"),
"ó": ("o", "ˊ"),
"ǒ": ("o", "ˇ"),
"ŏ": ("o", "ˇ"),
"ò": ("o", "ˋ"),
"ū": ("u", "¯"),
"ú": ("u", "ˊ"),
"ǔ": ("u", "ˇ"),
"ŭ": ("u", "ˇ"),
"ù": ("u", "ˋ"),
"ǖ": ("ü", "¯"),
"ǘ": ("ü", "ˊ"),
"ǚ": ("ü", "ˇ"),
"ü̆": ("ü", "ˇ"),
"ǜ": ("ü", "ˋ"),
"ề": ("ê", "ˋ"), # 「ㄝ」 could only possibly be the fourth tone
}


def to_chewing(pinyin: str) -> str:

# Matches chewing from substrings
def match_chewing(string: str, index: int, target: dict[str, str]):
global PINYIN_COMBINED, PINYIN_FINALS
# Substrings only to the maxium possible character amount
for i in range(max([len(i) for i in target.keys()]), 0, -1):
target_str = string[index : index + i]
result = target.get(target_str)
if result:
# Resolve duplicates
if target == PINYIN_COMBINED:
if target_str == "uan" and string[index - 1] in [
"y",
"j",
"q",
"x",
]:
result = "ㄩㄢ"
elif target_str == "un" and string[index - 1] in [
"y",
"j",
"q",
"x",
]:
result = "ㄩㄣ"
elif target == PINYIN_FINALS:
if target_str == "e" and string[index - 1] == "y":
result = "ㄝ"

return (index + i, result)
return (index + 1, None)

# Ensure there are no trailing characters unable to form word
def forms_new_word(index: int):
nonlocal pinyin
global PINYIN_INITIALS, PINYIN_ALONE
return (
match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1]
or index >= len(pinyin)
or not pinyin[index].isalpha()
)

# Temporarily store the chewing tones and original index
tones = []
for index in range(len(pinyin)):
value = DIACRITIC_TO_BASE_AND_TONE.get(pinyin[index])
if not value:
continue
pinyin = pinyin[:index] + value[0] + pinyin[index + 1 :]
tones.append((index, value[1]))

chewing = ""
index = 0
while index < len(pinyin):

# Ignore special characters
if not (pinyin[index].isalpha() and pinyin[index].islower()):
chewing += pinyin[index]
index += 1
continue

res = match_chewing(pinyin, index, PINYIN_ALONE)
if res[1] and forms_new_word(res[0]):
chewing += res[1]
index = res[0]

else:
initial = match_chewing(pinyin, index, PINYIN_INITIALS)
assert initial[1]
index = initial[0]
chewing += initial[1]
combined = match_chewing(pinyin, index, PINYIN_COMBINED)
if combined[1]:
index = combined[0]
chewing += combined[1]
else:
center = match_chewing(pinyin, index, PINYIN_CENTER)
if center[1]:
chewing += center[1]
index = center[0]
final = match_chewing(pinyin, index, PINYIN_FINALS)
safety_ctr = 0
while final[1]:
# incase things explode causing infinite loop
assert safety_ctr < 2
chewing += final[1]
index = final[0]
final = match_chewing(pinyin, index, PINYIN_FINALS)
safety_ctr += 1

if len(tones) and tones[0][0] < index:
chewing += tones.pop(0)[1]
else:
chewing += "˙"

chewing += " " # Add a fullwidth space between words

return chewing
9 changes: 8 additions & 1 deletion commands/tocfl/tocfl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from modules.supabase import supabaseClient
from random import randint
from .consts import TOCFL_LEVELS_CHOICES, TOCFL_LEVELS
from .chewing import to_chewing


def register_commands(
Expand Down Expand Up @@ -56,10 +57,16 @@ async def tocfl_rand(
def _create_word_embed(
word: str, level: int, part_of_speech: str, pinyin: str
):
try:
chewing = to_chewing(pinyin)
except AssertionError:
chewing = "ERROR OCCURRED"

embed = discord.Embed(
title=word
) # ^ add description="desc" for translation
embed.add_field(name="Pronunciation", value=pinyin, inline=False)
embed.add_field(name="Pronunciation (Pinyin)", value=pinyin, inline=False)
embed.add_field(name="Pronunciation (Zhuyin)", value=chewing, inline=False)
embed.add_field(
name="Dictionary Reference",
value=f"https://cdict.net/?q={word}",
Expand Down

0 comments on commit 55a1832

Please sign in to comment.