-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d0bad54
commit 55a1832
Showing
2 changed files
with
270 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,262 @@ | ||
PINYIN_INITIALS = { | ||
"b": "ㄅ", | ||
"p": "ㄆ", | ||
"m": "ㄇ", | ||
"f": "ㄈ", | ||
"d": "ㄉ", | ||
"t": "ㄊ", | ||
"n": "ㄋ", | ||
"l": "ㄌ", | ||
"g": "ㄍ", | ||
"k": "ㄎ", | ||
"h": "ㄏ", | ||
"j": "ㄐ", | ||
"q": "ㄑ", | ||
"x": "ㄒ", | ||
"zh": "ㄓ", | ||
"ch": "ㄔ", | ||
"sh": "ㄕ", | ||
"r": "ㄖ", | ||
"z": "ㄗ", | ||
"c": "ㄘ", | ||
"s": "ㄙ", | ||
} | ||
|
||
# In Hanyu Pinyin, 「ㄜ」 and 「ㄝ」 share the same character 〔e〕 | ||
# because the possible initials are almost different. | ||
# Only 「誒」 (「ㄝˋ」) interferes with 「惡」(「ㄜˋ」), | ||
# in this case 「ㄝ」 is spelt as 〔ê〕 | ||
PINYIN_ALONE = { | ||
"zhi": "ㄓ", | ||
"chi": "ㄔ", | ||
"shi": "ㄕ", | ||
"ri": "ㄖ", | ||
"zi": "ㄗ", | ||
"ci": "ㄘ", | ||
"si": "ㄙ", | ||
"a": "ㄚ", | ||
"o": "ㄛ", | ||
"e": "ㄜ", | ||
"ê": "ㄝ", | ||
"ai": "ㄞ", | ||
"ei": "ㄟ", | ||
"ao": "ㄠ", | ||
"ou": "ㄡ", | ||
"an": "ㄢ", | ||
"en": "ㄣ", | ||
"ang": "ㄤ", | ||
"er": "ㄦ", | ||
# In case of Erhua, 〔r〕 is adeed to the last character instead of 〔er〕 | ||
"r": "ㄦ", | ||
"yi": "ㄧ", | ||
"wu": "ㄨ", | ||
"yu": "ㄩ", | ||
# Combined | ||
"ya": "ㄧㄚ", | ||
"yo": "ㄧㄛ", | ||
"ye": "ㄧㄝ", | ||
"yai": "ㄧㄞ", | ||
"yao": "一ㄠ", | ||
"you": "ㄧㄡ", | ||
"yan": "ㄧㄢ", | ||
"yin": "ㄧㄣ", | ||
"yang": "ㄧㄤ", | ||
"ying": "ㄧㄥ", | ||
"wa": "ㄨㄚ", | ||
"wo": "ㄨㄛ", | ||
"wai": "ㄨㄞ", | ||
"wei": "ㄨㄟ", | ||
"wan": "ㄨㄢ", | ||
"wen": "ㄨㄣ", | ||
"wang": "ㄨㄤ", | ||
"weng": "ㄨㄥ", | ||
"yue": "ㄩㄝ", | ||
"yuan": "ㄩㄢ", | ||
"yun": "ㄩㄣ", | ||
"yong": "ㄩㄥ", | ||
# v is used to replace ü in typing | ||
"nü": "ㄋㄩ", | ||
"lü": "ㄌㄩ", | ||
"nv": "ㄋㄩ", | ||
"lv": "ㄌㄩ", | ||
} | ||
|
||
PINYIN_CENTER = { | ||
"i": "ㄧ", | ||
"u": "ㄨ", | ||
"ü": "ㄩ", | ||
"v": "ㄩ", | ||
} | ||
|
||
# The designer of Hanyu Pinyin used e to represent both 「ㄜ」 and 「ㄝ」. | ||
# This is because 「ㄝ」 could only be used in 「ㄩㄝ」 and 「ㄧㄝ」 | ||
PINYIN_FINALS = { | ||
"a": "ㄚ", | ||
"o": "ㄛ", | ||
"e": "ㄜ", # also ㄝ | ||
"ai": "ㄞ", | ||
"ei": "ㄟ", | ||
"ao": "ㄠ", | ||
"ou": "ㄡ", | ||
"an": "ㄢ", | ||
"en": "ㄣ", | ||
"ang": "ㄤ", | ||
"eng": "ㄥ", | ||
"er": "ㄦ", | ||
} | ||
|
||
PINYIN_COMBINED = { | ||
"iu": "ㄧㄡ", | ||
"ian": "ㄧㄢ", | ||
"in": "ㄧㄣ", | ||
"iang": "ㄧㄤ", | ||
"ing": "ㄧㄥ", | ||
"ui": "ㄨㄟ", | ||
"uan": "ㄨㄢ", # also ㄩㄢ | ||
"un": "ㄨㄣ", # also ㄨㄣ | ||
"uang": "ㄨㄤ", | ||
"ong": "ㄨㄥ", | ||
"ue": "ㄩㄝ", | ||
"iong": "ㄩㄥ", | ||
} | ||
|
||
# Only 「a﹑o﹑e﹑i﹑u﹑ü」 are added diacritics in Hanyu Pinyin | ||
# Neutral tones are not labeled | ||
DIACRITIC_TO_BASE_AND_TONE = { | ||
# ā (ɑ̄) ē ī ō ū ǖ | ||
# á (ɑ́) é í ó ú ǘ | ||
# ǎ (ɑ̌) ě ǐ ǒ ǔ ǚ | ||
# à (ɑ̀) è ì ò ù ǜ | ||
"ā": ("a", "¯"), | ||
"á": ("a", "ˊ"), | ||
"ǎ": ("a", "ˇ"), | ||
"ă": ("a", "ˇ"), | ||
"à": ("a", "ˋ"), | ||
"ē": ("e", "¯"), | ||
"é": ("e", "ˊ"), | ||
"ě": ("e", "ˇ"), | ||
"ĕ": ("e", "ˇ"), | ||
"è": ("e", "ˋ"), | ||
"ī": ("i", "¯"), | ||
"í": ("i", "ˊ"), | ||
"ǐ": ("i", "ˇ"), | ||
"ĭ": ("i", "ˇ"), | ||
"ì": ("i", "ˋ"), | ||
"ō": ("o", "¯"), | ||
"ó": ("o", "ˊ"), | ||
"ǒ": ("o", "ˇ"), | ||
"ŏ": ("o", "ˇ"), | ||
"ò": ("o", "ˋ"), | ||
"ū": ("u", "¯"), | ||
"ú": ("u", "ˊ"), | ||
"ǔ": ("u", "ˇ"), | ||
"ŭ": ("u", "ˇ"), | ||
"ù": ("u", "ˋ"), | ||
"ǖ": ("ü", "¯"), | ||
"ǘ": ("ü", "ˊ"), | ||
"ǚ": ("ü", "ˇ"), | ||
"ü̆": ("ü", "ˇ"), | ||
"ǜ": ("ü", "ˋ"), | ||
"ề": ("ê", "ˋ"), # 「ㄝ」 could only possibly be the fourth tone | ||
} | ||
|
||
|
||
def to_chewing(pinyin: str) -> str: | ||
|
||
# Matches chewing from substrings | ||
def match_chewing(string: str, index: int, target: dict[str, str]): | ||
global PINYIN_COMBINED, PINYIN_FINALS | ||
# Substrings only to the maxium possible character amount | ||
for i in range(max([len(i) for i in target.keys()]), 0, -1): | ||
target_str = string[index : index + i] | ||
result = target.get(target_str) | ||
if result: | ||
# Resolve duplicates | ||
if target == PINYIN_COMBINED: | ||
if target_str == "uan" and string[index - 1] in [ | ||
"y", | ||
"j", | ||
"q", | ||
"x", | ||
]: | ||
result = "ㄩㄢ" | ||
elif target_str == "un" and string[index - 1] in [ | ||
"y", | ||
"j", | ||
"q", | ||
"x", | ||
]: | ||
result = "ㄩㄣ" | ||
elif target == PINYIN_FINALS: | ||
if target_str == "e" and string[index - 1] == "y": | ||
result = "ㄝ" | ||
|
||
return (index + i, result) | ||
return (index + 1, None) | ||
|
||
# Ensure there are no trailing characters unable to form word | ||
def forms_new_word(index: int): | ||
nonlocal pinyin | ||
global PINYIN_INITIALS, PINYIN_ALONE | ||
return ( | ||
match_chewing(pinyin, index, PINYIN_INITIALS | PINYIN_ALONE)[1] | ||
or index >= len(pinyin) | ||
or not pinyin[index].isalpha() | ||
) | ||
|
||
# Temporarily store the chewing tones and original index | ||
tones = [] | ||
for index in range(len(pinyin)): | ||
value = DIACRITIC_TO_BASE_AND_TONE.get(pinyin[index]) | ||
if not value: | ||
continue | ||
pinyin = pinyin[:index] + value[0] + pinyin[index + 1 :] | ||
tones.append((index, value[1])) | ||
|
||
chewing = "" | ||
index = 0 | ||
while index < len(pinyin): | ||
|
||
# Ignore special characters | ||
if not (pinyin[index].isalpha() and pinyin[index].islower()): | ||
chewing += pinyin[index] | ||
index += 1 | ||
continue | ||
|
||
res = match_chewing(pinyin, index, PINYIN_ALONE) | ||
if res[1] and forms_new_word(res[0]): | ||
chewing += res[1] | ||
index = res[0] | ||
|
||
else: | ||
initial = match_chewing(pinyin, index, PINYIN_INITIALS) | ||
assert initial[1] | ||
index = initial[0] | ||
chewing += initial[1] | ||
combined = match_chewing(pinyin, index, PINYIN_COMBINED) | ||
if combined[1]: | ||
index = combined[0] | ||
chewing += combined[1] | ||
else: | ||
center = match_chewing(pinyin, index, PINYIN_CENTER) | ||
if center[1]: | ||
chewing += center[1] | ||
index = center[0] | ||
final = match_chewing(pinyin, index, PINYIN_FINALS) | ||
safety_ctr = 0 | ||
while final[1]: | ||
# incase things explode causing infinite loop | ||
assert safety_ctr < 2 | ||
chewing += final[1] | ||
index = final[0] | ||
final = match_chewing(pinyin, index, PINYIN_FINALS) | ||
safety_ctr += 1 | ||
|
||
if len(tones) and tones[0][0] < index: | ||
chewing += tones.pop(0)[1] | ||
else: | ||
chewing += "˙" | ||
|
||
chewing += " " # Add a fullwidth space between words | ||
|
||
return chewing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters