-
Notifications
You must be signed in to change notification settings - Fork 1
/
syllabate.py
123 lines (90 loc) · 3.52 KB
/
syllabate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Syllabator by Grisha Khachaturyan
'''
The syllabator uses the P2TK syllabifier script and hyphenates words based on it's syllabified output
Here's an example of how to run it
######################################
from syllabator import *
syl('orange')
output:
[(1, [], ['AO'], []), (0, ['R'], ['AH'], ['N', 'JH'])]
caught: o
word: range
parsed: [['o']]
caught: r
word: ange
parsed: [['o', ''], ['r', 'ange']]
['o', 'range']
index out of range means there is something missing from the phoneme_dict
keyerror means the word is not in cmudict
#############################################
'''
import syllabifier
from nltk.corpus import cmudict
eng = syllabifier.English
cmu = cmudict.dict()
phoneme_dict = {
# consonants
'K':['k','c','qu'], 'F':['f','ph','gh'], 'ZH':['g','sio','su'], 'JH':['j','dg','g'], 'HH':['h'],
'SH':['sh','tio','sio','ch'], 'DH':['th'], 'Y':['u','y'], 'CH':['ch','tu'], 'W':['w','ou'],'Z':['z','s','x'],
# vowels
'AA':['o','a'], 'AE':['a'],'AH':['a','u','e'], 'AO':['o','a'], 'AW':['ow'],'AY':['ie','i','ay'],
'EH':['a','e'], 'ER':['ar','er','ear'], 'EY':['a','ei'], 'IH':['i','e'],'IY':['ea','ee','e','i'],
'OW':['o'],'OY':['oy','oi'], 'UH':['ou'], 'UW':['ou','oo','ew','u']}
def pro_syl(word):
pronounce = " ".join(cmu[word][0])
return syllabifier.syllabify(eng, pronounce)
def syl_phrase(phrase):
for word in phrase.split():
syl_word = syl(word)
if (syl_word != -1):
print syl_word
else:
print word," (not syllabified)"
def syl(word):
try:
pronounce = " ".join(cmu[word.lower()][0])
syl_pro = syllabifier.syllabify(eng, pronounce)
except KeyError:
return -1
print syl_pro
if len(syl_pro) > 1:
parsed_word = []
for syllable, (stress, onset, nucleus, coda) in enumerate(syl_pro):
if onset == []:
split_point = nucleus[0]
else:
split_point = onset[0]
print split_point
'''
if the split_point is in the phoneme dictionary,
iterate and find index of all letters corrosponding to the
phoneme and set split_point_index to the smallest index
'''
if (split_point in phoneme_dict):
indices = []
for spelling in phoneme_dict[split_point]:
index = word.lower().find(spelling)
if (index != -1):
indices.append(index)
indices.sort()
split_point_index = indices[0]
else:
split_point_index = word.lower().find(split_point.lower())
print "caught: ",word[split_point_index]
'''
If you've iterated passed the first syllable,
take every letter preceding the onset of the current syllable,
and append it to the onset of the previous syllable
'''
if syllable > 0:
parsed_word[syllable-1].append(word[:split_point_index])
syl_split = []
syl_split.append(word[split_point_index])
parsed_word.append(syl_split)
word = word[split_point_index+1:]
if syllable+1 == len(syl_pro):
syl_split.append(word)
print "word: ",word
print "parsed: ",parsed_word
return " ".join(["".join(a) for a in parsed_word])
return word