-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathjoin_korean.py
executable file
·65 lines (58 loc) · 2.17 KB
/
join_korean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
import sys
import hangul
def get_next_char_list(infile):
for rawline in infile:
charlist = []
for rawchar in rawline.decode("utf-8"):
charlist.append(rawchar)
yield charlist
def get_joined_char(infile):
for charlist in get_next_char_list(infile):
i = 0
while i < len(charlist):
leftchar = len(charlist) - 1 - i
if leftchar == 1:
yield charlist[i]
i += 1
elif leftchar == 2:
if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]):
yield hangul.join((charlist[i], charlist[i+1], ''))
i += 2
else:
yield charlist[i]
i += 1
elif leftchar == 3:
if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]):
if hangul.isJaeum(charlist[i+2]):
yield hangul.join((charlist[i], charlist[i+1], charlist[i+2]))
i += 3
else:
yield hangul.join((charlist[i], charlist[i+1], ''))
i += 2
else:
yield charlist[i]
i += 1
else:
if hangul.isJaeum(charlist[i]) and hangul.isMoeum(charlist[i+1]):
if hangul.isJaeum(charlist[i+2]) and \
(hangul.isJaeum(charlist[i+3]) or (not hangul.ishangul(charlist[i+3]))):
yield hangul.join((charlist[i], charlist[i+1], charlist[i+2]))
i += 3
else:
yield hangul.join((charlist[i], charlist[i+1], ''))
i += 2
else:
yield charlist[i]
i += 1
def main():
input_filename = sys.argv[1]
output_filename = input_filename + ".jk"
infile = open(input_filename)
outfile = open(output_filename, "w")
for char in get_joined_char(infile):
outfile.write(char.encode("utf-8"))
infile.close
outfile.close
if __name__ == "__main__":
main()