-
Notifications
You must be signed in to change notification settings - Fork 928
/
simplify.py
90 lines (73 loc) · 2.66 KB
/
simplify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Radical folding and diacritic mark removal.
Handling a string with `cp1252` symbols:
>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
>>> shave_marks(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> shave_marks_latin(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> dewinize(order)
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
>>> asciize(order)
'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
Handling a string with Greek and Latin accented characters:
>>> greek = 'Ζέφυρος, Zéfiro'
>>> shave_marks(greek)
'Ζεφυρος, Zefiro'
>>> shave_marks_latin(greek)
'Ζέφυρος, Zefiro'
>>> dewinize(greek)
'Ζέφυρος, Zéfiro'
>>> asciize(greek)
'Ζέφυρος, Zefiro'
"""
# tag::SHAVE_MARKS[]
import unicodedata
import string
def shave_marks(txt):
"""Remove all diacritic marks"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
shaved = ''.join(c for c in norm_txt
if not unicodedata.combining(c)) # <2>
return unicodedata.normalize('NFC', shaved) # <3>
# end::SHAVE_MARKS[]
# tag::SHAVE_MARKS_LATIN[]
def shave_marks_latin(txt):
"""Remove all diacritic marks from Latin base characters"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
latin_base = False
preserve = []
for c in norm_txt:
if unicodedata.combining(c) and latin_base: # <2>
continue # ignore diacritic on Latin base char
preserve.append(c) # <3>
# if it isn't a combining char, it's a new base char
if not unicodedata.combining(c): # <4>
latin_base = c in string.ascii_letters
shaved = ''.join(preserve)
return unicodedata.normalize('NFC', shaved) # <5>
# end::SHAVE_MARKS_LATIN[]
# tag::ASCIIZE[]
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
"""'f"^<''""---~>""")
multi_map = str.maketrans({ # <2>
'€': 'EUR',
'…': '...',
'Æ': 'AE',
'æ': 'ae',
'Œ': 'OE',
'œ': 'oe',
'™': '(TM)',
'‰': '<per mille>',
'†': '**',
'‡': '***',
})
multi_map.update(single_map) # <3>
def dewinize(txt):
"""Replace Win1252 symbols with ASCII chars or sequences"""
return txt.translate(multi_map) # <4>
def asciize(txt):
no_marks = shave_marks_latin(dewinize(txt)) # <5>
no_marks = no_marks.replace('ß', 'ss') # <6>
return unicodedata.normalize('NFKC', no_marks) # <7>
# end::ASCIIZE[]