-
Notifications
You must be signed in to change notification settings - Fork 0
/
charfinder.py
executable file
·80 lines (58 loc) · 1.83 KB
/
charfinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
"""
charfinder.py:
Searches for Unicode characters named with the words given.
Downloads and scans UCD (Unicode Character Database).
"""
import pathlib
from urllib import request
UCD_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt'
UCD_NAME = pathlib.Path(UCD_URL).name
def words_set(word_string):
return set(word_string.upper().replace('-', ' ').split())
def parse(ucd_line):
parts = ucd_line.split(';')
char = chr(int(parts[0], 16))
name = parts[1]
if parts[10]:
old_name = parts[10]
old_name_set = words_set(old_name)
name_set = words_set(name)
if old_name_set - name_set:
name += ' | ' + old_name
return char, name
def match(query_set, name):
name = words_set(name)
return query_set <= name
def scan(lines, query):
query_set = words_set(query)
if not query_set:
return
for line in lines:
char, name = parse(line)
if match(query_set, name):
yield char, name
def download_ucd():
print('downloading {}...'.format(UCD_NAME))
with request.urlopen(UCD_URL) as fp_in:
octets = fp_in.read()
with open(UCD_NAME, 'wb') as fp_out:
fp_out.write(octets)
return octets.decode('ascii')
def read_ucd():
if pathlib.Path(UCD_NAME).exists():
with open(UCD_NAME, 'rt', encoding='ascii') as fp_in:
text = fp_in.read()
else:
text = download_ucd()
return (line for line in text.split('\n')
if line.strip() and not line.startswith('#'))
def main():
import sys
if len(sys.argv) < 2:
print('usage: {} <word1> <word2> ...'.format(sys.argv[0]))
sys.exit()
for char, name in scan(read_ucd(), ' '.join(sys.argv[1:])):
print(char, name)
if __name__ == '__main__':
main()