-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_pairs.py
44 lines (34 loc) · 1.48 KB
/
extract_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# coding: utf-8
import sqlite3, re, lxml.html
PATH_TO_BASE = '/home/lizaku/Документы/Big_Data/zal.db3'
PATH_TO_HTML = '/home/lizaku/PycharmProjects/verbal-aspect/prefixal.html'
base = sqlite3.connect(PATH_TO_BASE)
c = base.cursor()
re_lat = re.compile('[a-zA-Z]')
def suffixal():
# todo: pair the verbs and assign suffixes
# todo: align the formats of two verb lists
# todo: extract information about aspect classes
#SELECT source FROM headword JOIN descriptor ON descriptor.word_id=headword.id
# SELECT descriptor_id FROM aspect_pair
c.execute('SELECT source FROM headword JOIN descriptor ON descriptor.word_id=headword.id WHERE descriptor.id IN '
'(SELECT descriptor_id FROM aspect_pair)')
with open('verbs_zal.txt', 'w') as f:
for i in c.fetchall():
f.write(i[0] + '\n')
def prefixal():
# todo: extract information about aspect classes
with open(PATH_TO_HTML, 'r', encoding='utf-8') as f:
html = f.read()
root = lxml.html.fromstring(html)
rows = root.xpath(u'//table[contains(@cellpadding, "5")]/tr') + ['\n']
with open('verbs_prefixes.csv', 'w') as f:
for row in rows[0][0]:
cells = row.xpath('//td//a/text()')
cells = [cell for cell in cells if re_lat.search(cell) is None]
cell_groups = zip(*(iter(cells),) * 3)
for group in cell_groups:
f.write(','.join(group) + '\n')
if __name__ == '__main__':
#suffixal()
prefixal()