-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgloss.py
executable file
·276 lines (239 loc) · 10.4 KB
/
gloss.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/usr/bin/env python3
import sys
import regex as re
import os
import sqlite3
import argparse
parser = argparse.ArgumentParser(prog='gloss.py', description='Applies foreign language glosses from Katersat to a stream of CG-formatted text')
parser.add_argument('-t', '--trace', action='store_true')
parser.add_argument('-l', '--lang', action='store_true', default='eng')
parser.add_argument('lang', nargs='?', default='eng')
args = parser.parse_args()
# Some word classes are different in Katersat
wc_map_s = {
'N': 'T',
'V': 'V',
'Pali': 'Pali',
'Conj': 'Conj',
'Adv': 'Adv',
'Interj': 'Intj',
'Pron': 'Pron',
'Prop': 'Prop',
'Num': 'Num',
'Symbol': 'Symbol',
'Adj': 'Adj',
'Part': 'Part',
'Prep': 'Prep',
}
wc_map_k = {v: k for k, v in wc_map_s.items()}
dir = os.path.dirname(__file__)
con = sqlite3.connect('file:' + dir + '/katersat.sqlite?mode=ro', uri=True, isolation_level=None, check_same_thread=False)
db = con.cursor()
# Fetch map of semantic classes, mapping verbal semantic codes from their English equivalent
sem_map_k = {}
db.execute("SELECT sem_code, sem_eng FROM kat_semclasses WHERE sem_code != 'UNK' AND sem_code NOT LIKE 'V.%'")
while row := db.fetchone():
sem_map_k[row[0]] = row[0]
db.execute("SELECT sem_code, sem_eng FROM kat_semclasses WHERE sem_code LIKE 'V.%'")
while row := db.fetchone():
m = re.match(r'^:([^\s,]+)', row[1])
if m[1] in sem_map_k:
sem_map_k[row[0]] = 'v'+m[1]
else:
sem_map_k[row[0]] = m[1]
sem_map_s = {v: k for k, v in sem_map_k.items()}
stats = {
'hit': 0,
'miss': 0,
'clear': 0,
}
cache = {}
for line in sys.stdin:
line = line.rstrip()
if not line.startswith('\t"') or (' <tr-done> ' in line) or not re.search(r' (?:N|V|Pali|Conj|Adv|Interj|Pron|Prop|Num|Symbol)(?: |$)', line):
print(line)
sys.stdout.flush()
if len(cache) >= 20000:
stats['clear'] += 1
cache = {}
continue
line = line.strip()
hyb = (' Hyb/' in line and not ' Hyb/1-' in line)
suffix = ''
if m := re.search(r' (?:\d?(?:Sg|Pl|Du)(?:Poss|O)?)( (?:ADV-|CONJ-)?(?:LI|LU|LUUNNIIT)(?: |$).*)$', line):
suffix += m[1]
line = re.sub(r'( (?:ADV-|CONJ-)?(?:LI|LU|LUUNNIIT)(?: |$).*)$', '', line)
if m := re.search(r'( ¤\S+)( |$)', line):
suffix += m[1]
line = line.replace(m[1], '')
if m := re.search(r'((?: %\S+)+)( |$)', line):
suffix += m[1]
line = line.replace(m[1], '')
if m := re.search(r'((?: @\S+)+)( |$)', line):
suffix += m[1]
line = line.replace(m[1], '')
if m := re.search(r'( #\d+->\d+)( |$)', line):
suffix += m[1]
line = line.replace(m[0], '')
if line in cache:
stats['hit'] += 1
print('\t' + cache[line] + suffix)
sys.stdout.flush()
continue
stats['miss'] += 1
origs = re.split(r' (?=(?:(?:i?(?:N|V|Pali|Conj|Adv|Interj|Pron|Prop|Num|Symbol))|(?:\p{Lu}[_\p{Lu}]+)|U)(?: |$))', line)
scleans = []
cleans = []
for orig in origs:
orig = re.sub(r' Gram/((?:[HIT]V)|(?:Refl))\b', r' gram/\1', orig)
orig = re.sub(r' (Gram|Dial|Orth|O[lL]ang|Heur|Hyb|Err)/(\S+)', r'', orig)
orig = re.sub(r' (ADV|CONJ)-L', r' L', orig)
orig = re.sub(r' i?Sem/(Concessive|Temporal)', r'', orig)
orig = orig.replace(' gram/', ' Gram/')
scleans.append(orig)
orig = re.sub(r' i?Sem/(\S+)', r'', orig)
cleans.append(orig)
# Python doesn't have a real for() loop, so...
i = 0
e = len(origs)-1
while i < e:
for j in range(len(origs)-1, i, -1):
cur = (' '.join(cleans[i:j-1]) + ' ' + ' '.join(scleans[j-1:j])).strip()
m = None
if (m := re.match(r'^i?(N|V|Pali|Conj|Adv|Interj|Pron|Prop|Num|Symbol)(?: |$)(.*)$', cleans[j])) or (m := re.search(r' Der/([nv])[nv]( |$)', cleans[j])):
pass
if not m:
m = ['', '', '']
wc = m[1][0:1].upper() + m[1][1:]
flex = m[2].strip()
ana = (cur + ' ' + wc).strip()
#print(f'{i} {j-1}: {cur} | {wc} | {flex}')
anas = []
# Raw match for morpheme sequences
anas.append(ana)
# First try actual case/flexion
if (m := re.match(r'^((?:i?\d?\p{Lu}\p{Ll}[^/\s]* *)+)', flex)) or (m := re.match(r'^(LU)(?: |$)', flex)):
flex = re.sub(r'\bi(\p{Lu})', r'\1', m[1])
ana2 = f'{ana} {flex}'.strip()
anas.append(ana2)
if re.search(r' \dPl(O?)$', ana2):
anas.append(re.sub(r' (\d)Pl(O?)$', r' \1Sg\2', ana2))
anas.append((ana + ' ' + re.sub(r'\b(Rel|Trm|Abl|Lok|Aeq|Ins|Via|Nom|Akk)\b', r'Abs', flex)).strip())
if re.search(r'\b\d(Sg|Pl)Poss\b', flex):
anas.append((ana + ' ' + re.sub(r'\b\d(Sg|Pl)Poss\b', '', flex)).strip())
anas.append((ana + ' ' + re.sub(r'\b(?:Rel|Trm|Abl|Lok|Aeq|Ins|Via|Nom|Akk) (Sg|Pl) \d(Sg|Pl)Poss\b', r'Abs \1', flex)).strip())
# Then fall back to baseforms
if wc != 'V':
anas.append(ana + ' Abs Sg')
anas.append(ana + ' Ins Sg')
anas.append(ana + ' Abs Pl')
anas.append(ana + ' Ins Pl')
else:
if re.search(r'^.* Gram/[HI]V', ana) or re.search(r'^.* Gram/Refl', ana) or not re.search(r'^.* Gram/TV', ana):
anas.append(ana + ' Ind 3Sg')
anas.append(ana + ' Ind 3Pl')
if re.search(r'^.* Gram/[HT]V', ana) or not re.search(r'^.* Gram/IV', ana):
anas.append(ana + ' Ind 3Sg 3SgO')
anas.append(ana + ' Ind 3Pl 3PlO')
anas.append(ana + ' Ind 3Sg 3PlO')
anas.append(ana + ' Ind 3Pl 3SgO')
if hyb:
anas.extend([re.sub(r'^"(\p{Lu}+)" ', r'\1 ', x) for x in anas])
pfx = re.search(r' (Prefix/[TA]A) ', ana)
prefix = ''
s1 = 'UNK'
s2 = 'UNK'
if (m := re.search(r'\bi?Sem/(\S+) i?Sem/(\S+)\b', origs[j-1])) and (m[1] in sem_map_s) and (m[2] in sem_map_s):
s1, s2 = sem_map_s[m[1]], sem_map_s[m[2]]
anas = list(map(lambda x: re.sub(fr' \bi?Sem/{m[1]} i?Sem/{m[2]}\b', '', x), anas))
elif (m := re.search(r'\bi?Sem/(\S+)\b', origs[j-1])) and (m[1] in sem_map_s):
s1 = sem_map_s[m[1]]
anas = list(map(lambda x: re.sub(fr' \bi?Sem/{m[1]}\b', '', x), anas))
elif (m := re.search(r'\bi?Sem/(?:an|Be|CognitiveMaking|dur|event|Fem|FirstName|Geo|H|HH|Hprof|Hum|Hunt|inst|Location|LastName|Mailadresse|Mask|ModeOfMovement|Remove|sem|temp|Time|Unit|Url|misse) \bi?Sem/(\S+) i?Sem/(\S+)\b', origs[j-1])) and (m[1] in sem_map_s) and (m[2] in sem_map_s):
s1, s2 = sem_map_s[m[1]], sem_map_s[m[2]]
anas = list(map(lambda x: re.sub(fr' \bi?Sem/{m[1]} i?Sem/{m[2]}\b', '', x), anas))
elif (m := re.search(r'\bi?Sem/(?:an|Be|CognitiveMaking|dur|event|Fem|FirstName|Geo|H|HH|Hprof|Hum|Hunt|inst|Location|LastName|Mailadresse|Mask|ModeOfMovement|Remove|sem|temp|Time|Unit|Url|misse) \bi?Sem/(\S+)\b', origs[j-1])) and (m[1] in sem_map_s):
s1 = sem_map_s[m[1]]
anas = list(map(lambda x: re.sub(fr' \bi?Sem/{m[1]}\b', '', x), anas))
#print(f'{i} {j-1}: {cur} | {anas} | {s1} {s2}')
did = False
for ana in anas:
ids = []
db.execute("SELECT fst_ana, lex_id FROM kat_long_raw NATURAL JOIN kat_lexemes WHERE substr(fst_ana,1,16) = ? AND lex_semclass != 'meta-cat-lib'", [ana[0:16]])
while r := db.fetchone():
if r[0] == ana:
ids.append(str(r[1]))
# Allow looking up morphemes without Gram/[HIT]V
if not ids and not ana.startswith('"'):
ana = re.sub(r' Gram/[HIT]V ', r' ', ana)
db.execute("SELECT fst_ana, lex_id FROM kat_long_raw NATURAL JOIN kat_lexemes WHERE substr(fst_ana,1,16) = ? AND lex_semclass != 'meta-cat-lib'", [ana[0:16]])
while r := db.fetchone():
if r[0] == ana:
ids.append(str(r[1]))
# If there is a prefix, try without it
if not ids and pfx:
ana = ana.replace(pfx[0], ' ')
db.execute("SELECT fst_ana, lex_id FROM kat_long_raw NATURAL JOIN kat_lexemes WHERE substr(fst_ana,1,16) = ? AND lex_semclass != 'meta-cat-lib'", [ana[0:16]])
while r := db.fetchone():
if r[0] == ana:
ids.append(str(r[1]))
prefix = pfx[1]
# N may also be Pron
if not ids and ' N ' in ana and not ' Pron ' in ana:
ana = ana.replace(' N ', ' Pron ')
anas2 = [ana]
for sgpl in ['Sg', 'Pl']:
for num in ['', '1', '2', '3', '4']:
anas2.append(ana.replace(' Sg', f' {num}{sgpl}'))
anas2.append(ana.replace(' Pl', f' {num}{sgpl}'))
anas2.append(ana.replace(f' {num}Sg', f' {num}{sgpl}'))
anas2.append(ana.replace(f' {num}Pl', f' {num}{sgpl}'))
for ana in anas2:
db.execute("SELECT fst_ana, lex_id FROM kat_long_raw NATURAL JOIN kat_lexemes WHERE substr(fst_ana,1,16) = ? AND lex_semclass != 'meta-cat-lib'", [ana[0:16]])
while r := db.fetchone():
if r[0] == ana:
ids.append(str(r[1]))
if ids:
break
if ids:
db.execute("SELECT DISTINCT tr.lex_lexeme, tr.lex_semclass as sem, tr.lex_sem2 as sem2, tr.lex_wordclass as wc, kl.lex_id as k_id, tr.lex_id as t_id FROM kat_lexemes as kl NATURAL JOIN glue_lexeme_synonyms AS gls INNER JOIN kat_lexemes as tr ON (gls.lex_syn = tr.lex_id) WHERE kl.lex_id IN (" + ','.join(ids) + ") AND kl.lex_semclass = ? AND kl.lex_sem2 = ? AND tr.lex_language = ? ORDER BY kl.lex_id ASC, gls.syn_order ASC, tr.lex_id ASC LIMIT 1", [s1, s2, args.lang])
tr = db.fetchone()
# If there were no semantics and we did not find a match, try any semantics
if not tr and s1 == 'UNK':
db.execute("SELECT DISTINCT tr.lex_lexeme, tr.lex_semclass as sem, tr.lex_sem2 as sem2, tr.lex_wordclass as wc, kl.lex_id as k_id, tr.lex_id as t_id FROM kat_lexemes as kl NATURAL JOIN glue_lexeme_synonyms AS gls INNER JOIN kat_lexemes as tr ON (gls.lex_syn = tr.lex_id) WHERE kl.lex_id IN (" + ','.join(ids) + ") AND tr.lex_language = ? ORDER BY kl.lex_id ASC, gls.syn_order ASC, tr.lex_id ASC LIMIT 1", [args.lang])
tr = db.fetchone()
if tr:
wc = wc_map_k[tr[3].capitalize()]
sem = ''
if prefix:
sem = ' ' + prefix
if tr[1] in sem_map_k:
sem += ' Sem/' + sem_map_k[tr[1]]
if tr[2] in sem_map_k:
sem += ' Sem/' + sem_map_k[tr[2]]
#print(f'{i} {j-1}: {tr}')
out = f'"{tr[0]}"{sem} {wc}'
if args.trace:
out += f' TR-LEX:{tr[4]}:{tr[5]}'
origs[i] = f'{out} <tr>'
k = i+1
while k < j:
origs[k] = ''
k += 1
i = j-1
did = True
break
if did:
break
i += 1
orig = re.sub(r' +', r' ', ' '.join(origs))
# Mark semantics before derivation as internal
while (o := re.sub(r' (Sem/\S+.*? (?:U|\p{Lu}[_\p{Lu}]+) )', r' i\1', orig)) != orig:
orig = o
# Mark word classes before derivation or other word classes as internal
while (o := re.sub(r' ((?:N|V|Pali|Conj|Adv|Interj|Pron|Prop|Num|Symbol) .*? (?:(?:U|\p{Lu}\p{Lu}+)|(?:N|V|Pali|Conj|Adv|Interj|Pron|Prop|Num|Symbol)) )', r' i\1', orig)) != orig:
orig = o
cache[line] = orig
print('\t' + orig + suffix)
sys.stdout.flush()
#print(stats, file=sys.stderr)