forked from nert-nlp/cgel
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
139 lines (119 loc) · 5.33 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import re, glob
import stanza
from stanza.utils.conll import CoNLL
from tqdm import tqdm
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', tokenize_no_ssplit=True)
def parse(filename, fout):
with open(filename, 'r') as fin:
text = fin.read().replace('\n', ';')
filename = re.search(r'/(.*?)\.tex', filename).group(1)
for num, tree in enumerate(re.findall(r'\\begin\{parsetree\}.*?\\end\{parsetree\}', text)):
fout.write(f'Tree {filename}-{num}\n')
stack = []
label_id = 0
left, right = 0, 0
deps = []
for i, char in enumerate(tree):
if char == '(':
label = re.search(r'^\(.*?\.(.*?)\.', tree[i:]).group(1)
if 'SIEG' in filename:
sublabels = re.search(r'\\begin\{tabular\}\{c\}(.*?)\\end\{tabular\}', label)
else:
sublabels = re.search(r'\\NL\{(.*?)\}\{(.*?)\}', label)
word = "_"
if sublabels:
if 'SIEG' in filename:
label = sublabels.group(1).split('\\\\')
label[0] = label[0].strip(':')
if len(label) == 1:
label.append('')
else:
label = [sublabels.group(1), sublabels.group(2)]
label[1] = label[1].replace('\\textsubscript{', '_')
words = re.search(r'^[^\)\(]*?`(.*?)\' *?\)', tree[i + 1:])
if words:
word = words.group(1)
left = right
right = right + len(word.split()) - 1
else:
label = [label, '']
parent = -1 if len(stack) == 0 else stack[-1]
deps.append([left, right, word, label, parent])
stack.append(label_id)
label_id += 1
if word != '_':
left = right + 1
right = left
if char == ')':
stack.pop()
for i in range(len(deps) - 1, -1, -1):
parent = deps[i][4]
if parent != -1:
deps[parent][0] = min(deps[i][0], deps[parent][0])
deps[parent][1] = max(deps[i][1], deps[parent][1])
cts = {}
labels = []
words = []
consts = []
for _, i in enumerate(deps):
i[0] += 1
i[1] += 1
label = str(i[0]) if i[0] == i[1] else f'{i[0]}-{i[1]}'
labels.append(label)
# print(_, i, label)
if label not in cts: cts[label] = 0
cts[label] += 1
if i[2] != '_':
words.append(f'{label + "’" * (cts[label] - 1)}\t{i[2]}\n')
cts[label] += 1
consts.append(f'{label + "’" * (cts[label] - 1)}\t{i[3][0]}\t{i[3][1]}\t{"0" if i[4] == -1 else labels[i[4]]}\n')
# since the parse is in DFS preorder, we can just go in order
# and reconstruct the original tree
stack = [-1]
depth = 0
res = ' '.join([i[2] for i in deps if i[2] != '_'])
sentence = ' '.join([i[2] for i in deps if i[2] not in ['_', '--']])
for i, cons in enumerate(deps):
text, deprel, label, head = cons[2], cons[3][0], cons[3][1], cons[4]
# find the proper depth and parent of cur node
while stack[-1] != head:
stack.pop()
res += ')'
depth -= 1
# for gaps, track id
name = None
if '_' in label and '_rel' not in label:
label, name = label.split('_')
# _ = no text
text = f'"{text}"' if text != '_' else ''
append = True
# write node
if depth == 0:
res += f'\n({deprel}'
elif label == 'GAP':
res += f'\n{" " * depth}:{deprel} '
if name: res += f'{name}'
append = False
else:
res += f'\n{" " * depth}:{deprel} ('
if name: res += f'{name} / '
res += label
if text: res += " :t " + text
# future node children
if append:
stack.append(i)
depth += 1
res += ")" * depth
fout.write(res + '\n\n')
return sentence
with open('datasets/twitter_parsed/parsed.txt', 'w') as fout:
sentences = []
for file in tqdm(glob.glob('trees/*.tex')):
# print(file)
sentence = parse(file, fout)
sentence = re.sub(r'\s*\\textquoteright\s*', "'", sentence)
sentences.append(sentence)
doc = nlp('\n\n'.join(sentences))
with open('datasets/twitter_parsed/sentences.txt', 'w') as fout:
fout.write('\n'.join(sentences))
conll = CoNLL.write_doc2conll(doc, 'ud_silver.conllu')