-
Notifications
You must be signed in to change notification settings - Fork 2
/
DepND.py
305 lines (269 loc) · 12.9 KB
/
DepND.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# Copyright 2013 Siyuan Guo
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/python
# python2.7
import os
import re
import sys
from collections import defaultdict
class DepNeg():
def __init__(self, test_filepath, result_filepath):
self.trigger_filepath = "./negTriggers.txt"
self.test_filepath = test_filepath
self.result_filepath = result_filepath
self.trimmed_filepath = "./filtered.tmp"
self.parsed_filepath = "./parsed.tmp"
self.read_NegTriggers()
def read_NegTriggers(self):
print "Start reading negation triggers ..."
## read triggers, one trigger per line
self.NegTriggers = []
with open(self.trigger_filepath,'r') as fin:
for line in fin:
line = line.strip()
if line:
self.NegTriggers.append(line)
print "All negation triggers are\n", self.NegTriggers
def read_TestFile(self):
print "Start reading test file ..."
## read text file for analysis
## finding whether a sentence containing negation triggers
## aggregate/filter all sentences containing negation into a single text file, one sentence per line
with open(self.trimmed_filepath,'w') as tfout:
with open(self.test_filepath,'r') as tfin:
for line in tfin:
line = line.strip()
if line:
for trigger in self.NegTriggers:
## filter phrases like "no problem/trouble/matter"
if trigger == 'no':
pattern = re.compile(r'\b'+trigger+r'(?!( problem| trouble| matter| further| one))\b',re.IGNORECASE)
elif trigger == 'not':
pattern = re.compile(r'\b'+trigger+r'(?!( certain| necessarily| only))\b',re.IGNORECASE)
elif trigger == 'without':
pattern = re.compile(r'\b'+trigger+r'(?!( difficuly| further| any further))\b',re.IGNORECASE)
else:
pattern = re.compile(r'\b'+trigger+r'\b',re.IGNORECASE)
if pattern.search(line):
tfout.write(line+"\n\n")
break
def parse(self):
## run gDep to produce denpency trees from plain text sentence
cmd = "./gdep "+self.trimmed_filepath+" > "+self.parsed_filepath
print "Start POS tagging, chunking, NER, and parsing ... (this may take several minutes, please wait.)"
os.system(cmd)
# if you get "sh: ./gdep permission denied" message, run command "chmod u+rwx ./gdep" first.
def run_parse(self):
self.read_TestFile()
self.parse()
# a wrapper for sentence
class SENT():
def __init__(self):
self.indices = []
self.words = []
self.POS = []
self.arc_end = [] # another end of the incoming arc, every word has only one incoming arc
self.dep = []
self.NegIndice = []
def add_Row(self, row):
## add one row of CoNLL output into the sentence
row = row.strip()
cols = row.split('\t')
self.indices.append(int(cols[0]))
self.words.append(cols[1])
self.POS.append(cols[4])
self.arc_end.append(int(cols[6]))
self.dep.append(cols[7])
def add_NegIndex(self, i):
self.NegIndice.append(i)
def get_indice(self):
return self.indices[:]
def get_words(self):
return self.words[:]
def get_POS(self):
return self.POS[:]
def get_arc_end(self):
return self.arc_end[:]
def get_dep(self):
return self.dep[:]
def get_NegIndice(self):
return self.NegIndice[:]
def whether_empty(self):
return self.indices == []
class DepND(DepNeg):
def findNeg(self, sentwrapper):
core_triggers = self.NegTriggers[:35] #change this index when you changed core negation triggers.
for i in sentwrapper.get_indice():
if sentwrapper.get_words()[i-1].lower() in core_triggers:
sentwrapper.add_NegIndex(i)
## deal with phrases
elif sentwrapper.get_words()[i-1].lower() == 'rather':
if sentwrapper.get_words()[i].lower() == 'than':
sentwrapper.add_NegIndex(i)
elif any(sentwrapper.get_words()[i-1].lower() == trg for trg in ['rule','rules','ruled','ruling']):
if sentwrapper.get_words()[i].lower() == 'out':
sentwrapper.add_NegIndex(i)
## deal with subjunctive mood
elif any(sentwrapper.get_words()[i-1].lower() == trg for trg in ['would','could']):
candidates = {}
for j in sentwrapper.get_indice():
if sentwrapper.get_arc_end()[j-1] == i:
candidates[j] = [sentwrapper.get_words()[j-1].lower(), sentwrapper.get_POS()[j-1], sentwrapper.get_dep()[j-1]]
for j in candidates:
if candidates[j] == ['if','IN','VMOD']:
for k in candidates:
if candidates[k][1:] == ['VB','VC']:
sentwrapper.add_NegIndex(j)
sentwrapper.add_NegIndex(i)
elif sentwrapper.get_words()[i-1].lower() == 'wish':
if sentwrapper.get_POS()[i-1] == 'VBP':
candidates = {}
for j in sentwrapper.get_indice():
if sentwrapper.get_arc_end()[j-1] == i:
candidates[j] = [sentwrapper.get_words()[j-1].lower(), sentwrapper.get_POS()[j-1], sentwrapper.get_dep()[j-1]]
for k in candidates:
if candidates[k][1:] == ['VBD','VMOD']:
sentwrapper.add_NegIndex(k)
return sentwrapper.get_NegIndice()
def MST(self, i_root, i_neg, sentwrapper):
## maximal spanning tree with SUB&Right and Punc rules
## *SUB&Right* = only span towards right or span left through SUB arc, span nothing if there's no SUB arc or right part. (notice that this rule only apply to root node)
## *Punc* = All MST rules *should not cross punctuation marks* during spanning, no matter it spans towards left or right (relatively to the position of trigger word). But some arcs can (such as SUB, OBJ and PRD).
indices = []
openlist = []
## (index, whether_purebred) tuple locates each word
## purebred means the node has an SUB, OBJ or PRD ancestor
## only purebred nodes can violate Punc-rule (span across punctuations)
indices.append((i_root,False))
openlist.append((i_root,False))
while openlist:
i_now, whether_purebred = openlist.pop()
if i_now == i_root:
## SUB&Right rule for root node
for j in sentwrapper.get_indice():
if sentwrapper.get_arc_end()[j-1] == i_now:
if j >= min(i_root, i_neg):
if any(sentwrapper.get_dep()[j-1] == d for d in ['SUB','OBJ','PRD']):
whether_purebred = True
openlist.append((j,whether_purebred))
indices.append((j,whether_purebred))
else:
if any(sentwrapper.get_dep()[j-1] == d for d in ['SUB','OBJ','PRD']):
openlist.append((j,True))
indices.append((j,True))
else:
for j in sentwrapper.get_indice():
if sentwrapper.get_arc_end()[j-1] == i_now:
if any(sentwrapper.get_dep()[j-1] == d for d in ['SUB','OBJ','PRD']):
whether_purebred = True
openlist.append((j,whether_purebred))
indices.append((j,whether_purebred))
## find all non-purebred punctutations
i_punc = float("inf")
for index in indices:
if sentwrapper.get_dep()[index[0]-1] == 'P' and index[1] == False:
if index[0] < i_punc:
i_punc = index[0]
## delete non-purebred punctuation and all non-purebred nodes out of this non-purebred punctuation
for index in indices:
if index[0] >= i_punc and index[1] == False:
indices.remove(index)
indices = [index[0] for index in indices]
## make <SCOPE> tag always enclose <NEG> tag
if i_neg not in indices:
indices.append(i_neg)
indices.sort()
return indices
def oldMST(self, i_root, sentwrapper):
# old maximal spanning tree without any additional rules
# it's kept in case someone may need it someday
indice = []
openlist = []
indice.append(i_root)
openlist.append(i_root)
while openlist:
i_now = openlist.pop()
for j in sentwrapper.get_indice():
if sentwrapper.get_arc_end()[j-1] == i_now:
openlist.append(j)
indice.append(j)
indice.sort()
return indice
def indice2result(self, indices, i_neg, sentwrapper):
words = sentwrapper.get_words()
if any(words[i_neg-1] == trg for trg in ['rather','rule','ruling','rules','ruled']):
words[i_neg-1] = '<NEG>'+words[i_neg-1]
words[i_neg] += '</NEG>'
else:
words[i_neg-1] = '<NEG>'+words[i_neg-1]+'</NEG>'
words[indices[0]-1] = '<SCOPE>'+words[indices[0]-1]
words[indices[-1]-1] += '</SCOPE>'
return ' '.join(words)
def elevate(self, i, sentwrapper):
## $_Elevate rule
eDict = defaultdict(list)
eDict['RB'] = ['DEP', 'AMOD']
eDict['NN'] = ['PMOD']
eDict['VBN'] = ['VC']
eDict['VB'] = ['VC']
dep_elevate = eDict[sentwrapper.get_POS()[i-1]]
dep = sentwrapper.get_dep()
while dep[i-1] in dep_elevate:
i = sentwrapper.get_arc_end()[i-1]
return i
def getNegScope(self, sentwrapper):
result = ''
indice_neg = self.findNeg(sentwrapper)
if indice_neg != []:
## map tagset to rules
tagset_gMST = set(['RB','DT','JJ','CC'])
tagset_sMST = set(['NN','IN','VB','VBD','VBG','VBN','VBP','VBZ','MD'])
for i_neg in indice_neg:
## sMST rule
## maximal spanning tree from itself;
if sentwrapper.get_POS()[i_neg-1] in tagset_sMST:
i_root = self.elevate(i_neg, sentwrapper)
result += self.indice2result(self.MST(i_root,i_neg,sentwrapper), i_neg, sentwrapper)+'\n'
## gMST rule
## maximal spanning tree from its immediate governor;
if sentwrapper.get_POS()[i_neg-1] in tagset_gMST:
i_root = self.elevate(i_neg, sentwrapper)
i_root = sentwrapper.get_arc_end()[i_root-1]
result += self.indice2result(self.MST(i_root,i_neg,sentwrapper), i_neg, sentwrapper)+'\n'
return result
def run_DepND(self):
# 1. read one sentence, find negation trigger words in the sentence;
# 2. run corresponding rules to determine scope, return the index-range for the scope.
print "Start generating scope for negation triggers ..."
sent_tmp = SENT()
with open(self.result_filepath,'w') as fout:
with open(self.parsed_filepath,'r') as fin:
for line in fin:
line = line.strip()
if line:
sent_tmp.add_Row(line)
else:
if not sent_tmp.whether_empty():
fout.write(self.getNegScope(sent_tmp)+'\n')
sent_tmp = SENT()
print "All done.\nPlease see results at "+self.result_filepath
## run following command to execute this program:
## python DepND.py path/to/test.txt path/to/result.txt
if __name__ == '__main__':
args = sys.argv
if len(args) != 3:
print "Arguments Error: please give 2 arguments - test_filepath, result_filepath."
else:
test_filepath = args[1]
result_filepath = args[2]
toy = DepND(test_filepath, result_filepath)
toy.run_parse()
toy.run_DepND()