-
Notifications
You must be signed in to change notification settings - Fork 1
/
TextProcessor.py
1240 lines (985 loc) · 55.2 KB
/
TextProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from cgitb import text
import requests
from distutils.command.config import config
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import json
from tokenize import String
#from allennlp.predictors.predictor import Predictor
#from allennlp_models import pretrained
#import allennlp_models.tagging
from spacy import Language
import GPUtil
import spacy
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token, Span
from spacy.language import Language
import textwrap
from util.RestCaller import callAllenNlpApi
from util.RestCaller import amuse_wsd_api_call
from transformers import logging
logging.set_verbosity_error()
from py2neo import Graph
from py2neo import *
import configparser
import os
from util.RestCaller import callAllenNlpApi
from util.CallAllenNlpCoref import callAllenNlpCoref
import traceback
from nltk.corpus import wordnet31 as wn
from nltk.corpus.reader.wordnet import WordNetError as wn_error
from functools import reduce # Import reduce function
class TextProcessor(object):
uri=""
username =""
password =""
graph=""
def __init__(self, nlp, driver):
self.nlp = nlp
self._driver = driver
self.uri=""
self.username =""
self.password =""
config = configparser.ConfigParser()
#config_file = os.path.join(os.path.dirname(__file__), '..', 'config.ini')
config_file = os.path.join(os.path.dirname(__file__), 'config.ini')
config.read(config_file)
py2neo_params = config['py2neo']
self.uri = py2neo_params.get('uri')
self.username = py2neo_params.get('username')
self.password = py2neo_params.get('password')
#self.graph = Graph(self.uri, auth=(self.username, self.password))
def do_coref2(self, doc, textId):
graph = Graph(self.uri, auth=(self.username, self.password))
result = callAllenNlpCoref("coreference-resolution", doc.text )
print("Coref Result: ", result)
sg=""
PARTICIPANT = Relationship.type("PARTICIPANT")
PARTICIPATES_IN = Relationship.type("PARTICIPATES_IN")
MENTIONS = Relationship.type("MENTIONS")
COREF = Relationship.type("COREF")
#print("clusters: " , result["clusters"])
# storing the coreference mentions as graph nodes linked with antecedent via mentions edges
# steps
# 1. get the coref-mention and antedent pair
coref = []
for cluster in result["clusters"]:
i=0
antecedent_span = ""
cag="" # coreferents - antecedent relationships sub-graph
for span_token_indexes in cluster:
if i == 0:
i+=1
# the first span will be the antecedent for all other references
antecedent_span = doc[span_token_indexes[0]:span_token_indexes[-1]] #updated for index
antecedent_node = {'start_index': span_token_indexes[0], 'end_index': span_token_indexes[-1], 'text': antecedent_span.text} # updated for -1 index
antecendent_node = Node("Antecedent", text= antecedent_span.text, startIndex=span_token_indexes[0], endIndex=span_token_indexes[-1]) # updated for -1 index
antecedent_node_start_index = span_token_indexes[0]
# connect the antecedentNode node with all the participating tagOccurrences
index_range = range(span_token_indexes[0], span_token_indexes[-1])
atg=""
for index in index_range:
query = "match (x:TagOccurrence {tok_index_doc:" + str(index) + "})-[:HAS_TOKEN]-()-[:CONTAINS_SENTENCE]-(:AnnotatedText {id:"+str(doc._.text_id)+"}) return x"
token_node = graph.evaluate(query)
if token_node is None:
#sga= antecendent_node
#graph.create(sga)
continue
token_mention_rel = PARTICIPATES_IN(token_node,antecendent_node)
if atg == "":
atg = token_mention_rel
else:
atg = atg | token_mention_rel
graph.create(atg)
# antecedent-tagOccurrences sub-graph creation end.
continue
coref_mention_span = doc[span_token_indexes[0]:span_token_indexes[-1]] #updated index
coref_mention_node = {'start_index': span_token_indexes[0], 'end_index': span_token_indexes[-1], 'text': coref_mention_span.text} #updated index
corefMention_node = Node("CorefMention", text= coref_mention_span.text, startIndex=span_token_indexes[0], endIndex=span_token_indexes[-1]) #updated index
#mention = {'from_index': span[-1], 'to_index': antecedent}
#mention = { 'referent': coref_mention_span, 'antecedent': antecedent_span}
mention = { 'referent': coref_mention_node, 'antecedent': antecedent_node}
# connect the corefMention node with all the participating tagOccurrences
index_range = range(span_token_indexes[0], span_token_indexes[-1]) #updated index
ctg=""
for index in index_range:
query = "match (x:TagOccurrence {tok_index_doc:" + str(index) + "})-[:HAS_TOKEN]-()-[:CONTAINS_SENTENCE]-(:AnnotatedText {id:"+str(doc._.text_id)+"}) return x"
token_node = graph.evaluate(query)
if token_node is None:
#sgc= corefMention_node
#graph.create(sgc)
continue
token_mention_rel = PARTICIPATES_IN(token_node,corefMention_node)
if ctg == "":
ctg = token_mention_rel
else:
ctg = ctg | token_mention_rel
graph.create(ctg)
# corefMention - TagOccurrence subgraph ends.
coref_rel = COREF(corefMention_node,antecendent_node)
if cag == "":
cag = coref_rel
else:
cag = cag | coref_rel
coref.append(mention)
# connect the corefMention node with the antecdent namedEntity.
# np_query = "MATCH (document:AnnotatedText {id:"+ str(doc._.text_id) +"})-[*2]->(np:TagOccurrence)-[:PARTICIPATES_IN]->(end:NamedEntity) WHERE np.index = " + str(antecedent_node_start_index) + " RETURN end"
# np_node = graph.evaluate(np_query)
# if np_node is None:
# """ try:
# #graph.create(sg)
# except BaseException as err:
# print(f"Unexpected {err=}, {type(err)=}") """
# continue
# coref_mention_np_rel = MENTIONS(corefMention_node,np_node)
# cag = cag |coref_mention_np_rel
graph.create(cag)
#TODO: this query need to be tested and should be made more specific other wise it may result it false positives
# graph.evaluate("""match (ne:NamedEntity)<-[:PARTICIPATES_IN]-(tago:TagOccurrence)-[:PARTICIPATES_IN]->(ant:Antecedent)<-[:COREF]-(corefm:CorefMention)
# where tago.index = ne.index and tago.tok_index_doc = ant.startIndex
# merge (corefm)-[:MENTIONS]->(ne)""")
print(coref)
#self.store_coref_mentions(doc, coref)
# create the referrant span , attaches it with the tagOccurrences
# identify the namedEntity that belongs to the antecedent
#
def do_coref(self, doc, textId):
result = callAllenNlpCoref("coreference-resolution", doc.text )
#print("clusters: " , result["clusters"])
# storing the coreference mentions as graph nodes linked with antecedent via mentions edges
# steps
# 1. get the coref-mention and antedent pair
coref = []
for cluster in result["clusters"]:
i=0
antecedent_span = ""
for span_token_indexes in cluster:
if i == 0:
i+=1
# the first span will be the antecedent for all other references
antecedent_span = doc[span_token_indexes[0]:span_token_indexes[-1]+1]
antecedent_node = {'start_index': span_token_indexes[0], 'end_index': span_token_indexes[-1]+1, 'text': antecedent_span.text}
continue
coref_mention_span = doc[span_token_indexes[0]:span_token_indexes[-1]+1]
coref_mention_node = {'start_index': span_token_indexes[0], 'end_index': span_token_indexes[-1]+1, 'text': coref_mention_span.text}
#mention = {'from_index': span[-1], 'to_index': antecedent}
#mention = { 'referent': coref_mention_span, 'antecedent': antecedent_span}
mention = { 'referent': coref_mention_node, 'antecedent': antecedent_node}
coref.append(mention)
print(coref)
self.store_coref_mentions(doc, coref)
def store_coref_mentions(self, doc, mentions):
graph = Graph(self.uri, auth=(self.username, self.password))
# create the referrant span , attaches it with the tagOccurrences
# identify the namedEntity that belongs to the antecedent
#
sg=""
PARTICIPANT = Relationship.type("PARTICIPANT")
PARTICIPATES_IN = Relationship.type("PARTICIPATES_IN")
MENTIONS = Relationship.type("MENTIONS")
COREF = Relationship.type("COREF")
for mention in mentions:
start_index = mention['referent']['start_index']
end_index = mention['referent']['end_index']
start_index_antecedent = mention['antecedent']['start_index']
end_index_antecedent = mention['antecedent']['end_index']
sg=""
sgc=""
sga=""
# create a corefMention node
corefMention_node = Node("CorefMention", text= mention['referent']['text'], startIndex=start_index, endIndex=end_index)
antecendent_node = Node("Antecedent", text= mention['antecedent']['text'], startIndex=start_index_antecedent, endIndex=end_index_antecedent)
coref_rel = COREF(corefMention_node,antecendent_node)
#tx = self.graph.begin()
graph.create(coref_rel)
#self.graph.commit(tx)
# connect the corefMention node with all the participating tagOccurrences
index_range = range(start_index, end_index)
for index in index_range:
query = "match (x:TagOccurrence {tok_index_doc:" + str(index) + "})-[:HAS_TOKEN]-()-[:CONTAINS_SENTENCE]-(:AnnotatedText {id:"+str(doc._.text_id)+"}) return x"
token_node = graph.evaluate(query)
if token_node is None:
sgc= corefMention_node
#graph.create(sgc)
continue
token_mention_rel = PARTICIPATES_IN(token_node,corefMention_node)
if sgc == "":
sgc = token_mention_rel
else:
sgc = sgc | token_mention_rel
graph.create(sgc)
# connect the antecedentNode node with all the participating tagOccurrences
index_range = range(start_index_antecedent, end_index_antecedent)
for index in index_range:
query = "match (x:TagOccurrence {tok_index_doc:" + str(index) + "})-[:HAS_TOKEN]-()-[:CONTAINS_SENTENCE]-(:AnnotatedText {id:"+str(doc._.text_id)+"}) return x"
token_node = graph.evaluate(query)
if token_node is None:
sga= antecendent_node
#graph.create(sga)
continue
token_mention_rel = PARTICIPATES_IN(token_node,antecendent_node)
if sga == "":
sga = token_mention_rel
else:
sga = sga | token_mention_rel
graph.create(sga)
#graph.create(sg|sga|coref_rel)
# connect the corefMention node with the antecdent namedEntity.
np_query = "MATCH (document:AnnotatedText {id:"+ str(doc._.text_id) +"})-[*2]->(np:TagOccurrence)-[:PARTICIPATES_IN]->(end:NamedEntity) WHERE np.index = " + str(start_index_antecedent) + " RETURN end"
np_node = graph.evaluate(np_query)
if np_node is None:
""" try:
#graph.create(sg)
except BaseException as err:
print(f"Unexpected {err=}, {type(err)=}") """
continue
coref_mention_np_rel = MENTIONS(corefMention_node,np_node)
sg = sg |coref_mention_np_rel
try:
graph.create(sg)
except BaseException as err:
print(f"Unexpected {err=}, {type(err)=}")
return mention
# this method also includes code to create files for TARSQI toolkit.
# TODO: we need to port this code to another seperate script file outside this project.
def get_annotated_text(self):
print(self.uri)
graph = Graph(self.uri, auth=(self.username, self.password))
query = "MATCH (n:AnnotatedText) RETURN n.text, n.id, n.creationtime"
data= graph.run(query).data()
annotatedd_text_docs= list()
for record in data:
#print(record)
#print(record.get("n.text"))
t = (record.get("n.text"), {'text_id': record.get("n.id")})
dct = str(record.get("n.creationtime"))
dct = dct[0:10]
dct= dct.replace('-','')
# create a file
filename = """/home/neo/environments/text2graphs/text2graphs/tarsqi-dataset/""" + str(record.get("n.id")) +"_"+ dct + ".xml"
if not os.path.exists(filename):
f = open(filename, "x")
f.write(record.get("n.text"))
f.close()
annotatedd_text_docs.append(t)
return annotatedd_text_docs
def apply_pipeline_1(self, doc, flag_display=False):
graph = Graph(self.uri, auth=(self.username, self.password))
frameDict = {}
v = None
sg = None
tv = None
PARTICIPANT = Relationship.type("PARTICIPANT")
PARTICIPATES_IN = Relationship.type("PARTICIPATES_IN")
for tok in doc:
sg = None
v = None
frameDict = {}
for x, indices_list in tok._.SRL.items():
for y in indices_list:
span = doc[y[0]: y[len(y) - 1] + 1]
if x == "V":
v = Node("Frame", text=span.text, startIndex=y[0], endIndex=y[len(y) - 1])
for index in y:
query = "MATCH (x:TagOccurrence {tok_index_doc:" + str(
index) + "})-[:HAS_TOKEN]-()-[:CONTAINS_SENTENCE]-(:AnnotatedText {id:" + str(
doc._.text_id) + "}) RETURN x"
token_node = graph.evaluate(query)
token_verb_rel = PARTICIPATES_IN(token_node, v)
graph.create(token_verb_rel)
tv = v
else:
a = Node("FrameArgument", type=x, text=span.text, startIndex=y[0], endIndex=y[len(y) - 1])
if a is None:
continue
for index in y:
query = "MATCH (x:TagOccurrence {tok_index_doc:" + str(
index) + "})-[:HAS_TOKEN]-()-[:CONTAINS_SENTENCE]-(:AnnotatedText {id:" + str(
doc._.text_id) + "}) RETURN x"
token_node = graph.evaluate(query)
if token_node is None:
continue
# Create PARTICIPATES_IN relationship between TagOccurrence and FrameArgument
token_arg_rel = PARTICIPATES_IN(token_node, a)
graph.create(token_arg_rel)
if x not in frameDict:
frameDict[x] = []
frameDict[x].append(a)
if tv is not None:
sg = tv
for i in frameDict:
if sg is None:
break
for arg_node in frameDict[i]:
# Create PARTICIPANT relationship between FrameArgument and Frame
r = PARTICIPANT(arg_node, sg)
graph.create(r)
if sg is not None:
try:
graph.create(sg)
except BaseException as err:
print(f"Unexpected {err=}, {type(err)=}")
#print(x, ": ",y, span.text)
# print("list pipeline: ", list_pipeline)
# print("------------------------------------------------")
# print(tok._.SRL)
######################################################################### Token Enrichement with Wordnet ################################################################
# Function to get hypernyms for a synset
def get_all_hypernyms(self, synset):
hypernyms = []
hypernym_synsets = synset.hypernyms()
for hypernym_synset in hypernym_synsets:
hypernyms.append(hypernym_synset.name()) # Store hypernym synset name
hypernyms.extend(self.get_all_hypernyms(hypernym_synset)) # Recursive call to get hypernyms of hypernyms
return hypernyms
# Function to get synonyms of a synset
def get_synonyms(self, synset):
synonyms = []
for lemma in synset.lemmas():
synonyms.append(lemma.name()) # Store synonym
return synonyms
# Function to get domain labels for a synset
""" def get_domain_labels(self, synset):
domain_labels = []
topic_domains = synset.topic_domains()
for domain in topic_domains:
domain_labels.extend(domain.split("."))
return domain_labels """
# Function to get domain labels for a synset
def get_domain_labels(self, synset):
domain_labels = []
lexname = synset.lexname()
# Extract the domain label from the lexical name if present
if "." in lexname:
domain_labels.append(lexname.split(".")[0])
return domain_labels
# Assuming you have a running Neo4j server and a connected driver instance called 'driver'
def assign_synset_info_to_tokens(self, doc_id):
with self._driver.session() as session:
# Step 1: Retrieve all Sentence nodes for the given AnnotatedText document
query = """
MATCH (d:AnnotatedText {id: $doc_id})-[:CONTAINS_SENTENCE]->(s:Sentence)
RETURN s.id AS sentence_id, s.text AS sentence_text
"""
params = {"doc_id": doc_id}
result = self.execute_query3(query, params)
for record in result:
sentence_id = record["sentence_id"]
sentence_text = record["sentence_text"]
# Step 2: Retrieve the linked Token nodes for each Sentence node
query = """
MATCH (s:Sentence {id: $sentence_id})-[:HAS_TOKEN]->(t:TagOccurrence)
RETURN t.id AS token_id, t.nltkSynset AS nltkSynset, t.wnSynsetOffset AS wnSynsetOffset
"""
params = {"sentence_id": sentence_id}
token_result = self.execute_query3(query, params)
for token_record in token_result:
token_id = token_record["token_id"]
#wn_synset_offset = token_record["wnSynsetOffset"]
nltk_synset = token_record["nltkSynset"]
#print(wn_synset_offset)
if nltk_synset and nltk_synset != 'O':
try:
synset = wn.synset(nltk_synset)
synset_identifier = synset.name()
print(synset_identifier)
lemma, pos, sense_num = synset_identifier.split('.')
#print("Lemma:", lemma)
#print("POS:", pos)
#print("Sense Number:", sense_num)
wn_synset_offset = synset.offset()
wn_synset_offset = str(wn_synset_offset) + pos
# Step 3: Get synset information from WordNet
synset = wn.synset_from_pos_and_offset(wn_synset_offset[-1], int(wn_synset_offset[:-1]))
#synset = wn.synset_from_pos_and_offset(wn_synset_offset[-1], int(wn_synset_offset))
# Get hypernyms, synonyms, and domain labels for the synset
hypernyms = self.get_all_hypernyms(synset)
synonyms = self.get_synonyms(synset)
domain_labels = self.get_domain_labels(synset)
# Update the Token node in Neo4j with synset-related information
update_query = """
MATCH (t:TagOccurrence {id: $token_id})
SET t.hypernyms = $hypernyms, t.wn31SynsetOffset = $wn31SynsetOffset, t.synonyms = $synonyms, t.domain_labels = $domain_labels
"""
params = {
"token_id": token_id,
"hypernyms": hypernyms,
"synonyms": synonyms,
"domain_labels": domain_labels,
"wn31SynsetOffset": wn_synset_offset
}
self.execute_query3(update_query, params) # Call your existing execute_query method
except wn_error:
print(f"Synset not found for token_id: {token_id}. Skipping processing.")
else:
print(f"Synset offset 'O' or empty for token_id: {token_id}. Skipping processing.")
########################################################
#########################################################################################################################################################################
######################################################################### Word Sense Disambiguation Code #################################################################
def amuse_wsd_api_call(self, api_endpoint, sentence):
headers = {
"accept": "application/json",
"Content-Type": "application/json"
}
data = [{"text": sentence, "lang": "EN"}]
try:
response = requests.post(api_endpoint, json=data, headers=headers)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Error while calling AMuSE-WSD API: {e}")
return None
def update_tokens_in_neo4j(self, sentence_id, token_index, token_attrs):
query = """
MATCH (s:Sentence {id: $sentence_id})-[:HAS_TOKEN]->(t:TagOccurrence {tok_index_sent: $index})
SET t.bnSynsetId = $bnSynsetId,
t.wnSynsetOffset = $wnSynsetOffset,
t.nltkSynset = $nltkSynset
"""
params = {
"sentence_id": sentence_id,
"index": token_index,
"bnSynsetId": token_attrs['bnSynsetId'],
"wnSynsetOffset": token_attrs['wnSynsetOffset'],
"nltkSynset": token_attrs['nltkSynset']
}
self.execute_query(query, params)
def perform_wsd(self, document_id):
amuze_wsd_api_endpoint = "http://localhost:81/api/model"
query = """ MATCH (d:AnnotatedText {id: $doc_id})-[:CONTAINS_SENTENCE]->(s:Sentence) RETURN s.id AS sentence_id, s.text AS text """
params = {"doc_id": document_id}
result = self.execute_query3(query, params)
# amuze_wsd_api_endpoint = "http://localhost:81/api/model"
# query = """MATCH (d:AnnotatedText {id: $doc_id})-[:CONTAINS_SENTENCE]->(s:Sentence) RETURN s.id AS sentence_id, s.text AS text"""
# params = {"doc_id": document_id}
# result = self.execute_query(query, params)
sentences_to_process = []
sentence_ids = []
for record in result:
sentence_id = record["sentence_id"]
sentence_text = record["text"]
# Collect sentences to process in batches
sentences_to_process.append(sentence_text)
sentence_ids.append(sentence_id)
# Step 2.1: Call the AMuSE-WSD API with multiple sentences
api_response = amuse_wsd_api_call(amuze_wsd_api_endpoint, sentences_to_process)
if api_response:
for idx, sentence_data in enumerate(api_response):
sentence_id = sentence_ids[idx]
# Step 2.2: Update the associated Token nodes in Neo4j with the API response
for token_data in sentence_data['tokens']:
token_index = token_data['index']
token_attrs = {
'bnSynsetId': token_data['bnSynsetId'],
'wnSynsetOffset': token_data['wnSynsetOffset'],
'nltkSynset': token_data['nltkSynset']
}
self.update_tokens_in_neo4j(sentence_id=sentence_id, token_index=token_index, token_attrs=token_attrs)
########################################################## End of Section: Word Sense Disambiguation ################################################################
# query = """MERGE (ann:AnnotatedText {id: $id})
# RETURN id(ann) as result
#"""
def create_annotated_text(self, data, content, id):
#filename = "file://" + filename
query = """ WITH $data
AS xmlString
WITH apoc.xml.parse(xmlString) AS value
UNWIND [item in value._children where item._type ="nafHeader"] AS nafHeader
UNWIND [item in value._children where item._type ="raw"] AS raw
UNWIND [item in nafHeader._children where item._type = "fileDesc"] AS fileDesc
UNWIND [item in nafHeader._children where item._type = "public"] AS public
WITH fileDesc.author as author, fileDesc.creationtime as creationtime, fileDesc.filename as filename, fileDesc.filetype as filetype, fileDesc.title as title, public.publicId as publicId, public.uri as uri, raw._text as text
MERGE (at:AnnotatedText {id: $id}) set at.author = author, at.creationtime = creationtime, at.filename = filename, at.filetype = filetype, at.title = title, at.publicId = publicId, at.uri = uri, at.text = $text
"""
# query = """ CALL apoc.load.xml($filename)
# YIELD value
# UNWIND [item in value._children where item._type ="nafHeader"] AS nafHeader
# UNWIND [item in value._children where item._type ="raw"] AS raw
# UNWIND [item in nafHeader._children where item._type = "fileDesc"] AS fileDesc
# UNWIND [item in nafHeader._children where item._type = "public"] AS public
# WITH fileDesc.author as author, fileDesc.creationtime as creationtime, fileDesc.filename as filename, fileDesc.filetype as filetype, fileDesc.title as title, public.publicId as publicId, public.uri as uri, raw._text as text
# MERGE (at:AnnotatedText {id: $id}) set at.author = author, at.creationtime = creationtime, at.filename = filename, at.filetype = filetype, at.title = title, at.publicId = publicId, at.uri = uri, at.text = $text
# """
params = {"id": id, "data":data, "text": content}
#params = {"id": id, "filename":data}
print(query)
results = self.execute_query(query, params)
#return results[0]
#replace(text," "," ")
def add_temporal_metadata(self, filename, id):
return ""
def process_sentences(self, annotated_text, doc, storeTag, text_id):
i = 0
for sentence in doc.sents:
sentence_id = self.store_sentence(sentence, annotated_text, text_id, i, storeTag)
#spans = list(doc.ents) + list(doc.noun_chunks) - just removed so that only entities get stored.
#spans = list(doc.ents) - just disabled it as testing dbpedia spotlight
spans = ''
if doc.spans.get('ents_original') != None:
spans = list(doc.ents) + list(doc.spans['ents_original'])
else:
spans = list(doc.ents)
#spans = filter_spans(spans) - just disabled it as testing dbpedia spotlight
i += 1
return spans
def store_sentence(self, sentence, annotated_text, text_id, sentence_id, storeTag):
# sentence_query = """MATCH (ann:AnnotatedText) WHERE id(ann) = $ann_id
# MERGE (sentence:Sentence {id: $sentence_unique_id})
# SET sentence.text = $text
# MERGE (ann)-[:CONTAINS_SENTENCE]->(sentence)
# RETURN id(sentence) as result
# """
sentence_query = """MATCH (ann:AnnotatedText) WHERE ann.id = $ann_id
MERGE (sentence:Sentence {id: $sentence_unique_id})
SET sentence.text = $text
MERGE (ann)-[:CONTAINS_SENTENCE]->(sentence)
RETURN id(sentence) as result
"""
tag_occurrence_query = """MATCH (sentence:Sentence) WHERE id(sentence) = $sentence_id
WITH sentence, $tag_occurrences as tags
FOREACH ( idx IN range(0,size(tags)-2) |
MERGE (tagOccurrence1:TagOccurrence {id: tags[idx].id})
SET tagOccurrence1 = tags[idx]
MERGE (sentence)-[:HAS_TOKEN]->(tagOccurrence1)
MERGE (tagOccurrence2:TagOccurrence {id: tags[idx + 1].id})
SET tagOccurrence2 = tags[idx + 1]
MERGE (sentence)-[:HAS_TOKEN]->(tagOccurrence2)
MERGE (tagOccurrence1)-[r:HAS_NEXT {sentence: sentence.id}]->(tagOccurrence2))
RETURN id(sentence) as result
"""
tag_occurrence_with_tag_query = """MATCH (sentence:Sentence) WHERE id(sentence) = $sentence_id
WITH sentence, $tag_occurrences as tags
FOREACH ( idx IN range(0,size(tags)-2) |
MERGE (tagOccurrence1:TagOccurrence {id: tags[idx].id})
SET tagOccurrence1 = tags[idx]
MERGE (sentence)-[:HAS_TOKEN]->(tagOccurrence1)
MERGE (tagOccurrence2:TagOccurrence {id: tags[idx + 1].id})
SET tagOccurrence2 = tags[idx + 1]
MERGE (sentence)-[:HAS_TOKEN]->(tagOccurrence2)
MERGE (tagOccurrence1)-[r:HAS_NEXT {sentence: sentence.id}]->(tagOccurrence2))
FOREACH (tagItem in [tag_occurrence IN $tag_occurrences WHERE tag_occurrence.is_stop = False] |
MERGE (tag:Tag {id: tagItem.lemma}) MERGE (tagOccurrence:TagOccurrence {id: tagItem.id}) MERGE (tag)<-[:REFERS_TO]-(tagOccurrence))
RETURN id(sentence) as result
"""
params = {"ann_id": annotated_text, "text": sentence.text,
"sentence_unique_id": str(text_id) + "_" + str(sentence_id)}
results = self.execute_query(sentence_query, params)
node_sentence_id = results[0]
tag_occurrences = []
tag_occurrence_dependencies = []
for token in sentence:
lexeme = self.nlp.vocab[token.text]
# edited: included the punctuation as possible token candidates.
#if not lexeme.is_punct and not lexeme.is_space:
if not lexeme.is_space:
tag_occurrence_id = str(text_id) + "_" + str(sentence_id) + "_" + str(token.idx)
tag_occurrence = {"id": tag_occurrence_id,
"index": token.idx,
"end_index": (len(token.text)+token.idx),
"text": token.text,
"lemma": token.lemma_,
"pos": token.tag_,
"upos": token.pos_,
"tok_index_doc": token.i,
"tok_index_sent": (token.i - sentence.start),
"is_stop": (lexeme.is_stop or lexeme.is_punct or lexeme.is_space)}
tag_occurrences.append(tag_occurrence)
tag_occurrence_dependency_source = str(text_id) + "_" + str(sentence_id) + "_" + str(token.head.idx)
print(token.text, token.dep_, token.head.text, token.head.pos_,
[child for child in token.children])
dependency = {"source": tag_occurrence_dependency_source, "destination": tag_occurrence_id,
"type": token.dep_}
tag_occurrence_dependencies.append(dependency)
params = {"sentence_id": node_sentence_id, "tag_occurrences": tag_occurrences}
if storeTag:
results = self.execute_query(tag_occurrence_with_tag_query, params)
else:
results = self.execute_query(tag_occurrence_query, params)
self.process_dependencies(tag_occurrence_dependencies)
return results[0]
# this snippet is for dbpedia-spotlight component
def process_entities(self, spans, text_id):
nes = []
for entity in spans:
if entity.kb_id_ != '':
ne = {'value': entity.text, 'type': entity.label_, 'start_index': entity.start_char,
'end_index': entity.end_char,
'kb_id': entity.kb_id_, 'url_wikidata': entity.kb_id_, 'score': entity._.dbpedia_raw_result['@similarityScore'],
'normal_term': entity.text, 'description': entity._.dbpedia_raw_result.get('@surfaceForm')
}
else:
ne = {'value': entity.text, 'type': entity.label_, 'start_index': entity.start_char,
'end_index': entity.end_char
}
nes.append(ne)
self.store_entities(text_id, nes)
return nes
#end of this snippet
# this snippet is only applicable for entity-fishing component
# def process_entities(self, spans, text_id):
# nes = []
# for entity in spans:
# ne = {'value': entity.text, 'type': entity.label_, 'start_index': entity.start_char,
# 'end_index': entity.end_char,
# 'kb_id': entity._.kb_qid, 'url_wikidata': entity._.url_wikidata, 'score': entity._.nerd_score,
# 'normal_term': entity._.normal_term, 'description': entity._.description }
# nes.append(ne)
# self.store_entities(text_id, nes)
# return nes
# end of this snippet.
def process_noun_chunks(self, doc, text_id):
ncs = []
for noun_chunk in doc.noun_chunks:
nc = {'value': noun_chunk.text, 'type': noun_chunk.label_, 'start_index': noun_chunk.start_char,
'end_index': noun_chunk.end_char}
ncs.append(nc)
self.store_noun_chunks(text_id, ncs)
return ncs
def store_noun_chunks(self, document_id, ncs):
nc_query = """
UNWIND $ncs as item
MERGE (nc:NounChunk {id: toString($documentId) + "_" + toString(item.start_index)})
SET nc.type = item.type, nc.value = item.value, nc.index = item.start_index
WITH nc, item as ncIndex
MATCH (text:AnnotatedText)-[:CONTAINS_SENTENCE]->(sentence:Sentence)-[:HAS_TOKEN]->(tagOccurrence:TagOccurrence)
WHERE text.id = $documentId AND tagOccurrence.index >= ncIndex.start_index AND tagOccurrence.index < ncIndex.end_index
MERGE (nc)<-[:PARTICIPATES_IN]-(tagOccurrence)
"""
self.execute_query(nc_query, {"documentId": document_id, "ncs": ncs})
def store_entities(self, document_id, nes):
ne_query = """
UNWIND $nes as item
MERGE (ne:NamedEntity {id: toString($documentId) + "_" + toString(item.start_index)+ "_" + toString(item.end_index)+ "_" + toString(item.type)})
SET ne.type = item.type, ne.value = item.value, ne.index = item.start_index,
ne.kb_id = item.kb_id, ne.url_wikidata = item.url_wikidata, ne.score = item.score, ne.normal_term = item.normal_term,
ne.description = item.description
WITH ne, item as neIndex
MATCH (text:AnnotatedText)-[:CONTAINS_SENTENCE]->(sentence:Sentence)-[:HAS_TOKEN]->(tagOccurrence:TagOccurrence)
WHERE text.id = $documentId AND tagOccurrence.index >= neIndex.start_index AND tagOccurrence.index < neIndex.end_index
MERGE (ne)<-[:PARTICIPATES_IN]-(tagOccurrence)
"""
self.execute_query(ne_query, {"documentId": document_id, "nes": nes})
#ne.kb_id = item.kb_id, ne.description = item.description, ne.score = item.score
#NamedEntity Multitoken
def get_and_assign_head_info_to_entity_multitoken(self, document_id):
# print(self.uri)
# graph = Graph(self.uri, auth=(self.username, self.password))
# query to find the head of a NamedEntity. (case is for entitities composed of multitokens )
# TODO: the head for the NAM should include the whole extent of the name. see newsreader annotation guidelines
# for more information.
query = """
MATCH p= (text:AnnotatedText where text.id = $documentId)-[:CONTAINS_SENTENCE]->(sentence:Sentence)-[:HAS_TOKEN]->(a:TagOccurrence)-[:PARTICIPATES_IN]-(ne:NamedEntity),q= (a)-[:IS_DEPENDENT]->()--(ne)
where not exists ((a)<-[:IS_DEPENDENT]-()--(ne))
WITH ne, a, p
set ne.head = a.text, ne.headTokenIndex = a.tok_index_doc,
(case when a.pos in ['NNS', 'NN'] then ne END).syntacticType ='NOMINAL' ,
(case when a.pos in ['NNP', 'NNPS'] then ne END).syntacticType ='NAM'
"""
self.execute_query(query, {'documentId': document_id})
#NamedEntity Singletoken
def get_and_assign_head_info_to_entity_singletoken(self, document_id):
# print(self.uri)
# graph = Graph(self.uri, auth=(self.username, self.password))
# query to find the head of a NamedEntity. (case is for entitities composed of single token )
query = """
MATCH p= (text:AnnotatedText where text.id = $documentId )-[:CONTAINS_SENTENCE]->(sentence:Sentence)-[:HAS_TOKEN]->(a:TagOccurrence)-[:PARTICIPATES_IN]-(ne:NamedEntity)
where not exists ((a)<-[:IS_DEPENDENT]-()--(ne)) and not exists ((a)-[:IS_DEPENDENT]->()--(ne))
WITH ne, a, p
set ne.head = a.text, ne.headTokenIndex = a.tok_index_doc,
(case when a.pos in ['NNS', 'NN'] then ne END).syntacticType ='NOMINAL' ,
(case when a.pos in ['NNP', 'NNPS'] then ne END).syntacticType ='NAM'
"""
self.execute_query(query, {'documentId': document_id})
def use_spacy_named_entities(self, document_id):
# this query keep spacy named entities which have type of 'CARDINAL', 'DATE', 'ORDINAL', 'MONEY', 'TIME', 'QUANTITY', 'PERCENT'
query1 = """
match p = (ne:NamedEntity where ne.type in ['CARDINAL', 'DATE', 'ORDINAL', 'MONEY', 'TIME', 'QUANTITY', 'PERCENT'])--
(a:TagOccurrence )--(ne2:NamedEntity)
where a.tok_index_doc = ne.headTokenIndex and a.tok_index_doc = ne2.headTokenIndex and ne.id <> ne2.id
detach delete ne2
"""
self.execute_query(query1, {"documentId": document_id})
def use_dbpedia_named_entities(self, document_id):
# this query keeps the dbpedia ner entity but copies the spacy ner type information.
query2 = """
match p = (ne:NamedEntity where ne.kb_id is not null)--(a:TagOccurrence )--(ne2:NamedEntity)
where a.tok_index_doc = ne.headTokenIndex and a.tok_index_doc = ne2.headTokenIndex and ne.id <> ne2.id
set ne.spacyType = ne2.type
detach delete ne2
"""
self.execute_query(query2, {"documentId": document_id})
#In our pipeline, we employed two named entity recognition (NER) components,
# namely the spaCy NER and DBpedia-spotlight. By using both components, we were able
# to achieve high accuracy and recall. However, we needed to merge the results from
# these two components. To do this, we obtained two lists of named entities, one from
# spaCy NER and the other from DBpedia-spotlight. In some instances, we found duplicate
# entities or text spans that were classified by both components.
# We used the HEAD word to determine duplicate entries and removed them.
# We prioritized the results from spaCy NER for certain types of entities,
# specifically those classified as 'CARDINAL', 'DATE', 'ORDINAL', 'MONEY', 'TIME', 'QUANTITY', or 'PERCENT'.
# For the rest of the entities, we gave priority to the results from DBpedia-spotlight.
# However, there were instances where entities were detected by spaCy NER but not by DBpedia-spotlight
# and were not part of the preferred list. In such cases, we kept those entities as it is.
def deduplicate_named_entities(self, document_id):
self.get_and_assign_head_info_to_entity_multitoken(document_id)
self.get_and_assign_head_info_to_entity_singletoken(document_id)
self.use_spacy_named_entities(document_id)
self.use_dbpedia_named_entities(document_id)
return ''
def process_coreference2(self, doc, text_id):
coref = []
if doc._.has_coref:
for cluster in doc._.coref_clusters:
mention = {'from_index': cluster.mentions[-1].start_char, 'to_index': cluster.mentions[0].start_char}
coref.append(mention)
self.store_coref(text_id, coref)
return coref
def process_coreference_allennlp(self, doc, text_id):
result = callAllenNlpCoref("coreference-resolution", doc.text )
coref = []
for cluster in result["clusters"]:
#print("cluster: ", cluster)
i = 0
antecedent = ""
for span in cluster:
if i == 0:
i+=1
# the first span will be the antecedent for all other references
antecedent = span[0]
continue
mention = {'from_index': span[-1], 'to_index': antecedent}
coref.append(mention)
print (mention)
self.store_coref_allennlp(text_id, coref)
return coref
def process_coreference(self,doc,text_id):
coref = []
if len(doc._.coref_chains) > 0:
for chain in doc._.coref_chains:
for x in range(len(chain)-1):
mention = {'from_index': doc[chain[x+1].token_indexes[0]].idx, 'to_index': doc[chain[0].token_indexes[0]].idx}
coref.append(mention)
self.store_coref(text_id,coref)
return coref
def store_coref2(self, document_id, corefs):
coref_query = """
MATCH (document:AnnotatedText)
WHERE document.id = $documentId
WITH document
UNWIND $corefs as coref
MATCH (document)-[*2]->(start:TagOccurrence), (document)-[*2]->(np:TagOccurrence)-[:PARTICIPATES_IN]->(end:NamedEntity)
WHERE start.index = coref.from_index AND np.index = coref.to_index
MERGE (start)-[:MENTIONS]->(end)
"""
self.execute_query(coref_query,
{"documentId": document_id, "corefs": corefs})
def store_coref(self, document_id, corefs):
coref_query = """
MATCH (document:AnnotatedText)