-
Notifications
You must be signed in to change notification settings - Fork 0
/
ddi_data_processor1.py
59 lines (50 loc) · 1.88 KB
/
ddi_data_processor1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# convert ddi xml into data
# data: [sent_id, sent_text, pair_list]
# pair_list: [entity1, entity2, ddi]
# ddi: true/false
import os
import pickle
import xml.etree.ElementTree as ET
from utils import *
def read_ddi_xml(dir):
data = []
num_pair = 0
print(dir)
file_list = os.listdir(dir)
for fname in file_list:
parser = ET.XMLParser(encoding="UTF-8") # etree.XMLParser(recover=True)
tree = ET.parse(dir + '/' + fname, parser=parser)
root = tree.getroot()
for sent in root:
sent_id = sent.attrib['id']
sent_text = sent.attrib['text'].strip()
ent_dict = {}
pair_list = []
for c in sent:
# obtain entities' information
if c.tag == 'entity':
d_type = c.attrib['type']
d_id = c.attrib['id']
d_ch_of = c.attrib['charOffset']
d_text = c.attrib['text']
ent_dict[d_id] = [d_text, d_type, d_ch_of]
# obtain entity pairs' information
elif c.tag == 'pair':
p_id = c.attrib['id']
e1 = c.attrib['e1']
entity1 = ent_dict[e1]
e2 = c.attrib['e2']
entity2 = ent_dict[e2]
ddi = c.attrib['ddi']
pair_list.append([entity1, entity2, ddi])
num_pair = num_pair + 1
data.append([sent_id, sent_text, pair_list])
print("num_pair:", num_pair)
return data
ddi_xml_dir = 'corpus/ppi_corpus'
ddi_step1_txt = 'dataset/ddi_data/step1/train.txt'
ddi_step1_pickle = 'dataset/ddi_data/step1/train.pickle'
ddi_data = read_ddi_xml(ddi_xml_dir)
write_step1_data_as_txt(ddi_data, ddi_step1_txt)
pickle.dump(ddi_data, open(ddi_step1_pickle, 'wb'))
# print(pickle.load(open(ddi_step1_pickle, 'rb')))