-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_mimic2.py
154 lines (129 loc) · 6.25 KB
/
preprocess_mimic2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import codecs
from options import args
from utils import build_vocab, word_embeddings, fasttext_embeddings, gensim_to_fasttext_embeddings, gensim_to_embeddings
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from tqdm import tqdm
import csv
import re
term_pattern = re.compile('[A-Za-z]+')
with open('%s/MIMIC_RAW_DSUMS' % (args.MIMIC_2_DIR), 'r') as f:
with open('%s/MIMIC_FILTERED_DSUMS' % (args.MIMIC_2_DIR), 'w') as f2:
for i, line in enumerate(f):
raw_dsum = line.split('|')[6]
raw_dsum = re.sub(r'\[[^\]]+\]', ' ', raw_dsum)
raw_dsum = re.sub(r'admission date:', ' ', raw_dsum, flags=re.I)
raw_dsum = re.sub(r'discharge date:', ' ', raw_dsum, flags=re.I)
raw_dsum = re.sub(r'date of birth:', ' ', raw_dsum, flags=re.I)
raw_dsum = re.sub(r'sex:', ' ', raw_dsum, flags=re.I)
raw_dsum = re.sub(r'service:', ' ', raw_dsum, flags=re.I)
raw_dsum = re.sub(r'dictated by:.*$', ' ', raw_dsum, flags=re.I)
raw_dsum = re.sub(r'completed by:.*$', ' ', raw_dsum, flags=re.I)
raw_dsum = re.sub(r'signed electronically by:.*$', ' ', raw_dsum, flags=re.I)
tokens = [token.lower() for token in re.findall(term_pattern, raw_dsum)]
tokens = [token for token in tokens if len(token) > 1]
# Determine if this DSUM should stay, if so, write to filtered DSUM file
if len(tokens) > 0:
f2.write(line)
tokenizer = RegexpTokenizer(r'\w+')
with codecs.open('%s/MIMIC_FILTERED_DSUMS' % args.MIMIC_2_DIR, 'r', encoding='latin-1') as f:
with open('%s/proc_dsums.csv' % args.MIMIC_2_DIR, 'w') as of:
r = csv.reader(f, delimiter='|')
#header
next(r)
w = csv.writer(of)
w.writerow(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT', 'LABELS'])
for row in tqdm(r):
note = row[6].replace('[NEWLINE]', '\n')
tokens = [t.lower() for t in tokenizer.tokenize(note) if not t.isnumeric()]
text = ' '.join(tokens)
codes = ';'.join(row[5].split(','))
w.writerow([row[0], row[1], row[2], text, codes])
import nltk
nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')
with codecs.open('%s/MIMIC_FILTERED_DSUMS' % args.MIMIC_2_DIR, 'r', encoding='latin-1') as f:
with open('%s/proc_dsums_sentsplit.csv' % args.MIMIC_2_DIR, 'w') as of:
r = csv.reader(f, delimiter='|')
#header
next(r)
w = csv.writer(of)
w.writerow(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT', 'LABELS'])
for row in tqdm(r):
note = row[6].replace('[NEWLINE]', '\n')
all_sents_inds = []
generator = nlp_tool.span_tokenize(note)
for t in generator:
all_sents_inds.append(t)
text = ""
for ind in range(len(all_sents_inds)):
start = all_sents_inds[ind][0]
end = all_sents_inds[ind][1]
sentence_txt = note[start:end]
tokens = [t.lower() for t in tokenizer.tokenize(sentence_txt) if not t.isnumeric()]
if ind == 0:
text += '[CLS] ' + ' '.join(tokens) + ' [SEP]'
else:
text += ' [CLS] ' + ' '.join(tokens) + ' [SEP]'
codes = ';'.join(row[5].split(','))
w.writerow([row[0], row[1], row[2], text, codes])
train_ids = set()
test_ids = set()
with open('%s/training_indices.data' % args.MIMIC_2_DIR) as f:
for row in f:
train_ids.add(int(row.rstrip()))
with open('%s/testing_indices.data' % args.MIMIC_2_DIR) as f:
for row in f:
test_ids.add(int(row.rstrip()))
with open('%s/proc_dsums.csv' % args.MIMIC_2_DIR, 'r') as nf:
with open('%s/test_dsums.csv' % args.MIMIC_2_DIR, 'w') as test_f:
with open('%s/train_dsums.csv' % args.MIMIC_2_DIR, 'w') as train_f:
r = csv.reader(nf, delimiter=',')
test_w = csv.writer(test_f)
train_w = csv.writer(train_f)
#header
header = next(r)
#don't need chart time
del(header[2])
test_w.writerow(header)
train_w.writerow(header)
for i,row in enumerate(r):
#don't need chart time
del(row[2])
if i in train_ids:
train_w.writerow(row)
elif i in test_ids:
test_w.writerow(row)
with open('%s/proc_dsums_sentsplit.csv' % args.MIMIC_2_DIR, 'r') as nf:
with open('%s/test_dsums_sentsplit.csv' % args.MIMIC_2_DIR, 'w') as test_f:
with open('%s/train_dsums_sentsplit.csv' % args.MIMIC_2_DIR, 'w') as train_f:
r = csv.reader(nf, delimiter=',')
test_w = csv.writer(test_f)
train_w = csv.writer(train_f)
#header
header = next(r)
#don't need chart time
del(header[2])
test_w.writerow(header)
train_w.writerow(header)
for i,row in enumerate(r):
#don't need chart time
del(row[2])
if i in train_ids:
train_w.writerow(row)
elif i in test_ids:
test_w.writerow(row)
vfile = build_vocab(3, '%s/train_dsums.csv' % args.MIMIC_2_DIR, '%s/vocab.csv' % args.MIMIC_2_DIR)
df = pd.read_csv('%s/train_dsums_sentsplit.csv' % args.MIMIC_2_DIR)
df['length'] = df.apply(lambda row: len(row[2].split()) if not pd.isnull(row[2]) else 0, axis=1)
df = df[df['length'] > 1]
df = df.sort_values(['length'])
df.to_csv('%s/train.csv' % args.MIMIC_2_DIR, index=False)
df = pd.read_csv('%s/test_dsums_sentsplit.csv' % args.MIMIC_2_DIR)
df['length'] = df.apply(lambda row: len(row[2].split()) if not pd.isnull(row[2]) else 0, axis=1)
df = df[df['length'] > 1]
df = df.sort_values(['length'])
df.to_csv('%s/test.csv' % args.MIMIC_2_DIR, index=False)
w2v_file = word_embeddings('full', '%s/proc_dsums.csv' % args.MIMIC_2_DIR, 100, 3, 5)
gensim_to_embeddings('%s/processed_full.w2v' % args.MIMIC_2_DIR, '%s/vocab.csv' % args.MIMIC_2_DIR, None)
fasttext_file = fasttext_embeddings('full', '%s/proc_dsums.csv' % args.MIMIC_2_DIR, 100, 3, 5)
gensim_to_fasttext_embeddings('%s/processed_full.fasttext' % args.MIMIC_2_DIR, '%s/vocab.csv' % args.MIMIC_2_DIR, None)