-
Notifications
You must be signed in to change notification settings - Fork 0
/
patient_splitter.py
98 lines (83 loc) · 3.56 KB
/
patient_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import sys
import re
from collections import defaultdict
from record import Record
from annotation_matcher import get_annotation
'''
Patient Splitter Method
Krista Watkins
Feb 10
This doesn't promise a particular order for the patients. Is there any reason we'd care?
'''
#rx_tagged_section = re.compile("<(\\w+)>(.+?)</\\1>", re.DOTALL)
rx_patient_id = re.compile("<(PATIENT_DISPLAY_ID)>(.+?)</PATIENT_DISPLAY_ID>", re.DOTALL)
rx_record_id = re.compile("<(RECORD_DOCUMENT_ID)>(.+?)</RECORD_DOCUMENT_ID>", re.DOTALL)
def get_file_list(dir, test):
if "win" in sys.platform:
dir = re.sub(r"\\|/", re.escape(os.sep), dir)
else:
dir = re.sub(r"\\|/", os.sep, dir)
dir = dir.rstrip(os.sep)
if not test:
return [os.sep.join((dir, f)) for f in os.listdir(dir) if f.endswith("train.txt")]
else:
return [os.sep.join((dir, f)) for f in os.listdir(dir) if f.endswith("test.txt")]
def get_record_id(record):
patient_id_match = rx_patient_id.search(record)
record_id_match = rx_record_id.search(record)
if record_id_match:
id = record_id_match.group(2).strip("\n")
if id != "":
return id
if patient_id_match:
id = patient_id_match.group(2).strip("\n")
if id != "":
return id
sys.stderr.write("No Id Found for record: " + record[0:160] + "\n")
def get_records(file_list, test):
all_patients = []
for file_name in file_list:
records = split_records(file_name)
# Get Record ID. (May be PAT or REC)
file_records = defaultdict(str)
for record in records:
_id = get_record_id(record)
if _id in file_records:
sys.stderr.write("Duplicate record ID: " + _id + "\n")
file_records[_id] += record
# get the name of the annotations file corresponding to this record file
file_version = "train" if not test else "test"
file_id = "_".join(file_name.split(os.sep)[-1].split("_")[1:3])
annot_file_name = os.sep.join(file_name.split(os.sep)[:-2] + ["Annotations"] + ["annotations_" + file_id + "_" + file_version + ".json"])
for _id in file_records.keys():
record = Record(_id, file_records[_id], file_name, get_annotation(_id, annot_file_name))
all_patients.append(record)
# Array of all patient strings. May be multiple records for each patient
return sorted(all_patients, key=lambda r: int(r.rid[3:]))
def split_records(file_name):
file = open(file_name)
text = file.read()
text = "DUMMY " + text
file.close()
records = text.split("**PROTECTED[begin]")
return records[1:] # The first record is "DUMMY " Don't return that one
def load_records(dir, test=False):
'''
This is what to call from other programs.
Takes a string of a directory path and whether or not we want to
get the training data or the test data.
Returns a list of record objects
'''
return get_records(get_file_list(dir, test), test)
if __name__ == "__main__":
# Usage: python3 patientSplitter.py dir_name output_file
files = get_file_list(sys.argv[1])
# List of Patient strings. The method splits on the opening line,
# so "**PROTECTED[begin]" is deleted. It'd be easy enough to add back in if necessary
records = get_records(files, False)
with open(sys.argv[2], "w") as out_file:
print(records[0].record)
for record in records:
parse = record.parse()
record.dump(out_file)