-
Notifications
You must be signed in to change notification settings - Fork 1
/
demographic_report.py
216 lines (190 loc) · 7.11 KB
/
demographic_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""Flywheel Demographic report for a Flywheel project.
Usage:
demographic_report.py GROUP PROJECT
Options:
-h --help Show this screen.
"""
from docopt import docopt
from scitran_client import ScitranClient, query, Projects, Acquisitions, Groups
import re
import math
from dateutil import parser
import pytz
def _subject_code(subject):
if 'test' in subject['code']:
# we don't try to parse test entries
return subject['code']
match = re.match(r'\w{2}\d{5}', subject['code'])
assert match, 'Could not find code for {}'.format(subject['code'])
# upper() is a good idea to get to a canonical ID
return match.group(0).upper()
def _session_id(session):
# UID is supplied by the scanner and label is added by
# the researcher. we prefer UID and require at least one
# of the two
if 'uid' in session:
return session['uid']
elif 'label' in session:
return session['label']
else:
raise Exception('missing uid and label keys in {}'.format(session))
def _session_day(session):
return (
parser.parse(session['timestamp'] + 'Z')
.astimezone(pytz.timezone('America/Los_Angeles'))
.strftime('%Y-%m-%d')
)
def report(group, project):
client = ScitranClient()
# querying for acquisitions first since we have to fetch for them anyway.
raw_results = client.search(query(Acquisitions).filter(
Projects.label.match(project),
Groups.name.match(group),
))
results = [
a['_source']
for a in raw_results
if 'test' not in a['_source']['session']['subject']['code']
if 'NO00000' not in a['_source']['session']['subject']['code'].upper()
]
assert results, 'Could not find results for project {} for group {}.'.format(project, group)
acquisitions_by_session = {}
for result in results:
session = result['session']
acquisitions_by_session.setdefault(
_session_id(session), []).append(result)
# indexing sessions by label to get a unique list of sessions
sessions_by_id = {
_session_id(session): session
for session in (
result['session']
for result in results
)
}
# indexing subjects by code to get a unique list of subjects
subjects_by_code = {
_subject_code(session['subject']): session['subject']
for session in sessions_by_id.values()
}
sessions_by_subject = {}
for session in sessions_by_id.values():
sessions_by_subject.setdefault(
_subject_code(session['subject']), []).append(session)
def _seconds_to_years(seconds):
return int(math.floor(float(seconds) / 60 / 60 / 24 / 365))
def _report_by_sex(subjects, sex):
subjects = [
subject for subject in subjects
if subject.get('sex') == sex
]
ages = [s['age'] for s in subjects if s.get('age')]
return '{} {}s between ages of {} and {}'.format(
len(subjects),
sex,
_seconds_to_years(min(ages)),
_seconds_to_years(max(ages)),
)
subjects = subjects_by_code.values()
subject_codes = set(
subject_code
for subject_code in subjects_by_code.keys()
)
missing = dict(
t1w=set(subject_codes)
)
all_subject_visits = 0
def _missing_file(acquisition, key, file_predicate):
if key not in missing:
missing[key] = set()
if not (
acquisition and any(
file_predicate(f)
for f in acquisition['files']
)
):
missing[key].add('{}:{}'.format(_session_day(session), subject_code))
def _missing_file_msg(key):
return '{:.1f}% missing {}: {}'.format(
len(missing[key]) * 100. / all_subject_visits,
key,
', '.join(sorted(missing[key]))
)
for subject_code in subject_codes:
if not any(
acquisition['label'] == 'T1w 1mm'
for session in sessions_by_subject[subject_code]
for acquisition in acquisitions_by_session[_session_id(session)]
):
missing['t1w'].remove(subject_code)
for session in sessions_by_subject[subject_code]:
acquisitions = acquisitions_by_session[_session_id(session)]
behavioral = next((
acquisition
for acquisition in acquisitions
if (acquisition.get('uid') or '').startswith('behavioral_and_physiological:')
), None)
_missing_file(behavioral, 'Behavioral-GoNoGo', lambda file: file['name'].endswith('_GoNoGo.txt'))
_missing_file(behavioral, 'Behavioral-Consc', lambda file: file['name'].endswith('_EmotionConscious.txt'))
_missing_file(
behavioral, 'Behavioral-NonConsc',
lambda file: file['name'].endswith('_EmotionNonconscious.txt'))
if '2016-08-23' < session['timestamp']:
_missing_file(
behavioral, 'Behavioral-EmoReg', lambda file:
file['name'].endswith('_EmoReg08252016.csv') or
file['name'].endswith('_EmoReg09202016.csv'))
_missing_file(
behavioral, 'Physio-GoNoGo',
lambda file: file['name'].startswith('gonogo_') and file['name'].endswith('.csv'))
_missing_file(
behavioral, 'Physio-Consc',
lambda file: file['name'].startswith('consc_') and file['name'].endswith('.csv'))
_missing_file(
behavioral, 'Physio-NonConsc',
# helpful to do noncon, because some are nonconsc_ and others are noncons_
lambda file: file['name'].startswith('noncon') and file['name'].endswith('.csv'))
_missing_file(
behavioral, 'Physio-EmoReg',
lambda file: file['name'].startswith('emoreg_') and file['name'].endswith('.csv'))
all_subject_visits += 1
missing['base files'] = (
missing['Behavioral-GoNoGo'] |
missing['Behavioral-Consc'] |
missing['Behavioral-NonConsc'] |
missing['Behavioral-EmoReg']
)
print '''{}: {}
Total # of subjects: {}
{}
{}
{} subjects with unspecified sex
{} missing T1w 1mm: {}
{}
{}
{}
{}
{}
{}
{}
{}
{}
'''.format(
group, project,
len(subjects_by_code),
_report_by_sex(subjects, 'male'),
_report_by_sex(subjects, 'female'),
len([s for s in subjects if s.get('sex') is None]),
len(missing['t1w']), ', '.join(missing['t1w']),
_missing_file_msg('Behavioral-GoNoGo'),
_missing_file_msg('Behavioral-Consc'),
_missing_file_msg('Behavioral-NonConsc'),
_missing_file_msg('Behavioral-EmoReg'),
_missing_file_msg('Physio-GoNoGo'),
_missing_file_msg('Physio-Consc'),
_missing_file_msg('Physio-NonConsc'),
_missing_file_msg('Physio-EmoReg'),
_missing_file_msg('base files'),
)
if __name__ == '__main__':
arguments = docopt(__doc__, version='Flywheel Demographic Report 1.0')
report(arguments['GROUP'], arguments['PROJECT'])