-
Notifications
You must be signed in to change notification settings - Fork 8
/
glint.py
318 lines (254 loc) · 15.7 KB
/
glint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#!/usr/bin/env python
import validate_dependencies
import argparse
import os
import sys
from configuration import configurelogging
import logging
from utils import common
from numpy import loadtxt
from utils import GlintArgumentParser
from parsers import ModuleParser, RefactorParser, EWASParser, \
MethylationDataParser, ImputingParser, \
EpistructureParser, PlotParser, HousemanParser #dont remove this is imported in,,,
LOGGER = configurelogging.Configure()
class GlintParser(ModuleParser):
def __init__(self, parser):
"""
--out: a prefix that will be added to any file the program will output.
If not supplied, default filenames will be used, and will override each other on the next run.
--loglevel: the loglevel to use. default is INFO. program terminates if it is not one of the list specified in configurelogging.OPTIONAL_LEVELS
flow:
First the program validates that all the required flags for all the selected mofules are specified and valid, otherwise it terminates
with a proper message.
This is done before data is loaded.
No datafile need to be provided in the following cases:
if --impute is selected,
if --qqplot --manhattan are selected without any --ewas test
Otherwise (datafile must be provided):
- samples preprocessing is done before executing anything else (remove / keep samples). The new methData will be used in the next modules
- refactor is executed if asked (with a new methData copy)
Its components are added (always) as covariates to the methylation data (which will be used in the next modules)
- sites preprocessing is done (exclude / include sites). The new methData will be used in the next modules
- methData is saved as glint file if asked
- run ewas test if asked (with a new methData copy)
- run plot if asked (with a new methData copy) (with the results of the ewas test, this is OK because if ewas test wasn't selecten than plot would have been executed first (see above))
- run epistructure is asked (with a new methData copy)
-
* Note that a new copy of methylation data object is created for any module
* modules that should not run together
- refactor and lmm (refactor will be executed twice as documented in LMMParser)
- epistructure and refactor
- which more? TODO Elior
"""
optional = parser.add_argument_group('3.Optional arguments')
optional.add_argument('-h', '--h', '--help', action='help', help = "Prints this help") # add help here so it will be under same group with all other optional argument
optional.add_argument('--out', type = str, help = "Changes the prefix of the output file ")
def loglevel_value(val):
# val = int(val)
if val.lower() not in configurelogging.OPTIONAL_LEVELS.keys():
common.terminate("Log level is not valid. should be one of %s" % str( configurelogging.OPTIONAL_LEVELS.keys()))
return configurelogging.OPTIONAL_LEVELS[val]
optional.add_argument('--loglevel', type = loglevel_value, default = 'info', help = "The log level to print")
modules = parser.add_argument_group('4.Glint modules')
modules.add_argument('--refactor', action='store_true', help = "Runs the ReFACTor algorithm for capturing cell type composition")
modules.add_argument('--ewas', action='store_true', help = "Runs EWAS" )
modules.add_argument('--impute', action='store_true', help = "Imputes methylation levels from genotypes" )
modules.add_argument('--epi', action='store_true', help = "Runs the EPISTRUCTURE algorithm for capturing population structure from methylation data" )
modules.add_argument('--plot', action='store_true', help = "Allows to generate plots" )
modules.add_argument('--houseman', action='store_true', help = "Runs the Houseman algorithm for estimating cell counts" )
super(GlintParser, self).__init__(optional, modules)
class ModulesArgumentParsers(object):
# functional args are flags that does somthing, i.e if the user didnt choose one of them GLINT will do nothing and we should warn about that
# FUNCTIONALITY_ARGS - functional args that runs a module
FUNCTIONALITY_ARGS = ['--plot', '--refactor', '--ewas', '--impute', '--houseman'] # TODO find better way to hold arguments that cause some functionality. glint is not supposed to be aware of those args
# DATA_FUNC_ARGS - functional args that does something on the data (i.e running --minmean without --gsave means nothing)
# if user doesn't chose anythong from FUNCTIONALITY_ARGS but does choose to save his data, that's OK.
DATA_FUNC_ARGS = ['--gsave', '--txtsave']
#arguments that cannot be processed with arguments from FUNCTIONALITY_ARGS
SOLE_ARGS = ['--epi'] # functilnality flags that cannot be specified with other functionaity flags
DATA_PREPROCESSING_NOT_RELEVANT_FOR_REFACTOR = ['--include', '--exclude', '--minmean', '--maxmean']
def __init__(self, user_args_selection):
self.selected_args = user_args_selection
self.parser = GlintArgumentParser(prog=os.path.basename(sys.argv[0]),
add_help=False) # don't add help because it is added in 'optional group'
self.glint_parser = None
self.meth_parser = None
self.refactor_parser = None
self.ewas_parser = None
self.epi_parser = None
self.plot_parser = None
self.houseman_parser = None
self.args = None
def add_arguments(self):
# Notice to add arguments by the order you want them to be printed in --help
#main
self.meth_parser = MethylationDataParser(self.parser)
self.glint_parser = GlintParser(self.parser)
#modules in the order they'll appear on --help
self.refactor_parser = RefactorParser(self.parser)
self.ewas_parser = EWASParser(self.parser)
self.imputing_parser = ImputingParser(self.parser)
self.epi_parser = EpistructureParser(self.parser)
self.houseman_parser = HousemanParser(self.parser)
self.plot_parser = PlotParser(self.parser)
def parse_args(self):
logging.info("Validating arguments...")
self.args = self.parser.parse_args()
optional_args = []
self.glint_parser.validate_args(self.args)
optional_args.extend(self.glint_parser.all_args)
# imputation runs without datafile
if self.args.impute:
self.imputing_parser.validate_args(self.args)
optional_args.extend(self.imputing_parser.all_args)
self.check_selected_args(optional_args)
return self.args
#plot runs without datafile but could receive datafile if run with ewas
if self.args.plot:
self.plot_parser.validate_args(self.args)
optional_args.extend(self.plot_parser.all_args)
if not self.args.ewas:
self.check_selected_args(optional_args)
return self.args
# put here modules that require a datafile
self.meth_parser.validate_args(self.args)
optional_args.extend(self.meth_parser.all_args)
self.epi_parser.validate_args(self.args)
optional_args.extend(self.epi_parser.all_args)
if self.args.refactor:
self.refactor_parser.validate_args(self.args)
optional_args.extend(self.refactor_parser.all_args)
# ewas tests need to be after refactor
if self.args.ewas:
self.ewas_parser.validate_args(self.args)
optional_args.extend(self.ewas_parser.all_args)
if self.args.houseman:
self.houseman_parser.validate_args(self.args)
optional_args.extend(self.houseman_parser.all_args)
self.check_selected_args(optional_args)
return self.args
def check_selected_args(self, optional_args):
"""
validates that the user selected "action" argument (and didn't supply just a datafile)
and that the user didnt select an argument that is not an option for him (argument from a module that wasn't selected)
"""
selected_args = set(self.selected_args)
data_func_args = set(self.DATA_FUNC_ARGS)
func_args = set(self.FUNCTIONALITY_ARGS)
all_func_args = set(self.FUNCTIONALITY_ARGS + self.DATA_FUNC_ARGS)
optional_args = set(optional_args)
sole_args = set(self.SOLE_ARGS)
args_not_relevant_for_refactor = set(self.DATA_PREPROCESSING_NOT_RELEVANT_FOR_REFACTOR)
all_func_args.update(sole_args)
func_args_chosen = selected_args.intersection(all_func_args)
args_not_relevant_for_refactor_chosen = selected_args.intersection(args_not_relevant_for_refactor)
sole_args_chosen = selected_args.intersection(sole_args)
if len(func_args_chosen) == 0:
common.terminate("Nothing to do with the data, select at least one argument from %s." % ", ".join(list(all_func_args)))
# check that a sole argumet wasn't chosen with any other arg.
elif ((len(func_args_chosen - data_func_args) > len(sole_args_chosen)) and (len(sole_args_chosen) > 0))or (len(sole_args_chosen) > 1):
common.terminate("Options from %s cannot be specified with any arguments from %s in the same command. you chose %s." % (str(list(sole_args)), str(list(self.FUNCTIONALITY_ARGS)), str(list(func_args_chosen))))
differ = selected_args.difference(optional_args)
if differ:
optional_args_str = "".join(optional_args)
unrecognized_args = [] # args that are not an option of glint
redundent_args = [] # args that are not used with the selected flag
for flag in differ:
if flag in optional_args_str:
unrecognized_args.append(flag)
else:
redundent_args.append(flag)
if unrecognized_args:
common.terminate("Unrecognized argument " + ", ".join(unrecognized_args))
if redundent_args:
common.terminate("Selected redundent argument " + ", ".join(redundent_args))
# warn if only refactor module is selected and user selectes data management flags that are not relevant for refactor
if self.args.refactor and len(func_args_chosen) == 1:
not_relevant_atgs = list(args_not_relevant_for_refactor_chosen)
if len(not_relevant_atgs) != 0:
logging.warning("Selected data management arguments which are not relevant for ReFACTor: %s." % str(not_relevant_atgs))
def run(self):
# put here modules that doesn't required a datafile
# imputation runs without datafile
if self.args.impute:
self.imputing_parser.run(args, output_perfix = self.args.out)
return
# plot runs without datafile but could receive datafile if run with ewas
if (self.args.qqplot or args.manhattan) and not self.args.ewas: # if user asked to run plot without running EWAS test, run plot and quit
self.plot_parser.run(args)
return
# put here modules that require a datafile
self.meth_parser.run(self.args)
self.meth_parser.preprocess_samples_data() # preprocess samples before refactor and before ewas
self.meth_parser.preprocess_sites_data() #preprocess sites before refactor and before ewas
if self.args.refactor:
refactor_meth_data = self.meth_parser.module.copy()
self.refactor_parser.run(args = self.args,
meth_data = refactor_meth_data,
output_perfix = self.args.out)
if self.args.gsave:
logging.info("Adding the ReFACTor componetns to the covariates of the data...")
refactor_comp_names = self.meth_parser.module.add_covar_datas(self.refactor_parser.module.components, "rc") # add refactor components as covariate file
with open(self.refactor_parser.module.components_output_filename, 'r') as f:
data = f.read()
f.close()
with open(self.refactor_parser.module.components_output_filename, 'w') as f:
f.write(" ".join(["ID"]+refactor_comp_names) + "\n"+ data)
f.close()
if not self.args.gsave:
logging.info("To use ReFACTor componetns as covariates run glint.py with --covarfile %s" % self.refactor_parser.module.components_output_filename)
# add refactor components to the list of covariates to use:
if self.args.covar is not None:
self.args.covar.extend(refactor_comp_names)
else:
self.args.covar = refactor_comp_names
if self.args.houseman:
houseman_meth_data = self.meth_parser.module.copy()
self.houseman_parser.run(args = self.args,
meth_data = houseman_meth_data,
output_perfix = self.args.out)
if self.args.gsave:
logging.info("Adding the Houseman estimates to the covariates of the data.")
houseman_comp_names = self.meth_parser.module.add_covar_datas(self.houseman_parser.module.components, \
covarsnames = self.houseman_parser.module.names) # add houseman components as covariate file
if not self.args.gsave:
logging.info("To use Houseman estimates as covariates run glint.py with --covarfile %s" % self.houseman_parser.module.outputfile)
# add houseman components to the list of covariates to use:
if self.args.covar is not None:
self.args.covar.extend(houseman_comp_names)
else:
self.args.covar = houseman_comp_names
# ewas tests must be called after refactor
ewas_results = None
if self.args.ewas:
ewas_meth_data = self.meth_parser.module.copy()
ewas_results = self.ewas_parser.run(args = self.args,
meth_data = ewas_meth_data)
if self.args.plot: # if not selected should we plot by default?
self.plot_parser.run(args, meth_data = self.meth_parser.module, ewas_result_obj = ewas_results)
if self.args.epi:
epi_met_data = self.meth_parser.module.copy()
self.epi_parser.run(args = self.args,
meth_data = epi_met_data,
output_perfix = self.args.out)
self.meth_parser.module.add_covar_datas(self.epi_parser.module.components, "epi") # add refactor components as covariate file
if self.args.out:
prefix = self.args.out
else:
prefix = ".".join(os.path.basename(self.args.datafile.name).split(".")[:-1])
self.meth_parser.save(output_perfix = prefix) #save after all preprocessing add epi and refactor covars
if __name__ == '__main__':
import time
a = time.time()
selected_args = [arg for arg in sys.argv if arg.startswith("--")]
parser = ModulesArgumentParsers(selected_args)
logging.info(">>> python %s" % " ".join(sys.argv))
logging.info("Starting GLINT...")
parser.add_arguments()
args = parser.parse_args() # validate happens here
LOGGER.setLoggerLevel(args.loglevel)
LOGGER.setLoggerFile(args.out)
parser.run()
b = time.time()
logging.debug("TOTAL RUN TIME %s SECONDS"%(b-a))