forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdada2_inference.R
executable file
·345 lines (273 loc) · 14.3 KB
/
dada2_inference.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#!/usr/bin/env Rscript
# Read in package to read in command-line options.
library("optparse")
version <- "1.0"
option_list <- list(
make_option(c("-f", "--f_path"), type="character", default=NULL,
help="Location of forward reads (required)." , metavar="path"),
make_option(c("--seed"), type="integer", default=NULL,
help="Random seed to make command reproducible.", metavar = "integer"),
make_option(c("-r", "--r_path"), type="character", default=NULL,
help="Location of reverse reads (if applicable). Same as forward reads by default.",
metavar="path"),
make_option(c("-s", "--single"), action = "store_true", type="logical", default=FALSE,
help="Flag to indicate that reads are single-end (default: FALSE).", metavar = "boolean"),
make_option(c("--f_match"), type="character", default="_R1_.*fastq.*",
help="Regular expression to match in forward reads' filenames (default: \"_R1_.*fastq.*\").",
metavar="PATTERN"),
make_option(c("--r_match"), type="character", default="_R2_.*fastq.*",
help="Regular expression to match in reverse reads' filenames (default: \"_R2_.*fastq.*\").",
metavar="PATTERN"),
make_option(c("--sample_delim"), type="character", default="_",
help=paste("Character to split filenames on to determine sample name (default: \"_\").",
"Sample names are assumed to be field 1 after splitting.", sep=" "),
metavar="PATTERN"),
make_option(c("-t", "--threads"), type="integer", default=1,
help="Number of threads to use (default: 1).", metavar="integer"),
make_option(c("-o", "--output"), type="character", default="seqtab.rds",
help="Output RDS file for sequence table (default: seqtab.rds).", metavar="path"),
make_option(c("-n", "--num_learn"), type="integer", default=1e6,
help=paste("Min. number of reads for error rate learning (default: 1e6).",
"Samples will be read in until this read count is reached or all",
"samples are read in.", sep=" "), metavar="integer"),
make_option(c("--num_derep"), type="integer", default=1e6,
help="Number of reads to read into memory when dereplicating (default: 1e6).",
metavar="integer"),
make_option(c("--randomize"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that samples should be read in random order",
"for learnErrors step (default: FALSE).", sep=" "),
metavar = "boolean"),
make_option(c("--plot_errors"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that estimated error rates should be plotted",
"to pdfs (default: FALSE).", sep=" "),
metavar = "boolean"),
make_option(c("--selfConsist"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that dada algorithm should alternate between sample",
"inference and error rate estimate until convergence (default: FALSE).",
sep=" "), metavar = "boolean"),
make_option(c("--pool"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that all samples should be pooled before sample",
"inference step (default: FALSE).", sep=" "), metavar = "boolean"),
make_option(c("--minOverlap"), type="integer", default=20,
help="Min length of overlap required for merging reads (default: 20).",
metavar="integer"),
make_option(c("--maxMismatch"), type="integer", default=0,
help="Max number of mismatches allowed in overlap when merging reads (default: 0).",
metavar="integer"),
make_option(c("--returnRejects"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that pairs that had more than the max number",
"of mismatches should be retained in the output dataframe",
"(default: FALSE).", sep=" "), metavar = "boolean"),
make_option(c("--justConcatenate"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that forward and reverse reads should just",
"be concatenated with 10 Ns as spacers between them",
"(default: FALSE).", sep=" "), metavar = "boolean"),
make_option(c("--trimOverhang"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that overhands in alignment between forward",
"and reverse reads are trimmed off. This can happen when the reads",
"are bigger than the amplicon (default: False).", sep=" "),
metavar = "boolean"),
make_option(c("--OMEGA_A"), type="numeric", default=1e-40,
help="Threshold for when DADA2 calls unique sequences as clusters (default=1e-40)",
metavar="numeric"),
make_option(c("--BAND_SIZE"), type="integer", default=16,
help="Parameter for banding N-W alignments to restrict number of insertions (default=16)",
metavar="integer"),
make_option(c("--log"), type="character", default="dada2_inferred_read_counts.txt",
help="Output logfile for read count table (default: dada2_inferred_read_counts.txt).",
metavar="path"),
make_option(c("--verbose"), action = "store_true", type="logical", default=FALSE,
help="Write out status messages (default: FALSE).", metavar = "boolean"),
make_option(c("--version"), action = "store_true", type="logical", default=FALSE,
help="Print out version number and exit.", metavar = "boolean")
)
opt_parser <- OptionParser(
option_list=option_list,
usage = "%prog [options] -f PATH --seed 123",
description = paste(
"\nThis is a wrapper for the DADA2 inference step that is",
"based on the authors\' big data tutorial available here:",
"https://benjjneb.github.io/dada2/bigdata.html.\n\nBe sure to cite the DADA2",
"paper if you use this script:\nCallahan BJ et al. 2016. DADA2:",
"High-resolution sample inference from Illumina amplicon data.",
"Nature Methods 13:581-583.\n\nNote this script was tested with",
"DADA2 v1.4.0", sep=" ")
)
opt <- parse_args(opt_parser)
# Print out version if --version flag set.
if (opt$version) {
cat("Wrapper version:", version, "\n")
options_tmp <- options(show.error.messages=FALSE)
on.exit(options(options_tmp))
stop()
}
# Check if path to forward files set, if not stop job.
if(is.null(opt$f_path)) {
stop("path to forward FASTQs needs to be set.")
}
# Set multithread option (FALSE if no, otherwise give # core).
if (opt$threads > 1) {
multithread_opt <- opt$threads
} else {
multithread_opt <- FALSE
}
# Get forward filenames.
forward_in <- sort(list.files(opt$f_path, pattern=opt$f_match, full.names=TRUE))
forward_samples <- sapply(strsplit(basename(forward_in), opt$sample_delim), `[`, 1)
names(forward_in) <- forward_samples
# Read in and print out DADA2 version.
library("dada2")
# Set DADA2 options:
setDadaOpt(OMEGA_A = opt$OMEGA_A,
BAND_SIZE = opt$BAND_SIZE)
if(opt$verbose) {
cat("Running DADA2 version:", as.character(packageVersion("dada2")), "\n")
cat("Setting options of OMEGA_A=", as.character(opt$OMEGA_A), ", ",
"BAND_SIZE=", as.character(opt$BAND_SIZE))
}
# Set random seed for reproducibility if set.
if(! is.null(opt$seed)) {
set.seed(opt$seed)
if(opt$verbose){
cat("Random seed:", as.character(opt$seed), "\n")
}
}
if(opt$verbose) {
cat("Running learnErrors with below options.\n")
cat(paste("fls=", paste(forward_in, collapse=",") , "\n",
"nreads=", opt$num_learn, "\n",
"multithread=", multithread_opt, "\n",
"randomize=", opt$randomize, "\n\n", sep=""))
}
# Learn error model for forward reads.
err_forward <- learnErrors(fls = forward_in,
nreads = opt$num_learn,
multithread = multithread_opt,
randomize=opt$randomize)
# Initialize empty list with length of number of samples.
seq_variants <- vector("list", length(forward_samples))
names(seq_variants) <- forward_samples
if(opt$verbose) {
cat("Will run derepFastq and dada with the below options.\n")
cat("derepFastq n =", opt$num_derep, "\n")
cat("dada selfConsist =", opt$selfConsist, "\n")
cat("dada pool =", opt$pool, "\n")
cat("dada multithread =", multithread_opt, "\n")
}
# Run dereplication and dada inference just on forward reads in single-end mode.
if(opt$single) {
if(opt$verbose) {
cat("Running dereplication and dada algorithm in single-end mode\n")
}
# Initialize dataframe to keep track of read counts.
log_counts <- data.frame(matrix(NA, nrow=length(forward_samples), ncol=3))
rownames(log_counts) <- forward_samples
colnames(log_counts) <- c("sample", "derep_sum", "denoised")
# Loop over all samples.
for(sample in forward_samples) {
if(opt$verbose) {
cat("Processing: ", sample, "\n")
}
# Dereplicate reads.
derep_forward <- derepFastq(forward_in[[sample]],
n=opt$num_derep,
verbose=opt$verbose)
# Run inference and add inferred sequences to output list.
seq_variants[[sample]] <- dada(derep_forward,
err=err_forward,
multithread=multithread_opt)
# Add read counts to log dataframe for debug purposes.
log_counts[sample,] <- c(sample, sum(derep_forward$uniques), sum(seq_variants[[sample]]$denoised))
}
# Learn errors for reverse too if paired.
} else {
# Initialize dataframe to keep track of read counts.
log_counts <- data.frame(matrix(NA, nrow=length(forward_samples), ncol=6))
rownames(log_counts) <- forward_samples
colnames(log_counts) <- c("sample", "forward_derep_sum", "reverse_derep_sum",
"forward_denoised", "reverse_denoised", "merged")
# Read in path to reverse FASTQs.
if(is.null(opt$r_path)) {
opt$r_path <- opt$f_path
}
# Read in reverse FASTQs
reverse_in <- sort(list.files(opt$r_path, pattern=opt$r_match, full.names=TRUE))
reverse_samples <- sapply(strsplit(basename(reverse_in), opt$sample_delim), `[`, 1)
names(reverse_in) <- reverse_samples
# Check if forward and reverse sample names match.
if(! identical(forward_samples, reverse_samples)){
stop(paste("\n\nSample names parsed from forward and reverse filenames don't match.",
"\nForward sample name:", forward_samples,
"\nReverse sample name:", reverse_samples,
"\n\nUse the -s option if your reads are single-end.\n"))
}
if(opt$verbose) {
cat("Running learnErrors with below options for reverse reads.\n",
"fls =", paste(reverse_in, collapse=",") , "\n",
"nreads =", opt$num_learn, "\n",
"multithread =", multithread_opt, "\n",
"randomize =", opt$randomize, "\n\n",
"Running dereplication and dada algorithm in paired-end mode.\n\n",
"Will run mergePairs with below options.\n",
"minOverlap =", opt$minOverlap, "\n",
"maxMismatch =", opt$maxMismatch, "\n",
"returnRejects =", opt$returnRejects, "\n",
"justConcatenate =", opt$justConcatenate, "\n",
"trimOverhang =", opt$trimOverhang, "\n")
}
# Learn error model for forward reads.
err_reverse <- learnErrors(fls = reverse_in,
nreads = opt$num_learn,
multithread = multithread_opt,
randomize=opt$randomize)
# Loop over all samples.
for(sample in forward_samples) {
if(opt$verbose) {
cat("Processing: ", sample, "\n")
}
# Dereplicate reads.
derep_forward <- derepFastq(forward_in[[sample]], n=opt$num_derep, verbose=opt$verbose)
derep_reverse <- derepFastq(reverse_in[[sample]], n=opt$num_derep, verbose=opt$verbose)
# Run inference and add inferred sequences to output list.
dada_out_forward <- dada(derep_forward, err=err_forward, multithread=multithread_opt)
dada_out_reverse <- dada(derep_reverse, err=err_reverse, multithread=multithread_opt)
# Merge forward and reverse reads together
seq_variants[[sample]] <- mergePairs(dadaF=dada_out_forward,
derepF=derep_forward,
dadaR=dada_out_reverse,
derepR=derep_reverse,
minOverlap=opt$minOverlap,
maxMismatch=opt$maxMismatch,
returnRejects=opt$returnRejects,
justConcatenate=opt$justConcatenate,
trimOverhang=opt$trimOverhang,
verbose=opt$verbose)
# Add read counts to log dataframe for debug purposes.
log_counts[sample,] <- c(sample, sum(derep_forward$uniques), sum(derep_reverse$uniques), sum(dada_out_forward$denoised),
sum(dada_out_reverse$denoised), sum(seq_variants[[sample]]$abundance))
}
}
if(opt$plot_errors) {
if(opt$verbose){
cat("Plotting estimated error models.\n\n")
}
if(opt$single){
pdf("estimated_err.pdf", width=7, height=7)
print(plotErrors(err_forward, nominalQ=TRUE))
dev.off()
} else{
pdf("estimated_forward_err.pdf", width=7, height=7)
print(plotErrors(err_forward, nominalQ=TRUE))
dev.off()
pdf("estimated_reverse_err.pdf", width=7, height=7)
print(plotErrors(err_reverse, nominalQ=TRUE))
dev.off()
}
}
# Write out sequence table as RDS.
seqtab <- makeSequenceTable(seq_variants)
saveRDS(seqtab, opt$output)
# Write out logfile.
log_counts$tabled <- rowSums(seqtab)
write.table(x = log_counts, file = opt$log, quote = FALSE, sep="\t",
col.names = TRUE, row.names = FALSE)