forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dada2_filter.R
executable file
·324 lines (266 loc) · 14.9 KB
/
dada2_filter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/env Rscript
# Read in package to read in command-line options.
suppressMessages(library("optparse"))
version <- "1.0"
option_list <- list(
make_option(c("-f", "--f_path"), type="character", default=NULL,
help="Location of forward reads (required).", metavar="path"),
make_option(c("-r", "--r_path"), type="character", default=NULL,
help="Location of reverse reads (if applicable). Same as forward reads by default.",
metavar="path"),
make_option(c("-s", "--single"), action = "store_true", type="logical", default=FALSE,
help="Flag to indicate that reads are single-end (default: FALSE).", metavar = "boolean"),
make_option(c("--f_match"), type="character", default="_R1_.*fastq.*",
help="Regular expression to match in forward reads' filenames (default: \"_R1_.*fastq.*\").",
metavar="PATTERN"),
make_option(c("--r_match"), type="character", default="_R2_.*fastq.*",
help="Regular expression to match in reverse reads' filenames (default: \"_R2_.*fastq.*\").",
metavar="PATTERN"),
make_option(c("--sample_delim"), type="character", default="_",
help=paste("Character to split filenames on to determine sample name (default: \"_\").",
"Sample names are assumed to be 1st field after splitting.", sep=" "),
metavar="PATTERN"),
make_option(c("--truncQ"), type="character", default="2",
help="Reads will be truncated at the first instance of Q <= truncQ (default: 2).",
metavar="numeric,numeric"),
make_option(c("--truncLen"), type="character", default="0",
help="Length at which to truncate reads - shorter reads will be discarded (default: 0).",
metavar="numeric,numeric"),
make_option(c("--trimLeft"), type="character", default="0",
help="Number of nucleotides to trim from start of reads (default: 0).",
metavar="numeric,numeric"),
make_option(c("--maxLen"), type="character", default="Inf",
help="Maximum read length BEFORE trimming and truncation (default: Inf).",
metavar="numeric,numeric"),
make_option(c("--minLen"), type="character", default="20",
help="Minimum read length AFTER trimming and truncation (default: 20).",
metavar="numeric,numeric"),
make_option(c("--maxN"), type="character", default="0",
help="Maximum number of Ns after truncation - should be 0 for use with DADA2 (default: 0).",
metavar="numeric,numeric"),
make_option(c("--minQ"), type="character", default="0",
help="Minimum quality score allowed in read after truncation (default: 0).",
metavar="numeric,numeric"),
make_option(c("--maxEE"), type="character", default="Inf",
help="Maximum expected errors allowed in a read after truncation (default: Inf). EE = sum(10^(-Q/10)).",
metavar="numeric,numeric"),
make_option(c("-t", "--threads"), type="integer", default=1,
help="Number of threads to use (default: 1).", metavar="integer"),
make_option(c("-n", "--num_reads"), type="integer", default=1e5,
help="Number of reads to read into memory at once (default: 1e5).", metavar="integer"),
make_option(c("-o", "--output"), type="character", default="filtered_fastqs",
help="Output folder for filtered reads (default: filtered_fastqs).", metavar="path"),
make_option(c("--log"), type="character", default="dada2_filter_read_counts.txt",
help="Output logfile for read count table (default: dada2_filter_read_counts.txt).", metavar="path"),
make_option(c("--no_rm_phiX"), action = "store_true", type="logical", default=FALSE,
help="Flag to indicate that reads matching PhiX genome should NOT be discarded (default: FALSE).",
metavar = "boolean"),
make_option(c("--no_gzip"), action = "store_true", type="logical", default=FALSE,
help="Flag to indicate that output FASTQs should NOT be gzipped (default: FALSE).",
metavar = "boolean"),
make_option(c("--matchIDs"), action = "store_true", type="logical", default=FALSE,
help=paste("Flag to indicate that paired-end reads that don't share ids",
"in both the forward and reverse FASTQs will be excluded (default: FALSE)", sep=" "),
metavar = "boolean"),
make_option(c("--id_sep"), type="character", default="\\s",
help="The separator between fields in the id line for paired-end FASTQs (default is whitespace: \"\\s\").",
metavar = "character"),
make_option(c("--id_field"), type="character", default=NULL,
help=paste("Field of the FASTQ id line containing sequenced id. By default",
"this will be automatically detected, which will work for most users.",
"Only for paired-end data.", sep=" "),
metavar = "boolean"),
make_option(c("--verbose"), action = "store_true", type="logical", default=FALSE,
help="Write out status messages (default: FALSE).", metavar = "boolean"),
make_option(c("--version"), action = "store_true", type="logical", default=FALSE,
help="Print out version number and exit.", metavar = "boolean")
)
opt_parser <- OptionParser(
option_list=option_list,
usage = "%prog [options] -f PATH",
description = paste(
"\nThis is a wrapper for DADA2 that is based on the authors\'",
"filtering step in the big data tutorial available here:",
"https://benjjneb.github.io/dada2/bigdata.html.\n\nBe sure to cite the DADA2",
"paper if you use this script:\nCallahan BJ et al. 2016. DADA2:",
"High-resolution sample inference from Illumina amplicon data.",
"Nature Methods 13:581-583.\n\nNote this script was tested with",
"DADA2 v1.4.0\n\nUSAGE NOTE: when passing filtering parameters for paired-end",
"reads if you pass one value it will be used for both forward and reverse reads.",
"\nIf you pass two values separated by a comma then these values will be for the forward",
"and reverse reads respectively.", sep=" ")
)
opt <- parse_args(opt_parser)
# Function to parse DADA2 filtering params and to throw error if any unexpected input.
# This function deals with cases where multiple options are specified for forward
# and reverse reads at the filtering step.
parse_dada2_filt_params <- function(in_param, single_end, param_name) {
# Split by comma (in case separate options given for R1 and R2).
in_param_split <- as.vector(strsplit(in_param, ",")[[1]])
# Report error if empty string.
if(length(in_param_split) == 0) {
stop(paste("Error when parsing parameter", param_name, "-",
"no values detected.", sep=" "))
}
# Make sure that only one value given if SE.
if(single_end & length(in_param_split) != 1) {
stop(paste("Error when parsing parameter", param_name, "-",
"expected 1 input value, but got", length(in_param_split),
sep=" "))
# Make sure that max of 2 values given if PE.
} else if(! single_end & length(in_param_split) > 2) {
stop(paste("Error when parsing parameter", param_name, "-",
"expected max of 2 input values, but got", length(in_param_split),
sep=" "))
}
if (length(in_param_split) == 1) {
result = tryCatch({
return(as.numeric(in_param_split[1]))
}, warning = function(w) {
stop("Warning!")
}, error = function(e) {
stop(paste("Error input parameter",
param_name, "of",
in_param_split[1],
"is not numeric.", sep=" "))
})
} else {
result = tryCatch({
return(c(as.numeric(in_param_split[1]), as.numeric(in_param_split[2])))
}, warning = function(w) {
stop("Warning!")
}, error = function(e) {
stop(paste("Error at least 1 input parameter for",
param_name, "of",
in_param_split[1], "or",
in_param_split[2],
"is not numeric.", sep=" "))
})
}
}
# Print out version if --version flag set.
if (opt$version) {
cat("Wrapper version:", version, "\n")
options_tmp <- options(show.error.messages=FALSE)
on.exit(options(options_tmp))
stop()
}
# Check if path to forward files set, if not stop job.
if(is.null(opt$f_path)) {
stop("path to forward FASTQs needs to be set.")
}
# Get forward filenames.
forward_in <- sort(list.files(opt$f_path, pattern=opt$f_match, full.names=TRUE))
forward_samples <- sapply(strsplit(basename(forward_in), opt$sample_delim), `[`, 1)
forward_out <- file.path(opt$output, paste0(forward_samples, "_R1_filt.fastq.gz"))
# Convert input filtering params to vectors and check for obvious input errors.
filt_params_raw <- opt[c("truncQ", "truncLen", "trimLeft", "maxLen",
"minLen", "maxN", "minQ", "maxEE")]
filt_params <- list(c())
for (param in names(filt_params_raw)) {
filt_params[[param]] <- parse_dada2_filt_params(in_param=filt_params_raw[[param]],
single_end=opt$single,
param_name=param)
}
# Set multithread option (FALSE if no, otherwise give # core).
if (opt$threads > 1) {
multithread_opt <- opt$threads
} else {
multithread_opt <- FALSE
}
# Finally read in and print out DADA2 version.
if(opt$verbose) {
cat("Running DADA2 version:", as.character(packageVersion("dada2")), "\n")
library("dada2")
cat("Running filterAndTrim function with the below options.\n")
} else {
suppressMessages(library("dada2"))
}
# Set id_field output log to be character of "NULL" so that it is printed (if applicable).
id_field_log <- opt$id_field
if(is.null(id_field_log)) {
id_field_log <- "NULL"
}
# Run DADA2 on just forward reads in single-end data.
if (opt$single) {
if(opt$verbose) {
cat("Single-end mode\n\n")
cat(paste("fwd =", paste(forward_in, collapse=",") , "\n",
"filt =", paste(forward_out, collapse=","), "\n",
"compress =", !opt$no_gzip, "\n",
"truncQ =", paste(filt_params$truncQ, collapse=","), "\n",
"truncLen =", paste(filt_params$truncLen, collapse=","), "\n",
"trimLeft =", paste(filt_params$trimLeft, collapse=","), "\n",
"maxLen =", paste(filt_params$maxLen, collapse=","), "\n",
"minLen =", paste(filt_params$minLen, collapse=","), "\n",
"maxN =", paste(filt_params$maxN, collapse=","), "\n",
"minQ =", paste(filt_params$minQ, collapse=","), "\n",
"maxEE =", paste(filt_params$maxEE, collapse=","), "\n",
"rm.phix =", !opt$no_rm_phiX, "\n",
"multithread =", multithread_opt, "\n",
"n =", opt$num_reads, "\n",
"verbose =", opt$verbose, "\n\n", sep=""))
}
read_counts <- filterAndTrim(fwd=forward_in, filt=forward_out, compress=!opt$no_gzip,
truncQ=filt_params$truncQ, truncLen=filt_params$truncLen,
trimLeft=filt_params$trimLeft, maxLen=filt_params$maxLen,
minLen=filt_params$minLen, maxN=filt_params$maxN,
minQ=filt_params$minQ, maxEE=filt_params$maxEE,
rm.phix=!opt$no_rm_phiX, multithread=multithread_opt,
n=opt$num_reads, verbose=opt$verbose)
} else {
# Read in path to reverse FASTQs.
if(is.null(opt$r_path)) {
opt$r_path <- opt$f_path
}
reverse_in <- sort(list.files(opt$r_path, pattern=opt$r_match, full.names=TRUE))
reverse_samples <- sapply(strsplit(basename(reverse_in), opt$sample_delim), `[`, 1)
reverse_out <- file.path(opt$output, paste0(reverse_samples, "_R2_filt.fastq.gz"))
# Check that sample names for forward and reverse FASTQs are the same.
if(! identical(forward_samples, reverse_samples)){
stop(paste("\n\nSample names parsed from forward and reverse filenames don't match.",
"\nForward sample name:", forward_samples,
"\nReverse sample name:", reverse_samples,
"\n\nUse the -s option if your reads are single-end.\n"))
}
if(opt$verbose) {
cat("Paired-end mode\n")
cat(paste("fwd =", paste(forward_in, collapse=",") , "\n",
"filt =", paste(forward_out, collapse=","), "\n",
"rev =", paste(reverse_in, collapse=","), "\n",
"filt.rev =", paste(reverse_out, collapse=","), "\n",
"compress =", !opt$no_gzip, "\n",
"truncQ =", paste(filt_params$truncQ, collapse=","), "\n",
"truncLen =", paste(filt_params$truncLen, collapse=","), "\n",
"trimLeft =", paste(filt_params$trimLeft, collapse=","), "\n",
"maxLen =", paste(filt_params$maxLen, collapse=","), "\n",
"minLen =", paste(filt_params$minLen, collapse=","), "\n",
"maxN =", paste(filt_params$maxN, collapse=","), "\n",
"minQ =", paste(filt_params$minQ, collapse=","), "\n",
"maxEE =", paste(filt_params$maxEE, collapse=","), "\n",
"rm.phix =", !opt$no_rm_phiX, "\n",
"multithread =", multithread_opt, "\n",
"n =", opt$num_reads, "\n",
"matchIDs =", opt$matchIDs, "\n",
"id.sep =", opt$id_sep, "\n",
"id.field =", id_field_log, "\n",
"verbose =", opt$verbose, "\n\n", sep=""))
}
read_counts <- filterAndTrim(fwd=forward_in, filt=forward_out, rev=reverse_in,
filt.rev=reverse_out, compress=!opt$no_gzip,
truncQ=filt_params$truncQ, truncLen=filt_params$truncLen,
trimLeft=filt_params$trimLeft, maxLen=filt_params$maxLen,
minLen=filt_params$minLen, maxN=filt_params$maxN,
minQ=filt_params$minQ, maxEE=filt_params$maxEE,
rm.phix=!opt$no_rm_phiX, multithread=multithread_opt,
n=opt$num_reads, matchIDs=opt$matchIDs, id.sep=opt$id_sep,
id.field=opt$id_field, verbose=opt$verbose)
}
# Convert log table rownames from R1 FASTQs to sample names.
read_counts <- as.data.frame(read_counts)
read_counts$sample <- sapply(strsplit(rownames(read_counts), opt$sample_delim), `[`, 1)
colnames(read_counts) <- c("input", "filtered", "sample")
read_counts <- read_counts[c("sample", "input", "filtered")]
# Print out input and output read counts to logfile.
write.table(x = read_counts, file = opt$log, quote = FALSE, sep="\t",
col.names = TRUE, row.names = FALSE)