-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yml
352 lines (303 loc) · 13.3 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
############################
##### INPUT PARAMETERS #####
############################
# The rundir parameter is the name of a subdirectory under data/ that must contain
# - asv_seqs.fasta (ASV sequences in FASTA format)
# - asv_counts.tsv (counts of ASVs (rows) in samples (columns))
rundir: "test"
# The run_name parameter is the name of the workflow run and is used to separate
# runs with specific parameters of the clustering tools.
# With different run_name parameters on the same rundir you can cluster the ASVs
# while automatically using the same alignment output (from vsearch).
run_name: "run1"
# With the split_rank parameter you can split the ASVs by a certain taxonomic rank
# prior to alignment and clustering, which parallellizes the workflow and can help
# speed things up on large datasets.
# Note that the split_rank must exactly match one of the columns in the asv_taxa.tsv file
split_rank: "Family"
####################################
##### PREPROCESSING PARAMETERS #####
####################################
preprocessing:
# Whether to perform length filtering of input seqs
filter_length: False
# Minimum and maximum length of input seqs
min_length: 403
max_length: 418
# Whether to perform codon filtering of input seqs
filter_codons: False
# Specific stop codons to look for. Specify multiple codons with in a comma separated list like: "TAA,TAG". By default TAA and TAG are being looked for.
stop_codons: "TAA,TAG"
# Specific position of the ASV where to start checking for codons.
start_position: 2
# Specific position of the ASV where to stop checking for codons. By default it checks until the end of the sequence
end_position: 0
###############################
##### TAXONOMY PARAMETERS #####
###############################
# The split_size parameter specifies the number of sequences to include in each
# split fasta file. This is used to reduce the size of the fasta files that are
# input to the taxonomy assignment tools.
split_size: 500
# The taxonomy_source parameter specifies what taxonomy source to use for the
# clustering. This is the primary source of taxonomy information and is used to
# to partition the ASVs into smaller fasta files based on the split_rank
# parameter. If you already have a tab-separated file with taxonomic
# assignments, set taxonomy_source to the path of that file.
taxonomy_source: "sintax+epa-ng"
# The ranks parameter specifies what taxonomic ranks are included in the optional
# ASV taxonomy file. If you don't have a taxonomy file, and instead want to assign
# taxonomy using the taxonomic tools specified under 'taxtools', do not modify this
# parameter. If you have a taxonomy file, set this parameter to the ranks that are
# included in the file, e.g. ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus",
# "Species", "BOLD_bin"]. If you leave this parameter empty the pipeline will attempt to
# guess which ranks are included in the taxonomy file.
ranks: []
# The taxtools parameter specifies what taxonomic tools to use for assigning taxonomy
taxtools:
- "sintax"
- "vsearch"
- "epa-ng"
- "sintax+epa-ng"
# The sintax parameter specifies the parameters to use for the sintax taxonomy
# assignment tool.
sintax:
ref: ""
randseed: 15
cutoff: 0.8
ranks:
- "Kingdom"
- "Phylum"
- "Class"
- "Order"
- "Family"
- "Genus"
- "Species"
- "BOLD_bin"
# Below are the parameters for qiime2 (vsearch and sklearn) taxonomy assignment
qiime2:
# ref is the path to the reference database to use for taxonomy assignment
ref: ""
# taxfile is the path to the taxonomy file that corresponds to the reference database
taxfile: ""
ranks: []
# The epa-ng parameter specifies the parameters to use for the epa-ng taxonomy
# assignment tool.
epa-ng:
tree: ""
tree_format: "newick"
msa: ""
msa_format: "fasta"
model: "GTR+G+F"
heuristic: "dyn-heur"
ref_taxonomy: ""
chunk_size: 5000
tree_ranks:
- "Kingdom"
- "Phylum"
- "Class"
- "Order"
- "Family"
- "Genus"
- "Species"
gappa:
distribution_ratio: -1
consensus_thresh: 1
# Below are settings that control how ASVs that are unclassified by SINTAX will
# be handled. The default is to use the EPA-NG results to reassign the SINTAX
# taxonomy at high levels in the taxonomy tree.
reassign:
# placeholder_rank is the taxonomic rank at which ASVs must be classified by EPA-NG,
# but unclassified by SINTAX in order to be reclassified.
placeholder_rank: "Class"
# placeholder_taxa is a list of taxa that ASVs must be classified as by both EPA-NG,
# and SINTAX (at placeholder_rank) in order to be reassigned at lower levels.
placeholder_taxa: ["Collembola", "Diplura", "Protura", "Insecta"]
# reassign_rank is the taxonomic rank at which ASVs will be reassigned if they meet
# the criteria above.
reassign_ranks: ["Order"]
# downstream_ranks is a list of ranks which will be updated for ASVs with reassignments
downstream_ranks: ["Family", "Genus", "Species", "BOLD_bin"]
# The evaluation rank parameter specifies what taxonomic rank to use for evaluating
# the clustering results. This rank should correspond to the taxonomic rank that
# you expect the clustering to represent, e.g. if you expect the clustering to
# represent species, set this parameter to "Species".
evaluation_rank: "Species"
# Ranks to use for calculating consensus taxonomy for clusters
consensus_ranks:
- "Family"
- "Genus"
- "Species"
# % threshold to use for calculating consensus taxonomy for clusters
# As an example, if a cluster has 10 sequences, and at least 8 of them are classified
# as "Species X" at the Species level, the consensus taxonomy for the cluster
# will be "Species X" if the threshold is set to 80, otherwise taxonomy will
# be attempted to be resolved at Genus level etc.
consensus_threshold: 80
########################################
##### CHIMERA FILTERING PARAMETERS #####
########################################
chimera:
# Should chimera detection and removal be performed on the input data?
remove_chimeras: True
# Set a run name for the chimera detection and removal step
run_name: "chimera1"
# Select method for chimera filtering:
#'batchwise' = run chimera detection on dataset as a whole.
#'samplewise' = split input fasta into one file per sample and run chimera
# detection on each sample individually.
method: "samplewise"
# Select algorithm to use, you can choose from 'uchime_denovo', 'uchime2_denovo'
# and 'uchime3_denovo'.
algorithm: "uchime_denovo"
# In batchwise method, require that a sequence marked as chimeric is present
# with its parents in at least <min_samples_shared> samples
min_samples_shared: 1
# In batchwise method, require that a sequence marked as chimeric is present
# with its parents in at least <min_frac_samples_shared> fraction of samples
min_frac_samples_shared: 0
# In samplewise method, require that a sequence is marked as chimeric in at least
# <min_chimeric_samples> in order for it to be removed from analysis. If this
# value is set to 0, ASVs have to be marked as chimeric in all samples
min_chimeric_samples: 0
# In samplewise method, require that a sequence is marked as chimeric in at least
# <min_frac_chimeric_samples> fraction of samples in which the ASV is present in
# order for it to be removed from analysis. If this value is set to 0, ASVs have
# to be marked as chimeric in all samples
min_frac_chimeric_samples: 0
# Thec parameters below are specific to how the chimeric score of sequences is
# calculated. Please see the Uchime
# [docs](https://www.drive5.com/usearch/manual6/UCHIME_score.html) for
# details. Note that these are not used for uchime2_denovo or uchime3_denovo.
# Instead these algorithms require 'perfect chimeras' (see the Uchime manual
# for more info)
dn: 1.4
mindiffs: 3
mindiv: 0.8
minh: 0.28
#abskew: 2.0
###################################
##### CLUSTERING TOOLS TO RUN #####
###################################
# Specify what clustering software to use. Choose from 'swarm', 'opticlust',
# 'dbotu3'.
software:
- "swarm"
#- "opticlust"
#- "dbotu3"
####################################
##### TOOL-SPECIFIC PARAMETERS #####
####################################
vsearch:
# minimum pairwise identity to report
id: 0.84
# pairwise identity definition (see vsearch manual for details and choices)
iddef: "1"
# threshold for query coverage
# reject hits if fraction of query aligned is less than this value
query_cov: 0.9
opticlust:
# For opticlust, choose whether pairwise alignments should be generated with
# 'vsearch' or 'mothur'
aligner: "vsearch"
# The delta parameter sets the stable value for the metric in opticlust
delta: 0.0001
# at what similarity threshold clusters are generated.
cutoff: 0.05
# The `initialize` parameter sets the initial randomization for opticlust.
# The default is `singleton` where each sequence is randomly assigned to its
# own OTU. The other accepted setting `oneotu` means that all sequences are
# assigned to one otu.
initialize: "singleton"
# the floating point precision for opticlust.
precision: 1000
swarm:
# number of differences allowed to cluster sequences
differences: 15
# setting 'no-otu-breaking' to True deactivates the built-in cluster
# refinement when d=1
no-otu-breaking: False
# if 'fastidious' is set to True swarm performs a second clustering pass to
# reduce the number of small clusters when d=1
fastidious: False
# when 'fastidious' is set to True 'boundary' defines the minimum abundance of
# what should be considered a large cluster.
boundary: 0
# the settings below modify the pairwise global alignment scoring parameters
# when d>1
match-reward: 5
mismatch-penalty: 4
gap-opening-penalty: 12
gap-extension-penalty: 4
dbotu3:
# dist sets the maximum allowed genetic dissimilarity between sequences
dist: 0.1
# abund sets the minimum fold difference for comparing two OTUs
abund: 10.0
# pval sets the minimum p-value for merging ASVs
pval: 0.0005
###########################
##### NOISE FILTERING #####
###########################
noise_filtering:
# run_name sets the name of the run for the noise filtering step
run_name: "noise1"
# split_rank sets the taxonomic rank to split the ASVs by before running the
# noise filtering.
split_rank: "Order"
# assignment_rank sets the taxonomic rank to use for the noise filtering.
# ASVs unassigned or ambiguously assigned at this rank are considered noise and will be removed.
assignment_rank: "Order"
# maximum number of target sequences to return for aligner
max_target_seqs: 10
# min_match is the minimum threshold of sequence similarity for considering any
# OTU as an error of another
min_match: 84
# n_closest is the number of potential parent clusters to use in the comparison
# when identifying noise
n_closest: 10
# echo_min_overlap (numeric on the interval (0,1)): In the echo filter, the minimum fraction of
# the putative “noise” OTU samples that must also contain the putative
# authentic CO1 “parent” OTU
echo_min_overlap: 0.9
# echo_read_ratio_type (“max”/“mean”): In the echo filter, the type of read
# ratio considered by the echo_max_read_ratio criterion: “max” = maximum ratio
# between noise and parent reads across the samples where they co-occur;
# “mean” = mean ratio between noise and parent reads across the sample where
# they co-occur.
echo_read_ratio_type: mean
# echo_max_read_ratio (numeric, positive real): In the echo filter, the
# maximum value of the ratio between noise and parent reads
echo_max_read_ratio: 0.1
# echo_require_corr (TRUE/FALSE): In the echo filter, should a significant
# correlation between noise and parent read numbers be required before the
# putative OTU is removed? The significance is evaluated using a linear model,
# requiring that the regression coefficient is smaller than
# echo_max_read_ratio, and the p-value <=0.05.
echo_require_corr: False
# evo_local_min_overlap (numeric on the interval (0.1)): In the local evo
# filter, minimum fraction of the putative “noise” OTU samples that must also
# contain the putative authentic CO1 “parent” OTU.
evo_local_min_overlap: 0.95
# dist_type_local (“dadn”/“wdadn”): In the local evo filter, should distances
# between representative ASVs be based on unweighted amino acid distances
# (“dadn”) or amino acid distances weighted based on biochemical similarities
# (“wdadn”)?
dist_type_local: dadn
# dist_threshold_local (numeric, positive real): In the local evo filter, the
# distance value above which an OTU is considered to represent noise.
dist_threshold_local: 1.8
# dist_threshold_global (numeric, positive real): In the global evo filter,
# the distance value above which an OTU is considered to represent noise.
dist_threshold_global: 4.0
# abundance_cutoff_type (“sum”/“max”): Should the abundance cutoff be based on
# the sum of reads across samples in the dataset (“sum”), or the max number of
# reads (“max”)?
abundance_cutoff_type: sum
# abundance_cutoff (numeric, positive integer): The minimum number of reads
# required for an OTU to pass the abundance filter
abundance_cutoff: 3
# codon table to use for translation when aligning sequences with pal2nal
codon_table: 5
# reading frame start (1 based)
codon_start: 2