-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.nf
369 lines (282 loc) · 10.7 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
#!/usr/bin/env nextflow
/*
=================================================================================================
LONGMETARG
=================================================================================================
Nextflow pipeline for the identifiation of antibiotic resistance genes in metagenomic long reads
-------------------------------------------------------------------------------------------------
@ Author
Louisa Wessels Perelo <[email protected]>
#### Homepage / Documentation
https://github.com/louperelo/longmetarg
-------------------------------------------------------------------------------------------------
*/
def helpMessage() {
//log.info nfcoreHeader()
log.info"""
==============================================================
LONGMETARG
==============================================================
Nextflow pipeline for the identifiation of antibiotic resistance genes in metagenomic long reads
--------------------------------------------------------------
Usage:
The typical command for running the pipeline is as follows:
nextflow run main.nf --input metagenome.fasta -profile conda
Mandatory arguments:
--input [file] Path to input fasta file
-profile [str] Configuration profile to use.
Available: conda. Soon available: docker, singularity
Options:
--readtype [str] long read type (ONT or PB) vs reference mapping in minimap2
Available: map-ont (default), map-pb
--flye [bool] run assembly with metaFlye before mapping
default: false
--flyeDt [str] specify sequencing type in metaFlye
Available: nano-raw (default), nano-corr, subassemblies,
pacbio-raw, pacbio-corr, pacbio-hifi
--length [int] cutoff ARG alignment length (%), default: 95
--ident [int] cutoff ARG cover identity (%), default: 80
--genomeSize [int] estimated genome size for metaFlye assembly, default: 1000000
--taxon [str] taxonomy alignment tool
Available: Diamond (default), Blast
--pfThreshold [float] manually specified threshold for probability filtering in PlasFlow
default: 0.7
--pfModels [file] custom location of models used for prediction in PlasFlow
default: $PWD/env/plasflow
References for use with -profile conda
--argDb [file] Path to the CARD database fasta
default: $PWD/card_db/card_database_v*.fasta
--drugClass [file] Path to CARD aro_index file
default: $PWD/aro_index.tsv
--blastDbDir [file] Path to folder containing Blast database
default: $PWD/blast_db
--blastDbName [str] Name of Blast database
default: ref_prok_rep_genomes
--diamondDbDir [file] Path to folder containing Diamond database
default: $PWD/diamond_db
--diamondDbName [str] Name of Diamond database
default: nr.dmnd
//Other options:
--outdir [file] The output directory where the results will be saved
--threads [int] Number of parallel threads, default: 15
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
// Setup
/*move params to nextflow.config file
*/
/*
params.help = false
params.input = "$PWD/sample.fa"
params.readtype = 'map-ont'
params.argDb = "$PWD/card_db/card_database_v*.fasta"
params.drugClass = "$PWD/aro_index.tsv"
params.blastDbDir = "$PWD/blast_db"
params.blastDbName = "ref_prok_rep_genomes"
params.diamondDbDir = "$PWD/diamond_db"
params.diamondDbName = "proteinDb.dmnd"
params.flye = false
params.flyeDt = "nano-raw"
params.length = 95
params.ident = 80
params.genomeSize = 1000000
params.outdir = "$PWD/out_dir"
params.taxon = "Diamond"
params.threads = 15
params.pfThreshold = 0.7
params.pfModels = "$PWD/env/plasflow/models"
*/
in_ch = Channel.fromPath(params.input)
.into{in_qc; in_flye; in_align; in_analysis}
/*
input FASTA statistical summary and plots using NanoPlot
*/
process stats_NanoPlot {
label 'mapcard' //use to define env in nextflow.config
input:
file in_qc
publishDir "$params.outdir/nanoPlot"
output:
file '*' //into qcheck
script:
"""
NanoPlot --fasta $in_qc -t $params.threads
"""
}
/*
MetaFlye assembly
execute only if params.flye is set to true
*/
process assembly_MetaFlye {
label 'metaflye'
input:
file in_flye
publishDir "$params.outdir"
output:
file 'out_flye/assembly.fasta' into (out_flye, flye_analysis)
when:
params.flye == true
script:
/*
minimum overlap was set to default 2000 (2kb), also used in Flye paper for Cow rumen set.
genome size default was set to 1 G, as in Flye paper
*/
"""
flye --$params.flyeDt $in_flye --threads $params.threads --out-dir out_flye --meta --plasmids --genome-size $params.genomeSize
"""
}
/*
Minimap2 alignment to CARD database
*/
process map_CARD {
label 'mapcard'
input:
file fa from in_align
publishDir "$params.outdir", mode: 'copy'//, overwrite: true
output:
file '*' into card_aln
when:
params.flye == false
script:
"""
minimap2 -ax $params.readtype $params.argDb $fa -t $params.threads -c | samtools view -S -b | samtools view -b -F 4 > aln_F4.bam
"""
}
process mapFlye_CARD {
label 'mapcard'
input:
file out_flye
publishDir "$params.outdir", mode: 'copy'
output:
file '*' into flyecard_aln
when:
params.flye == true
script:
"""
minimap2 -ax $params.readtype $params.argDb $out_flye -t $params.threads | samtools view -S -b | samtools view -b -F 4 > aln_F4.bam
"""
}
/*
Extract information from minimap2 alignment output
*/
process aln_analysis {
label 'mapcard'
input:
file card_aln
file in_analysis
publishDir "$params.outdir", mode:'copy'
output:
file "argHitsDf.tsv" into in_summary
file "argHits.fasta" into(in_tax, in_pf)
when:
params.flye == false
script:
"""
read_analysis.py --bam_file $card_aln --reads_fasta $in_analysis --ident $params.ident --cov_length $params.length --folder $params.outdir --aro_idx $PWD/bin/aro_index.tsv
"""
}
process alnFlye_analysis {
label 'mapcard'
input:
file flyecard_aln
file flye_analysis
publishDir "$params.outdir", mode:'copy'
output:
file "argHitsDf.tsv" into inflye_summary
file "argHits.fasta" into(inflye_tax, inflye_pf)
when:
params.flye == true
script:
"""
read_analysis.py --bam_file $flyecard_aln --reads_fasta $flye_analysis --ident $params.ident --cov_length $params.length --folder $params.outdir --aro_idx $PWD/bin/aro_index.tsv
"""
}
/*
PlasFlow analysis of results.fasta from alignment_analysis, adding column with Plasmid/Chromosome information to results_df
*/
process plasflow {
label 'plasflow'
input:
file argHits from in_pf.mix(inflye_pf)
publishDir "$params.outdir", mode:'copy'
output:
file 'result_pf.tsv' into out_pf
script:
"""
PlasFlow.py --input $argHits --output result_pf.tsv --threshold $params.pfThreshold --models $params.pfModels
"""
}
/*
BLAST or DIAMOND taxonomic analysis for result.fasta reads
*/
process taxonomy {
if( params.taxon == "Blast")
label 'blast'
if( params.taxon == "Diamond")
label 'diamond'
input:
file argHits from in_tax.mix(inflye_tax)
publishDir "$params.outdir", mode:'copy'
output:
file 'result_tax.tsv' into (in_ssciname, out_taxonomy)
script:
if( params.taxon == "Blast")
"""
blastn -task blastn -max_target_seqs 1 -num_threads $params.threads -outfmt "6 delim= qseqid staxid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -db $params.blastDbDir/$params.blastDbName -query $argHits -out result_tax.tsv
"""
else if( params.taxon == "Diamond")
"""
diamond blastx -d $params.diamondDbDir/$params.diamondDbName -q $argHits -p $params.threads -F 15 -f 6 qseqid staxids pident length mismatch gapopen qstart qend sstart send evalue bitscore --max-target-seqs 1 -o result_tax.tsv
"""
else
throw new IllegalArgumentException("Unknown taxonomy aligner $params.taxon")
}
/*
Get scientific name from NCBI taxid in process taxonomy output
*/
process get_ssciname {
label 'mapcard'
input:
file in_ssciname
publishDir "$params.outdir", mode:'copy'
output:
file "*" into out_ssciname
script:
"""
get_taxon.py --blast_out $in_ssciname
"""
}
/*
join results to output summary
*/
process summary {
label 'mapcard'
input:
file argHitsDf from in_summary.mix(inflye_summary)
file out_pf
file out_taxonomy
file out_ssciname
publishDir "$params.outdir", mode:'copy'
output:
file 'summary.tsv' //into summary
script:
"""
#!/usr/bin/python
import pandas as pd
aln_df = pd.read_csv("$argHitsDf", sep="\\t")
pf_df = pd.read_csv("$out_pf", sep="\\t")
taxon_df = pd.read_csv("$out_taxonomy", sep="\\t", header=None, names=['contig_name', 'taxid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
ssciname_df = pd.read_csv("$out_ssciname", sep="\\t")
tax_df = pd.merge(taxon_df, ssciname_df[['taxid', 'species', 'class']], on='taxid', how='left')
aln_pf_df = pd.merge(aln_df, pf_df[['contig_name', 'label']], on='contig_name', how='left')
aln_pf_df.rename(columns={'label':'gene_location'}, inplace = True)
aln_pf_df['gene_location'] = aln_pf_df['gene_location'].apply(lambda x: x.split('.')[0])
aln_pf_tax_df = pd.merge(aln_pf_df, tax_df[['contig_name', 'species', 'class', 'pident', 'evalue', 'bitscore']], on='contig_name', how='left')
summary_df = aln_pf_tax_df.drop(['index', 'contig_length', 'aln_length', 'aln_nm', 'ref_start', 'ref_length', 'ref_ARO', 'ref_ID', 'ref_NCBI', 'c_tuples', 'matches', 'insertions', 'deletions'], axis=1)
summary_df.to_csv('summary.tsv', sep="\\t")
"""
}