From 26558acc6956336a69cbe11c03f862a3268f73e4 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Tue, 20 Dec 2022 15:35:01 -0800 Subject: [PATCH] Add collect_outputs flag (#29) --- README.md | 20 +++++++++++++++++++ ...cken_percentages_for_unclassified_reads.py | 2 +- main.nf | 11 ++++++++++ modules/taxon_abundance.nf | 7 ++++++- nextflow.config | 8 +++++--- 5 files changed, 43 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index af1283a..248b00b 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,26 @@ When the `--skip_bracken` flag is used, abundances will be calculated directly f estimates directly from kraken2 reports may under-estimate the actual abundances. Detailed rationale for including bracken analysis can be found in the [bracken paper](https://peerj.com/articles/cs-104/). +### Collecting Outputs + +By default, separate output files will be created for each sample, in independent output sub-directories under the directory provided with the `--outdir` flag. + +To generate summary files for all samples, add the `--collect_outputs` flag. The following files will be written to the output dir: + +``` +collected___abundances.csv +collected___abundances_top_5.csv +collected_fastp.csv +``` + +An alternative prefix (instead of `collected`) may be supplied using the `--collected_outputs_prefix` flag. For example, with `--collected_outputs_prefix test`: + +``` +test___abundances.csv +test___abundances_top_5.csv +test_fastp.csv +``` + ## Outputs An output directory will be created for each sample. Within those directories, diff --git a/bin/adjust_bracken_percentages_for_unclassified_reads.py b/bin/adjust_bracken_percentages_for_unclassified_reads.py index 4ae0088..78fc9b3 100755 --- a/bin/adjust_bracken_percentages_for_unclassified_reads.py +++ b/bin/adjust_bracken_percentages_for_unclassified_reads.py @@ -73,7 +73,7 @@ def main(args): 'fraction_total_reads', ] - writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames) + writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='unix', quoting=csv.QUOTE_MINIMAL) writer.writeheader() for b in bracken_abundances: diff --git a/main.nf b/main.nf index 83148c7..4513d39 100644 --- a/main.nf +++ b/main.nf @@ -54,6 +54,17 @@ workflow { extract_reads(ch_fastq.join(kraken2.out.report).combine(ch_to_extract, by: 0)) } + if (params.collect_outputs) { + fastp.out.csv.map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_fastp.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }) + if (!params.skip_bracken) { + ch_abundances.map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_" + params.taxonomic_level + "_bracken_abundances.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1].split(',')[0] }) + abundance_top_5.out.map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_" + params.taxonomic_level + "_bracken_abundances_top_5.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }) + } else { + ch_abundances.map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_" + params.taxonomic_level + "_kraken_abundances.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1].split(',')[0] }) + abundance_top_5_kraken.out.map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_" + params.taxonomic_level + "_kraken_abundances_top_5.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }) + } + } + ch_provenance = fastp.out.provenance ch_provenance = ch_provenance.join(kraken2.out.provenance).map{ it -> [it[0], [it[1], it[2]]] } diff --git a/modules/taxon_abundance.nf b/modules/taxon_abundance.nf index 1ddd1f5..5a5799c 100644 --- a/modules/taxon_abundance.nf +++ b/modules/taxon_abundance.nf @@ -8,7 +8,8 @@ process fastp { tuple val(sample_id), path(reads_1), path(reads_2) output: - tuple val(sample_id), path("${sample_id}_fastp.{json,csv}"), emit: fastp_reports + tuple val(sample_id), path("${sample_id}_fastp.json"), emit: json + tuple val(sample_id), path("${sample_id}_fastp.csv"), emit: csv tuple val(sample_id), path("${sample_id}_trimmed_R1.fastq.gz"), path("${sample_id}_trimmed_R2.fastq.gz"), emit: reads tuple val(sample_id), path("${sample_id}_fastp_provenance.yml"), emit: provenance @@ -77,15 +78,19 @@ process bracken { -o ${sample_id}_${params.taxonomic_level}_bracken_abundances_unsorted.tsv \ -r ${params.read_length} \ -l ${params.taxonomic_level} + paste <(echo "sample_id") <(head -n 1 ${sample_id}_${params.taxonomic_level}_bracken_abundances_unsorted.tsv) | tr \$'\\t' ',' > bracken_abundances_header.csv + adjust_bracken_percentages_for_unclassified_reads.py \ -k ${kraken2_report} \ -b ${sample_id}_${params.taxonomic_level}_bracken.txt \ -a ${sample_id}_${params.taxonomic_level}_bracken_abundances_unsorted.tsv \ > ${sample_id}_${params.taxonomic_level}_bracken_abundances_unsorted_with_unclassified.csv + tail -n+2 ${sample_id}_${params.taxonomic_level}_bracken_abundances_unsorted_with_unclassified.csv | \ sort -t ',' -nrk 7,7 | \ awk -F ',' 'BEGIN {OFS=FS}; {print "${sample_id}",\$0}' > ${sample_id}_${params.taxonomic_level}_bracken_abundances_data.csv + cat bracken_abundances_header.csv ${sample_id}_${params.taxonomic_level}_bracken_abundances_data.csv > ${sample_id}_${params.taxonomic_level}_bracken_abundances.csv """ } diff --git a/nextflow.config b/nextflow.config index e68f480..763ed95 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,7 +4,7 @@ manifest { description = 'Taxon Abundance' mainScript = 'main.nf' nextflowVersion = '>=20.01.0' - version = '0.1.4' + version = '0.1.5' } params { @@ -14,8 +14,8 @@ params { fastq_exts = ['.fastq.gz', '.fq.gz', '.fastq', '.fq'] fastq_search_path = makeFastqSearchPath( illumina_suffixes, fastq_exts ) samplesheet_input = 'NO_FILE' - kraken_db = '/data/ref_databases/kraken2/2021-05-17_standard' - bracken_db = '/data/ref_databases/kraken2/2021-05-17_standard' + kraken_db = '/data/ref_databases/kraken2/latest_standard' + bracken_db = '/data/ref_databases/kraken2/latest_standard' confidence = 0.0 taxonomic_level = "S" read_length = 150 @@ -23,6 +23,8 @@ params { extract_reads = false extract_reads_threshold = 1.0 skip_bracken = false + collect_outputs = false + collected_outputs_prefix = 'collected' pipeline_short_name = parsePipelineName(manifest.toMap().get('name')) pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version')) }