From 65034b59d4e25980cdba117c06913a2d4506a155 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Mon, 20 Nov 2023 22:25:04 +0000 Subject: [PATCH 01/16] Added ability to auto-create samplesheet for phageannotator --- conf/modules.config | 15 ++ conf/test.config | 14 ++ modules.json | 5 + modules/local/mag_merge_samplesheet.nf | 27 ++++ modules/local/mag_to_samplesheet.nf | 40 +++++ modules/nf-core/cat/cat/environment.yml | 7 + modules/nf-core/cat/cat/main.nf | 62 +++++++ modules/nf-core/cat/cat/meta.yml | 36 +++++ modules/nf-core/cat/cat/tests/main.nf.test | 153 ++++++++++++++++++ .../nf-core/cat/cat/tests/main.nf.test.snap | 121 ++++++++++++++ .../cat/tests/nextflow_unzipped_zipped.config | 6 + .../cat/tests/nextflow_zipped_unzipped.config | 8 + modules/nf-core/cat/cat/tests/tags.yml | 2 + nextflow.config | 1 + subworkflows/local/samplesheet_creation.nf | 115 +++++++++++++ workflows/mag.nf | 7 + 16 files changed, 619 insertions(+) create mode 100644 modules/local/mag_merge_samplesheet.nf create mode 100644 modules/local/mag_to_samplesheet.nf create mode 100644 modules/nf-core/cat/cat/environment.yml create mode 100644 modules/nf-core/cat/cat/main.nf create mode 100644 modules/nf-core/cat/cat/meta.yml create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap create mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config create mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config create mode 100644 modules/nf-core/cat/cat/tests/tags.yml create mode 100644 subworkflows/local/samplesheet_creation.nf diff --git a/conf/modules.config b/conf/modules.config index cbfa51fb..28d5afc5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -741,6 +741,21 @@ process { ] } + withName: MAG_TO_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + enabled: false + ] + } + + withName: 'MAG_MERGE_SAMPLESHEET' { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index 9c93278f..3a5ea4f4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,4 +30,18 @@ params { busco_clean = true skip_gtdbtk = true skip_concoct = true + + // For computational efficiency + nf_core_pipeline = 'phageannotator' + coassemble_group = false + skip_binning = true + skip_prokka = true + skip_spadeshybrid = true + skip_quast = true + skip_prodigal = true + skip_krona = true + skip_adapter_trimming = true + skip_metabat2 = true + skip_maxbin2 = true + skip_busco = true } diff --git a/modules.json b/modules.json index e9162243..cbe0a14f 100644 --- a/modules.json +++ b/modules.json @@ -36,6 +36,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "cat/cat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "cat/fastq": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", diff --git a/modules/local/mag_merge_samplesheet.nf b/modules/local/mag_merge_samplesheet.nf new file mode 100644 index 00000000..5ad7d01a --- /dev/null +++ b/modules/local/mag_merge_samplesheet.nf @@ -0,0 +1,27 @@ +process MAG_MERGE_SAMPLESHEET { + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path ('samplesheets/*') + + output: + path "samplesheet.csv", emit: samplesheet + path "versions.yml" , emit: versions + + script: + """ + head -n 1 `ls ./samplesheets/* | head -n 1` > samplesheet.csv + for fileid in `ls ./samplesheets/*`; do + awk 'NR>1' \$fileid >> samplesheet.csv + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/mag_to_samplesheet.nf b/modules/local/mag_to_samplesheet.nf new file mode 100644 index 00000000..a454bb44 --- /dev/null +++ b/modules/local/mag_to_samplesheet.nf @@ -0,0 +1,40 @@ +process MAG_TO_SAMPLESHEET { + tag "$meta.id" + + executor 'local' + memory 100.MB + + input: + val meta + val pipeline + + output: + tuple val(meta), path("*samplesheet.csv"), emit: samplesheet + + exec: + // + // Create samplesheet containing metadata + // + + // Add nf-core pipeline specific entries + if (pipeline) { + if (pipeline == 'phageannotator') { + pipeline_map = [ + sample : "${meta.id}", + group : "${meta.group}", + fastq_1 : meta.fastq_1, + fastq_2 : meta.fastq_2, + fasta : meta.fasta + ] + } + } + + // Create a samplesheet + samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' + samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",") + + // Write samplesheet to file + def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv") + samplesheet_file.text = samplesheet + +} diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 00000000..17a04ef2 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 00000000..4264a92c --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,62 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 00000000..00a8db0b --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 00000000..5766daaf --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,153 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")} + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")} + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 00000000..423571ba --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,121 @@ +{ + "test_cat_unzipped_zipped_size": { + "content": [ + 375 + ], + "timestamp": "2023-10-16T14:33:08.049445686" + }, + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped_lines": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ] + ], + "timestamp": "2023-10-16T14:32:33.629048645" + }, + "test_cat_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:08.038830506" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_zipped_zipped_size": { + "content": [ + 78 + ], + "timestamp": "2023-10-16T14:32:33.641869244" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2023-10-16T14:33:21.4094373" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 00000000..ec26b0fd --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 00000000..fbc79783 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 00000000..37b578f5 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/nextflow.config b/nextflow.config index 66314e9a..90fde95a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null single_end = false assembly_input = null + nf_core_pipeline = null // short read preprocessing options skip_clipping = false diff --git a/subworkflows/local/samplesheet_creation.nf b/subworkflows/local/samplesheet_creation.nf new file mode 100644 index 00000000..d9edb182 --- /dev/null +++ b/subworkflows/local/samplesheet_creation.nf @@ -0,0 +1,115 @@ +include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' +include { MAG_TO_SAMPLESHEET } from '../../modules/local/mag_to_samplesheet' +include { MAG_MERGE_SAMPLESHEET } from '../../modules/local/mag_merge_samplesheet' + +workflow SAMPLESHEET_CREATION { + take: + short_reads //channel: [val(meta), path(fastq_1), path(fastq_2)] + assemblies //channel: [val(meta), path(fasta)] + main: + ch_versions = Channel.empty() + + // combine assemblies by sample/group if multiple assembly methods were used + ch_assemblies = assemblies + .map { + meta, fasta -> + def meta_new = meta.subMap('id') + [ meta_new, fasta ] + } + .groupTuple() + + // + // MODULE: Combine all assemblies from a sample into one FastA file + // + ch_combined_assemblies = CAT_CAT ( ch_assemblies ).file_out + ch_versions = ch_versions.mix( CAT_CAT.out.versions ) + + // if no coassembly, join FastQ and FastA by ID + if ( !params.coassemble_group ){ + ch_combined_assemblies_remap = ch_combined_assemblies + .map { + meta, fasta -> + def id = meta.id + + return [ id, fasta ] + } + short_reads + .map { + meta, fastq -> + def id = meta.id + def group = meta.group + def single_end = meta.single_end + + return [ id, group, single_end, fastq ] + }.join ( ch_combined_assemblies_remap ) + .map { + id, group, single_end, fastq, fasta -> + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def meta = [:] + + meta.id = id + meta.group = group + meta.single_end = single_end + meta.fastq_1 = reads[0] ? reads[0] : '' + meta.fastq_2 = reads[1] && !meta.single_end ? reads[1] : '' + meta.fasta = fasta ? fasta : '' + + return meta + } + .set { ch_mag_metadata } + } else { + // if coassembly was used, join FastQ and FastA by group + ch_combined_assemblies_remap = ch_combined_assemblies + .map { + meta, fasta -> + def group = meta.id.split('-') + + return [ group[1], fasta ] + } + short_reads + .map { + meta, fastq -> + def id = meta.id + def group = meta.group + def single_end = meta.single_end + + return [ group, id, single_end, fastq ] + } + .join ( ch_combined_assemblies_remap ) + .map { + id, group, single_end, fastq, fasta -> + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def meta = [:] + + meta.id = id + meta.group = group + meta.single_end = single_end + meta.fastq_1 = reads[0] ? reads[0] : '' + meta.fastq_2 = reads[1] && !meta.single_end ? reads[1] : '' + meta.fasta = fasta ? fasta : '' + + return meta + } + .set { ch_mag_metadata } + } + + // + // MODULE: Stage FastQ/FastA files generated by nf-core/mag together and auto-create a samplesheet + // + MAG_TO_SAMPLESHEET ( + ch_mag_metadata, + params.nf_core_pipeline ?: '' + ) + + // + // MODULE: Create a merged samplesheet across all samples for the pipeline + // + MAG_MERGE_SAMPLESHEET ( + MAG_TO_SAMPLESHEET.out.samplesheet.collect{it[1]} + ) + ch_versions = ch_versions.mix( MAG_MERGE_SAMPLESHEET.out.versions ) + + emit: + samplesheet = ch_assemblies + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/workflows/mag.nf b/workflows/mag.nf index 160928d2..ec69017d 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -98,6 +98,7 @@ include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' include { DEPTHS } from '../subworkflows/local/depths' +include { SAMPLESHEET_CREATION } from '../subworkflows/local/samplesheet_creation' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1013,6 +1014,12 @@ workflow MAG { } } + // + // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines + // + ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads_assembly, ch_assemblies ).samplesheet + ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions ) + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) From 43491d863d8f30787b1139ccd7806afbcce4933d Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Mon, 20 Nov 2023 22:30:21 +0000 Subject: [PATCH 02/16] Created tests for samplesheet creation --- .github/workflows/ci.yml | 1 + conf/test.config | 14 ----------- conf/test_samplesheet.config | 47 ++++++++++++++++++++++++++++++++++++ nextflow.config | 1 + 4 files changed, 49 insertions(+), 14 deletions(-) create mode 100644 conf/test_samplesheet.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3aaa6f3e..f6c12464 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,6 +62,7 @@ jobs: test_adapterremoval, test_binrefinement, test_virus_identification, + test_samplesheet, ] steps: - name: Free some space diff --git a/conf/test.config b/conf/test.config index 3a5ea4f4..9c93278f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,18 +30,4 @@ params { busco_clean = true skip_gtdbtk = true skip_concoct = true - - // For computational efficiency - nf_core_pipeline = 'phageannotator' - coassemble_group = false - skip_binning = true - skip_prokka = true - skip_spadeshybrid = true - skip_quast = true - skip_prodigal = true - skip_krona = true - skip_adapter_trimming = true - skip_metabat2 = true - skip_maxbin2 = true - skip_busco = true } diff --git a/conf/test_samplesheet.config b/conf/test_samplesheet.config new file mode 100644 index 00000000..3a5ea4f4 --- /dev/null +++ b/conf/test_samplesheet.config @@ -0,0 +1,47 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv' + centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz" + kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz" + skip_krona = true + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_clean = true + skip_gtdbtk = true + skip_concoct = true + + // For computational efficiency + nf_core_pipeline = 'phageannotator' + coassemble_group = false + skip_binning = true + skip_prokka = true + skip_spadeshybrid = true + skip_quast = true + skip_prodigal = true + skip_krona = true + skip_adapter_trimming = true + skip_metabat2 = true + skip_maxbin2 = true + skip_busco = true +} diff --git a/nextflow.config b/nextflow.config index 90fde95a..67611d98 100644 --- a/nextflow.config +++ b/nextflow.config @@ -314,6 +314,7 @@ profiles { test_bbnorm { includeConfig 'conf/test_bbnorm.config' } test_nothing { includeConfig 'conf/test_nothing.config' } test_virus_identification { includeConfig 'conf/test_virus_identification.config' } + test_samplesheet { includeConfig 'conf/test_samplesheet.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From e39111e9d4adc0c85c840cf6a21e6c09fa6fa6eb Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Mon, 20 Nov 2023 22:41:31 +0000 Subject: [PATCH 03/16] Added nf-core-pipeline parameter to schema --- nextflow_schema.json | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 13f7e6bc..a747c7f7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -52,6 +52,10 @@ "type": "string", "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", "fa_icon": "fas fa-file-signature" + }, + "nf_core_pipeline": { + "type": "string", + "description": "Create a samplesheet for the specified nf-core pipeline" } } }, @@ -530,7 +534,7 @@ }, "gtdbtk_min_completeness": { "type": "number", - "default": 50.0, + "default": 50, "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.", "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!", "minimum": 0.01, @@ -538,7 +542,7 @@ }, "gtdbtk_max_contamination": { "type": "number", - "default": 10.0, + "default": 10, "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.", "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!", "minimum": 0, @@ -546,7 +550,7 @@ }, "gtdbtk_min_perc_aa": { "type": "number", - "default": 10.0, + "default": 10, "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.", "minimum": 0, "maximum": 100 @@ -560,7 +564,7 @@ }, "gtdbtk_pplacer_cpus": { "type": "number", - "default": 1.0, + "default": 1, "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.", "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)." }, From 69c684e77355a233505d7fb02f97ce9f06363296 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Mon, 20 Nov 2023 22:44:37 +0000 Subject: [PATCH 04/16] Changed input to samplesheet creation --- workflows/mag.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/mag.nf b/workflows/mag.nf index ec69017d..6cd17d01 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -1017,7 +1017,7 @@ workflow MAG { // // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines // - ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads_assembly, ch_assemblies ).samplesheet + ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions ) CUSTOM_DUMPSOFTWAREVERSIONS ( From bd065320dc61f2da5873186498ddd0872db6d633 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Mon, 20 Nov 2023 22:59:48 +0000 Subject: [PATCH 05/16] Conditional running of samplesheet creation --- workflows/mag.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/mag.nf b/workflows/mag.nf index 6cd17d01..d1b9eb0e 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -1017,8 +1017,10 @@ workflow MAG { // // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines // - ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet - ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions ) + if ( params.nf_core_pipeline ) { + ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet + ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions ) + } CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') From a564425b5ec04d763b2ea37edcc04d2e432115d1 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Wed, 29 Nov 2023 18:25:51 +0000 Subject: [PATCH 06/16] Updated docs and added pipeline to output file name --- CHANGELOG.md | 2 ++ README.md | 1 + conf/modules.config | 1 + docs/output.md | 15 +++++++++++++++ modules/local/mag_merge_samplesheet.nf | 7 ++++--- nextflow_schema.json | 12 +++++++----- 6 files changed, 30 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc6eed0d..65a36b27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#543](https://github.com/nf-core/mag/pull/543) - Automatic samplesheet generation for nf-core/phageannotator (@CarsonJM) + ### `Changed` ### `Fixed` diff --git a/README.md b/README.md index 8a4ba8d2..76ea9508 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The pipeline then: - Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara) +- generates a samplesheet that can be used as input for other nf-core pipelines. Currently, [phageannotator](https://github.com/nf-core/phageannotator) is supported. Furthermore, the pipeline creates various reports in the results directory specified, including a [MultiQC](https://multiqc.info/) report summarizing some of the findings and software versions. diff --git a/conf/modules.config b/conf/modules.config index 28d5afc5..e98280f5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -749,6 +749,7 @@ process { } withName: 'MAG_MERGE_SAMPLESHEET' { + ext.prefix = "${params.nf_core_pipeline}" publishDir = [ path: { "${params.outdir}/samplesheet" }, mode: params.publish_dir_mode, diff --git a/docs/output.md b/docs/output.md index 88aba227..681dc371 100644 --- a/docs/output.md +++ b/docs/output.md @@ -21,6 +21,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes) - [Additional summary for binned genomes](#additional-summary-for-binned-genomes) - [Ancient DNA](#ancient-dna) +- [Samplesheet generation](#sampleseet-generation) - [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -706,6 +707,20 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc +### Samplesheet generation + +
+Output files + +- `samplesheet/` + - `[nf_core_pipeline].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline + +
+ +Currently, samplesheets for the following nf-core pipelines can be automatically generated: + +- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences. + ### MultiQC
diff --git a/modules/local/mag_merge_samplesheet.nf b/modules/local/mag_merge_samplesheet.nf index 5ad7d01a..34641de6 100644 --- a/modules/local/mag_merge_samplesheet.nf +++ b/modules/local/mag_merge_samplesheet.nf @@ -9,14 +9,15 @@ process MAG_MERGE_SAMPLESHEET { path ('samplesheets/*') output: - path "samplesheet.csv", emit: samplesheet + path "*_samplesheet.csv", emit: samplesheet path "versions.yml" , emit: versions script: + def prefix = task.ext.prefix ?: "${meta.id}" """ - head -n 1 `ls ./samplesheets/* | head -n 1` > samplesheet.csv + head -n 1 `ls ./samplesheets/* | head -n 1` > ${prefix}_samplesheet.csv for fileid in `ls ./samplesheets/*`; do - awk 'NR>1' \$fileid >> samplesheet.csv + awk 'NR>1' \$fileid >> ${prefix}_samplesheet.csv done cat <<-END_VERSIONS > versions.yml diff --git a/nextflow_schema.json b/nextflow_schema.json index a747c7f7..a50c9ae3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -55,7 +55,9 @@ }, "nf_core_pipeline": { "type": "string", - "description": "Create a samplesheet for the specified nf-core pipeline" + "description": "Create a samplesheet for the specified nf-core pipeline", + "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.", + "enum": ["phageannotator"] } } }, @@ -534,7 +536,7 @@ }, "gtdbtk_min_completeness": { "type": "number", - "default": 50, + "default": 50.0, "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.", "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!", "minimum": 0.01, @@ -542,7 +544,7 @@ }, "gtdbtk_max_contamination": { "type": "number", - "default": 10, + "default": 10.0, "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.", "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!", "minimum": 0, @@ -550,7 +552,7 @@ }, "gtdbtk_min_perc_aa": { "type": "number", - "default": 10, + "default": 10.0, "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.", "minimum": 0, "maximum": 100 @@ -564,7 +566,7 @@ }, "gtdbtk_pplacer_cpus": { "type": "number", - "default": 1, + "default": 1.0, "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.", "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)." }, From 647a197e464823e650c0c6484f39a87e41cf28c4 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Thu, 14 Dec 2023 21:50:50 +0000 Subject: [PATCH 07/16] Modified names and used collectFile() --- conf/modules.config | 16 ------- conf/test_samplesheet.config | 42 ++++++++----------- docs/output.md | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- ...f => create_phageannotator_samplesheet.nf} | 37 +++++++--------- workflows/mag.nf | 32 +++++++------- 7 files changed, 51 insertions(+), 82 deletions(-) rename subworkflows/local/{samplesheet_creation.nf => create_phageannotator_samplesheet.nf} (75%) diff --git a/conf/modules.config b/conf/modules.config index e98280f5..cbfa51fb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -741,22 +741,6 @@ process { ] } - withName: MAG_TO_SAMPLESHEET { - publishDir = [ - path: { "${params.outdir}/samplesheet" }, - enabled: false - ] - } - - withName: 'MAG_MERGE_SAMPLESHEET' { - ext.prefix = "${params.nf_core_pipeline}" - publishDir = [ - path: { "${params.outdir}/samplesheet" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test_samplesheet.config b/conf/test_samplesheet.config index 3a5ea4f4..d36c87fe 100644 --- a/conf/test_samplesheet.config +++ b/conf/test_samplesheet.config @@ -20,28 +20,22 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv' - centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz" - kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz" - skip_krona = true - min_length_unbinned_contigs = 1 - max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - busco_clean = true - skip_gtdbtk = true - skip_concoct = true - - // For computational efficiency - nf_core_pipeline = 'phageannotator' - coassemble_group = false - skip_binning = true - skip_prokka = true - skip_spadeshybrid = true - skip_quast = true - skip_prodigal = true - skip_krona = true - skip_adapter_trimming = true - skip_metabat2 = true - skip_maxbin2 = true - skip_busco = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv' + generate_downstream_samplesheet = 'phageannotator' + skip_clipping = true + skip_adapter_trimming = true + keep_phix = true + centrifuge_db = null + kraken2_db = null + skip_krona = true + coassemble_group = true + megahit_fix_cpu_1 = true + skip_spadeshybrid = true + skip_spades = true + skip_quast = true + skip_prodigal = true + skip_binning = true + skip_binqc = true + skip_gtdbtk = true + skip_prokka = true } diff --git a/docs/output.md b/docs/output.md index 681dc371..a4141232 100644 --- a/docs/output.md +++ b/docs/output.md @@ -713,7 +713,7 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc Output files - `samplesheet/` - - `[nf_core_pipeline].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline + - `[generate_downstream_samplesheet].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline
diff --git a/nextflow.config b/nextflow.config index 67611d98..bf879ac9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,7 @@ params { input = null single_end = false assembly_input = null - nf_core_pipeline = null + generate_downstream_samplesheet = null // short read preprocessing options skip_clipping = false diff --git a/nextflow_schema.json b/nextflow_schema.json index a50c9ae3..1d412462 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -53,7 +53,7 @@ "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", "fa_icon": "fas fa-file-signature" }, - "nf_core_pipeline": { + "generate_downstream_samplesheet": { "type": "string", "description": "Create a samplesheet for the specified nf-core pipeline", "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.", diff --git a/subworkflows/local/samplesheet_creation.nf b/subworkflows/local/create_phageannotator_samplesheet.nf similarity index 75% rename from subworkflows/local/samplesheet_creation.nf rename to subworkflows/local/create_phageannotator_samplesheet.nf index d9edb182..6423486e 100644 --- a/subworkflows/local/samplesheet_creation.nf +++ b/subworkflows/local/create_phageannotator_samplesheet.nf @@ -2,7 +2,7 @@ include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { MAG_TO_SAMPLESHEET } from '../../modules/local/mag_to_samplesheet' include { MAG_MERGE_SAMPLESHEET } from '../../modules/local/mag_merge_samplesheet' -workflow SAMPLESHEET_CREATION { +workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET { take: short_reads //channel: [val(meta), path(fastq_1), path(fastq_2)] assemblies //channel: [val(meta), path(fasta)] @@ -50,8 +50,8 @@ workflow SAMPLESHEET_CREATION { meta.id = id meta.group = group meta.single_end = single_end - meta.fastq_1 = reads[0] ? reads[0] : '' - meta.fastq_2 = reads[1] && !meta.single_end ? reads[1] : '' + meta.fastq_1 = reads[0] + meta.fastq_2 = !meta.single_end ? reads[1] : '' meta.fasta = fasta ? fasta : '' return meta @@ -62,7 +62,7 @@ workflow SAMPLESHEET_CREATION { ch_combined_assemblies_remap = ch_combined_assemblies .map { meta, fasta -> - def group = meta.id.split('-') + def group = meta.id.split('group-') return [ group[1], fasta ] } @@ -75,17 +75,17 @@ workflow SAMPLESHEET_CREATION { return [ group, id, single_end, fastq ] } - .join ( ch_combined_assemblies_remap ) + .combine ( ch_combined_assemblies_remap, by:0 ) .map { - id, group, single_end, fastq, fasta -> + group, id, single_end, fastq, fasta -> def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] def meta = [:] meta.id = id meta.group = group meta.single_end = single_end - meta.fastq_1 = reads[0] ? reads[0] : '' - meta.fastq_2 = reads[1] && !meta.single_end ? reads[1] : '' + meta.fastq_1 = reads[0] + meta.fastq_2 = !meta.single_end ? reads[1] : '' meta.fasta = fasta ? fasta : '' return meta @@ -93,23 +93,14 @@ workflow SAMPLESHEET_CREATION { .set { ch_mag_metadata } } - // - // MODULE: Stage FastQ/FastA files generated by nf-core/mag together and auto-create a samplesheet - // - MAG_TO_SAMPLESHEET ( - ch_mag_metadata, - params.nf_core_pipeline ?: '' - ) + // Create samplesheet for each sample using meta information + ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta -> + [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ] + } - // - // MODULE: Create a merged samplesheet across all samples for the pipeline - // - MAG_MERGE_SAMPLESHEET ( - MAG_TO_SAMPLESHEET.out.samplesheet.collect{it[1]} - ) - ch_versions = ch_versions.mix( MAG_MERGE_SAMPLESHEET.out.versions ) + // Merge samplesheet across all samples for the pipeline + ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/") emit: - samplesheet = ch_assemblies versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/mag.nf b/workflows/mag.nf index d1b9eb0e..75e6111e 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -86,19 +86,19 @@ include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modules // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' -include { BINNING } from '../subworkflows/local/binning' -include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' -include { BUSCO_QC } from '../subworkflows/local/busco_qc' -include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' -include { GUNC_QC } from '../subworkflows/local/gunc_qc' -include { GTDBTK } from '../subworkflows/local/gtdbtk' -include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' -include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' -include { DEPTHS } from '../subworkflows/local/depths' -include { SAMPLESHEET_CREATION } from '../subworkflows/local/samplesheet_creation' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' +include { BINNING } from '../subworkflows/local/binning' +include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' +include { BUSCO_QC } from '../subworkflows/local/busco_qc' +include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' +include { CHECKM_QC } from '../subworkflows/local/checkm_qc' +include { GUNC_QC } from '../subworkflows/local/gunc_qc' +include { GTDBTK } from '../subworkflows/local/gtdbtk' +include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' +include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' +include { DEPTHS } from '../subworkflows/local/depths' +include { CREATE_PHAGEANNOTATOR_SAMPLESHEET } from '../subworkflows/local/create_phageannotator_samplesheet' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1017,9 +1017,9 @@ workflow MAG { // // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines // - if ( params.nf_core_pipeline ) { - ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet - ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions ) + if ( params.generate_downstream_samplesheet == 'phageannotator' ) { + CREATE_PHAGEANNOTATOR_SAMPLESHEET ( ch_short_reads, ch_assemblies ) + ch_versions = ch_versions.mix( CREATE_PHAGEANNOTATOR_SAMPLESHEET.out.versions ) } CUSTOM_DUMPSOFTWAREVERSIONS ( From 6028f5a100e1dd1a6e3e5b2ca93628d02a9fb7b5 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Fri, 12 Jan 2024 16:12:33 +0000 Subject: [PATCH 08/16] Remove unused local modules --- modules/local/mag_merge_samplesheet.nf | 28 ------------- modules/local/mag_to_samplesheet.nf | 40 ------------------- .../create_phageannotator_samplesheet.nf | 2 - 3 files changed, 70 deletions(-) delete mode 100644 modules/local/mag_merge_samplesheet.nf delete mode 100644 modules/local/mag_to_samplesheet.nf diff --git a/modules/local/mag_merge_samplesheet.nf b/modules/local/mag_merge_samplesheet.nf deleted file mode 100644 index 34641de6..00000000 --- a/modules/local/mag_merge_samplesheet.nf +++ /dev/null @@ -1,28 +0,0 @@ -process MAG_MERGE_SAMPLESHEET { - - conda "conda-forge::sed=4.7" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'nf-core/ubuntu:20.04' }" - - input: - path ('samplesheets/*') - - output: - path "*_samplesheet.csv", emit: samplesheet - path "versions.yml" , emit: versions - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - head -n 1 `ls ./samplesheets/* | head -n 1` > ${prefix}_samplesheet.csv - for fileid in `ls ./samplesheets/*`; do - awk 'NR>1' \$fileid >> ${prefix}_samplesheet.csv - done - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/local/mag_to_samplesheet.nf b/modules/local/mag_to_samplesheet.nf deleted file mode 100644 index a454bb44..00000000 --- a/modules/local/mag_to_samplesheet.nf +++ /dev/null @@ -1,40 +0,0 @@ -process MAG_TO_SAMPLESHEET { - tag "$meta.id" - - executor 'local' - memory 100.MB - - input: - val meta - val pipeline - - output: - tuple val(meta), path("*samplesheet.csv"), emit: samplesheet - - exec: - // - // Create samplesheet containing metadata - // - - // Add nf-core pipeline specific entries - if (pipeline) { - if (pipeline == 'phageannotator') { - pipeline_map = [ - sample : "${meta.id}", - group : "${meta.group}", - fastq_1 : meta.fastq_1, - fastq_2 : meta.fastq_2, - fasta : meta.fasta - ] - } - } - - // Create a samplesheet - samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' - samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",") - - // Write samplesheet to file - def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv") - samplesheet_file.text = samplesheet - -} diff --git a/subworkflows/local/create_phageannotator_samplesheet.nf b/subworkflows/local/create_phageannotator_samplesheet.nf index 6423486e..1299febb 100644 --- a/subworkflows/local/create_phageannotator_samplesheet.nf +++ b/subworkflows/local/create_phageannotator_samplesheet.nf @@ -1,6 +1,4 @@ include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' -include { MAG_TO_SAMPLESHEET } from '../../modules/local/mag_to_samplesheet' -include { MAG_MERGE_SAMPLESHEET } from '../../modules/local/mag_merge_samplesheet' workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET { take: From 332b7c75f084e426eb3dff26c51896f44bf33a76 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Wed, 17 Jan 2024 23:12:32 +0000 Subject: [PATCH 09/16] [automated] Fix linting with Prettier --- .devcontainer/devcontainer.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4ecfbfe3..4a9bc5c7 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,11 +18,11 @@ "python.linting.flake8Path": "/opt/conda/bin/flake8", "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", - "python.linting.pylintPath": "/opt/conda/bin/pylint" + "python.linting.pylintPath": "/opt/conda/bin/pylint", }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } - } + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"], + }, + }, } From 622938ad6d93c88fb4a6d9f704c307f4f63bd044 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Fri, 9 Feb 2024 14:35:13 +0100 Subject: [PATCH 10/16] Fix linting --- .devcontainer/devcontainer.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4a9bc5c7..4ecfbfe3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -18,11 +18,11 @@ "python.linting.flake8Path": "/opt/conda/bin/flake8", "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", - "python.linting.pylintPath": "/opt/conda/bin/pylint", + "python.linting.pylintPath": "/opt/conda/bin/pylint" }, // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"], - }, - }, + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + } + } } From 7c0d30debaca6c33da8b33a0d74f0518770fc3c8 Mon Sep 17 00:00:00 2001 From: Carson J Miller <68351153+CarsonJM@users.noreply.github.com> Date: Fri, 9 Feb 2024 07:55:43 -0800 Subject: [PATCH 11/16] Update subworkflows/local/create_phageannotator_samplesheet.nf Co-authored-by: James A. Fellows Yates --- subworkflows/local/create_phageannotator_samplesheet.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/create_phageannotator_samplesheet.nf b/subworkflows/local/create_phageannotator_samplesheet.nf index 1299febb..8021c7e5 100644 --- a/subworkflows/local/create_phageannotator_samplesheet.nf +++ b/subworkflows/local/create_phageannotator_samplesheet.nf @@ -31,6 +31,7 @@ workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET { return [ id, fasta ] } + short_reads .map { meta, fastq -> From 1bc6f310e383ad0034810fb73e6f4ea1c5a5ce57 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Fri, 9 Feb 2024 23:23:59 +0000 Subject: [PATCH 12/16] Made recommended updates --- .github/workflows/ci.yml | 2 +- ...config => test_generatesamplesheet.config} | 0 nextflow.config | 5 +- nextflow_schema.json | 35 ++++-- .../create_phageannotator_samplesheet.nf | 105 ---------------- .../local/generate_downstream_samplesheet.nf | 117 ++++++++++++++++++ workflows/mag.nf | 8 +- 7 files changed, 151 insertions(+), 121 deletions(-) rename conf/{test_samplesheet.config => test_generatesamplesheet.config} (100%) delete mode 100644 subworkflows/local/create_phageannotator_samplesheet.nf create mode 100644 subworkflows/local/generate_downstream_samplesheet.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ee6240d3..9d6f8e5e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,7 +62,7 @@ jobs: test_adapterremoval, test_binrefinement, test_virus_identification, - test_samplesheet, + test_generatesamplesheet, ] steps: - name: Free some space diff --git a/conf/test_samplesheet.config b/conf/test_generatesamplesheet.config similarity index 100% rename from conf/test_samplesheet.config rename to conf/test_generatesamplesheet.config diff --git a/nextflow.config b/nextflow.config index 5a014d04..eab8ea0d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,6 @@ params { input = null single_end = false assembly_input = null - generate_downstream_samplesheet = null // short read preprocessing options skip_clipping = false @@ -147,6 +146,10 @@ params { metaeuk_db = null save_mmseqs_db = false + // Generate downstream samplesheet options + generate_downstream_samplesheet = null + samplesheet_combine_assemblers = false + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' diff --git a/nextflow_schema.json b/nextflow_schema.json index 2cfeb15e..439a6b35 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -52,12 +52,6 @@ "type": "string", "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", "fa_icon": "fas fa-file-signature" - }, - "generate_downstream_samplesheet": { - "type": "string", - "description": "Create a samplesheet for the specified nf-core pipeline", - "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.", - "enum": ["phageannotator"] } } }, @@ -536,7 +530,7 @@ }, "gtdbtk_min_completeness": { "type": "number", - "default": 50.0, + "default": 50, "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.", "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!", "minimum": 0.01, @@ -544,7 +538,7 @@ }, "gtdbtk_max_contamination": { "type": "number", - "default": 10.0, + "default": 10, "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.", "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!", "minimum": 0, @@ -552,7 +546,7 @@ }, "gtdbtk_min_perc_aa": { "type": "number", - "default": 10.0, + "default": 10, "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.", "minimum": 0, "maximum": 100 @@ -566,7 +560,7 @@ }, "gtdbtk_pplacer_cpus": { "type": "number", - "default": 1.0, + "default": 1, "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.", "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)." }, @@ -890,6 +884,24 @@ "description": "minimum number of bases supporting the alternative allele" } } + }, + "downstream_sample_sheet_generation_options": { + "title": "Downstream sample sheet generation options", + "type": "object", + "description": "Generate sample sheets for downstream nf-core pipelines", + "default": "", + "properties": { + "generate_downstream_samplesheet": { + "type": "string", + "description": "Create a samplesheet for the specified nf-core pipeline", + "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.", + "enum": ["phageannotator"] + }, + "samplesheet_combine_assemblers": { + "type": "boolean", + "description": "Combine assembler outputs into one assembly file" + } + } } }, "allOf": [ @@ -937,6 +949,9 @@ }, { "$ref": "#/definitions/ancient_dna_assembly" + }, + { + "$ref": "#/definitions/downstream_sample_sheet_generation_options" } ] } diff --git a/subworkflows/local/create_phageannotator_samplesheet.nf b/subworkflows/local/create_phageannotator_samplesheet.nf deleted file mode 100644 index 8021c7e5..00000000 --- a/subworkflows/local/create_phageannotator_samplesheet.nf +++ /dev/null @@ -1,105 +0,0 @@ -include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' - -workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET { - take: - short_reads //channel: [val(meta), path(fastq_1), path(fastq_2)] - assemblies //channel: [val(meta), path(fasta)] - main: - ch_versions = Channel.empty() - - // combine assemblies by sample/group if multiple assembly methods were used - ch_assemblies = assemblies - .map { - meta, fasta -> - def meta_new = meta.subMap('id') - [ meta_new, fasta ] - } - .groupTuple() - - // - // MODULE: Combine all assemblies from a sample into one FastA file - // - ch_combined_assemblies = CAT_CAT ( ch_assemblies ).file_out - ch_versions = ch_versions.mix( CAT_CAT.out.versions ) - - // if no coassembly, join FastQ and FastA by ID - if ( !params.coassemble_group ){ - ch_combined_assemblies_remap = ch_combined_assemblies - .map { - meta, fasta -> - def id = meta.id - - return [ id, fasta ] - } - - short_reads - .map { - meta, fastq -> - def id = meta.id - def group = meta.group - def single_end = meta.single_end - - return [ id, group, single_end, fastq ] - }.join ( ch_combined_assemblies_remap ) - .map { - id, group, single_end, fastq, fasta -> - def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] - def meta = [:] - - meta.id = id - meta.group = group - meta.single_end = single_end - meta.fastq_1 = reads[0] - meta.fastq_2 = !meta.single_end ? reads[1] : '' - meta.fasta = fasta ? fasta : '' - - return meta - } - .set { ch_mag_metadata } - } else { - // if coassembly was used, join FastQ and FastA by group - ch_combined_assemblies_remap = ch_combined_assemblies - .map { - meta, fasta -> - def group = meta.id.split('group-') - - return [ group[1], fasta ] - } - short_reads - .map { - meta, fastq -> - def id = meta.id - def group = meta.group - def single_end = meta.single_end - - return [ group, id, single_end, fastq ] - } - .combine ( ch_combined_assemblies_remap, by:0 ) - .map { - group, id, single_end, fastq, fasta -> - def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] - def meta = [:] - - meta.id = id - meta.group = group - meta.single_end = single_end - meta.fastq_1 = reads[0] - meta.fastq_2 = !meta.single_end ? reads[1] : '' - meta.fasta = fasta ? fasta : '' - - return meta - } - .set { ch_mag_metadata } - } - - // Create samplesheet for each sample using meta information - ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta -> - [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ] - } - - // Merge samplesheet across all samples for the pipeline - ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/") - - emit: - versions = ch_versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/generate_downstream_samplesheet.nf b/subworkflows/local/generate_downstream_samplesheet.nf new file mode 100644 index 00000000..a8c165cb --- /dev/null +++ b/subworkflows/local/generate_downstream_samplesheet.nf @@ -0,0 +1,117 @@ +include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' + +workflow GENERATE_DOWNSTREAM_SAMPLESHEET { + take: + downstream_nfcore_pipelines // val: [ nf-core-pipeline, OPTIONAL: other-nf-core-pipelines ] + short_reads // channel: [val(meta), path(fastq_1), path(fastq_2)] + assemblies // channel: [val(meta), path(fasta)] + main: + + ch_versions = Channel.empty() + + // + // Create a samplesheet for nf-core/phageannotator + // + if ( 'phageannotator' in downstream_nfcore_pipelines ) { + + if ( params.samplesheet_combine_assemblers ) { + // combine assemblies by sample/group if multiple assembly methods were used + ch_assemblies = assemblies + .map { + meta, fasta -> + def meta_new = meta.subMap('id') + [ meta_new, fasta ] + } + .groupTuple() + + // + // MODULE: Combine all assemblies from a sample into one FastA file + // + ch_combined_assemblies = CAT_CAT ( ch_assemblies ).file_out + ch_versions = ch_versions.mix( CAT_CAT.out.versions ) + } else { + ch_combined_assemblies = assemblies + } + + // if no coassembly, join FastQ and FastA by ID + if ( !params.coassemble_group ){ + ch_combined_assemblies_remap = ch_combined_assemblies + .map { + meta, fasta -> + def id = meta.id + + return [ id, fasta ] + } + + short_reads + .map { + meta, fastq -> + def id = meta.id + def group = meta.group + def single_end = meta.single_end + + return [ id, group, single_end, fastq ] + }.join ( ch_combined_assemblies_remap ) + .map { + id, group, single_end, fastq, fasta -> + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def meta = [:] + + meta.id = id + meta.group = group + meta.single_end = single_end + meta.fastq_1 = reads[0] + meta.fastq_2 = !meta.single_end ? reads[1] : '' + meta.fasta = fasta ? fasta : '' + + return meta + } + .set { ch_mag_metadata } + } else { + // if coassembly was used, join FastQ and FastA by group + ch_combined_assemblies_remap = ch_combined_assemblies + .map { + meta, fasta -> + def group = meta.id.split('group-') + + return [ group[1], fasta ] + } + short_reads + .map { + meta, fastq -> + def id = meta.id + def group = meta.group + def single_end = meta.single_end + + return [ group, id, single_end, fastq ] + } + .combine ( ch_combined_assemblies_remap, by:0 ) + .map { + group, id, single_end, fastq, fasta -> + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def meta = [:] + + meta.id = id + meta.group = group + meta.single_end = single_end + meta.fastq_1 = reads[0] + meta.fastq_2 = !meta.single_end ? reads[1] : '' + meta.fasta = fasta ? fasta : '' + + return meta + } + .set { ch_mag_metadata } + } + + // Create samplesheet for each sample using meta information + ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta -> + [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ] + } + + // Merge samplesheet across all samples for the pipeline + ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/") + } + + emit: + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/workflows/mag.nf b/workflows/mag.nf index 3cd5f9dd..941899e4 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -98,7 +98,7 @@ include { GTDBTK } from '../subworkflows/local/gtdbtk include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' include { DEPTHS } from '../subworkflows/local/depths' -include { CREATE_PHAGEANNOTATOR_SAMPLESHEET } from '../subworkflows/local/create_phageannotator_samplesheet' +include { GENERATE_DOWNSTREAM_SAMPLESHEET } from '../subworkflows/local/generate_downstream_samplesheet' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1045,9 +1045,9 @@ workflow MAG { // // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines // - if ( params.generate_downstream_samplesheet == 'phageannotator' ) { - CREATE_PHAGEANNOTATOR_SAMPLESHEET ( ch_short_reads, ch_assemblies ) - ch_versions = ch_versions.mix( CREATE_PHAGEANNOTATOR_SAMPLESHEET.out.versions ) + if ( params.generate_downstream_samplesheet ) { + GENERATE_DOWNSTREAM_SAMPLESHEET ( params.generate_downstream_samplesheet, ch_short_reads, ch_assemblies ) + ch_versions = ch_versions.mix( GENERATE_DOWNSTREAM_SAMPLESHEET.out.versions ) } CUSTOM_DUMPSOFTWAREVERSIONS ( From dbb8dc37cf6f6555d5b0c864269670d059e5b229 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Mon, 12 Feb 2024 17:46:59 +0000 Subject: [PATCH 13/16] Fixed profile name --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index eab8ea0d..12ccf636 100644 --- a/nextflow.config +++ b/nextflow.config @@ -320,7 +320,7 @@ profiles { test_bbnorm { includeConfig 'conf/test_bbnorm.config' } test_nothing { includeConfig 'conf/test_nothing.config' } test_virus_identification { includeConfig 'conf/test_virus_identification.config' } - test_samplesheet { includeConfig 'conf/test_samplesheet.config' } + test_generatesamplesheet { includeConfig 'conf/test_samplesheet.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 15c3f198a2e2d463af68d688a7ee823c3ea0efee Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Mon, 12 Feb 2024 17:49:25 +0000 Subject: [PATCH 14/16] Part 2 --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 12ccf636..78491f57 100644 --- a/nextflow.config +++ b/nextflow.config @@ -320,7 +320,7 @@ profiles { test_bbnorm { includeConfig 'conf/test_bbnorm.config' } test_nothing { includeConfig 'conf/test_nothing.config' } test_virus_identification { includeConfig 'conf/test_virus_identification.config' } - test_generatesamplesheet { includeConfig 'conf/test_samplesheet.config' } + test_generatesamplesheet { includeConfig 'conf/test_generatesamplesheet.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 6cda4429552b8bc24a0e1620587da4fecd098dc6 Mon Sep 17 00:00:00 2001 From: Carson J Miller <68351153+CarsonJM@users.noreply.github.com> Date: Sun, 17 Mar 2024 14:33:13 -0700 Subject: [PATCH 15/16] Apply suggestions from code review Co-authored-by: James A. Fellows Yates --- README.md | 2 +- docs/output.md | 2 +- nextflow_schema.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6be4bd5d..cec2c6bc 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ The pipeline then: - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) - predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk) - performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). -- Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) +- performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara) - generates a samplesheet that can be used as input for other nf-core pipelines. Currently, [phageannotator](https://github.com/nf-core/phageannotator) is supported. diff --git a/docs/output.md b/docs/output.md index a4141232..34c2b1e8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -719,7 +719,7 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc Currently, samplesheets for the following nf-core pipelines can be automatically generated: -- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences. +- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences. Utilizes quality controlled reads and contigs generate by nf-core/mag ### MultiQC diff --git a/nextflow_schema.json b/nextflow_schema.json index 439a6b35..f100eac9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -899,7 +899,7 @@ }, "samplesheet_combine_assemblers": { "type": "boolean", - "description": "Combine assembler outputs into one assembly file" + "description": "Combine all contigs from all assemblies of a given sample into a single FASTA file" } } } From 1e7abe508c735bd1e8770324899a296abeaca223 Mon Sep 17 00:00:00 2001 From: Carson J Miller Date: Thu, 21 Mar 2024 17:58:05 +0000 Subject: [PATCH 16/16] Copy fastq and fasta files to outdir for samplesheet generation --- .../local/generate_downstream_samplesheet.nf | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/generate_downstream_samplesheet.nf b/subworkflows/local/generate_downstream_samplesheet.nf index a8c165cb..5c21682e 100644 --- a/subworkflows/local/generate_downstream_samplesheet.nf +++ b/subworkflows/local/generate_downstream_samplesheet.nf @@ -5,6 +5,7 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEET { downstream_nfcore_pipelines // val: [ nf-core-pipeline, OPTIONAL: other-nf-core-pipelines ] short_reads // channel: [val(meta), path(fastq_1), path(fastq_2)] assemblies // channel: [val(meta), path(fasta)] + main: ch_versions = Channel.empty() @@ -103,10 +104,35 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEET { .set { ch_mag_metadata } } - // Create samplesheet for each sample using meta information - ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta -> - [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ] - } + // Create samplesheet for each sample using meta information + ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta -> + // Save reads and assemblies to outdir so that they are in a stable location + file(meta.fastq_1.toUriString(), checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}") + file(meta.fasta, checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}") + if ( !meta.single_end ){ + file(meta.fastq_2.toUriString(), checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_2.name}") + [ "${meta.id}_phageannotator_samplesheet.csv", + "sample,group,fastq_1,fastq_2,fasta" + + '\n' + + "${meta.id},${meta.group}," + + file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}").toString() + "," + + file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_2.name}").toString() + "," + + file("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}").toString() + + '\n' + ] + } else { + // Create samplesheet for each sample using meta information + [ "${meta.id}_phageannotator_samplesheet.csv", + "sample,group,fastq_1,fastq_2,fasta" + + '\n' + + "${meta.id},${meta.group}," + + file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}").toString() + "," + + "," + + file("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}").toString() + + '\n' + ] + } + } // Merge samplesheet across all samples for the pipeline ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/")