From 65034b59d4e25980cdba117c06913a2d4506a155 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Mon, 20 Nov 2023 22:25:04 +0000
Subject: [PATCH 01/16] Added ability to auto-create samplesheet for
 phageannotator

---
 conf/modules.config                           |  15 ++
 conf/test.config                              |  14 ++
 modules.json                                  |   5 +
 modules/local/mag_merge_samplesheet.nf        |  27 ++++
 modules/local/mag_to_samplesheet.nf           |  40 +++++
 modules/nf-core/cat/cat/environment.yml       |   7 +
 modules/nf-core/cat/cat/main.nf               |  62 +++++++
 modules/nf-core/cat/cat/meta.yml              |  36 +++++
 modules/nf-core/cat/cat/tests/main.nf.test    | 153 ++++++++++++++++++
 .../nf-core/cat/cat/tests/main.nf.test.snap   | 121 ++++++++++++++
 .../cat/tests/nextflow_unzipped_zipped.config |   6 +
 .../cat/tests/nextflow_zipped_unzipped.config |   8 +
 modules/nf-core/cat/cat/tests/tags.yml        |   2 +
 nextflow.config                               |   1 +
 subworkflows/local/samplesheet_creation.nf    | 115 +++++++++++++
 workflows/mag.nf                              |   7 +
 16 files changed, 619 insertions(+)
 create mode 100644 modules/local/mag_merge_samplesheet.nf
 create mode 100644 modules/local/mag_to_samplesheet.nf
 create mode 100644 modules/nf-core/cat/cat/environment.yml
 create mode 100644 modules/nf-core/cat/cat/main.nf
 create mode 100644 modules/nf-core/cat/cat/meta.yml
 create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test
 create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
 create mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
 create mode 100644 modules/nf-core/cat/cat/tests/tags.yml
 create mode 100644 subworkflows/local/samplesheet_creation.nf

diff --git a/conf/modules.config b/conf/modules.config
index cbfa51fb..28d5afc5 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -741,6 +741,21 @@ process {
         ]
     }
 
+    withName: MAG_TO_SAMPLESHEET {
+        publishDir = [
+            path: { "${params.outdir}/samplesheet" },
+            enabled: false
+        ]
+    }
+
+    withName: 'MAG_MERGE_SAMPLESHEET' {
+        publishDir = [
+            path: { "${params.outdir}/samplesheet" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
diff --git a/conf/test.config b/conf/test.config
index 9c93278f..3a5ea4f4 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -30,4 +30,18 @@ params {
     busco_clean                   = true
     skip_gtdbtk                   = true
     skip_concoct                  = true
+
+    // For computational efficiency
+    nf_core_pipeline            = 'phageannotator'
+    coassemble_group            = false
+    skip_binning                = true
+    skip_prokka                 = true
+    skip_spadeshybrid           = true
+    skip_quast                  = true
+    skip_prodigal               = true
+    skip_krona                  = true
+    skip_adapter_trimming       = true
+    skip_metabat2               = true
+    skip_maxbin2                = true
+    skip_busco                  = true
 }
diff --git a/modules.json b/modules.json
index e9162243..cbe0a14f 100644
--- a/modules.json
+++ b/modules.json
@@ -36,6 +36,11 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
+                    "cat/cat": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
                     "cat/fastq": {
                         "branch": "master",
                         "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e",
diff --git a/modules/local/mag_merge_samplesheet.nf b/modules/local/mag_merge_samplesheet.nf
new file mode 100644
index 00000000..5ad7d01a
--- /dev/null
+++ b/modules/local/mag_merge_samplesheet.nf
@@ -0,0 +1,27 @@
+process MAG_MERGE_SAMPLESHEET {
+
+    conda "conda-forge::sed=4.7"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
+        'nf-core/ubuntu:20.04' }"
+
+    input:
+    path ('samplesheets/*')
+
+    output:
+    path "samplesheet.csv", emit: samplesheet
+    path "versions.yml"   , emit: versions
+
+    script:
+    """
+    head -n 1 `ls ./samplesheets/* | head -n 1` > samplesheet.csv
+    for fileid in `ls ./samplesheets/*`; do
+        awk 'NR>1' \$fileid >> samplesheet.csv
+    done
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/mag_to_samplesheet.nf b/modules/local/mag_to_samplesheet.nf
new file mode 100644
index 00000000..a454bb44
--- /dev/null
+++ b/modules/local/mag_to_samplesheet.nf
@@ -0,0 +1,40 @@
+process MAG_TO_SAMPLESHEET {
+    tag "$meta.id"
+
+    executor 'local'
+    memory 100.MB
+
+    input:
+    val meta
+    val pipeline
+
+    output:
+    tuple val(meta), path("*samplesheet.csv"), emit: samplesheet
+
+    exec:
+    //
+    // Create samplesheet containing metadata
+    //
+
+    // Add nf-core pipeline specific entries
+    if (pipeline) {
+        if (pipeline == 'phageannotator') {
+            pipeline_map = [
+                sample  : "${meta.id}",
+                group   : "${meta.group}",
+                fastq_1 : meta.fastq_1,
+                fastq_2 : meta.fastq_2,
+                fasta   : meta.fasta
+            ]
+        }
+    }
+
+    // Create a samplesheet
+    samplesheet  = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n'
+    samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",")
+
+    // Write samplesheet to file
+    def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv")
+    samplesheet_file.text = samplesheet
+
+}
diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml
new file mode 100644
index 00000000..17a04ef2
--- /dev/null
+++ b/modules/nf-core/cat/cat/environment.yml
@@ -0,0 +1,7 @@
+name: cat_cat
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - conda-forge::pigz=2.3.4
diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf
new file mode 100644
index 00000000..4264a92c
--- /dev/null
+++ b/modules/nf-core/cat/cat/main.nf
@@ -0,0 +1,62 @@
+process CAT_CAT {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pigz:2.3.4' :
+        'biocontainers/pigz:2.3.4' }"
+
+    input:
+    tuple val(meta), path(files_in)
+
+    output:
+    tuple val(meta), path("${prefix}"), emit: file_out
+    path "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def file_list = files_in.collect { it.toString() }
+
+    // | input     | output     | command1 | command2 |
+    // |-----------|------------|----------|----------|
+    // | gzipped   | gzipped    | cat      |          |
+    // | ungzipped | ungzipped  | cat      |          |
+    // | gzipped   | ungzipped  | zcat     |          |
+    // | ungzipped | gzipped    | cat      | pigz     |
+
+    // Use input file ending as default
+    prefix   = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
+    out_zip  = prefix.endsWith('.gz')
+    in_zip   = file_list[0].endsWith('.gz')
+    command1 = (in_zip && !out_zip) ? 'zcat' : 'cat'
+    command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : ''
+    """
+    $command1 \\
+        $args \\
+        ${file_list.join(' ')} \\
+        $command2 \\
+        > ${prefix}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+
+    stub:
+    def file_list = files_in.collect { it.toString() }
+    prefix   = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}"
+    """
+    touch $prefix
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml
new file mode 100644
index 00000000..00a8db0b
--- /dev/null
+++ b/modules/nf-core/cat/cat/meta.yml
@@ -0,0 +1,36 @@
+name: cat_cat
+description: A module for concatenation of gzipped or uncompressed files
+keywords:
+  - concatenate
+  - gzip
+  - cat
+tools:
+  - cat:
+      description: Just concatenation
+      documentation: https://man7.org/linux/man-pages/man1/cat.1.html
+      licence: ["GPL-3.0-or-later"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - files_in:
+      type: file
+      description: List of compressed / uncompressed files
+      pattern: "*"
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - file_out:
+      type: file
+      description: Concatenated file. Will be gzipped if file_out ends with ".gz"
+      pattern: "${file_out}"
+authors:
+  - "@erikrikarddaniel"
+  - "@FriederikeHanssen"
+maintainers:
+  - "@erikrikarddaniel"
+  - "@FriederikeHanssen"
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test
new file mode 100644
index 00000000..5766daaf
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test
@@ -0,0 +1,153 @@
+nextflow_process {
+
+    name "Test Process CAT_CAT"
+    script "../main.nf"
+    process "CAT_CAT"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "cat"
+    tag "cat/cat"
+
+    test("test_cat_unzipped_unzipped") {
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+
+    test("test_cat_zipped_zipped") {
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")}
+            )
+        }
+    }
+
+    test("test_cat_zipped_unzipped") {
+        config './nextflow_zipped_unzipped.config'
+
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("test_cat_unzipped_zipped") {
+        config './nextflow_unzipped_zipped.config'
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")}
+            )
+        }
+    }
+
+    test("test_cat_one_file_unzipped_zipped") {
+        config './nextflow_unzipped_zipped.config'
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")}
+            )
+        }
+    }
+}
+
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap
new file mode 100644
index 00000000..423571ba
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap
@@ -0,0 +1,121 @@
+{
+    "test_cat_unzipped_zipped_size": {
+        "content": [
+            375
+        ],
+        "timestamp": "2023-10-16T14:33:08.049445686"
+    },
+    "test_cat_unzipped_unzipped": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-16T14:32:18.500464399"
+    },
+    "test_cat_zipped_unzipped": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-16T14:32:49.642741302"
+    },
+    "test_cat_zipped_zipped_lines": {
+        "content": [
+            [
+                "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab",
+                "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1",
+                "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+                "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+                "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1",
+                "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:32:33.629048645"
+    },
+    "test_cat_unzipped_zipped_lines": {
+        "content": [
+            [
+                ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+                "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+                "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+                "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+                "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+                "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:33:08.038830506"
+    },
+    "test_cat_one_file_unzipped_zipped_lines": {
+        "content": [
+            [
+                ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+                "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+                "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+                "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+                "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+                "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:33:21.39642399"
+    },
+    "test_cat_zipped_zipped_size": {
+        "content": [
+            78
+        ],
+        "timestamp": "2023-10-16T14:32:33.641869244"
+    },
+    "test_cat_one_file_unzipped_zipped_size": {
+        "content": [
+            374
+        ],
+        "timestamp": "2023-10-16T14:33:21.4094373"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
new file mode 100644
index 00000000..ec26b0fd
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
@@ -0,0 +1,6 @@
+
+process {
+    withName: CAT_CAT {
+        ext.prefix = 'cat.txt.gz'
+    }
+}
diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
new file mode 100644
index 00000000..fbc79783
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
@@ -0,0 +1,8 @@
+
+process {
+
+    withName: CAT_CAT {
+        ext.prefix = 'cat.txt'
+    }
+
+}
diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml
new file mode 100644
index 00000000..37b578f5
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/tags.yml
@@ -0,0 +1,2 @@
+cat/cat:
+  - modules/nf-core/cat/cat/**
diff --git a/nextflow.config b/nextflow.config
index 66314e9a..90fde95a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,6 +13,7 @@ params {
     input                                = null
     single_end                           = false
     assembly_input                       = null
+    nf_core_pipeline                     = null
 
     // short read preprocessing options
     skip_clipping                        = false
diff --git a/subworkflows/local/samplesheet_creation.nf b/subworkflows/local/samplesheet_creation.nf
new file mode 100644
index 00000000..d9edb182
--- /dev/null
+++ b/subworkflows/local/samplesheet_creation.nf
@@ -0,0 +1,115 @@
+include { CAT_CAT               } from '../../modules/nf-core/cat/cat/main'
+include { MAG_TO_SAMPLESHEET    } from '../../modules/local/mag_to_samplesheet'
+include { MAG_MERGE_SAMPLESHEET } from '../../modules/local/mag_merge_samplesheet'
+
+workflow SAMPLESHEET_CREATION {
+    take:
+        short_reads //channel: [val(meta), path(fastq_1), path(fastq_2)]
+        assemblies  //channel: [val(meta), path(fasta)]
+    main:
+        ch_versions = Channel.empty()
+
+        // combine assemblies by sample/group if multiple assembly methods were used
+        ch_assemblies = assemblies
+            .map {
+                meta, fasta ->
+                    def meta_new = meta.subMap('id')
+                [ meta_new, fasta ]
+            }
+            .groupTuple()
+
+        //
+        // MODULE: Combine all assemblies from a sample into one FastA file
+        //
+        ch_combined_assemblies = CAT_CAT ( ch_assemblies ).file_out
+        ch_versions = ch_versions.mix( CAT_CAT.out.versions )
+
+        // if no coassembly, join FastQ and FastA by ID
+        if ( !params.coassemble_group ){
+            ch_combined_assemblies_remap = ch_combined_assemblies
+                .map {
+                    meta, fasta ->
+                        def id          = meta.id
+
+                        return [ id, fasta ]
+                }
+            short_reads
+                .map {
+                    meta, fastq ->
+                        def id          = meta.id
+                        def group       = meta.group
+                        def single_end  = meta.single_end
+
+                        return [ id, group, single_end, fastq ]
+                }.join ( ch_combined_assemblies_remap )
+                .map {
+                    id, group, single_end, fastq, fasta ->
+                        def reads   = fastq instanceof List ? fastq.flatten() : [ fastq ]
+                        def meta    = [:]
+
+                        meta.id         = id
+                        meta.group      = group
+                        meta.single_end = single_end
+                        meta.fastq_1    = reads[0] ? reads[0] : ''
+                        meta.fastq_2    = reads[1] && !meta.single_end ? reads[1] : ''
+                        meta.fasta      = fasta ? fasta : ''
+
+                        return meta
+                }
+                .set { ch_mag_metadata }
+        } else {
+            // if coassembly was used, join FastQ and FastA by group
+            ch_combined_assemblies_remap = ch_combined_assemblies
+                .map {
+                    meta, fasta ->
+                        def group = meta.id.split('-')
+
+                            return [ group[1], fasta ]
+                }
+            short_reads
+                .map {
+                    meta, fastq ->
+                        def id          = meta.id
+                        def group       = meta.group
+                        def single_end  = meta.single_end
+
+                            return [ group, id, single_end, fastq ]
+                }
+                .join ( ch_combined_assemblies_remap )
+                .map {
+                    id, group, single_end, fastq, fasta ->
+                        def reads   = fastq instanceof List ? fastq.flatten() : [ fastq ]
+                        def meta    = [:]
+
+                        meta.id         = id
+                        meta.group      = group
+                        meta.single_end = single_end
+                        meta.fastq_1    = reads[0] ? reads[0] : ''
+                        meta.fastq_2    = reads[1] && !meta.single_end ? reads[1] : ''
+                        meta.fasta      = fasta ? fasta : ''
+
+                        return meta
+                }
+                .set { ch_mag_metadata }
+        }
+
+        //
+        // MODULE: Stage FastQ/FastA files generated by nf-core/mag together and auto-create a samplesheet
+        //
+        MAG_TO_SAMPLESHEET (
+            ch_mag_metadata,
+            params.nf_core_pipeline ?: ''
+        )
+
+        //
+        // MODULE: Create a merged samplesheet across all samples for the pipeline
+        //
+        MAG_MERGE_SAMPLESHEET (
+            MAG_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}
+        )
+        ch_versions = ch_versions.mix( MAG_MERGE_SAMPLESHEET.out.versions )
+
+    emit:
+        samplesheet         = ch_assemblies
+        versions            = ch_versions       // channel: [ versions.yml ]
+}
diff --git a/workflows/mag.nf b/workflows/mag.nf
index 160928d2..ec69017d 100644
--- a/workflows/mag.nf
+++ b/workflows/mag.nf
@@ -98,6 +98,7 @@ include { GTDBTK                          } from '../subworkflows/local/gtdbtk'
 include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna'
 include { DOMAIN_CLASSIFICATION           } from '../subworkflows/local/domain_classification'
 include { DEPTHS                          } from '../subworkflows/local/depths'
+include { SAMPLESHEET_CREATION            } from '../subworkflows/local/samplesheet_creation'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1013,6 +1014,12 @@ workflow MAG {
         }
     }
 
+    //
+    // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines
+    //
+    ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads_assembly, ch_assemblies ).samplesheet
+    ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions )
+
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')
     )

From 43491d863d8f30787b1139ccd7806afbcce4933d Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Mon, 20 Nov 2023 22:30:21 +0000
Subject: [PATCH 02/16] Created tests for samplesheet creation

---
 .github/workflows/ci.yml     |  1 +
 conf/test.config             | 14 -----------
 conf/test_samplesheet.config | 47 ++++++++++++++++++++++++++++++++++++
 nextflow.config              |  1 +
 4 files changed, 49 insertions(+), 14 deletions(-)
 create mode 100644 conf/test_samplesheet.config

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3aaa6f3e..f6c12464 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -62,6 +62,7 @@ jobs:
             test_adapterremoval,
             test_binrefinement,
             test_virus_identification,
+            test_samplesheet,
           ]
     steps:
       - name: Free some space
diff --git a/conf/test.config b/conf/test.config
index 3a5ea4f4..9c93278f 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -30,18 +30,4 @@ params {
     busco_clean                   = true
     skip_gtdbtk                   = true
     skip_concoct                  = true
-
-    // For computational efficiency
-    nf_core_pipeline            = 'phageannotator'
-    coassemble_group            = false
-    skip_binning                = true
-    skip_prokka                 = true
-    skip_spadeshybrid           = true
-    skip_quast                  = true
-    skip_prodigal               = true
-    skip_krona                  = true
-    skip_adapter_trimming       = true
-    skip_metabat2               = true
-    skip_maxbin2                = true
-    skip_busco                  = true
 }
diff --git a/conf/test_samplesheet.config b/conf/test_samplesheet.config
new file mode 100644
index 00000000..3a5ea4f4
--- /dev/null
+++ b/conf/test_samplesheet.config
@@ -0,0 +1,47 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/mag -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input                         = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv'
+    centrifuge_db                 = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
+    kraken2_db                    = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
+    skip_krona                    = true
+    min_length_unbinned_contigs   = 1
+    max_unbinned_contigs          = 2
+    busco_db                      = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+    busco_clean                   = true
+    skip_gtdbtk                   = true
+    skip_concoct                  = true
+
+    // For computational efficiency
+    nf_core_pipeline            = 'phageannotator'
+    coassemble_group            = false
+    skip_binning                = true
+    skip_prokka                 = true
+    skip_spadeshybrid           = true
+    skip_quast                  = true
+    skip_prodigal               = true
+    skip_krona                  = true
+    skip_adapter_trimming       = true
+    skip_metabat2               = true
+    skip_maxbin2                = true
+    skip_busco                  = true
+}
diff --git a/nextflow.config b/nextflow.config
index 90fde95a..67611d98 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -314,6 +314,7 @@ profiles {
     test_bbnorm         { includeConfig 'conf/test_bbnorm.config'         }
     test_nothing        { includeConfig 'conf/test_nothing.config'        }
     test_virus_identification { includeConfig 'conf/test_virus_identification.config' }
+    test_samplesheet    { includeConfig 'conf/test_samplesheet.config' }
 }
 
 // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile

From e39111e9d4adc0c85c840cf6a21e6c09fa6fa6eb Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Mon, 20 Nov 2023 22:41:31 +0000
Subject: [PATCH 03/16] Added nf-core-pipeline parameter to schema

---
 nextflow_schema.json | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 13f7e6bc..a747c7f7 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -52,6 +52,10 @@
                     "type": "string",
                     "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.",
                     "fa_icon": "fas fa-file-signature"
+                },
+                "nf_core_pipeline": {
+                    "type": "string",
+                    "description": "Create a samplesheet for the specified nf-core pipeline"
                 }
             }
         },
@@ -530,7 +534,7 @@
                 },
                 "gtdbtk_min_completeness": {
                     "type": "number",
-                    "default": 50.0,
+                    "default": 50,
                     "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.",
                     "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!",
                     "minimum": 0.01,
@@ -538,7 +542,7 @@
                 },
                 "gtdbtk_max_contamination": {
                     "type": "number",
-                    "default": 10.0,
+                    "default": 10,
                     "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.",
                     "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!",
                     "minimum": 0,
@@ -546,7 +550,7 @@
                 },
                 "gtdbtk_min_perc_aa": {
                     "type": "number",
-                    "default": 10.0,
+                    "default": 10,
                     "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.",
                     "minimum": 0,
                     "maximum": 100
@@ -560,7 +564,7 @@
                 },
                 "gtdbtk_pplacer_cpus": {
                     "type": "number",
-                    "default": 1.0,
+                    "default": 1,
                     "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.",
                     "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)."
                 },

From 69c684e77355a233505d7fb02f97ce9f06363296 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Mon, 20 Nov 2023 22:44:37 +0000
Subject: [PATCH 04/16] Changed input to samplesheet creation

---
 workflows/mag.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/mag.nf b/workflows/mag.nf
index ec69017d..6cd17d01 100644
--- a/workflows/mag.nf
+++ b/workflows/mag.nf
@@ -1017,7 +1017,7 @@ workflow MAG {
     //
     // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines
     //
-    ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads_assembly, ch_assemblies ).samplesheet
+    ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet
     ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions )
 
     CUSTOM_DUMPSOFTWAREVERSIONS (

From bd065320dc61f2da5873186498ddd0872db6d633 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Mon, 20 Nov 2023 22:59:48 +0000
Subject: [PATCH 05/16] Conditional running of samplesheet creation

---
 workflows/mag.nf | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/workflows/mag.nf b/workflows/mag.nf
index 6cd17d01..d1b9eb0e 100644
--- a/workflows/mag.nf
+++ b/workflows/mag.nf
@@ -1017,8 +1017,10 @@ workflow MAG {
     //
     // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines
     //
-    ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet
-    ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions )
+    if ( params.nf_core_pipeline ) {
+        ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet
+        ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions )
+    }
 
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')

From a564425b5ec04d763b2ea37edcc04d2e432115d1 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Wed, 29 Nov 2023 18:25:51 +0000
Subject: [PATCH 06/16] Updated docs and added pipeline to output file name

---
 CHANGELOG.md                           |  2 ++
 README.md                              |  1 +
 conf/modules.config                    |  1 +
 docs/output.md                         | 15 +++++++++++++++
 modules/local/mag_merge_samplesheet.nf |  7 ++++---
 nextflow_schema.json                   | 12 +++++++-----
 6 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc6eed0d..65a36b27 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#543](https://github.com/nf-core/mag/pull/543) - Automatic samplesheet generation for nf-core/phageannotator (@CarsonJM)
+
 ### `Changed`
 
 ### `Fixed`
diff --git a/README.md b/README.md
index 8a4ba8d2..76ea9508 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ The pipeline then:
 - Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes)
 - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool)
 - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara)
+- generates a samplesheet that can be used as input for other nf-core pipelines. Currently, [phageannotator](https://github.com/nf-core/phageannotator) is supported.
 
 Furthermore, the pipeline creates various reports in the results directory specified, including a [MultiQC](https://multiqc.info/) report summarizing some of the findings and software versions.
 
diff --git a/conf/modules.config b/conf/modules.config
index 28d5afc5..e98280f5 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -749,6 +749,7 @@ process {
     }
 
     withName: 'MAG_MERGE_SAMPLESHEET' {
+        ext.prefix = "${params.nf_core_pipeline}"
         publishDir = [
             path: { "${params.outdir}/samplesheet" },
             mode: params.publish_dir_mode,
diff --git a/docs/output.md b/docs/output.md
index 88aba227..681dc371 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -21,6 +21,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes)
 - [Additional summary for binned genomes](#additional-summary-for-binned-genomes)
 - [Ancient DNA](#ancient-dna)
+- [Samplesheet generation](#sampleseet-generation)
 - [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
@@ -706,6 +707,20 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc
 
 </details>
 
+### Samplesheet generation
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `samplesheet/`
+  - `[nf_core_pipeline].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline
+
+</details>
+
+Currently, samplesheets for the following nf-core pipelines can be automatically generated:
+
+- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences.
+
 ### MultiQC
 
 <details markdown="1">
diff --git a/modules/local/mag_merge_samplesheet.nf b/modules/local/mag_merge_samplesheet.nf
index 5ad7d01a..34641de6 100644
--- a/modules/local/mag_merge_samplesheet.nf
+++ b/modules/local/mag_merge_samplesheet.nf
@@ -9,14 +9,15 @@ process MAG_MERGE_SAMPLESHEET {
     path ('samplesheets/*')
 
     output:
-    path "samplesheet.csv", emit: samplesheet
+    path "*_samplesheet.csv", emit: samplesheet
     path "versions.yml"   , emit: versions
 
     script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    head -n 1 `ls ./samplesheets/* | head -n 1` > samplesheet.csv
+    head -n 1 `ls ./samplesheets/* | head -n 1` > ${prefix}_samplesheet.csv
     for fileid in `ls ./samplesheets/*`; do
-        awk 'NR>1' \$fileid >> samplesheet.csv
+        awk 'NR>1' \$fileid >> ${prefix}_samplesheet.csv
     done
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/nextflow_schema.json b/nextflow_schema.json
index a747c7f7..a50c9ae3 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -55,7 +55,9 @@
                 },
                 "nf_core_pipeline": {
                     "type": "string",
-                    "description": "Create a samplesheet for the specified nf-core pipeline"
+                    "description": "Create a samplesheet for the specified nf-core pipeline",
+                    "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.",
+                    "enum": ["phageannotator"]
                 }
             }
         },
@@ -534,7 +536,7 @@
                 },
                 "gtdbtk_min_completeness": {
                     "type": "number",
-                    "default": 50,
+                    "default": 50.0,
                     "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.",
                     "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!",
                     "minimum": 0.01,
@@ -542,7 +544,7 @@
                 },
                 "gtdbtk_max_contamination": {
                     "type": "number",
-                    "default": 10,
+                    "default": 10.0,
                     "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.",
                     "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!",
                     "minimum": 0,
@@ -550,7 +552,7 @@
                 },
                 "gtdbtk_min_perc_aa": {
                     "type": "number",
-                    "default": 10,
+                    "default": 10.0,
                     "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.",
                     "minimum": 0,
                     "maximum": 100
@@ -564,7 +566,7 @@
                 },
                 "gtdbtk_pplacer_cpus": {
                     "type": "number",
-                    "default": 1,
+                    "default": 1.0,
                     "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.",
                     "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)."
                 },

From 647a197e464823e650c0c6484f39a87e41cf28c4 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Thu, 14 Dec 2023 21:50:50 +0000
Subject: [PATCH 07/16] Modified names and used collectFile()

---
 conf/modules.config                           | 16 -------
 conf/test_samplesheet.config                  | 42 ++++++++-----------
 docs/output.md                                |  2 +-
 nextflow.config                               |  2 +-
 nextflow_schema.json                          |  2 +-
 ...f => create_phageannotator_samplesheet.nf} | 37 +++++++---------
 workflows/mag.nf                              | 32 +++++++-------
 7 files changed, 51 insertions(+), 82 deletions(-)
 rename subworkflows/local/{samplesheet_creation.nf => create_phageannotator_samplesheet.nf} (75%)

diff --git a/conf/modules.config b/conf/modules.config
index e98280f5..cbfa51fb 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -741,22 +741,6 @@ process {
         ]
     }
 
-    withName: MAG_TO_SAMPLESHEET {
-        publishDir = [
-            path: { "${params.outdir}/samplesheet" },
-            enabled: false
-        ]
-    }
-
-    withName: 'MAG_MERGE_SAMPLESHEET' {
-        ext.prefix = "${params.nf_core_pipeline}"
-        publishDir = [
-            path: { "${params.outdir}/samplesheet" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },
diff --git a/conf/test_samplesheet.config b/conf/test_samplesheet.config
index 3a5ea4f4..d36c87fe 100644
--- a/conf/test_samplesheet.config
+++ b/conf/test_samplesheet.config
@@ -20,28 +20,22 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input                         = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv'
-    centrifuge_db                 = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
-    kraken2_db                    = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
-    skip_krona                    = true
-    min_length_unbinned_contigs   = 1
-    max_unbinned_contigs          = 2
-    busco_db                      = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
-    busco_clean                   = true
-    skip_gtdbtk                   = true
-    skip_concoct                  = true
-
-    // For computational efficiency
-    nf_core_pipeline            = 'phageannotator'
-    coassemble_group            = false
-    skip_binning                = true
-    skip_prokka                 = true
-    skip_spadeshybrid           = true
-    skip_quast                  = true
-    skip_prodigal               = true
-    skip_krona                  = true
-    skip_adapter_trimming       = true
-    skip_metabat2               = true
-    skip_maxbin2                = true
-    skip_busco                  = true
+    input                           = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv'
+    generate_downstream_samplesheet = 'phageannotator'
+    skip_clipping                   = true
+    skip_adapter_trimming           = true
+    keep_phix                       = true
+    centrifuge_db                   = null
+    kraken2_db                      = null
+    skip_krona                      = true
+    coassemble_group                = true
+    megahit_fix_cpu_1               = true
+    skip_spadeshybrid               = true
+    skip_spades                     = true
+    skip_quast                      = true
+    skip_prodigal                   = true
+    skip_binning                    = true
+    skip_binqc                      = true
+    skip_gtdbtk                     = true
+    skip_prokka                     = true
 }
diff --git a/docs/output.md b/docs/output.md
index 681dc371..a4141232 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -713,7 +713,7 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc
 <summary>Output files</summary>
 
 - `samplesheet/`
-  - `[nf_core_pipeline].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline
+  - `[generate_downstream_samplesheet].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline
 
 </details>
 
diff --git a/nextflow.config b/nextflow.config
index 67611d98..bf879ac9 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,7 +13,7 @@ params {
     input                                = null
     single_end                           = false
     assembly_input                       = null
-    nf_core_pipeline                     = null
+    generate_downstream_samplesheet      = null
 
     // short read preprocessing options
     skip_clipping                        = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index a50c9ae3..1d412462 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -53,7 +53,7 @@
                     "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.",
                     "fa_icon": "fas fa-file-signature"
                 },
-                "nf_core_pipeline": {
+                "generate_downstream_samplesheet": {
                     "type": "string",
                     "description": "Create a samplesheet for the specified nf-core pipeline",
                     "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.",
diff --git a/subworkflows/local/samplesheet_creation.nf b/subworkflows/local/create_phageannotator_samplesheet.nf
similarity index 75%
rename from subworkflows/local/samplesheet_creation.nf
rename to subworkflows/local/create_phageannotator_samplesheet.nf
index d9edb182..6423486e 100644
--- a/subworkflows/local/samplesheet_creation.nf
+++ b/subworkflows/local/create_phageannotator_samplesheet.nf
@@ -2,7 +2,7 @@ include { CAT_CAT               } from '../../modules/nf-core/cat/cat/main'
 include { MAG_TO_SAMPLESHEET    } from '../../modules/local/mag_to_samplesheet'
 include { MAG_MERGE_SAMPLESHEET } from '../../modules/local/mag_merge_samplesheet'
 
-workflow SAMPLESHEET_CREATION {
+workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET {
     take:
         short_reads //channel: [val(meta), path(fastq_1), path(fastq_2)]
         assemblies  //channel: [val(meta), path(fasta)]
@@ -50,8 +50,8 @@ workflow SAMPLESHEET_CREATION {
                         meta.id         = id
                         meta.group      = group
                         meta.single_end = single_end
-                        meta.fastq_1    = reads[0] ? reads[0] : ''
-                        meta.fastq_2    = reads[1] && !meta.single_end ? reads[1] : ''
+                        meta.fastq_1    = reads[0]
+                        meta.fastq_2    = !meta.single_end ? reads[1] : ''
                         meta.fasta      = fasta ? fasta : ''
 
                         return meta
@@ -62,7 +62,7 @@ workflow SAMPLESHEET_CREATION {
             ch_combined_assemblies_remap = ch_combined_assemblies
                 .map {
                     meta, fasta ->
-                        def group = meta.id.split('-')
+                        def group = meta.id.split('group-')
 
                             return [ group[1], fasta ]
                 }
@@ -75,17 +75,17 @@ workflow SAMPLESHEET_CREATION {
 
                             return [ group, id, single_end, fastq ]
                 }
-                .join ( ch_combined_assemblies_remap )
+                .combine ( ch_combined_assemblies_remap, by:0 )
                 .map {
-                    id, group, single_end, fastq, fasta ->
+                    group, id, single_end, fastq, fasta ->
                         def reads   = fastq instanceof List ? fastq.flatten() : [ fastq ]
                         def meta    = [:]
 
                         meta.id         = id
                         meta.group      = group
                         meta.single_end = single_end
-                        meta.fastq_1    = reads[0] ? reads[0] : ''
-                        meta.fastq_2    = reads[1] && !meta.single_end ? reads[1] : ''
+                        meta.fastq_1    = reads[0]
+                        meta.fastq_2    = !meta.single_end ? reads[1] : ''
                         meta.fasta      = fasta ? fasta : ''
 
                         return meta
@@ -93,23 +93,14 @@ workflow SAMPLESHEET_CREATION {
                 .set { ch_mag_metadata }
         }
 
-        //
-        // MODULE: Stage FastQ/FastA files generated by nf-core/mag together and auto-create a samplesheet
-        //
-        MAG_TO_SAMPLESHEET (
-            ch_mag_metadata,
-            params.nf_core_pipeline ?: ''
-        )
+        // Create samplesheet for each sample using meta information
+        ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta ->
+            [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ]
+        }
 
-        //
-        // MODULE: Create a merged samplesheet across all samples for the pipeline
-        //
-        MAG_MERGE_SAMPLESHEET (
-            MAG_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}
-        )
-        ch_versions = ch_versions.mix( MAG_MERGE_SAMPLESHEET.out.versions )
+        // Merge samplesheet across all samples for the pipeline
+        ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/")
 
     emit:
-        samplesheet         = ch_assemblies
         versions            = ch_versions       // channel: [ versions.yml ]
 }
diff --git a/workflows/mag.nf b/workflows/mag.nf
index d1b9eb0e..75e6111e 100644
--- a/workflows/mag.nf
+++ b/workflows/mag.nf
@@ -86,19 +86,19 @@ include { COMBINE_TSV as COMBINE_SUMMARY_TSV                  } from '../modules
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
 //
-include { INPUT_CHECK                     } from '../subworkflows/local/input_check'
-include { BINNING_PREPARATION             } from '../subworkflows/local/binning_preparation'
-include { BINNING                         } from '../subworkflows/local/binning'
-include { BINNING_REFINEMENT              } from '../subworkflows/local/binning_refinement'
-include { BUSCO_QC                        } from '../subworkflows/local/busco_qc'
-include { VIRUS_IDENTIFICATION            } from '../subworkflows/local/virus_identification'
-include { CHECKM_QC                       } from '../subworkflows/local/checkm_qc'
-include { GUNC_QC                         } from '../subworkflows/local/gunc_qc'
-include { GTDBTK                          } from '../subworkflows/local/gtdbtk'
-include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna'
-include { DOMAIN_CLASSIFICATION           } from '../subworkflows/local/domain_classification'
-include { DEPTHS                          } from '../subworkflows/local/depths'
-include { SAMPLESHEET_CREATION            } from '../subworkflows/local/samplesheet_creation'
+include { INPUT_CHECK                       } from '../subworkflows/local/input_check'
+include { BINNING_PREPARATION               } from '../subworkflows/local/binning_preparation'
+include { BINNING                           } from '../subworkflows/local/binning'
+include { BINNING_REFINEMENT                } from '../subworkflows/local/binning_refinement'
+include { BUSCO_QC                          } from '../subworkflows/local/busco_qc'
+include { VIRUS_IDENTIFICATION              } from '../subworkflows/local/virus_identification'
+include { CHECKM_QC                         } from '../subworkflows/local/checkm_qc'
+include { GUNC_QC                           } from '../subworkflows/local/gunc_qc'
+include { GTDBTK                            } from '../subworkflows/local/gtdbtk'
+include { ANCIENT_DNA_ASSEMBLY_VALIDATION   } from '../subworkflows/local/ancient_dna'
+include { DOMAIN_CLASSIFICATION             } from '../subworkflows/local/domain_classification'
+include { DEPTHS                            } from '../subworkflows/local/depths'
+include { CREATE_PHAGEANNOTATOR_SAMPLESHEET } from '../subworkflows/local/create_phageannotator_samplesheet'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1017,9 +1017,9 @@ workflow MAG {
     //
     // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines
     //
-    if ( params.nf_core_pipeline ) {
-        ch_samplesheet = SAMPLESHEET_CREATION ( ch_short_reads, ch_assemblies ).samplesheet
-        ch_versions = ch_versions.mix( SAMPLESHEET_CREATION.out.versions )
+    if ( params.generate_downstream_samplesheet == 'phageannotator' ) {
+        CREATE_PHAGEANNOTATOR_SAMPLESHEET ( ch_short_reads, ch_assemblies )
+        ch_versions = ch_versions.mix( CREATE_PHAGEANNOTATOR_SAMPLESHEET.out.versions )
     }
 
     CUSTOM_DUMPSOFTWAREVERSIONS (

From 6028f5a100e1dd1a6e3e5b2ca93628d02a9fb7b5 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Fri, 12 Jan 2024 16:12:33 +0000
Subject: [PATCH 08/16] Remove unused local modules

---
 modules/local/mag_merge_samplesheet.nf        | 28 -------------
 modules/local/mag_to_samplesheet.nf           | 40 -------------------
 .../create_phageannotator_samplesheet.nf      |  2 -
 3 files changed, 70 deletions(-)
 delete mode 100644 modules/local/mag_merge_samplesheet.nf
 delete mode 100644 modules/local/mag_to_samplesheet.nf

diff --git a/modules/local/mag_merge_samplesheet.nf b/modules/local/mag_merge_samplesheet.nf
deleted file mode 100644
index 34641de6..00000000
--- a/modules/local/mag_merge_samplesheet.nf
+++ /dev/null
@@ -1,28 +0,0 @@
-process MAG_MERGE_SAMPLESHEET {
-
-    conda "conda-forge::sed=4.7"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
-        'nf-core/ubuntu:20.04' }"
-
-    input:
-    path ('samplesheets/*')
-
-    output:
-    path "*_samplesheet.csv", emit: samplesheet
-    path "versions.yml"   , emit: versions
-
-    script:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    head -n 1 `ls ./samplesheets/* | head -n 1` > ${prefix}_samplesheet.csv
-    for fileid in `ls ./samplesheets/*`; do
-        awk 'NR>1' \$fileid >> ${prefix}_samplesheet.csv
-    done
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//')
-    END_VERSIONS
-    """
-}
diff --git a/modules/local/mag_to_samplesheet.nf b/modules/local/mag_to_samplesheet.nf
deleted file mode 100644
index a454bb44..00000000
--- a/modules/local/mag_to_samplesheet.nf
+++ /dev/null
@@ -1,40 +0,0 @@
-process MAG_TO_SAMPLESHEET {
-    tag "$meta.id"
-
-    executor 'local'
-    memory 100.MB
-
-    input:
-    val meta
-    val pipeline
-
-    output:
-    tuple val(meta), path("*samplesheet.csv"), emit: samplesheet
-
-    exec:
-    //
-    // Create samplesheet containing metadata
-    //
-
-    // Add nf-core pipeline specific entries
-    if (pipeline) {
-        if (pipeline == 'phageannotator') {
-            pipeline_map = [
-                sample  : "${meta.id}",
-                group   : "${meta.group}",
-                fastq_1 : meta.fastq_1,
-                fastq_2 : meta.fastq_2,
-                fasta   : meta.fasta
-            ]
-        }
-    }
-
-    // Create a samplesheet
-    samplesheet  = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n'
-    samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",")
-
-    // Write samplesheet to file
-    def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv")
-    samplesheet_file.text = samplesheet
-
-}
diff --git a/subworkflows/local/create_phageannotator_samplesheet.nf b/subworkflows/local/create_phageannotator_samplesheet.nf
index 6423486e..1299febb 100644
--- a/subworkflows/local/create_phageannotator_samplesheet.nf
+++ b/subworkflows/local/create_phageannotator_samplesheet.nf
@@ -1,6 +1,4 @@
 include { CAT_CAT               } from '../../modules/nf-core/cat/cat/main'
-include { MAG_TO_SAMPLESHEET    } from '../../modules/local/mag_to_samplesheet'
-include { MAG_MERGE_SAMPLESHEET } from '../../modules/local/mag_merge_samplesheet'
 
 workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET {
     take:

From 332b7c75f084e426eb3dff26c51896f44bf33a76 Mon Sep 17 00:00:00 2001
From: nf-core-bot <core@nf-co.re>
Date: Wed, 17 Jan 2024 23:12:32 +0000
Subject: [PATCH 09/16] [automated] Fix linting with Prettier

---
 .devcontainer/devcontainer.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 4ecfbfe3..4a9bc5c7 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -18,11 +18,11 @@
                 "python.linting.flake8Path": "/opt/conda/bin/flake8",
                 "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle",
                 "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
-                "python.linting.pylintPath": "/opt/conda/bin/pylint"
+                "python.linting.pylintPath": "/opt/conda/bin/pylint",
             },
 
             // Add the IDs of extensions you want installed when the container is created.
-            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"]
-        }
-    }
+            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"],
+        },
+    },
 }

From 622938ad6d93c88fb4a6d9f704c307f4f63bd044 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Fri, 9 Feb 2024 14:35:13 +0100
Subject: [PATCH 10/16] Fix linting

---
 .devcontainer/devcontainer.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 4a9bc5c7..4ecfbfe3 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -18,11 +18,11 @@
                 "python.linting.flake8Path": "/opt/conda/bin/flake8",
                 "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle",
                 "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
-                "python.linting.pylintPath": "/opt/conda/bin/pylint",
+                "python.linting.pylintPath": "/opt/conda/bin/pylint"
             },
 
             // Add the IDs of extensions you want installed when the container is created.
-            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"],
-        },
-    },
+            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"]
+        }
+    }
 }

From 7c0d30debaca6c33da8b33a0d74f0518770fc3c8 Mon Sep 17 00:00:00 2001
From: Carson J Miller <68351153+CarsonJM@users.noreply.github.com>
Date: Fri, 9 Feb 2024 07:55:43 -0800
Subject: [PATCH 11/16] Update
 subworkflows/local/create_phageannotator_samplesheet.nf

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
---
 subworkflows/local/create_phageannotator_samplesheet.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/subworkflows/local/create_phageannotator_samplesheet.nf b/subworkflows/local/create_phageannotator_samplesheet.nf
index 1299febb..8021c7e5 100644
--- a/subworkflows/local/create_phageannotator_samplesheet.nf
+++ b/subworkflows/local/create_phageannotator_samplesheet.nf
@@ -31,6 +31,7 @@ workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET {
 
                         return [ id, fasta ]
                 }
+
             short_reads
                 .map {
                     meta, fastq ->

From 1bc6f310e383ad0034810fb73e6f4ea1c5a5ce57 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Fri, 9 Feb 2024 23:23:59 +0000
Subject: [PATCH 12/16] Made recommended updates

---
 .github/workflows/ci.yml                      |   2 +-
 ...config => test_generatesamplesheet.config} |   0
 nextflow.config                               |   5 +-
 nextflow_schema.json                          |  35 ++++--
 .../create_phageannotator_samplesheet.nf      | 105 ----------------
 .../local/generate_downstream_samplesheet.nf  | 117 ++++++++++++++++++
 workflows/mag.nf                              |   8 +-
 7 files changed, 151 insertions(+), 121 deletions(-)
 rename conf/{test_samplesheet.config => test_generatesamplesheet.config} (100%)
 delete mode 100644 subworkflows/local/create_phageannotator_samplesheet.nf
 create mode 100644 subworkflows/local/generate_downstream_samplesheet.nf

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ee6240d3..9d6f8e5e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -62,7 +62,7 @@ jobs:
             test_adapterremoval,
             test_binrefinement,
             test_virus_identification,
-            test_samplesheet,
+            test_generatesamplesheet,
           ]
     steps:
       - name: Free some space
diff --git a/conf/test_samplesheet.config b/conf/test_generatesamplesheet.config
similarity index 100%
rename from conf/test_samplesheet.config
rename to conf/test_generatesamplesheet.config
diff --git a/nextflow.config b/nextflow.config
index 5a014d04..eab8ea0d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,7 +13,6 @@ params {
     input                                = null
     single_end                           = false
     assembly_input                       = null
-    generate_downstream_samplesheet      = null
 
     // short read preprocessing options
     skip_clipping                        = false
@@ -147,6 +146,10 @@ params {
     metaeuk_db                           = null
     save_mmseqs_db                       = false
 
+    // Generate downstream samplesheet options
+    generate_downstream_samplesheet      = null
+    samplesheet_combine_assemblers       = false
+
     // References
     genome                     = null
     igenomes_base              = 's3://ngi-igenomes/igenomes/'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 2cfeb15e..439a6b35 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -52,12 +52,6 @@
                     "type": "string",
                     "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.",
                     "fa_icon": "fas fa-file-signature"
-                },
-                "generate_downstream_samplesheet": {
-                    "type": "string",
-                    "description": "Create a samplesheet for the specified nf-core pipeline",
-                    "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.",
-                    "enum": ["phageannotator"]
                 }
             }
         },
@@ -536,7 +530,7 @@
                 },
                 "gtdbtk_min_completeness": {
                     "type": "number",
-                    "default": 50.0,
+                    "default": 50,
                     "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.",
                     "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!",
                     "minimum": 0.01,
@@ -544,7 +538,7 @@
                 },
                 "gtdbtk_max_contamination": {
                     "type": "number",
-                    "default": 10.0,
+                    "default": 10,
                     "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.",
                     "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!",
                     "minimum": 0,
@@ -552,7 +546,7 @@
                 },
                 "gtdbtk_min_perc_aa": {
                     "type": "number",
-                    "default": 10.0,
+                    "default": 10,
                     "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.",
                     "minimum": 0,
                     "maximum": 100
@@ -566,7 +560,7 @@
                 },
                 "gtdbtk_pplacer_cpus": {
                     "type": "number",
-                    "default": 1.0,
+                    "default": 1,
                     "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.",
                     "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)."
                 },
@@ -890,6 +884,24 @@
                     "description": "minimum number of bases supporting the alternative allele"
                 }
             }
+        },
+        "downstream_sample_sheet_generation_options": {
+            "title": "Downstream sample sheet generation options",
+            "type": "object",
+            "description": "Generate sample sheets for downstream nf-core pipelines",
+            "default": "",
+            "properties": {
+                "generate_downstream_samplesheet": {
+                    "type": "string",
+                    "description": "Create a samplesheet for the specified nf-core pipeline",
+                    "help_text": "Automatically generate a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline.",
+                    "enum": ["phageannotator"]
+                },
+                "samplesheet_combine_assemblers": {
+                    "type": "boolean",
+                    "description": "Combine assembler outputs into one assembly file"
+                }
+            }
         }
     },
     "allOf": [
@@ -937,6 +949,9 @@
         },
         {
             "$ref": "#/definitions/ancient_dna_assembly"
+        },
+        {
+            "$ref": "#/definitions/downstream_sample_sheet_generation_options"
         }
     ]
 }
diff --git a/subworkflows/local/create_phageannotator_samplesheet.nf b/subworkflows/local/create_phageannotator_samplesheet.nf
deleted file mode 100644
index 8021c7e5..00000000
--- a/subworkflows/local/create_phageannotator_samplesheet.nf
+++ /dev/null
@@ -1,105 +0,0 @@
-include { CAT_CAT               } from '../../modules/nf-core/cat/cat/main'
-
-workflow CREATE_PHAGEANNOTATOR_SAMPLESHEET {
-    take:
-        short_reads //channel: [val(meta), path(fastq_1), path(fastq_2)]
-        assemblies  //channel: [val(meta), path(fasta)]
-    main:
-        ch_versions = Channel.empty()
-
-        // combine assemblies by sample/group if multiple assembly methods were used
-        ch_assemblies = assemblies
-            .map {
-                meta, fasta ->
-                    def meta_new = meta.subMap('id')
-                [ meta_new, fasta ]
-            }
-            .groupTuple()
-
-        //
-        // MODULE: Combine all assemblies from a sample into one FastA file
-        //
-        ch_combined_assemblies = CAT_CAT ( ch_assemblies ).file_out
-        ch_versions = ch_versions.mix( CAT_CAT.out.versions )
-
-        // if no coassembly, join FastQ and FastA by ID
-        if ( !params.coassemble_group ){
-            ch_combined_assemblies_remap = ch_combined_assemblies
-                .map {
-                    meta, fasta ->
-                        def id          = meta.id
-
-                        return [ id, fasta ]
-                }
-
-            short_reads
-                .map {
-                    meta, fastq ->
-                        def id          = meta.id
-                        def group       = meta.group
-                        def single_end  = meta.single_end
-
-                        return [ id, group, single_end, fastq ]
-                }.join ( ch_combined_assemblies_remap )
-                .map {
-                    id, group, single_end, fastq, fasta ->
-                        def reads   = fastq instanceof List ? fastq.flatten() : [ fastq ]
-                        def meta    = [:]
-
-                        meta.id         = id
-                        meta.group      = group
-                        meta.single_end = single_end
-                        meta.fastq_1    = reads[0]
-                        meta.fastq_2    = !meta.single_end ? reads[1] : ''
-                        meta.fasta      = fasta ? fasta : ''
-
-                        return meta
-                }
-                .set { ch_mag_metadata }
-        } else {
-            // if coassembly was used, join FastQ and FastA by group
-            ch_combined_assemblies_remap = ch_combined_assemblies
-                .map {
-                    meta, fasta ->
-                        def group = meta.id.split('group-')
-
-                            return [ group[1], fasta ]
-                }
-            short_reads
-                .map {
-                    meta, fastq ->
-                        def id          = meta.id
-                        def group       = meta.group
-                        def single_end  = meta.single_end
-
-                            return [ group, id, single_end, fastq ]
-                }
-                .combine ( ch_combined_assemblies_remap, by:0 )
-                .map {
-                    group, id, single_end, fastq, fasta ->
-                        def reads   = fastq instanceof List ? fastq.flatten() : [ fastq ]
-                        def meta    = [:]
-
-                        meta.id         = id
-                        meta.group      = group
-                        meta.single_end = single_end
-                        meta.fastq_1    = reads[0]
-                        meta.fastq_2    = !meta.single_end ? reads[1] : ''
-                        meta.fasta      = fasta ? fasta : ''
-
-                        return meta
-                }
-                .set { ch_mag_metadata }
-        }
-
-        // Create samplesheet for each sample using meta information
-        ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta ->
-            [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ]
-        }
-
-        // Merge samplesheet across all samples for the pipeline
-        ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/")
-
-    emit:
-        versions            = ch_versions       // channel: [ versions.yml ]
-}
diff --git a/subworkflows/local/generate_downstream_samplesheet.nf b/subworkflows/local/generate_downstream_samplesheet.nf
new file mode 100644
index 00000000..a8c165cb
--- /dev/null
+++ b/subworkflows/local/generate_downstream_samplesheet.nf
@@ -0,0 +1,117 @@
+include { CAT_CAT               } from '../../modules/nf-core/cat/cat/main'
+
+workflow GENERATE_DOWNSTREAM_SAMPLESHEET {
+    take:
+        downstream_nfcore_pipelines // val: [ nf-core-pipeline, OPTIONAL: other-nf-core-pipelines ]
+        short_reads                 // channel: [val(meta), path(fastq_1), path(fastq_2)]
+        assemblies                  // channel: [val(meta), path(fasta)]
+    main:
+
+        ch_versions = Channel.empty()
+
+        //
+        // Create a samplesheet for nf-core/phageannotator
+        //
+        if ( 'phageannotator' in downstream_nfcore_pipelines ) {
+
+            if ( params.samplesheet_combine_assemblers ) {
+                // combine assemblies by sample/group if multiple assembly methods were used
+                ch_assemblies = assemblies
+                    .map {
+                        meta, fasta ->
+                            def meta_new = meta.subMap('id')
+                        [ meta_new, fasta ]
+                    }
+                    .groupTuple()
+
+                //
+                // MODULE: Combine all assemblies from a sample into one FastA file
+                //
+                ch_combined_assemblies = CAT_CAT ( ch_assemblies ).file_out
+                ch_versions = ch_versions.mix( CAT_CAT.out.versions )
+            } else {
+                ch_combined_assemblies = assemblies
+            }
+
+            // if no coassembly, join FastQ and FastA by ID
+            if ( !params.coassemble_group ){
+                ch_combined_assemblies_remap = ch_combined_assemblies
+                    .map {
+                        meta, fasta ->
+                            def id          = meta.id
+
+                            return [ id, fasta ]
+                    }
+
+                short_reads
+                    .map {
+                        meta, fastq ->
+                            def id          = meta.id
+                            def group       = meta.group
+                            def single_end  = meta.single_end
+
+                            return [ id, group, single_end, fastq ]
+                    }.join ( ch_combined_assemblies_remap )
+                    .map {
+                        id, group, single_end, fastq, fasta ->
+                            def reads   = fastq instanceof List ? fastq.flatten() : [ fastq ]
+                            def meta    = [:]
+
+                            meta.id         = id
+                            meta.group      = group
+                            meta.single_end = single_end
+                            meta.fastq_1    = reads[0]
+                            meta.fastq_2    = !meta.single_end ? reads[1] : ''
+                            meta.fasta      = fasta ? fasta : ''
+
+                            return meta
+                    }
+                    .set { ch_mag_metadata }
+            } else {
+                // if coassembly was used, join FastQ and FastA by group
+                ch_combined_assemblies_remap = ch_combined_assemblies
+                    .map {
+                        meta, fasta ->
+                            def group = meta.id.split('group-')
+
+                                return [ group[1], fasta ]
+                    }
+                short_reads
+                    .map {
+                        meta, fastq ->
+                            def id          = meta.id
+                            def group       = meta.group
+                            def single_end  = meta.single_end
+
+                                return [ group, id, single_end, fastq ]
+                    }
+                    .combine ( ch_combined_assemblies_remap, by:0 )
+                    .map {
+                        group, id, single_end, fastq, fasta ->
+                            def reads   = fastq instanceof List ? fastq.flatten() : [ fastq ]
+                            def meta    = [:]
+
+                            meta.id         = id
+                            meta.group      = group
+                            meta.single_end = single_end
+                            meta.fastq_1    = reads[0]
+                            meta.fastq_2    = !meta.single_end ? reads[1] : ''
+                            meta.fasta      = fasta ? fasta : ''
+
+                            return meta
+                    }
+                    .set { ch_mag_metadata }
+            }
+
+            // Create samplesheet for each sample using meta information
+            ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta ->
+                [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ]
+            }
+
+            // Merge samplesheet across all samples for the pipeline
+            ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/")
+        }
+
+    emit:
+        versions            = ch_versions       // channel: [ versions.yml ]
+}
diff --git a/workflows/mag.nf b/workflows/mag.nf
index 3cd5f9dd..941899e4 100644
--- a/workflows/mag.nf
+++ b/workflows/mag.nf
@@ -98,7 +98,7 @@ include { GTDBTK                            } from '../subworkflows/local/gtdbtk
 include { ANCIENT_DNA_ASSEMBLY_VALIDATION   } from '../subworkflows/local/ancient_dna'
 include { DOMAIN_CLASSIFICATION             } from '../subworkflows/local/domain_classification'
 include { DEPTHS                            } from '../subworkflows/local/depths'
-include { CREATE_PHAGEANNOTATOR_SAMPLESHEET } from '../subworkflows/local/create_phageannotator_samplesheet'
+include { GENERATE_DOWNSTREAM_SAMPLESHEET   } from '../subworkflows/local/generate_downstream_samplesheet'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1045,9 +1045,9 @@ workflow MAG {
     //
     // SUBWORKFLOW: Auto-create samplesheets for downstream nf-core pipelines
     //
-    if ( params.generate_downstream_samplesheet == 'phageannotator' ) {
-        CREATE_PHAGEANNOTATOR_SAMPLESHEET ( ch_short_reads, ch_assemblies )
-        ch_versions = ch_versions.mix( CREATE_PHAGEANNOTATOR_SAMPLESHEET.out.versions )
+    if ( params.generate_downstream_samplesheet ) {
+        GENERATE_DOWNSTREAM_SAMPLESHEET ( params.generate_downstream_samplesheet, ch_short_reads, ch_assemblies )
+        ch_versions = ch_versions.mix( GENERATE_DOWNSTREAM_SAMPLESHEET.out.versions )
     }
 
     CUSTOM_DUMPSOFTWAREVERSIONS (

From dbb8dc37cf6f6555d5b0c864269670d059e5b229 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Mon, 12 Feb 2024 17:46:59 +0000
Subject: [PATCH 13/16] Fixed profile name

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index eab8ea0d..12ccf636 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -320,7 +320,7 @@ profiles {
     test_bbnorm         { includeConfig 'conf/test_bbnorm.config'         }
     test_nothing        { includeConfig 'conf/test_nothing.config'        }
     test_virus_identification { includeConfig 'conf/test_virus_identification.config' }
-    test_samplesheet    { includeConfig 'conf/test_samplesheet.config' }
+    test_generatesamplesheet  { includeConfig 'conf/test_samplesheet.config' }
 }
 
 // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile

From 15c3f198a2e2d463af68d688a7ee823c3ea0efee Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Mon, 12 Feb 2024 17:49:25 +0000
Subject: [PATCH 14/16] Part 2

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 12ccf636..78491f57 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -320,7 +320,7 @@ profiles {
     test_bbnorm         { includeConfig 'conf/test_bbnorm.config'         }
     test_nothing        { includeConfig 'conf/test_nothing.config'        }
     test_virus_identification { includeConfig 'conf/test_virus_identification.config' }
-    test_generatesamplesheet  { includeConfig 'conf/test_samplesheet.config' }
+    test_generatesamplesheet  { includeConfig 'conf/test_generatesamplesheet.config' }
 }
 
 // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile

From 6cda4429552b8bc24a0e1620587da4fecd098dc6 Mon Sep 17 00:00:00 2001
From: Carson J Miller <68351153+CarsonJM@users.noreply.github.com>
Date: Sun, 17 Mar 2024 14:33:13 -0700
Subject: [PATCH 15/16] Apply suggestions from code review

Co-authored-by: James A. Fellows Yates <jfy133@gmail.com>
---
 README.md            | 2 +-
 docs/output.md       | 2 +-
 nextflow_schema.json | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6be4bd5d..cec2c6bc 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ The pipeline then:
 - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html)
 - predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk)
 - performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/).
-- Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes)
+- performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes)
 - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool)
 - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara)
 - generates a samplesheet that can be used as input for other nf-core pipelines. Currently, [phageannotator](https://github.com/nf-core/phageannotator) is supported.
diff --git a/docs/output.md b/docs/output.md
index a4141232..34c2b1e8 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -719,7 +719,7 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc
 
 Currently, samplesheets for the following nf-core pipelines can be automatically generated:
 
-- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences.
+- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences. Utilizes quality controlled reads and contigs generate by nf-core/mag
 
 ### MultiQC
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 439a6b35..f100eac9 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -899,7 +899,7 @@
                 },
                 "samplesheet_combine_assemblers": {
                     "type": "boolean",
-                    "description": "Combine assembler outputs into one assembly file"
+                    "description": "Combine all contigs from all assemblies of a given sample into a single FASTA file"
                 }
             }
         }

From 1e7abe508c735bd1e8770324899a296abeaca223 Mon Sep 17 00:00:00 2001
From: Carson J Miller <carsonjmiller@outlook.com>
Date: Thu, 21 Mar 2024 17:58:05 +0000
Subject: [PATCH 16/16] Copy fastq and fasta files to outdir for samplesheet
 generation

---
 .../local/generate_downstream_samplesheet.nf  | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/subworkflows/local/generate_downstream_samplesheet.nf b/subworkflows/local/generate_downstream_samplesheet.nf
index a8c165cb..5c21682e 100644
--- a/subworkflows/local/generate_downstream_samplesheet.nf
+++ b/subworkflows/local/generate_downstream_samplesheet.nf
@@ -5,6 +5,7 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEET {
         downstream_nfcore_pipelines // val: [ nf-core-pipeline, OPTIONAL: other-nf-core-pipelines ]
         short_reads                 // channel: [val(meta), path(fastq_1), path(fastq_2)]
         assemblies                  // channel: [val(meta), path(fasta)]
+
     main:
 
         ch_versions = Channel.empty()
@@ -103,10 +104,35 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEET {
                     .set { ch_mag_metadata }
             }
 
-            // Create samplesheet for each sample using meta information
-            ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta ->
-                [ "${meta.id}_phageannotator_samplesheet.csv", "sample,group,fastq_1,fastq_2,fasta" + '\n' + "${meta.id},${meta.group},${meta.fastq_1},${meta.fastq_2},${meta.fasta}" + '\n' ]
-            }
+                // Create samplesheet for each sample using meta information
+                ch_mag_id_samplesheets = ch_mag_metadata.collectFile() { meta ->
+                    // Save reads and assemblies to outdir so that they are in a stable location
+                    file(meta.fastq_1.toUriString(), checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}")
+                    file(meta.fasta, checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}")
+                    if ( !meta.single_end ){
+                        file(meta.fastq_2.toUriString(), checkIfExists: true).copyTo("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_2.name}")
+                        [ "${meta.id}_phageannotator_samplesheet.csv",
+                            "sample,group,fastq_1,fastq_2,fasta" +
+                            '\n' +
+                            "${meta.id},${meta.group}," +
+                            file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}").toString() + "," +
+                            file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_2.name}").toString() + "," +
+                            file("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}").toString() +
+                            '\n'
+                        ]
+                    } else {
+                        // Create samplesheet for each sample using meta information
+                        [ "${meta.id}_phageannotator_samplesheet.csv",
+                            "sample,group,fastq_1,fastq_2,fasta" +
+                            '\n' +
+                            "${meta.id},${meta.group}," +
+                            file("${params.outdir}/downstream_samplesheets/fastq/${meta.fastq_1.name}").toString() + "," +
+                            "," +
+                            file("${params.outdir}/downstream_samplesheets/fasta/${meta.fasta.name}").toString() +
+                            '\n'
+                        ]
+                    }
+                }
 
             // Merge samplesheet across all samples for the pipeline
             ch_mag_id_samplesheets.collectFile(name: "phageannotator_samplesheet.csv", keepHeader:true, skip:1, storeDir:"${params.outdir}/downstream_samplesheets/")