diff --git a/modules/nf-core/galah/main.nf b/modules/nf-core/galah/main.nf new file mode 100644 index 00000000000..19721a5d684 --- /dev/null +++ b/modules/nf-core/galah/main.nf @@ -0,0 +1,59 @@ +process GALAH { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::galah=0.3.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/galah%3A0.3.1--h031d066_3': + 'biocontainers/galah:0.3.1--h031d066_3' }" + + input: + tuple val(meta), path(bins), path(qc_table), val(qc_format) + + output: + tuple val(meta), path("*.tsv") , emit: tsv + tuple val(meta), path("${prefix}-dereplicated/*") , emit: dereplicated_bins + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def qc_args = (qc_format == "checkm") ? "--checkm-tab-table ${qc_table}" : "--genome-info ${qc_table}" + def qc_input = qc_table ? qc_args : "" + def valid_qc_format = qc_format in ["checkm", "genome_info"] + if( qc_table && !valid_qc_format ) { + error "Invalid qc_format supplied! qc_format should be either 'checkm' or 'genome_info'." + } + """ + mkdir ${prefix}-dereplicated + + galah cluster \\ + --threads ${task.cpus} \\ + --genome-fasta-files ${bins} \\ + ${qc_input} \\ + --output-cluster-definition ${prefix}-dereplicated_bins.tsv \\ + --output-representative-fasta-directory ${prefix}-dereplicated + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + galah: \$(galah --version | sed 's/galah //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir ${prefix}-dereplicated/ + touch ${prefix}-dereplicated/test.fa + touch ${prefix}-dereplicated_bins.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + galah: \$(galah --version | sed 's/galah //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/galah/meta.yml b/modules/nf-core/galah/meta.yml new file mode 100644 index 00000000000..85263a8f591 --- /dev/null +++ b/modules/nf-core/galah/meta.yml @@ -0,0 +1,63 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "galah" +description: Cluster genome FASTA files by average nucleotide identity +keywords: + - genomics + - cluster + - genome + - metagenomics +tools: + - "galah": + description: "Galah aims to be a more scalable metagenome assembled genome (MAG) dereplication method." + homepage: "https://github.com/wwood/galah" + documentation: "https://github.com/wwood/galah" + tool_dev_url: "https://github.com/wwood/galah" + doi: "10.1111/NODOI" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - bins: + type: file + description: A list of fasta-formatted genomes for dereplication + pattern: "*.{fa,fna,fa.gz, etc}" + - qc_table: + type: file + description: | + (optional) Either a (CheckM)[https://nf-co.re/modules/checkm_lineagewf] summary TSV containing + information on the completeness and contamination of the input genomes (13 columns), + or a 3-column csv with the header `genome,completeness,contamination`. + In both cases the first column should contain the names of the input genome files, + minus the last file extension + (i.e. if the genome is gzipped, the genome name should retain the .fasta extension). + pattern: "*.{csv,tsv}" + - qc_format: + type: string + description: Defines the type if input table in `qc_table`, if specified. + pattern: "checkm|genome_info" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - tsv: + type: file + description: TSV file in the format `representative_genome` \t `member_genome` + pattern: "*.tsv" + - dereplicated_bins: + type: file + description: The representative genomes following dereplication by galah. + pattern: "*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@prototaxites" diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml index 833fbabd75a..0144bc892ef 100644 --- a/tests/config/pytest_modules.yml +++ b/tests/config/pytest_modules.yml @@ -1247,6 +1247,10 @@ freyja/variants: - modules/nf-core/freyja/variants/** - tests/modules/nf-core/freyja/variants/** +galah: + - modules/nf-core/galah/** + - tests/modules/nf-core/galah/** + gamma/gamma: - modules/nf-core/gamma/gamma/** - tests/modules/nf-core/gamma/gamma/** diff --git a/tests/modules/nf-core/galah/main.nf b/tests/modules/nf-core/galah/main.nf new file mode 100644 index 00000000000..dba70de9169 --- /dev/null +++ b/tests/modules/nf-core/galah/main.nf @@ -0,0 +1,49 @@ + +include { GALAH } from '../../../../modules/nf-core/galah/main.nf' +include { BIOAWK as BIOAWK_CHECKM } from '../../../../modules/nf-core/bioawk/main.nf' +include { BIOAWK as BIOAWK_GENOMEINFO } from '../../../../modules/nf-core/bioawk/main.nf' +include { GUNZIP } from '../../../../modules/nf-core/gunzip/main.nf' + +workflow test_galah { + + input = [ + [ id:'test' ], // meta map + [file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCF_004296495.1_ASM429649v1_genomic.fna.gz", checkIfExists: true)], + [], + [] + ] + + GALAH ( input ) + +} + +workflow test_galah_genomeinfo { + + genomeinfo = Channel.fromPath("https://raw.githubusercontent.com/nf-core/test-datasets/magmap/testdata/checkm.lineage_wf.qa_2.tsv", checkIfExists: true) + .map { file -> + [ [id: "genomeinfo"], file ] + } + + BIOAWK_GENOMEINFO(genomeinfo) + + GUNZIP(BIOAWK_GENOMEINFO.out.output) + + ch_genomeinfo = GUNZIP.out.gunzip + .map { meta, tsv -> [tsv] } + + input = Channel.of( + [ + [ id:'test' ], // meta map + [file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCF_004296495.1_ASM429649v1_genomic.fna.gz", checkIfExists: true)] + ] + ) + .combine(ch_genomeinfo) + .map {meta, bins, qc -> + [ meta, bins, qc, "genome_info" ] + } + + GALAH ( input ) + +} diff --git a/tests/modules/nf-core/galah/nextflow.config b/tests/modules/nf-core/galah/nextflow.config new file mode 100644 index 00000000000..8cae6f9910f --- /dev/null +++ b/tests/modules/nf-core/galah/nextflow.config @@ -0,0 +1,13 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + // write a horrid awk to munge the checkm_qa tsv to a 3 column csv + // as files are gzipped, genome has to include extension "fna" as galah expects the genome name to be + // the file name minus the last extension + withName: BIOAWK_GENOMEINFO { + ext.args = '\'BEGIN {{FS="\t"; OFS=","}} NR==1 {print "genome","completeness","contamination"} NR>1 {print $1".fna",$6, $7}\'' + ext.prefix = "genome_info.tsv" + } + +} diff --git a/tests/modules/nf-core/galah/test.yml b/tests/modules/nf-core/galah/test.yml new file mode 100644 index 00000000000..47f9e900e96 --- /dev/null +++ b/tests/modules/nf-core/galah/test.yml @@ -0,0 +1,30 @@ +- name: galah test_galah + command: nextflow run ./tests/modules/nf-core/galah -entry test_galah -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/galah/nextflow.config + tags: + - galah + files: + - path: output/galah/test-dereplicated/GCA_002688505.1_ASM268850v1_genomic.fna.gz + md5sum: 0747c48f6693a4fb03c7164c2f472326 + - path: output/galah/test-dereplicated/GCF_004296495.1_ASM429649v1_genomic.fna.gz + md5sum: a8e9bac598df938f25e09418ff7214dd + - path: output/galah/test-dereplicated_bins.tsv + md5sum: d2f8a621bfa5794467f4fdd759e2bce7 + - path: output/galah/versions.yml + +- name: galah test_galah_genomeinfo + command: nextflow run ./tests/modules/nf-core/galah -entry test_galah_genomeinfo -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/galah/nextflow.config + tags: + - galah + files: + - path: output/bioawk/genome_info.tsv.gz + - path: output/bioawk/versions.yml + - path: output/galah/test-dereplicated/GCA_002688505.1_ASM268850v1_genomic.fna.gz + md5sum: 0747c48f6693a4fb03c7164c2f472326 + - path: output/galah/test-dereplicated/GCF_004296495.1_ASM429649v1_genomic.fna.gz + md5sum: a8e9bac598df938f25e09418ff7214dd + - path: output/galah/test-dereplicated_bins.tsv + md5sum: d2f8a621bfa5794467f4fdd759e2bce7 + - path: output/galah/versions.yml + - path: output/gunzip/genome_info.tsv + md5sum: f73b9131ab91ddb754725b94e5085955 + - path: output/gunzip/versions.yml