Skip to content

Commit

Permalink
New module: galah (#3666)
Browse files Browse the repository at this point in the history
* Add galah

* NO DOI

* fix versions

* fix dashing version in conda

* fix inputs/outputs

* fix dashing version

* Update model inputs, add tests

* Update error message.

* Update main.nf

* Update test.yml

* Update main.nf

* Update modules/nf-core/galah/main.nf

Co-authored-by: Adam Talbot <[email protected]>

* Update main.nf

* Update modules/nf-core/galah/main.nf

Co-authored-by: Adam Talbot <[email protected]>

* Update main.nf

* Update test

* Update main.nf

* Update main.nf

* fix tests

---------

Co-authored-by: Adam Talbot <[email protected]>
  • Loading branch information
prototaxites and adamrtalbot authored Aug 7, 2023
1 parent b58a51f commit 4375e5a
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 0 deletions.
59 changes: 59 additions & 0 deletions modules/nf-core/galah/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
process GALAH {
tag "$meta.id"
label 'process_medium'

conda "bioconda::galah=0.3.1"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/galah%3A0.3.1--h031d066_3':
'biocontainers/galah:0.3.1--h031d066_3' }"

input:
tuple val(meta), path(bins), path(qc_table), val(qc_format)

output:
tuple val(meta), path("*.tsv") , emit: tsv
tuple val(meta), path("${prefix}-dereplicated/*") , emit: dereplicated_bins
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
def qc_args = (qc_format == "checkm") ? "--checkm-tab-table ${qc_table}" : "--genome-info ${qc_table}"
def qc_input = qc_table ? qc_args : ""
def valid_qc_format = qc_format in ["checkm", "genome_info"]
if( qc_table && !valid_qc_format ) {
error "Invalid qc_format supplied! qc_format should be either 'checkm' or 'genome_info'."
}
"""
mkdir ${prefix}-dereplicated
galah cluster \\
--threads ${task.cpus} \\
--genome-fasta-files ${bins} \\
${qc_input} \\
--output-cluster-definition ${prefix}-dereplicated_bins.tsv \\
--output-representative-fasta-directory ${prefix}-dereplicated
cat <<-END_VERSIONS > versions.yml
"${task.process}":
galah: \$(galah --version | sed 's/galah //')
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
"""
mkdir ${prefix}-dereplicated/
touch ${prefix}-dereplicated/test.fa
touch ${prefix}-dereplicated_bins.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
galah: \$(galah --version | sed 's/galah //')
END_VERSIONS
"""
}
63 changes: 63 additions & 0 deletions modules/nf-core/galah/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
name: "galah"
description: Cluster genome FASTA files by average nucleotide identity
keywords:
- genomics
- cluster
- genome
- metagenomics
tools:
- "galah":
description: "Galah aims to be a more scalable metagenome assembled genome (MAG) dereplication method."
homepage: "https://github.com/wwood/galah"
documentation: "https://github.com/wwood/galah"
tool_dev_url: "https://github.com/wwood/galah"
doi: "10.1111/NODOI"
licence: "['GPL v3']"

input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'test', single_end:false ]`
- bins:
type: file
description: A list of fasta-formatted genomes for dereplication
pattern: "*.{fa,fna,fa.gz, etc}"
- qc_table:
type: file
description: |
(optional) Either a (CheckM)[https://nf-co.re/modules/checkm_lineagewf] summary TSV containing
information on the completeness and contamination of the input genomes (13 columns),
or a 3-column csv with the header `genome,completeness,contamination`.
In both cases the first column should contain the names of the input genome files,
minus the last file extension
(i.e. if the genome is gzipped, the genome name should retain the .fasta extension).
pattern: "*.{csv,tsv}"
- qc_format:
type: string
description: Defines the type if input table in `qc_table`, if specified.
pattern: "checkm|genome_info"

output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'test', single_end:false ]`
- tsv:
type: file
description: TSV file in the format `representative_genome` \t `member_genome`
pattern: "*.tsv"
- dereplicated_bins:
type: file
description: The representative genomes following dereplication by galah.
pattern: "*"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@prototaxites"
4 changes: 4 additions & 0 deletions tests/config/pytest_modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1247,6 +1247,10 @@ freyja/variants:
- modules/nf-core/freyja/variants/**
- tests/modules/nf-core/freyja/variants/**

galah:
- modules/nf-core/galah/**
- tests/modules/nf-core/galah/**

gamma/gamma:
- modules/nf-core/gamma/gamma/**
- tests/modules/nf-core/gamma/gamma/**
Expand Down
49 changes: 49 additions & 0 deletions tests/modules/nf-core/galah/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@

include { GALAH } from '../../../../modules/nf-core/galah/main.nf'
include { BIOAWK as BIOAWK_CHECKM } from '../../../../modules/nf-core/bioawk/main.nf'
include { BIOAWK as BIOAWK_GENOMEINFO } from '../../../../modules/nf-core/bioawk/main.nf'
include { GUNZIP } from '../../../../modules/nf-core/gunzip/main.nf'

workflow test_galah {

input = [
[ id:'test' ], // meta map
[file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz", checkIfExists: true),
file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCF_004296495.1_ASM429649v1_genomic.fna.gz", checkIfExists: true)],
[],
[]
]

GALAH ( input )

}

workflow test_galah_genomeinfo {

genomeinfo = Channel.fromPath("https://raw.githubusercontent.com/nf-core/test-datasets/magmap/testdata/checkm.lineage_wf.qa_2.tsv", checkIfExists: true)
.map { file ->
[ [id: "genomeinfo"], file ]
}

BIOAWK_GENOMEINFO(genomeinfo)

GUNZIP(BIOAWK_GENOMEINFO.out.output)

ch_genomeinfo = GUNZIP.out.gunzip
.map { meta, tsv -> [tsv] }

input = Channel.of(
[
[ id:'test' ], // meta map
[file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz", checkIfExists: true),
file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCF_004296495.1_ASM429649v1_genomic.fna.gz", checkIfExists: true)]
]
)
.combine(ch_genomeinfo)
.map {meta, bins, qc ->
[ meta, bins, qc, "genome_info" ]
}

GALAH ( input )

}
13 changes: 13 additions & 0 deletions tests/modules/nf-core/galah/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
process {

publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }

// write a horrid awk to munge the checkm_qa tsv to a 3 column csv
// as files are gzipped, genome has to include extension "fna" as galah expects the genome name to be
// the file name minus the last extension
withName: BIOAWK_GENOMEINFO {
ext.args = '\'BEGIN {{FS="\t"; OFS=","}} NR==1 {print "genome","completeness","contamination"} NR>1 {print $1".fna",$6, $7}\''
ext.prefix = "genome_info.tsv"
}

}
30 changes: 30 additions & 0 deletions tests/modules/nf-core/galah/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
- name: galah test_galah
command: nextflow run ./tests/modules/nf-core/galah -entry test_galah -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/galah/nextflow.config
tags:
- galah
files:
- path: output/galah/test-dereplicated/GCA_002688505.1_ASM268850v1_genomic.fna.gz
md5sum: 0747c48f6693a4fb03c7164c2f472326
- path: output/galah/test-dereplicated/GCF_004296495.1_ASM429649v1_genomic.fna.gz
md5sum: a8e9bac598df938f25e09418ff7214dd
- path: output/galah/test-dereplicated_bins.tsv
md5sum: d2f8a621bfa5794467f4fdd759e2bce7
- path: output/galah/versions.yml

- name: galah test_galah_genomeinfo
command: nextflow run ./tests/modules/nf-core/galah -entry test_galah_genomeinfo -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/galah/nextflow.config
tags:
- galah
files:
- path: output/bioawk/genome_info.tsv.gz
- path: output/bioawk/versions.yml
- path: output/galah/test-dereplicated/GCA_002688505.1_ASM268850v1_genomic.fna.gz
md5sum: 0747c48f6693a4fb03c7164c2f472326
- path: output/galah/test-dereplicated/GCF_004296495.1_ASM429649v1_genomic.fna.gz
md5sum: a8e9bac598df938f25e09418ff7214dd
- path: output/galah/test-dereplicated_bins.tsv
md5sum: d2f8a621bfa5794467f4fdd759e2bce7
- path: output/galah/versions.yml
- path: output/gunzip/genome_info.tsv
md5sum: f73b9131ab91ddb754725b94e5085955
- path: output/gunzip/versions.yml

0 comments on commit 4375e5a

Please sign in to comment.