-
Notifications
You must be signed in to change notification settings - Fork 713
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add galah * NO DOI * fix versions * fix dashing version in conda * fix inputs/outputs * fix dashing version * Update model inputs, add tests * Update error message. * Update main.nf * Update test.yml * Update main.nf * Update modules/nf-core/galah/main.nf Co-authored-by: Adam Talbot <[email protected]> * Update main.nf * Update modules/nf-core/galah/main.nf Co-authored-by: Adam Talbot <[email protected]> * Update main.nf * Update test * Update main.nf * Update main.nf * fix tests --------- Co-authored-by: Adam Talbot <[email protected]>
- Loading branch information
1 parent
b58a51f
commit 4375e5a
Showing
6 changed files
with
218 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
process GALAH { | ||
tag "$meta.id" | ||
label 'process_medium' | ||
|
||
conda "bioconda::galah=0.3.1" | ||
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
'https://depot.galaxyproject.org/singularity/galah%3A0.3.1--h031d066_3': | ||
'biocontainers/galah:0.3.1--h031d066_3' }" | ||
|
||
input: | ||
tuple val(meta), path(bins), path(qc_table), val(qc_format) | ||
|
||
output: | ||
tuple val(meta), path("*.tsv") , emit: tsv | ||
tuple val(meta), path("${prefix}-dereplicated/*") , emit: dereplicated_bins | ||
path "versions.yml" , emit: versions | ||
|
||
when: | ||
task.ext.when == null || task.ext.when | ||
|
||
script: | ||
def args = task.ext.args ?: '' | ||
prefix = task.ext.prefix ?: "${meta.id}" | ||
def qc_args = (qc_format == "checkm") ? "--checkm-tab-table ${qc_table}" : "--genome-info ${qc_table}" | ||
def qc_input = qc_table ? qc_args : "" | ||
def valid_qc_format = qc_format in ["checkm", "genome_info"] | ||
if( qc_table && !valid_qc_format ) { | ||
error "Invalid qc_format supplied! qc_format should be either 'checkm' or 'genome_info'." | ||
} | ||
""" | ||
mkdir ${prefix}-dereplicated | ||
galah cluster \\ | ||
--threads ${task.cpus} \\ | ||
--genome-fasta-files ${bins} \\ | ||
${qc_input} \\ | ||
--output-cluster-definition ${prefix}-dereplicated_bins.tsv \\ | ||
--output-representative-fasta-directory ${prefix}-dereplicated | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
galah: \$(galah --version | sed 's/galah //') | ||
END_VERSIONS | ||
""" | ||
|
||
stub: | ||
def args = task.ext.args ?: '' | ||
prefix = task.ext.prefix ?: "${meta.id}" | ||
""" | ||
mkdir ${prefix}-dereplicated/ | ||
touch ${prefix}-dereplicated/test.fa | ||
touch ${prefix}-dereplicated_bins.tsv | ||
cat <<-END_VERSIONS > versions.yml | ||
"${task.process}": | ||
galah: \$(galah --version | sed 's/galah //') | ||
END_VERSIONS | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
--- | ||
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json | ||
name: "galah" | ||
description: Cluster genome FASTA files by average nucleotide identity | ||
keywords: | ||
- genomics | ||
- cluster | ||
- genome | ||
- metagenomics | ||
tools: | ||
- "galah": | ||
description: "Galah aims to be a more scalable metagenome assembled genome (MAG) dereplication method." | ||
homepage: "https://github.com/wwood/galah" | ||
documentation: "https://github.com/wwood/galah" | ||
tool_dev_url: "https://github.com/wwood/galah" | ||
doi: "10.1111/NODOI" | ||
licence: "['GPL v3']" | ||
|
||
input: | ||
- meta: | ||
type: map | ||
description: | | ||
Groovy Map containing sample information | ||
e.g. `[ id:'test', single_end:false ]` | ||
- bins: | ||
type: file | ||
description: A list of fasta-formatted genomes for dereplication | ||
pattern: "*.{fa,fna,fa.gz, etc}" | ||
- qc_table: | ||
type: file | ||
description: | | ||
(optional) Either a (CheckM)[https://nf-co.re/modules/checkm_lineagewf] summary TSV containing | ||
information on the completeness and contamination of the input genomes (13 columns), | ||
or a 3-column csv with the header `genome,completeness,contamination`. | ||
In both cases the first column should contain the names of the input genome files, | ||
minus the last file extension | ||
(i.e. if the genome is gzipped, the genome name should retain the .fasta extension). | ||
pattern: "*.{csv,tsv}" | ||
- qc_format: | ||
type: string | ||
description: Defines the type if input table in `qc_table`, if specified. | ||
pattern: "checkm|genome_info" | ||
|
||
output: | ||
- meta: | ||
type: map | ||
description: | | ||
Groovy Map containing sample information | ||
e.g. `[ id:'test', single_end:false ]` | ||
- tsv: | ||
type: file | ||
description: TSV file in the format `representative_genome` \t `member_genome` | ||
pattern: "*.tsv" | ||
- dereplicated_bins: | ||
type: file | ||
description: The representative genomes following dereplication by galah. | ||
pattern: "*" | ||
- versions: | ||
type: file | ||
description: File containing software versions | ||
pattern: "versions.yml" | ||
authors: | ||
- "@prototaxites" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
|
||
include { GALAH } from '../../../../modules/nf-core/galah/main.nf' | ||
include { BIOAWK as BIOAWK_CHECKM } from '../../../../modules/nf-core/bioawk/main.nf' | ||
include { BIOAWK as BIOAWK_GENOMEINFO } from '../../../../modules/nf-core/bioawk/main.nf' | ||
include { GUNZIP } from '../../../../modules/nf-core/gunzip/main.nf' | ||
|
||
workflow test_galah { | ||
|
||
input = [ | ||
[ id:'test' ], // meta map | ||
[file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz", checkIfExists: true), | ||
file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCF_004296495.1_ASM429649v1_genomic.fna.gz", checkIfExists: true)], | ||
[], | ||
[] | ||
] | ||
|
||
GALAH ( input ) | ||
|
||
} | ||
|
||
workflow test_galah_genomeinfo { | ||
|
||
genomeinfo = Channel.fromPath("https://raw.githubusercontent.com/nf-core/test-datasets/magmap/testdata/checkm.lineage_wf.qa_2.tsv", checkIfExists: true) | ||
.map { file -> | ||
[ [id: "genomeinfo"], file ] | ||
} | ||
|
||
BIOAWK_GENOMEINFO(genomeinfo) | ||
|
||
GUNZIP(BIOAWK_GENOMEINFO.out.output) | ||
|
||
ch_genomeinfo = GUNZIP.out.gunzip | ||
.map { meta, tsv -> [tsv] } | ||
|
||
input = Channel.of( | ||
[ | ||
[ id:'test' ], // meta map | ||
[file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCA_002688505.1_ASM268850v1_genomic.fna.gz", checkIfExists: true), | ||
file("https://github.com/nf-core/test-datasets/raw/magmap/testdata/GCF_004296495.1_ASM429649v1_genomic.fna.gz", checkIfExists: true)] | ||
] | ||
) | ||
.combine(ch_genomeinfo) | ||
.map {meta, bins, qc -> | ||
[ meta, bins, qc, "genome_info" ] | ||
} | ||
|
||
GALAH ( input ) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
process { | ||
|
||
publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } | ||
|
||
// write a horrid awk to munge the checkm_qa tsv to a 3 column csv | ||
// as files are gzipped, genome has to include extension "fna" as galah expects the genome name to be | ||
// the file name minus the last extension | ||
withName: BIOAWK_GENOMEINFO { | ||
ext.args = '\'BEGIN {{FS="\t"; OFS=","}} NR==1 {print "genome","completeness","contamination"} NR>1 {print $1".fna",$6, $7}\'' | ||
ext.prefix = "genome_info.tsv" | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
- name: galah test_galah | ||
command: nextflow run ./tests/modules/nf-core/galah -entry test_galah -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/galah/nextflow.config | ||
tags: | ||
- galah | ||
files: | ||
- path: output/galah/test-dereplicated/GCA_002688505.1_ASM268850v1_genomic.fna.gz | ||
md5sum: 0747c48f6693a4fb03c7164c2f472326 | ||
- path: output/galah/test-dereplicated/GCF_004296495.1_ASM429649v1_genomic.fna.gz | ||
md5sum: a8e9bac598df938f25e09418ff7214dd | ||
- path: output/galah/test-dereplicated_bins.tsv | ||
md5sum: d2f8a621bfa5794467f4fdd759e2bce7 | ||
- path: output/galah/versions.yml | ||
|
||
- name: galah test_galah_genomeinfo | ||
command: nextflow run ./tests/modules/nf-core/galah -entry test_galah_genomeinfo -c ./tests/config/nextflow.config -c ./tests/modules/nf-core/galah/nextflow.config | ||
tags: | ||
- galah | ||
files: | ||
- path: output/bioawk/genome_info.tsv.gz | ||
- path: output/bioawk/versions.yml | ||
- path: output/galah/test-dereplicated/GCA_002688505.1_ASM268850v1_genomic.fna.gz | ||
md5sum: 0747c48f6693a4fb03c7164c2f472326 | ||
- path: output/galah/test-dereplicated/GCF_004296495.1_ASM429649v1_genomic.fna.gz | ||
md5sum: a8e9bac598df938f25e09418ff7214dd | ||
- path: output/galah/test-dereplicated_bins.tsv | ||
md5sum: d2f8a621bfa5794467f4fdd759e2bce7 | ||
- path: output/galah/versions.yml | ||
- path: output/gunzip/genome_info.tsv | ||
md5sum: f73b9131ab91ddb754725b94e5085955 | ||
- path: output/gunzip/versions.yml |