diff --git a/CHANGELOG.md b/CHANGELOG.md index 90fe8dec..29a8c094 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,8 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#422](https://github.com/nf-core/mag/pull/422) - Adds support for normalization of read depth with BBNorm (added by @erikrikarddaniel and @fabianegli) - [#439](https://github.com/nf-core/mag/pull/439) - Adds ability to enter the pipeline at the binning stage by providing a CSV of pre-computed assemblies (by @prototaxites) - [#459](https://github.com/nf-core/mag/pull/459) - Adds ability to skip damage correction step in the ancient DNA workflow and just run pyDamage (by @jfy133) -- [#364](https://github.com/nf-core/mag/pull/364) - Added geNomad nf-core modules for identifying viruses in assemblies (by @PhilPalmer and @CarsonJM) +- [#364](https://github.com/nf-core/mag/pull/364) - Adds geNomad nf-core modules for identifying viruses in assemblies (by @PhilPalmer and @CarsonJM) - [#481](https://github.com/nf-core/mag/pull/481) - Adds MetaEuk for annotation of eukaryotic MAGs, and MMSeqs2 to enable downloading databases for MetaEuk (by @prototaxites) +- [#437](https://github.com/nf-core/mag/pull/429) - `--gtdb_db` also now supports directory input of an pre-uncompressed GTDB archive directory (reported by @alneberg, fix by @jfy133) ### `Changed` @@ -22,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#442](https://github.com/nf-core/mag/pull/442) - Remove warning when BUSCO finds no genes in bins, as this can be expected in some datasets (reported by @Lumimar, fix by @jfy133). - [#444](https://github.com/nf-core/mag/pull/444) - Moved BUSCO bash code to script (by @jfy133) - [#428](https://github.com/nf-core/mag/pull/429) - Update to nf-core 2.9 `TEMPLATE` (by @jfy133) +- [#437](https://github.com/nf-core/mag/pull/429) - `--gtdb` parameter is split into `--skip_gtdbtk` and `--gtdb_db` to allow finer control over GTDB database retrieval (fix by @jfy133) ### `Fixed` diff --git a/conf/test.config b/conf/test.config index 2b846218..348b95d5 100644 --- a/conf/test.config +++ b/conf/test.config @@ -28,6 +28,6 @@ params { max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" busco_clean = true - gtdb = false + skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index 956ca7e0..92d51aec 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -28,7 +28,7 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true clip_tool = 'adapterremoval' skip_concoct = true bin_domain_classification = true diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config index dcb8f7c9..325362fc 100644 --- a/conf/test_ancient_dna.config +++ b/conf/test_ancient_dna.config @@ -27,7 +27,7 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true ancient_dna = true binning_map_mode = 'own' skip_spades = false diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config index e081aa4e..5f481adf 100644 --- a/conf/test_bbnorm.config +++ b/conf/test_bbnorm.config @@ -34,7 +34,7 @@ params { max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" busco_clean = true - gtdb = false + skip_gtdbtk = true bbnorm = true coassemble_group = true } diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config index 72da28c5..f4ec132f 100644 --- a/conf/test_binrefinement.config +++ b/conf/test_binrefinement.config @@ -28,7 +28,7 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true refine_bins_dastool = true refine_bins_dastool_threshold = 0 postbinning_input = 'both' diff --git a/conf/test_busco_auto.config b/conf/test_busco_auto.config index 9480575c..6479012f 100644 --- a/conf/test_busco_auto.config +++ b/conf/test_busco_auto.config @@ -24,7 +24,7 @@ params { skip_spades = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - gtdb = false + skip_gtdbtk = true skip_prokka = true skip_prodigal = true skip_quast = true diff --git a/conf/test_full.config b/conf/test_full.config index 0e755192..4917332e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -22,7 +22,7 @@ params { centrifuge_db = "s3://ngi-igenomes/test-data/mag/p_compressed+h+v.tar.gz" kraken2_db = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz" cat_db = "s3://ngi-igenomes/test-data/mag/CAT_prepare_20210107.tar.gz" - gtdb = "s3://ngi-igenomes/test-data/mag/gtdbtk_r202_data.tar.gz" + gtdb_db = "s3://ngi-igenomes/test-data/mag/gtdbtk_r202_data.tar.gz" // reproducibility options for assembly spades_fix_cpus = 10 diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config index f91ef48c..b3487c6b 100644 --- a/conf/test_host_rm.config +++ b/conf/test_host_rm.config @@ -25,6 +25,6 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index 8cf5e525..bc22d3d2 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -24,6 +24,6 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - gtdb = false + skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_hybrid_host_rm.config b/conf/test_hybrid_host_rm.config index 8a37b813..7a0e4a15 100644 --- a/conf/test_hybrid_host_rm.config +++ b/conf/test_hybrid_host_rm.config @@ -26,4 +26,5 @@ params { max_unbinned_contigs = 2 skip_binqc = true skip_concoct = true + skip_gtdbtk = true } diff --git a/conf/test_nothing.config b/conf/test_nothing.config index d22582e3..53df219f 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -38,6 +38,6 @@ params { skip_concoct = true skip_prokka = true skip_binqc = true - gtdb = false + skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_virus_identification.config b/conf/test_virus_identification.config index 7709a523..e15fab7d 100644 --- a/conf/test_virus_identification.config +++ b/conf/test_virus_identification.config @@ -27,7 +27,7 @@ params { // For computational efficiency reads_minlength = 150 coassemble_group = true - gtdb = false + skip_gtdbtk = true skip_binning = true skip_prokka = true skip_spades = true diff --git a/lib/WorkflowMag.groovy b/lib/WorkflowMag.groovy index ebe07a0a..51822e4e 100755 --- a/lib/WorkflowMag.groovy +++ b/lib/WorkflowMag.groovy @@ -119,8 +119,8 @@ class WorkflowMag { Nextflow.error('Both --busco_auto_lineage_prok and --busco_reference are specified! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference.') } - if (params.skip_binqc && params.gtdb) { - log.warn '--skip_binqc and --gtdb are specified! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.' + if (params.skip_binqc && !params.skip_gtdbtk) { + log.warn '--skip_binqc is specified, but --skip_gtdbtk is explictly set to run! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.' } // Check if CAT parameters are valid diff --git a/modules/local/gtdbtk_db_preparation.nf b/modules/local/gtdbtk_db_preparation.nf index 5c4a991e..6c987a32 100644 --- a/modules/local/gtdbtk_db_preparation.nf +++ b/modules/local/gtdbtk_db_preparation.nf @@ -10,7 +10,7 @@ process GTDBTK_DB_PREPARATION { path(database) output: - tuple val("${database.toString().replace(".tar.gz", "")}"), path("database/*") + tuple val("${database.toString().replace(".tar.gz", "")}"), path("database/*"), emit: db script: """ diff --git a/nextflow.config b/nextflow.config index bb8e3de4..1846bc76 100644 --- a/nextflow.config +++ b/nextflow.config @@ -84,7 +84,8 @@ params { cat_db_generate = false cat_official_taxonomy = false save_cat_db = false - gtdb = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz" + skip_gtdbtk = false + gtdb_db = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" gtdbtk_min_completeness = 50.0 gtdbtk_max_contamination = 10.0 gtdbtk_min_perc_aa = 10 diff --git a/nextflow_schema.json b/nextflow_schema.json index 03b1992a..dcff2298 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -511,11 +511,15 @@ "type": "boolean", "description": "Only return official taxonomic ranks (Kingdom, Phylum, etc.) when running CAT." }, - "gtdb": { + "skip_gtdbtk": { + "type": "boolean", + "description": "Skip the running of GTDB, as well as the automatic download of the database", + "default": "false" + }, + "gtdb_db": { "type": "string", - "default": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", - "description": "GTDB database for taxonomic classification of bins with GTDB-tk.", - "help_text": "For information which GTDB reference databases are compatible with the used GTDB-tk version see https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data." + "description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.", + "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" }, "gtdbtk_min_completeness": { "type": "number", diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf index 5796de42..aa9a4a9b 100644 --- a/subworkflows/local/binning.nf +++ b/subworkflows/local/binning.nf @@ -130,9 +130,9 @@ workflow BINNING { ch_versions = ch_versions.mix(GUNZIP_UNBINS.out.versions.first()) emit: - bins = ch_binning_results_gunzipped.dump(tag: "ch_binning_results_gunzipped") + bins = ch_binning_results_gunzipped bins_gz = ch_binning_results_gzipped_final - unbinned = ch_splitfasta_results_gunzipped.dump(tag: "ch_splitfasta_results_gunzipped") + unbinned = ch_splitfasta_results_gunzipped unbinned_gz = SPLIT_FASTA.out.unbinned metabat2depths = METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth versions = ch_versions diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index ecbb1d40..21823962 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -59,10 +59,24 @@ workflow GTDBTK { return [it[0], it[1]] } - GTDBTK_DB_PREPARATION ( gtdb ) + if ( gtdb.extension == 'gz' ) { + // Expects to be tar.gz! + ch_db_for_gtdbtk = GTDBTK_DB_PREPARATION ( gtdb ).db + } else if ( gtdb.isDirectory() ) { + // Make up meta id to match expected channel cardinality for GTDBTK + ch_db_for_gtdbtk = Channel + .of(gtdb) + .map{ + [ it.toString().split('/').last(), it ] + } + .collect() + } else { + error("Unsupported object given to --gtdb, database must be supplied as either a directory or a .tar.gz file!") + } + GTDBTK_CLASSIFYWF ( ch_filtered_bins.passed.groupTuple(), - GTDBTK_DB_PREPARATION.out + ch_db_for_gtdbtk ) GTDBTK_SUMMARY ( diff --git a/workflows/mag.nf b/workflows/mag.nf index d74ada0e..62797405 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -31,7 +31,7 @@ log.info logo + paramsSummaryLog(workflow) + citation WorkflowMag.initialise(params, log, hybrid) // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.gtdb, params.lambda_reference, params.busco_reference ] +def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.gtdb_db, params.lambda_reference, params.busco_reference ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } /* @@ -205,13 +205,12 @@ if (params.genomad_db){ ch_genomad_db = Channel.empty() } -gtdb = params.skip_binqc ? false : params.gtdb +gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db if (gtdb) { - ch_gtdb = Channel - .value(file( "${gtdb}" )) + gtdb = file( "${gtdb}", checkIfExists: true) } else { - ch_gtdb = Channel.empty() + gtdb = [] } if(params.metaeuk_db && !params.skip_metaeuk) { @@ -720,12 +719,12 @@ workflow MAG { } else { - ch_binning_results_bins = BINNING.out.bins.dump(tag: 'BINNING.out.bins') + ch_binning_results_bins = BINNING.out.bins .map { meta, bins -> def meta_new = meta + [domain: 'unclassified'] [meta_new, bins] } - ch_binning_results_unbins = BINNING.out.unbinned.dump(tag: 'BINNING.out.unbins') + ch_binning_results_unbins = BINNING.out.unbinned .map { meta, bins -> def meta_new = meta + [domain: 'unclassified'] [meta_new, bins] @@ -877,25 +876,31 @@ workflow MAG { /* * GTDB-tk: taxonomic classifications using GTDB reference */ - ch_gtdbtk_summary = Channel.empty() - if ( gtdb ){ - ch_gtdb_bins = ch_input_for_postbinning_bins_unbins - .filter { meta, bins -> - meta.domain != "eukarya" - } + if ( !params.skip_gtdbtk ) { - GTDBTK ( - ch_gtdb_bins, - ch_busco_summary, - ch_checkm_summary, - ch_gtdb - ) - ch_versions = ch_versions.mix(GTDBTK.out.versions.first()) - ch_gtdbtk_summary = GTDBTK.out.summary + ch_gtdbtk_summary = Channel.empty() + if ( gtdb ){ + + ch_gtdb_bins = ch_input_for_postbinning_bins_unbins + .filter { meta, bins -> + meta.domain != "eukarya" + } + + GTDBTK ( + ch_gtdb_bins, + ch_busco_summary, + ch_checkm_summary, + gtdb + ) + ch_versions = ch_versions.mix(GTDBTK.out.versions.first()) + ch_gtdbtk_summary = GTDBTK.out.summary + } + } else { + ch_gtdbtk_summary = Channel.empty() } - if ( ( !params.skip_binqc ) || !params.skip_quast || gtdb){ + if ( ( !params.skip_binqc ) || !params.skip_quast || !params.skip_gtdbtk){ BIN_SUMMARY ( ch_input_for_binsummary, ch_busco_summary.ifEmpty([]),