diff --git a/CHANGELOG.md b/CHANGELOG.md index 53949c87..b4b91675 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#438](https://github.com/genomic-medicine-sweden/nallo/pull/438) - Updated pipeline tests to use functions in nft-utils instead of checking hardcoded paths - [#440](https://github.com/genomic-medicine-sweden/nallo/pull/440) - Updated hifiasm to 0.20 with new default parameters for telomeres and scaffolding ([#295](https://github.com/genomic-medicine-sweden/nallo/issues/295)) - [#441](https://github.com/genomic-medicine-sweden/nallo/pull/441) - Changed the minimap2 preset for hifi reads back to `map-hifi` +- [#443](https://github.com/genomic-medicine-sweden/nallo/pull/443) - Refactored reference channel assignments +- [#443](https://github.com/genomic-medicine-sweden/nallo/pull/443) - Updated schemas for `vep_plugin_files` and `snp_db` ### `Removed` diff --git a/assets/schema_snpdb.json b/assets/schema_snp_db.json similarity index 53% rename from assets/schema_snpdb.json rename to assets/schema_snp_db.json index 648a5283..4d9141db 100644 --- a/assets/schema_snpdb.json +++ b/assets/schema_snp_db.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_gvcfs.json", - "title": "genomic-medicine-sweden/nallo pipeline - params.extra_gvcfs schema", - "description": "Schema for the file provided with params.extra_gvcfs", + "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_snp_db.json", + "title": "genomic-medicine-sweden/nallo pipeline - params.snp_db schema", + "description": "Schema for the file provided with params.snp_db", "type": "array", "items": { "type": "object", @@ -10,14 +10,13 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", - "meta": ["id"] + "errorMessage": "Sample must be provided and cannot contain spaces." }, "file": { "format": "file-path", "type": "string", "pattern": "^\\S+\\.zip$", - "errorMessage": "gVCF file must be provided, cannot contain spaces and must have extension 'g.vcf.gz' or 'gvcf.gz'" + "errorMessage": "Echtvar database must be provided, cannot contain spaces and must have extension '.zip'" } }, "required": ["sample", "file"] diff --git a/assets/schema_vep_plugin_files.json b/assets/schema_vep_plugin_files.json new file mode 100644 index 00000000..0be393a3 --- /dev/null +++ b/assets/schema_vep_plugin_files.json @@ -0,0 +1,20 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_vep_plugin_files.json", + "title": "genomic-medicine-sweden/nallo pipeline - params.vep_plugin_files schema", + "description": "Schema for the file provided with params.vep_plugin_files", + "type": "array", + "items": { + "type": "object", + "properties": { + "vep_files": { + "format": "file-path", + "type": "string", + "pattern": "^\\S+", + "exists": true, + "errorMessage": "Vep plugin file must be a path and exist." + } + }, + "required": ["vep_files"] + } +} diff --git a/modules/nf-core/cadd/cadd.diff b/modules/nf-core/cadd/cadd.diff index f174cc67..2243f02d 100644 --- a/modules/nf-core/cadd/cadd.diff +++ b/modules/nf-core/cadd/cadd.diff @@ -15,8 +15,9 @@ Changes in 'cadd/main.nf': input: tuple val(meta), path(vcf) - path(annotation_dir) -+ path(prescored_dir) +- path(annotation_dir) ++ tuple val(meta2), path(annotation_dir) ++ tuple val(meta3), path(prescored_dir) output: tuple val(meta), path("*.tsv.gz"), emit: tsv diff --git a/modules/nf-core/cadd/main.nf b/modules/nf-core/cadd/main.nf index 52490c64..d830ff72 100644 --- a/modules/nf-core/cadd/main.nf +++ b/modules/nf-core/cadd/main.nf @@ -13,8 +13,8 @@ process CADD { input: tuple val(meta), path(vcf) - path(annotation_dir) - path(prescored_dir) + tuple val(meta2), path(annotation_dir) + tuple val(meta3), path(prescored_dir) output: tuple val(meta), path("*.tsv.gz"), emit: tsv diff --git a/nextflow_schema.json b/nextflow_schema.json index ec1fa4b2..a459def4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -163,7 +163,7 @@ "pattern": "^\\S+\\.csv$", "format": "file-path", "mimetype": "text/csv", - "schema": "/assets/schema_snpdb.json", + "schema": "/assets/schema_snp_db.json", "description": "A csv file with echtvar databases to annotate SNVs with", "exists": true }, diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf index 9a1dc047..a56d1450 100644 --- a/subworkflows/local/annotate_cadd/main.nf +++ b/subworkflows/local/annotate_cadd/main.nf @@ -17,9 +17,9 @@ workflow ANNOTATE_CADD { ch_fai // channel: [mandatory] [ val(meta), path(fai) ] ch_vcf // channel: [mandatory] [ val(meta), path(vcfs) ] ch_index // channel: [mandatory] [ val(meta), path(tbis) ] - ch_header // channel: [mandatory] [ path(txt) ] - ch_cadd_resources // channel: [mandatory] [ path(dir) ] - ch_cadd_prescored // channel: [mandatory] [ path(dir) ] + ch_header // channel: [mandatory] [ val(meta), path(txt) ] + ch_cadd_resources // channel: [mandatory] [ val(meta), path(dir) ] + ch_cadd_prescored // channel: [mandatory] [ val(meta), path(dir) ] main: ch_versions = Channel.empty() @@ -64,7 +64,7 @@ workflow ANNOTATE_CADD { ANNOTATE_INDELS ( ch_annotate_indels_in, - ch_header, + ch_header.map { meta, header -> header }, CADD_TO_REFERENCE_CHRNAMES.out.output.map { meta, txt -> txt } ) ch_versions = ch_versions.mix(ANNOTATE_INDELS.out.versions) diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index f18cf360..23bfe8f0 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -10,7 +10,6 @@ workflow PREPARE_GENOME { gunzip_fasta // bool: should we gunzip fasta ch_vep_cache // channel: [optional] [ val(meta), path(cache) ] split_vep_files // bool: are there vep extra files - ch_vep_extra_files_unsplit // channel: [optional] [ val(meta), path(csv) ] main: ch_versions = Channel.empty() @@ -40,33 +39,13 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(UNTAR_VEP_CACHE.out.versions) UNTAR_VEP_CACHE.out.untar - .map { meta, files -> [ files ] } .collect() .set { untarred_vep } - // Read and store paths in the vep_plugin_files file - if ( split_vep_files ) { - ch_vep_extra_files_unsplit - .splitCsv ( header:true ) - .map { row -> - path = file(row.vep_files[0]) - if(path.exists()) { - return [path] - } else { - error("\nVep database file ${path} does not exist.") - } - } - .collect() - .set { ch_vep_extra_files } - } else { - ch_vep_extra_files = Channel.value([]) - } - emit: mmi = MINIMAP2_INDEX.out.index.collect() // channel: [ val(meta), path(mmi) ] fai = SAMTOOLS_FAIDX.out.fai.collect() // channel: [ val(meta), path(fai) ] fasta = ch_fasta // channel: [ val(meta), path(fasta) ] - vep_resources = untarred_vep // channel: [ path(cache) ] - vep_extra_files = ch_vep_extra_files // channel: [ path(files) ] + vep_resources = untarred_vep // channel: [ val(meta), path(cache) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/rank_variants/tests/main.nf.test b/subworkflows/local/rank_variants/tests/main.nf.test index 581a7510..473f16c8 100644 --- a/subworkflows/local/rank_variants/tests/main.nf.test +++ b/subworkflows/local/rank_variants/tests/main.nf.test @@ -20,9 +20,6 @@ nextflow_workflow { file(params.pipelines_testdata_base_path + 'reference/vep_cache_test_data.tar.gz', checkIfExists:true) ] input[3] = true - input[4] = Channel.of([ - file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true) - ]) """ } } @@ -69,9 +66,11 @@ nextflow_workflow { ] input[2] = PREPARE_GENOME.out.fasta input[3] = PREPARE_GENOME.out.fai - input[4] = PREPARE_GENOME.out.vep_resources + input[4] = PREPARE_GENOME.out.vep_resources.map { meta, cache -> cache } input[5] = Channel.value('110') - input[6] = PREPARE_GENOME.out.vep_extra_files + input[6] = Channel.of([ + file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true) + ]).splitCsv(header:true).map { row -> row.vep_files }.collect() input[7] = false input[8] = Channel.value([]) input[9] = null diff --git a/subworkflows/local/snv_annotation/main.nf b/subworkflows/local/snv_annotation/main.nf index 82fd3695..d1e382be 100644 --- a/subworkflows/local/snv_annotation/main.nf +++ b/subworkflows/local/snv_annotation/main.nf @@ -8,7 +8,7 @@ workflow SNV_ANNOTATION { take: ch_vcf // channel [mandatory] [ val(meta), path(vcf) ] - ch_databases // channel: [mandatory] [ val(meta), path(db) ] + ch_databases // channel: [mandatory] [ path(db) ] ch_fasta // channel: [mandatory] [ val(meta), path(fasta) ] ch_fai // channel: [mandatory] [ val(meta), path(fai) ] ch_vep_cache // channel: [mandatory] [ path(cache) ] @@ -16,8 +16,8 @@ workflow SNV_ANNOTATION { ch_vep_extra_files // channel: [mandatory] [ path(files) ] val_annotate_cadd // bool: [mandatory] ch_cadd_header // channel: [mandatory] [ path(txt) ] - ch_cadd_resources // channel: [mandatory] [ path(annotation) ] - ch_cadd_prescored // channel: [mandatory] [ path(prescored) ] + ch_cadd_resources // channel: [mandatory] [ val(meta), path(annotation) ] + ch_cadd_prescored // channel: [mandatory] [ val(meta), path(prescored) ] main: ch_versions = Channel.empty() diff --git a/subworkflows/local/snv_annotation/tests/main.nf.test b/subworkflows/local/snv_annotation/tests/main.nf.test index 3164b44a..dd5ee5ef 100644 --- a/subworkflows/local/snv_annotation/tests/main.nf.test +++ b/subworkflows/local/snv_annotation/tests/main.nf.test @@ -88,11 +88,11 @@ nextflow_workflow { ] input[2] = GUNZIP.out.gunzip input[3] = SAMTOOLS_FAIDX.out.fai - input[4] = UNTAR.out.untar.map { meta, cache -> cache } + input[4] = UNTAR.out.untar.map { meta, cache -> cache} input[5] = Channel.value('110') - input[6] = [ + input[6] = Channel.of([ file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true) - ] + ]).splitCsv(header:true).map { row -> row.vep_files }.collect() input[7] = false input[8] = Channel.value([]) input[9] = null @@ -132,9 +132,9 @@ nextflow_workflow { input[3] = SAMTOOLS_FAIDX.out.fai input[4] = UNTAR.out.untar.map { meta, cache -> cache } input[5] = Channel.value('110') - input[6] = [ + input[6] = Channel.of([ file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true) - ] + ]).splitCsv(header:true).map { row -> row.vep_files }.collect() input[7] = false input[8] = Channel.value([]) input[9] = null diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf index 3061e3c0..5fca3199 100644 --- a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf @@ -640,3 +640,15 @@ def findKeyForValue(def valueToFind, Map map) { } return null // Value not found } + +// Utility function to create channels from references +def createReferenceChannelFromPath(param, defaultValue = '') { + return param ? Channel.fromPath(param, checkIfExists: true) + .map { [ [ id: it.simpleName ], it ] } + .collect() : defaultValue +} +// Utility function to create channels from samplesheets +def createReferenceChannelFromSamplesheet(param, schema, defaultValue = '') { + return param ? Channel.fromList(samplesheetToList(param, schema)) : defaultValue +} + diff --git a/tests/samplesheet.nf.test.snap b/tests/samplesheet.nf.test.snap index 838d5318..d1480342 100644 --- a/tests/samplesheet.nf.test.snap +++ b/tests/samplesheet.nf.test.snap @@ -546,6 +546,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-10-30T10:27:37.120618269" + "timestamp": "2024-10-30T11:40:24.479263781" } } \ No newline at end of file diff --git a/tests/samplesheet_multisample_bam.nf.test.snap b/tests/samplesheet_multisample_bam.nf.test.snap index 75143c8d..a19e876b 100644 --- a/tests/samplesheet_multisample_bam.nf.test.snap +++ b/tests/samplesheet_multisample_bam.nf.test.snap @@ -746,6 +746,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-10-30T10:29:12.353783346" + "timestamp": "2024-10-30T11:42:12.581768636" } } \ No newline at end of file diff --git a/tests/samplesheet_multisample_ont_bam.nf.test.snap b/tests/samplesheet_multisample_ont_bam.nf.test.snap index 7bce4132..8538e967 100644 --- a/tests/samplesheet_multisample_ont_bam.nf.test.snap +++ b/tests/samplesheet_multisample_ont_bam.nf.test.snap @@ -490,6 +490,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-10-30T08:37:01.633018038" + "timestamp": "2024-10-30T11:43:41.130041374" } } \ No newline at end of file diff --git a/workflows/nallo.nf b/workflows/nallo.nf index 45587e62..c78d9612 100644 --- a/workflows/nallo.nf +++ b/workflows/nallo.nf @@ -1,5 +1,8 @@ include { samplesheetToList } from 'plugin/nf-schema' - +include { + createReferenceChannelFromPath + createReferenceChannelFromSamplesheet +} from '../subworkflows/local/utils_nfcore_nallo_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL SUBWORKFLOWS @@ -69,46 +72,30 @@ workflow NALLO { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // Optional input files that has to be set depending on which workflow is run - ch_cadd_header = Channel.fromPath("$projectDir/assets/cadd_to_vcf_header_-1.0-.txt", checkIfExists: true).collect() - ch_cadd_resources = params.cadd_resources ? Channel.fromPath(params.cadd_resources).collect() - : '' - ch_cadd_prescored = params.cadd_prescored ? Channel.fromPath(params.cadd_prescored).collect() - : '' - ch_fasta = params.fasta ? Channel.fromPath(params.fasta).map { it -> [ it.simpleName, it ] }.collect() - : '' - ch_tandem_repeats = params.tandem_repeats ? Channel.fromPath(params.tandem_repeats).map{ [ it.simpleName, it ] }.collect() - : Channel.value([[],[]]) - ch_input_bed = params.bed ? Channel.fromPath(params.bed).map{ [ [ id:it.simpleName ] , it ] }.collect() - : Channel.value([[],[]]) - ch_par = params.par_regions ? Channel.fromPath(params.par_regions).map { [ [ id: it.simpleName ], it ] }.collect() - : '' - ch_trgt_bed = params.trgt_repeats ? Channel.fromPath(params.trgt_repeats).map { it -> [ it.simpleName, it ] }.collect() - : '' - ch_variant_catalog = params.variant_catalog ? Channel.fromPath(params.variant_catalog).map { it -> [ it.simpleName, it ] }.collect() - : '' - ch_databases = params.snp_db ? Channel.fromList(samplesheetToList(params.snp_db, 'assets/schema_snpdb.json')).map{ it[1] }.collect() - : '' - ch_variant_consequences_snv = params.variant_consequences_snv ? Channel.fromPath(params.variant_consequences_snv).map { it -> [ it.simpleName, it ] }.collect() - : Channel.value([]) - ch_vep_cache_unprocessed = params.vep_cache ? Channel.fromPath(params.vep_cache).map { it -> [ [ id:'vep_cache' ], it ] }.collect() - : Channel.value([[],[]]) - ch_vep_extra_files_unsplit = params.vep_plugin_files ? Channel.fromPath(params.vep_plugin_files).collect() - : '' - ch_expected_xy_bed = params.hificnv_xy ? Channel.fromPath(params.hificnv_xy).map { it -> [ [ id: it.simpleName ], it ] }.collect() - : '' - ch_expected_xx_bed = params.hificnv_xx ? Channel.fromPath(params.hificnv_xx).map { it -> [ [ id: it.simpleName ], it ] }.collect() - : '' - ch_exclude_bed = params.hificnv_exclude ? Channel.fromPath(params.hificnv_exclude).map { it -> [ [ id: it.simpleName ], it ] }.collect() - : '' - ch_reduced_penetrance = params.reduced_penetrance ? Channel.fromPath(params.reduced_penetrance).map { it -> [ it.simpleName, it ] }.collect() - : Channel.value([]) - ch_score_config_snv = params.score_config_snv ? Channel.fromPath(params.score_config_snv).map { it -> [ it.simpleName, it ] }.collect() - : Channel.value([]) - ch_somalier_sites = params.somalier_sites ? Channel.fromPath(params.somalier_sites).map { [ it.simpleName, it ] }.collect() - : '' - ch_svdb_dbs = params.svdb_dbs ? Channel.fromPath(params.svdb_dbs).map { [ it.simpleName, it ] }.collect() - : '' + // Channels from (optional) input files + // If provided: [[id: 'reference'], [/path/to/reference_full_name.file]] + ch_cadd_header = createReferenceChannelFromPath("$projectDir/assets/cadd_to_vcf_header_-1.0-.txt") + ch_cadd_resources = createReferenceChannelFromPath(params.cadd_resouces) + ch_cadd_prescored = createReferenceChannelFromPath(params.cadd_prescored) + ch_fasta = createReferenceChannelFromPath(params.fasta) + ch_tandem_repeats = createReferenceChannelFromPath(params.tandem_repeats, Channel.value([[],[]])) + ch_input_bed = createReferenceChannelFromPath(params.bed, Channel.value([[],[]])) + ch_par = createReferenceChannelFromPath(params.par_regions) + ch_trgt_bed = createReferenceChannelFromPath(params.trgt_repeats) + ch_variant_catalog = createReferenceChannelFromPath(params.variant_catalog) + ch_variant_consequences_snv = createReferenceChannelFromPath(params.variant_consequences_snv) + ch_vep_cache_unprocessed = createReferenceChannelFromPath(params.vep_cache, Channel.value([])) + ch_expected_xy_bed = createReferenceChannelFromPath(params.hificnv_xy) + ch_expected_xx_bed = createReferenceChannelFromPath(params.hificnv_xx) + ch_exclude_bed = createReferenceChannelFromPath(params.hificnv_exclude) + ch_reduced_penetrance = createReferenceChannelFromPath(params.reduced_penetrance) + ch_score_config_snv = createReferenceChannelFromPath(params.score_config_snv) + ch_somalier_sites = createReferenceChannelFromPath(params.somalier_sites) + ch_svdb_dbs = createReferenceChannelFromPath(params.svdb_dbs) + + // Channels from (optional) input samplesheets validated by schema + ch_databases = createReferenceChannelFromSamplesheet(params.snp_db, 'assets/schema_snp_db.json') + ch_vep_plugin_files = createReferenceChannelFromSamplesheet(params.vep_plugin_files, 'assets/schema_vep_plugin_files.json', Channel.value([])) // Check parameter that doesn't conform to schema validation here if (params.phaser.matches('hiphase') && params.preset == 'ONT_R10') { error "The HiPhase license only permits analysis of data from PacBio. For details see: https://github.com/PacificBiosciences/HiPhase/blob/main/LICENSE.md" } @@ -125,24 +112,21 @@ workflow NALLO { // // Prepare references // - if(!params.skip_mapping_wf | !params.skip_assembly_wf ) { + if(!params.skip_mapping_wf || !params.skip_assembly_wf ) { PREPARE_GENOME ( ch_fasta, params.fasta.endsWith('.gz'), ch_vep_cache_unprocessed, params.vep_plugin_files, - ch_vep_extra_files_unsplit ) ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) - if(!params.skip_snv_annotation) { - if (params.vep_cache) { - if (params.vep_cache.endsWith("tar.gz")) { - ch_vep_cache = PREPARE_GENOME.out.vep_resources - } else { - ch_vep_cache = Channel.fromPath(params.vep_cache).collect() - } + if(!params.skip_snv_annotation && params.vep_cache) { + if (params.vep_cache.endsWith("tar.gz")) { + ch_vep_cache = PREPARE_GENOME.out.vep_resources + } else { + ch_vep_cache = Channel.fromPath(params.vep_cache).collect() } } @@ -349,12 +333,12 @@ workflow NALLO { // SNV_ANNOTATION( SHORT_VARIANT_CALLING.out.combined_bcf, - ch_databases, + ch_databases.map { meta, databases -> databases }.collect(), fasta, fai.map { name, fai -> [ [ id: name ], fai ] }, - ch_vep_cache, + ch_vep_cache.map { meta, cache -> cache }, params.vep_cache_version, - PREPARE_GENOME.out.vep_extra_files, + ch_vep_plugin_files.collect(), (params.cadd_resources && params.cadd_prescored), ch_cadd_header, ch_cadd_resources, @@ -528,9 +512,9 @@ workflow NALLO { annotate_svs_in, fasta, ch_svdb_dbs, - ch_vep_cache, + ch_vep_cache.map { meta, cache -> cache }, params.vep_cache_version, - PREPARE_GENOME.out.vep_extra_files + ch_vep_plugin_files.collect() ) } }