Refactor reference channels

genomic-medicine-sweden · Oct 30, 2024 · c333d53 · c333d53
1 parent 9000326
commit c333d53
Show file tree

Hide file tree

Showing 16 changed files with 104 additions and 108 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -71,6 +71,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#438](https://github.com/genomic-medicine-sweden/nallo/pull/438) - Updated pipeline tests to use functions in nft-utils instead of checking hardcoded paths
 - [#440](https://github.com/genomic-medicine-sweden/nallo/pull/440) - Updated hifiasm to 0.20 with new default parameters for telomeres and scaffolding ([#295](https://github.com/genomic-medicine-sweden/nallo/issues/295))
 - [#441](https://github.com/genomic-medicine-sweden/nallo/pull/441) - Changed the minimap2 preset for hifi reads back to `map-hifi`
+- [#443](https://github.com/genomic-medicine-sweden/nallo/pull/443) - Refactored reference channel assignments
+- [#443](https://github.com/genomic-medicine-sweden/nallo/pull/443) - Updated schemas for `vep_plugin_files` and `snp_db`
 
 ### `Removed`
 

diff --git a/assets/schema_snpdb.json → assets/schema_snp_db.json b/assets/schema_snpdb.json → assets/schema_snp_db.json
@@ -1,23 +1,22 @@
 {
     "$schema": "https://json-schema.org/draft/2020-12/schema",
-    "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_gvcfs.json",
-    "title": "genomic-medicine-sweden/nallo pipeline - params.extra_gvcfs schema",
-    "description": "Schema for the file provided with params.extra_gvcfs",
+    "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_snp_db.json",
+    "title": "genomic-medicine-sweden/nallo pipeline - params.snp_db schema",
+    "description": "Schema for the file provided with params.snp_db",
     "type": "array",
     "items": {
         "type": "object",
         "properties": {
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces",
-                "meta": ["id"]
+                "errorMessage": "Sample must be provided and cannot contain spaces."
             },
             "file": {
                 "format": "file-path",
                 "type": "string",
                 "pattern": "^\\S+\\.zip$",
-                "errorMessage": "gVCF file must be provided, cannot contain spaces and must have extension 'g.vcf.gz' or 'gvcf.gz'"
+                "errorMessage": "Echtvar database must be provided, cannot contain spaces and must have extension '.zip'"
             }
         },
         "required": ["sample", "file"]

diff --git a/assets/schema_vep_plugin_files.json b/assets/schema_vep_plugin_files.json
@@ -0,0 +1,20 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_vep_plugin_files.json",
+    "title": "genomic-medicine-sweden/nallo pipeline - params.vep_plugin_files schema",
+    "description": "Schema for the file provided with params.vep_plugin_files",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "vep_files": {
+                "format": "file-path",
+                "type": "string",
+                "pattern": "^\\S+",
+                "exists": true,
+                "errorMessage": "Vep plugin file must be a path and exist."
+            }
+        },
+        "required": ["vep_files"]
+    }
+}
diff --git a/modules/nf-core/cadd/cadd.diff b/modules/nf-core/cadd/cadd.diff
diff --git a/modules/nf-core/cadd/main.nf b/modules/nf-core/cadd/main.nf
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -163,7 +163,7 @@
                     "pattern": "^\\S+\\.csv$",
                     "format": "file-path",
                     "mimetype": "text/csv",
-                    "schema": "/assets/schema_snpdb.json",
+                    "schema": "/assets/schema_snp_db.json",
                     "description": "A csv file with echtvar databases to annotate SNVs with",
                     "exists": true
                 },

diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf
@@ -17,9 +17,9 @@ workflow ANNOTATE_CADD {
     ch_fai            // channel: [mandatory] [ val(meta), path(fai) ]
     ch_vcf            // channel: [mandatory] [ val(meta), path(vcfs) ]
     ch_index          // channel: [mandatory] [ val(meta), path(tbis) ]
-    ch_header         // channel: [mandatory] [ path(txt) ]
-    ch_cadd_resources // channel: [mandatory] [ path(dir) ]
-    ch_cadd_prescored // channel: [mandatory] [ path(dir) ]
+    ch_header         // channel: [mandatory] [ val(meta), path(txt) ]
+    ch_cadd_resources // channel: [mandatory] [ val(meta), path(dir) ]
+    ch_cadd_prescored // channel: [mandatory] [ val(meta), path(dir) ]
 
     main:
     ch_versions = Channel.empty()
@@ -64,7 +64,7 @@ workflow ANNOTATE_CADD {
 
     ANNOTATE_INDELS (
         ch_annotate_indels_in,
-        ch_header,
+        ch_header.map { meta, header -> header },
         CADD_TO_REFERENCE_CHRNAMES.out.output.map { meta, txt -> txt }
     )
     ch_versions = ch_versions.mix(ANNOTATE_INDELS.out.versions)

diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
@@ -10,7 +10,6 @@ workflow PREPARE_GENOME {
     gunzip_fasta               //    bool: should we gunzip fasta
     ch_vep_cache               // channel: [optional] [ val(meta), path(cache) ]
     split_vep_files            //    bool: are there vep extra files
-    ch_vep_extra_files_unsplit // channel: [optional] [ val(meta), path(csv) ]
 
     main:
     ch_versions = Channel.empty()
@@ -40,33 +39,13 @@ workflow PREPARE_GENOME {
     ch_versions = ch_versions.mix(UNTAR_VEP_CACHE.out.versions)
 
     UNTAR_VEP_CACHE.out.untar
-        .map { meta, files -> [ files ] }
         .collect()
         .set { untarred_vep }
 
-    // Read and store paths in the vep_plugin_files file
-    if ( split_vep_files ) {
-        ch_vep_extra_files_unsplit
-            .splitCsv ( header:true )
-            .map { row ->
-                path = file(row.vep_files[0])
-                if(path.exists()) {
-                    return [path]
-                } else {
-                    error("\nVep database file ${path} does not exist.")
-                }
-            }
-            .collect()
-            .set { ch_vep_extra_files }
-    } else {
-        ch_vep_extra_files = Channel.value([])
-    }
-
     emit:
     mmi             = MINIMAP2_INDEX.out.index.collect() // channel: [ val(meta), path(mmi) ]
     fai             = SAMTOOLS_FAIDX.out.fai.collect()   // channel: [ val(meta), path(fai) ]
     fasta           = ch_fasta                           // channel: [ val(meta), path(fasta) ]
-    vep_resources   = untarred_vep                       // channel: [ path(cache) ]
-    vep_extra_files = ch_vep_extra_files                 // channel: [ path(files) ]
+    vep_resources   = untarred_vep                       // channel: [ val(meta), path(cache) ]
     versions        = ch_versions                        // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/rank_variants/tests/main.nf.test b/subworkflows/local/rank_variants/tests/main.nf.test
@@ -20,9 +20,6 @@ nextflow_workflow {
                     file(params.pipelines_testdata_base_path + 'reference/vep_cache_test_data.tar.gz', checkIfExists:true)
                 ]
                 input[3] = true
-                input[4] = Channel.of([
-                    file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
-                ])
                 """
             }
         }
@@ -69,9 +66,11 @@ nextflow_workflow {
                 ]
                 input[2] = PREPARE_GENOME.out.fasta
                 input[3] = PREPARE_GENOME.out.fai
-                input[4] = PREPARE_GENOME.out.vep_resources
+                input[4] = PREPARE_GENOME.out.vep_resources.map { meta, cache -> cache }
                 input[5] = Channel.value('110')
-                input[6] = PREPARE_GENOME.out.vep_extra_files
+                input[6] = Channel.of([
+                    file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
+                ]).splitCsv(header:true).map { row -> row.vep_files }.collect()
                 input[7] = false
                 input[8] = Channel.value([])
                 input[9] = null

diff --git a/subworkflows/local/snv_annotation/main.nf b/subworkflows/local/snv_annotation/main.nf
@@ -8,16 +8,16 @@ workflow SNV_ANNOTATION {
 
     take:
     ch_vcf                // channel [mandatory] [ val(meta), path(vcf) ]
-    ch_databases          // channel: [mandatory] [ val(meta), path(db) ]
+    ch_databases          // channel: [mandatory] [ path(db) ]
     ch_fasta              // channel: [mandatory] [ val(meta), path(fasta) ]
     ch_fai                // channel: [mandatory] [ val(meta), path(fai) ]
     ch_vep_cache          // channel: [mandatory] [ path(cache) ]
     val_vep_cache_version // string: [mandatory] default: 110
     ch_vep_extra_files    // channel: [mandatory] [ path(files) ]
     val_annotate_cadd     // bool: [mandatory]
     ch_cadd_header        // channel: [mandatory] [ path(txt) ]
-    ch_cadd_resources     // channel: [mandatory] [ path(annotation) ]
-    ch_cadd_prescored     // channel: [mandatory] [ path(prescored) ]
+    ch_cadd_resources     // channel: [mandatory] [ val(meta), path(annotation) ]
+    ch_cadd_prescored     // channel: [mandatory] [ val(meta), path(prescored) ]
 
     main:
     ch_versions = Channel.empty()

diff --git a/subworkflows/local/snv_annotation/tests/main.nf.test b/subworkflows/local/snv_annotation/tests/main.nf.test
@@ -88,11 +88,11 @@ nextflow_workflow {
                 ]
                 input[2] = GUNZIP.out.gunzip
                 input[3] = SAMTOOLS_FAIDX.out.fai
-                input[4] = UNTAR.out.untar.map { meta, cache -> cache }
+                input[4] = UNTAR.out.untar.map { meta, cache -> cache}
                 input[5] = Channel.value('110')
-                input[6] = [
+                input[6] = Channel.of([
                     file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
-                ]
+                ]).splitCsv(header:true).map { row -> row.vep_files }.collect()
                 input[7] = false
                 input[8] = Channel.value([])
                 input[9] = null
@@ -132,9 +132,9 @@ nextflow_workflow {
                 input[3] = SAMTOOLS_FAIDX.out.fai
                 input[4] = UNTAR.out.untar.map { meta, cache -> cache }
                 input[5] = Channel.value('110')
-                input[6] = [
+                input[6] = Channel.of([
                     file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true)
-                ]
+                ]).splitCsv(header:true).map { row -> row.vep_files }.collect()
                 input[7] = false
                 input[8] = Channel.value([])
                 input[9] = null

diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf
@@ -640,3 +640,15 @@ def findKeyForValue(def valueToFind, Map map) {
     }
     return null // Value not found
 }
+
+// Utility function to create channels from references
+def createReferenceChannelFromPath(param, defaultValue = '') {
+    return param ? Channel.fromPath(param, checkIfExists: true)
+        .map { [ [ id: it.simpleName ], it ] }
+        .collect() : defaultValue
+}
+// Utility function to create channels from samplesheets
+def createReferenceChannelFromSamplesheet(param, schema, defaultValue = '') {
+    return param ? Channel.fromList(samplesheetToList(param, schema)) : defaultValue
+}
+
diff --git a/tests/samplesheet.nf.test.snap b/tests/samplesheet.nf.test.snap
@@ -546,6 +546,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-30T10:27:37.120618269"
+        "timestamp": "2024-10-30T11:40:24.479263781"
     }
 }
diff --git a/tests/samplesheet_multisample_bam.nf.test.snap b/tests/samplesheet_multisample_bam.nf.test.snap
@@ -746,6 +746,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-30T10:29:12.353783346"
+        "timestamp": "2024-10-30T11:42:12.581768636"
     }
 }
diff --git a/tests/samplesheet_multisample_ont_bam.nf.test.snap b/tests/samplesheet_multisample_ont_bam.nf.test.snap
@@ -490,6 +490,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-30T08:37:01.633018038"
+        "timestamp": "2024-10-30T11:43:41.130041374"
     }
 }