working

genomic-medicine-sweden · Oct 30, 2024 · 29a7b61 · 29a7b61
1 parent e1b8480
commit 29a7b61
Show file tree

Hide file tree

Showing 17 changed files with 85 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@
 
 ##### Ranking
 
-- Rank SNVs with [GENMOD](https://github.com/Clinical-Genomics/genmod)
+- Rank SNVs, INDELs and SVs with [GENMOD](https://github.com/Clinical-Genomics/genmod)
 
 ## Usage
 

diff --git a/conf/modules/annotate_svs.config b/conf/modules/annotate_svs.config
@@ -42,15 +42,15 @@ process {
         publishDir = [
             path: { "${params.outdir}/svs/family/${meta.id}" },
             mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            saveAs: { filename -> filename.equals('versions.yml') || !params.skip_rank_variants ? null : filename }
         ]
     }
 
     withName: '.*ANNOTATE_SVS:TABIX_ENSEMBLVEP_SV' {
         publishDir = [
             path: { "${params.outdir}/svs/family/${meta.id}" },
             mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            saveAs: { filename -> filename.equals('versions.yml') || !params.skip_rank_variants ? null : filename }
         ]
     }
 

diff --git a/conf/modules/rank_variants.config b/conf/modules/rank_variants.config
@@ -70,4 +70,12 @@ process {
         ext.prefix = { "${meta.id}_svs_genmod_compound" }
         ext.args = "--temp_dir ./"
     }
+    withName: '.*:RANK_VARIANTS_SVS:TABIX_BGZIPTABIX' {
+        ext.prefix = { params.skip_cnv_calling ? "${meta.id}_svs_merged_annotated_ranked" : "${meta.id}_svs_cnvs_merged_annotated_ranked" }
+        publishDir = [
+            path: { "${params.outdir}/svs/family/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -18,7 +18,7 @@ params {
     modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'
 
     // Base directory for genomic-medicine-sweden/nallo test data
-    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
+    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
 
     // References
     fasta                    = params.pipelines_testdata_base_path + 'reference/hg38.test.fa.gz'
@@ -36,6 +36,7 @@ params {
     svdb_dbs                 = params.pipelines_testdata_base_path + 'testdata/svdb_dbs.csv'
     reduced_penetrance       = params.pipelines_testdata_base_path + 'reference/reduced_penetrance.tsv'
     score_config_snv         = params.pipelines_testdata_base_path + 'reference/rank_model_snv.ini'
+    score_config_svs         = params.pipelines_testdata_base_path + 'reference/rank_model_svs.ini'
     variant_consequences_snv = params.pipelines_testdata_base_path + 'reference/variant_consequences_v2.txt'
     variant_consequences_svs = params.pipelines_testdata_base_path + 'reference/variant_consequences_v2.txt'
     somalier_sites           = params.pipelines_testdata_base_path + 'reference/somalier_sites.vcf.gz'

diff --git a/docs/output.md b/docs/output.md
@@ -206,10 +206,11 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin
 [Severus](https://github.com/KolmogorovLab/Severus) or [Sniffles](https://github.com/fritzsedlazeck/Sniffles) is used to call structural variants.
 [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs. It also produces copy number, depth, and MAF [visualization tracks](#visualization-tracks).
 [SVDB](https://github.com/J35P312/SVDB) is used to combine and merge SVs and CNVs within and between samples.
+[GENMOD](https://github.com/Clinical-Genomics/genmod) is used to rank the annotated SVs.
 
 !!!note
 
-    Variants are only output without annotation if that subworkflow is turned off.
+    Variants are only output without annotation and/or ranking if these subworkflow is turned off.
 
 !!!note
 
@@ -237,6 +238,15 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin
 | `svs/family/{family_id}/{family_id}_svs_merged_annotated.vcf.gz`          | VCF file with merged and annotated SVs per family (output if CNV-calling is off) |
 | `svs/family/{family_id}/{family_id}_svs_merged_annotated.vcf.gz.tbi`      | Index of the merged VCF file                                                     |
 
+[GENMOD](https://github.com/Clinical-Genomics/genmod) is used to rank the annotated SVs.
+
+| Path                                                                             | Description                                                                              |
+| -------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
+| `svs/family/{family_id}/{family_id}_cnvs_svs_merged_annotated_ranked.vcf.gz`     | VCF file with merged, annotated and ranked CNVs and SVs per family                       |
+| `svs/family/{family_id}/{family_id}_cnvs_svs_merged_annotated_ranked.vcf.gz.tbi` | Index of the merged VCF file                                                             |
+| `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz`          | VCF file with merged, annotated and ranked SVs per family (output if CNV-calling is off) |
+| `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz.tbi`      | Index of the merged VCF file                                                             |
+
 ## Visualization Tracks
 
 [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs, but it also produces copy number, depth, and MAF tracks that can be visualized in for example IGV.

diff --git a/docs/parameters.md b/docs/parameters.md
@@ -41,15 +41,17 @@ Define where the pipeline should find input data and save output data.
 | `svdb_dbs` | Databases used for structural variant annotation in vcf format. <details><summary>Help</summary><small>Path to comma-separated file containing information about the databases used for structural variant annotation.</small></details>| `string` |  |  |  |
 | `variant_catalog` | A variant catalog json-file for stranger | `string` |  |  |  |
 | `variant_consequences_snv` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` |  |  |  |
+| `variant_consequences_svs` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` |  |  |  |
 | `vep_cache` | A path to the VEP cache location | `string` |  |  |  |
 | `bed` | A BED file with regions of interest, used to limit short variant calling. | `string` |  |  |  |
 | `hificnv_xy` | A BED file containing expected copy number regions for XY samples. | `string` |  |  |  |
 | `hificnv_xx` | A BED file containing expected copy number regions for XX samples. | `string` |  |  |  |
 | `hificnv_exclude` | A BED file specifying regions to exclude with HiFiCNV, such as centromeres. | `string` |  |  |  |
 | `reduced_penetrance` | A file with gene ids that have reduced penetrance. For use with genmod. | `string` |  |  |  |
 | `score_config_snv` | A SNV rank model config file for genmod. | `string` |  |  |  |
+| `score_config_svs` | A SV rank model config file for genmod. | `string` |  |  |  |
 | `somalier_sites` | A VCF of known polymorphic sites for somalier | `string` |  |  |  |
-| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/ |  | True |
+| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/ |  | True |
 
 ## Reference genome options
 
@@ -106,7 +108,7 @@ Workflow options specific to genomic-medicine-sweden/nallo
 | `vep_cache_version` | VEP cache version | `integer` | 110 |  |  |
 | `vep_plugin_files` | A csv file with vep_plugins as header, and then paths to vep plugin files. Paths to pLI_values.txt and LoFtool_scores.txt are required. | `string` |  |  |  |
 | `deepvariant_model_type` | Sets the model type used for DeepVariant. This is set automatically using `--preset` by default. | `string` | PACBIO |  | True |
-| `minimap2_read_mapping_preset` | Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default. | `string` |  |  | True |
+| `minimap2_read_mapping_preset` | Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default. | `string` | map-hifi |  | True |
 | `extra_modkit_options` | Extra options to modkit, used for test profile. | `string` |  |  | True |
 | `extra_vep_options` | Extra options to VEP, used for test profile. | `string` |  |  | True |
 | `extra_paraphase_options` | Extra options to Paraphase, used for test profile. | `string` |  |  | True |

diff --git a/docs/usage.md b/docs/usage.md
@@ -298,6 +298,17 @@ These databases could for example come from [CoLoRSdb](https://zenodo.org/record
 
 Turned off with `--skip_sv_annotation`.
 
+### Rank SVs
+
+This subworkflow ranks SVs, and relies on the mapping, SV calling and SV annotation subworkflows, and requires the following additional files:
+
+| Parameter            | Description                                                                                                                                                                                                                                                 |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `score_config_svs`   |  Used by GENMOD when ranking variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/rank_model_snv.ini).                                                                                                          |
+| `reduced_penetrance` | A list of loci that show [reduced penetrance](https://medlineplus.gov/genetics/understanding/inheritance/penetranceexpressivity/) in people. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/reduced_penetrance.tsv) |
+
+`--skip_rank_variants`.
+
 ## Other highlighted parameters
 
 - Limit SNV calling to regions in BED file (`--bed`).

diff --git a/nextflow.config b/nextflow.config
@@ -20,9 +20,11 @@ params {
     variant_catalog              = null
     reduced_penetrance           = null
     score_config_snv             = null
+    score_config_svs             = null
     snp_db                       = null
     svdb_dbs                     = null
     variant_consequences_snv     = null
+    variant_consequences_svs     = null
     vep_cache                    = null
     vep_plugin_files             = null
     hificnv_xy                   = null
@@ -86,7 +88,7 @@ params {
     help_full                    = false
     show_hidden                  = false
     version                      = false
-    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
+    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
 
     // Config options
     config_profile_name        = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -189,6 +189,11 @@
                     "description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html",
                     "fa_icon": "fas fa-file-csv"
                 },
+                "variant_consequences_svs": {
+                    "type": "string",
+                    "description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html",
+                    "fa_icon": "fas fa-file-csv"
+                },
                 "vep_cache": {
                     "type": "string",
                     "description": "A path to the VEP cache location",
@@ -234,6 +239,13 @@
                     "fa_icon": "fas fa-file",
                     "description": "A SNV rank model config file for genmod."
                 },
+                "score_config_svs": {
+                    "type": "string",
+                    "exists": true,
+                    "format": "path",
+                    "fa_icon": "fas fa-file",
+                    "description": "A SV rank model config file for genmod."
+                },
                 "somalier_sites": {
                     "type": "string",
                     "pattern": "^\\S+\\.vcf(\\.gz)?$",
@@ -245,7 +257,7 @@
                     "type": "string",
                     "fa_icon": "far fa-check-circle",
                     "description": "Base URL or local path to location of pipeline test dataset files",
-                    "default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/",
+                    "default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/",
                     "hidden": true
                 }
             }
@@ -467,7 +479,8 @@
                     "type": "string",
                     "description": "Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default.",
                     "hidden": true,
-                    "enum": ["map-hifi", "map-ont", "lr:hq", "lr:hqae"]
+                    "enum": ["map-hifi", "map-ont", "lr:hq", "lr:hqae"],
+                    "default": "map-hifi"
                 },
                 "extra_modkit_options": {
                     "type": "string",

diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf
@@ -73,9 +73,9 @@ def fileDependencies = [
     assembly         : ["fasta", "par_regions"], // The assembly workflow should be split into two - assembly and variant calling (requires ref)
     snv_calling      : ["fasta", "par_regions"],
     snv_annotation   : ["snp_db", "vep_cache", "vep_plugin_files", "variant_consequences_snv"],
-    sv_annotation    : ["svdb_dbs", "vep_cache", "vep_plugin_files"],
+    sv_annotation    : ["svdb_dbs", "vep_cache", "vep_plugin_files", "variant_consequences_svs"],
     cnv_calling      : ["hificnv_xy", "hificnv_xx", "hificnv_exclude"],
-    rank_variants    : ["reduced_penetrance", "score_config_snv"],
+    rank_variants    : ["reduced_penetrance", "score_config_snv", "score_config_svs"],
     repeat_calling   : ["trgt_repeats"],
     repeat_annotation: ["variant_catalog"],
 ]
@@ -108,10 +108,11 @@ def parameterStatus = [
         fasta                   : params.fasta,
         trgt_repeats            : params.trgt_repeats,
         variant_catalog         : params.variant_catalog,
-        score_config_snv        : params.score_config_snv,
         reduced_penetrance      : params.reduced_penetrance,
         score_config_snv        : params.score_config_snv,
+        score_config_svs        : params.score_config_svs,
         variant_consequences_snv: params.variant_consequences_snv,
+        variant_consequences_svs: params.variant_consequences_svs,
     ]
 ]
 

diff --git a/tests/samplesheet.nf.test b/tests/samplesheet.nf.test
@@ -9,7 +9,7 @@ nextflow_pipeline {
 
         when {
             params {
-                pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
+                pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
                 input                        = params.pipelines_testdata_base_path + 'testdata/samplesheet.csv'
                 outdir                       = "$outputDir"
             }

diff --git a/tests/samplesheet.nf.test.snap b/tests/samplesheet.nf.test.snap
@@ -1,7 +1,7 @@
 {
     "test profile": {
         "content": [
-            104,
+            112,
             {
                 "ADD_FOUND_IN_TAG": {
                     "bcftools": 1.2,
@@ -382,8 +382,8 @@
                 "svs",
                 "svs/family",
                 "svs/family/FAM",
-                "svs/family/FAM/FAM_svs_cnvs_merged_annotated.vcf.gz",
-                "svs/family/FAM/FAM_svs_cnvs_merged_annotated.vcf.gz.tbi",
+                "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz",
+                "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi",
                 "svs/single_sample",
                 "svs/single_sample/HG002_Revio",
                 "svs/single_sample/HG002_Revio/HG002_Revio_cnvs.vcf.gz",
@@ -507,7 +507,7 @@
                     "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=100, phased=false, phasedAutodetect=false]"
                 ],
                 [
-                    "FAM_svs_cnvs_merged_annotated.vcf.gz",
+                    "FAM_svs_cnvs_merged_annotated_ranked.vcf.gz",
                     "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=87, phased=false, phasedAutodetect=false]"
                 ],
                 [
@@ -546,6 +546,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-30T10:27:37.120618269"
+        "timestamp": "2024-10-30T11:26:11.367381989"
     }
 }
diff --git a/tests/samplesheet_multisample_bam.nf.test b/tests/samplesheet_multisample_bam.nf.test
@@ -9,7 +9,7 @@ nextflow_pipeline {
 
         when {
             params {
-                pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
+                pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
                 input                        = params.pipelines_testdata_base_path + 'testdata/samplesheet_multisample_bam.csv'
                 outdir                       = "$outputDir"
                 phaser                       = "hiphase"

diff --git a/tests/samplesheet_multisample_bam.nf.test.snap b/tests/samplesheet_multisample_bam.nf.test.snap
@@ -1,7 +1,7 @@
 {
     "samplesheet_multisample_bam | --phaser hiphase": {
         "content": [
-            150,
+            158,
             {
                 "ADD_FOUND_IN_TAG": {
                     "bcftools": 1.2,
@@ -456,8 +456,8 @@
                 "svs",
                 "svs/family",
                 "svs/family/FAM",
-                "svs/family/FAM/FAM_svs_cnvs_merged_annotated.vcf.gz",
-                "svs/family/FAM/FAM_svs_cnvs_merged_annotated.vcf.gz.tbi",
+                "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz",
+                "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi",
                 "svs/single_sample",
                 "svs/single_sample/HG002_Revio_A",
                 "svs/single_sample/HG002_Revio_A/HG002_Revio_A_cnvs.vcf.gz",
@@ -683,7 +683,7 @@
                     "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=100, phased=false, phasedAutodetect=false]"
                 ],
                 [
-                    "FAM_svs_cnvs_merged_annotated.vcf.gz",
+                    "FAM_svs_cnvs_merged_annotated_ranked.vcf.gz",
                     "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=87, phased=false, phasedAutodetect=false]"
                 ],
                 [
@@ -746,6 +746,6 @@
             "nf-test": "0.9.0",
             "nextflow": "24.04.4"
         },
-        "timestamp": "2024-10-30T10:29:12.353783346"
+        "timestamp": "2024-10-30T11:27:50.204023027"
     }
 }