Rank SVs (#450)

* wip * working * Update CHANGELOG * Update CHANGELOG again
genomic-medicine-sweden · Oct 30, 2024 · 558312d · 558312d
1 parent 9000326
commit 558312d
Show file tree

Hide file tree

Showing 19 changed files with 162 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#431](https://github.com/genomic-medicine-sweden/nallo/pull/431) - Added files needed to automatically build and publish docs to GitHub Pages
 - [#435](https://github.com/genomic-medicine-sweden/nallo/pull/435) - Added nf-test to rank variants
 - [#445](https://github.com/genomic-medicine-sweden/nallo/pull/445) - Added FOUND_IN tag and nf-test to rank variants
+- [#450](https://github.com/genomic-medicine-sweden/nallo/pull/450) - Added ranking of SVs (and CNVs)
 
 ### `Changed`
 

diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@
 
 ##### Ranking
 
-- Rank SNVs with [GENMOD](https://github.com/Clinical-Genomics/genmod)
+- Rank SNVs, INDELs and SVs with [GENMOD](https://github.com/Clinical-Genomics/genmod)
 
 ## Usage
 

diff --git a/conf/modules/annotate_consequence_pli.config b/conf/modules/annotate_consequence_pli.config
@@ -12,6 +12,10 @@
 */
 
 process {
+
+    //
+    // SNVs
+    //
     withName: '.*:ANN_CSQ_PLI_SNV:.*' {
         publishDir = [
             enabled: false
@@ -29,4 +33,25 @@ process {
     withName: '.*ANN_CSQ_PLI_SNV:TABIX_BGZIPTABIX' {
         ext.prefix = { "${meta.id}_snv_csq_pli" }
     }
+
+    //
+    // SVs
+    //
+    withName: '.*:ANN_CSQ_PLI_SVS:.*' {
+        publishDir = [
+            enabled: false
+        ]
+    }
+
+    withName: '.*ANN_CSQ_PLI_SVS:ADD_MOST_SEVERE_CSQ' {
+        ext.prefix = { "${meta.id}_svs_csq" }
+    }
+
+    withName: '.*ANN_CSQ_PLI_SVS:ADD_MOST_SEVERE_PLI' {
+        ext.prefix = { "${meta.id}_svs_csq_pli" }
+    }
+
+    withName: '.*ANN_CSQ_PLI_SVS:TABIX_BGZIPTABIX' {
+        ext.prefix = { "${meta.id}_svs_csq_pli" }
+    }
 }
diff --git a/conf/modules/annotate_svs.config b/conf/modules/annotate_svs.config
@@ -42,15 +42,15 @@ process {
         publishDir = [
             path: { "${params.outdir}/svs/family/${meta.id}" },
             mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            saveAs: { filename -> filename.equals('versions.yml') || !params.skip_rank_variants ? null : filename }
         ]
     }
 
     withName: '.*ANNOTATE_SVS:TABIX_ENSEMBLVEP_SV' {
         publishDir = [
             path: { "${params.outdir}/svs/family/${meta.id}" },
             mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            saveAs: { filename -> filename.equals('versions.yml') || !params.skip_rank_variants ? null : filename }
         ]
     }
 

diff --git a/conf/modules/rank_variants.config b/conf/modules/rank_variants.config
@@ -11,18 +11,16 @@
 ----------------------------------------------------------------------------------------
 */
 
-//
-// Score and rank SNVs
-//
-
 process {
 
+    //
+    // Score and rank SNVs
+    //
     withName: '.*:RANK_VARIANTS_SNV:.*' {
         publishDir = [
             enabled: false,
         ]
     }
-
     withName: '.*:RANK_VARIANTS_SNV:GENMOD_ANNOTATE' {
         ext.prefix = { "${meta.id}_snv_genmod_annotate" }
         ext.args = { [
@@ -31,20 +29,53 @@ process {
             '--temp_dir ./'
             ].join(' ') }
     }
-
     withName: '.*:RANK_VARIANTS_SNV:GENMOD_MODELS' {
         ext.prefix = { "${meta.id}_snv_genmod_models" }
         ext.args = "--whole_gene --temp_dir ./"
     }
-
     withName: '.*:RANK_VARIANTS_SNV:GENMOD_SCORE' {
         ext.prefix = { "${meta.id}_snv_genmod_score" }
         ext.args = "--rank_results"
     }
-
     withName: '.*:RANK_VARIANTS_SNV:GENMOD_COMPOUND' {
         ext.prefix = { "${meta.id}_snv_genmod_compound" }
         ext.args = "--temp_dir ./"
     }
 
+    //
+    // Score and rank SVSs
+    //
+    withName: '.*:RANK_VARIANTS_SVS:.*' {
+        publishDir = [
+            enabled: false,
+        ]
+    }
+    withName: '.*:RANK_VARIANTS_SVS:GENMOD_ANNOTATE' {
+        ext.prefix = { "${meta.id}_svs_genmod_annotate" }
+        ext.args = { [
+            '--annotate_regions',
+            '--genome-build 38',
+            '--temp_dir ./'
+            ].join(' ') }
+    }
+    withName: '.*:RANK_VARIANTS_SVS:GENMOD_MODELS' {
+        ext.prefix = { "${meta.id}_svs_genmod_models" }
+        ext.args = "--whole_gene --temp_dir ./"
+    }
+    withName: '.*:RANK_VARIANTS_SVS:GENMOD_SCORE' {
+        ext.prefix = { "${meta.id}_svs_genmod_score" }
+        ext.args = "--rank_results"
+    }
+    withName: '.*:RANK_VARIANTS_SVS:GENMOD_COMPOUND' {
+        ext.prefix = { "${meta.id}_svs_genmod_compound" }
+        ext.args = "--temp_dir ./"
+    }
+    withName: '.*:RANK_VARIANTS_SVS:TABIX_BGZIPTABIX' {
+        ext.prefix = { params.skip_cnv_calling ? "${meta.id}_svs_merged_annotated_ranked" : "${meta.id}_svs_cnvs_merged_annotated_ranked" }
+        publishDir = [
+            path: { "${params.outdir}/svs/family/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -18,7 +18,7 @@ params {
     modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'
 
     // Base directory for genomic-medicine-sweden/nallo test data
-    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
+    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
 
     // References
     fasta                    = params.pipelines_testdata_base_path + 'reference/hg38.test.fa.gz'
@@ -36,7 +36,9 @@ params {
     svdb_dbs                 = params.pipelines_testdata_base_path + 'testdata/svdb_dbs.csv'
     reduced_penetrance       = params.pipelines_testdata_base_path + 'reference/reduced_penetrance.tsv'
     score_config_snv         = params.pipelines_testdata_base_path + 'reference/rank_model_snv.ini'
+    score_config_svs         = params.pipelines_testdata_base_path + 'reference/rank_model_svs.ini'
     variant_consequences_snv = params.pipelines_testdata_base_path + 'reference/variant_consequences_v2.txt'
+    variant_consequences_svs = params.pipelines_testdata_base_path + 'reference/variant_consequences_v2.txt'
     somalier_sites           = params.pipelines_testdata_base_path + 'reference/somalier_sites.vcf.gz'
 
     // Pipeline options

diff --git a/docs/output.md b/docs/output.md
@@ -206,10 +206,11 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin
 [Severus](https://github.com/KolmogorovLab/Severus) or [Sniffles](https://github.com/fritzsedlazeck/Sniffles) is used to call structural variants.
 [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs. It also produces copy number, depth, and MAF [visualization tracks](#visualization-tracks).
 [SVDB](https://github.com/J35P312/SVDB) is used to combine and merge SVs and CNVs within and between samples.
+[GENMOD](https://github.com/Clinical-Genomics/genmod) is used to rank the annotated SVs.
 
 !!!note
 
-    Variants are only output without annotation if that subworkflow is turned off.
+    Variants are only output without annotation and/or ranking if these subworkflow is turned off.
 
 !!!note
 
@@ -237,6 +238,15 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin
 | `svs/family/{family_id}/{family_id}_svs_merged_annotated.vcf.gz`          | VCF file with merged and annotated SVs per family (output if CNV-calling is off) |
 | `svs/family/{family_id}/{family_id}_svs_merged_annotated.vcf.gz.tbi`      | Index of the merged VCF file                                                     |
 
+[GENMOD](https://github.com/Clinical-Genomics/genmod) is used to rank the annotated SVs.
+
+| Path                                                                             | Description                                                                              |
+| -------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
+| `svs/family/{family_id}/{family_id}_cnvs_svs_merged_annotated_ranked.vcf.gz`     | VCF file with merged, annotated and ranked CNVs and SVs per family                       |
+| `svs/family/{family_id}/{family_id}_cnvs_svs_merged_annotated_ranked.vcf.gz.tbi` | Index of the merged VCF file                                                             |
+| `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz`          | VCF file with merged, annotated and ranked SVs per family (output if CNV-calling is off) |
+| `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz.tbi`      | Index of the merged VCF file                                                             |
+
 ## Visualization Tracks
 
 [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs, but it also produces copy number, depth, and MAF tracks that can be visualized in for example IGV.

diff --git a/docs/parameters.md b/docs/parameters.md
@@ -41,15 +41,17 @@ Define where the pipeline should find input data and save output data.
 | `svdb_dbs` | Databases used for structural variant annotation in vcf format. <details><summary>Help</summary><small>Path to comma-separated file containing information about the databases used for structural variant annotation.</small></details>| `string` |  |  |  |
 | `variant_catalog` | A variant catalog json-file for stranger | `string` |  |  |  |
 | `variant_consequences_snv` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` |  |  |  |
+| `variant_consequences_svs` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` |  |  |  |
 | `vep_cache` | A path to the VEP cache location | `string` |  |  |  |
 | `bed` | A BED file with regions of interest, used to limit short variant calling. | `string` |  |  |  |
 | `hificnv_xy` | A BED file containing expected copy number regions for XY samples. | `string` |  |  |  |
 | `hificnv_xx` | A BED file containing expected copy number regions for XX samples. | `string` |  |  |  |
 | `hificnv_exclude` | A BED file specifying regions to exclude with HiFiCNV, such as centromeres. | `string` |  |  |  |
 | `reduced_penetrance` | A file with gene ids that have reduced penetrance. For use with genmod. | `string` |  |  |  |
 | `score_config_snv` | A SNV rank model config file for genmod. | `string` |  |  |  |
+| `score_config_svs` | A SV rank model config file for genmod. | `string` |  |  |  |
 | `somalier_sites` | A VCF of known polymorphic sites for somalier | `string` |  |  |  |
-| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/ |  | True |
+| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/ |  | True |
 
 ## Reference genome options
 
@@ -106,7 +108,7 @@ Workflow options specific to genomic-medicine-sweden/nallo
 | `vep_cache_version` | VEP cache version | `integer` | 110 |  |  |
 | `vep_plugin_files` | A csv file with vep_plugins as header, and then paths to vep plugin files. Paths to pLI_values.txt and LoFtool_scores.txt are required. | `string` |  |  |  |
 | `deepvariant_model_type` | Sets the model type used for DeepVariant. This is set automatically using `--preset` by default. | `string` | PACBIO |  | True |
-| `minimap2_read_mapping_preset` | Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default. | `string` |  |  | True |
+| `minimap2_read_mapping_preset` | Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default. | `string` | map-hifi |  | True |
 | `extra_modkit_options` | Extra options to modkit, used for test profile. | `string` |  |  | True |
 | `extra_vep_options` | Extra options to VEP, used for test profile. | `string` |  |  | True |
 | `extra_paraphase_options` | Extra options to Paraphase, used for test profile. | `string` |  |  | True |

diff --git a/docs/usage.md b/docs/usage.md
@@ -298,6 +298,17 @@ These databases could for example come from [CoLoRSdb](https://zenodo.org/record
 
 Turned off with `--skip_sv_annotation`.
 
+### Rank SVs
+
+This subworkflow ranks SVs, and relies on the mapping, SV calling and SV annotation subworkflows, and requires the following additional files:
+
+| Parameter            | Description                                                                                                                                                                                                                                                 |
+| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `score_config_svs`   |  Used by GENMOD when ranking variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/rank_model_snv.ini).                                                                                                          |
+| `reduced_penetrance` | A list of loci that show [reduced penetrance](https://medlineplus.gov/genetics/understanding/inheritance/penetranceexpressivity/) in people. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/reduced_penetrance.tsv) |
+
+`--skip_rank_variants`.
+
 ## Other highlighted parameters
 
 - Limit SNV calling to regions in BED file (`--bed`).

diff --git a/nextflow.config b/nextflow.config
@@ -20,9 +20,11 @@ params {
     variant_catalog              = null
     reduced_penetrance           = null
     score_config_snv             = null
+    score_config_svs             = null
     snp_db                       = null
     svdb_dbs                     = null
     variant_consequences_snv     = null
+    variant_consequences_svs     = null
     vep_cache                    = null
     vep_plugin_files             = null
     hificnv_xy                   = null
@@ -86,7 +88,7 @@ params {
     help_full                    = false
     show_hidden                  = false
     version                      = false
-    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
+    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
 
     // Config options
     config_profile_name        = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -189,6 +189,11 @@
                     "description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html",
                     "fa_icon": "fas fa-file-csv"
                 },
+                "variant_consequences_svs": {
+                    "type": "string",
+                    "description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html",
+                    "fa_icon": "fas fa-file-csv"
+                },
                 "vep_cache": {
                     "type": "string",
                     "description": "A path to the VEP cache location",
@@ -234,6 +239,13 @@
                     "fa_icon": "fas fa-file",
                     "description": "A SNV rank model config file for genmod."
                 },
+                "score_config_svs": {
+                    "type": "string",
+                    "exists": true,
+                    "format": "path",
+                    "fa_icon": "fas fa-file",
+                    "description": "A SV rank model config file for genmod."
+                },
                 "somalier_sites": {
                     "type": "string",
                     "pattern": "^\\S+\\.vcf(\\.gz)?$",
@@ -245,7 +257,7 @@
                     "type": "string",
                     "fa_icon": "far fa-check-circle",
                     "description": "Base URL or local path to location of pipeline test dataset files",
-                    "default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/",
+                    "default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/",
                     "hidden": true
                 }
             }
@@ -467,7 +479,8 @@
                     "type": "string",
                     "description": "Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default.",
                     "hidden": true,
-                    "enum": ["map-hifi", "map-ont", "lr:hq", "lr:hqae"]
+                    "enum": ["map-hifi", "map-ont", "lr:hq", "lr:hqae"],
+                    "default": "map-hifi"
                 },
                 "extra_modkit_options": {
                     "type": "string",

diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf
@@ -73,9 +73,9 @@ def fileDependencies = [
     assembly         : ["fasta", "par_regions"], // The assembly workflow should be split into two - assembly and variant calling (requires ref)
     snv_calling      : ["fasta", "par_regions"],
     snv_annotation   : ["snp_db", "vep_cache", "vep_plugin_files", "variant_consequences_snv"],
-    sv_annotation    : ["svdb_dbs", "vep_cache", "vep_plugin_files"],
+    sv_annotation    : ["svdb_dbs", "vep_cache", "vep_plugin_files", "variant_consequences_svs"],
     cnv_calling      : ["hificnv_xy", "hificnv_xx", "hificnv_exclude"],
-    rank_variants    : ["reduced_penetrance", "score_config_snv"],
+    rank_variants    : ["reduced_penetrance", "score_config_snv", "score_config_svs"],
     repeat_calling   : ["trgt_repeats"],
     repeat_annotation: ["variant_catalog"],
 ]
@@ -108,10 +108,11 @@ def parameterStatus = [
         fasta                   : params.fasta,
         trgt_repeats            : params.trgt_repeats,
         variant_catalog         : params.variant_catalog,
-        score_config_snv        : params.score_config_snv,
         reduced_penetrance      : params.reduced_penetrance,
         score_config_snv        : params.score_config_snv,
+        score_config_svs        : params.score_config_svs,
         variant_consequences_snv: params.variant_consequences_snv,
+        variant_consequences_svs: params.variant_consequences_svs,
     ]
 ]
 

diff --git a/tests/samplesheet.nf.test b/tests/samplesheet.nf.test
@@ -9,7 +9,7 @@ nextflow_pipeline {
 
         when {
             params {
-                pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
+                pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
                 input                        = params.pipelines_testdata_base_path + 'testdata/samplesheet.csv'
                 outdir                       = "$outputDir"
             }