Annotate SVs (#408)

* wip * working * fix snv annotation * docs and split * update docs * add back regulatory * review suggestions * prettier
genomic-medicine-sweden · Oct 14, 2024 · 777cd7b · 777cd7b
1 parent 1263d72
commit 777cd7b
Show file tree

Hide file tree

Showing 34 changed files with 749 additions and 85 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -37,6 +37,7 @@ jobs:
           - "SHORT_VARIANT_CALLING"
           - "SNV_ANNOTATION"
           - "CALL_SVS"
+          - "ANNOTATE_SVS"
         profile:
           - "docker"
 

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -10,6 +10,7 @@ lint:
     - .github/workflows/awstest.yml
     - .github/workflows/awsfulltest.yml
     - conf/modules.config
+    - conf/igenomes_ignored.config
   files_unchanged:
     - CODE_OF_CONDUCT.md
     - assets/nf-core-nallo_logo_light.png

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#388](https://github.com/genomic-medicine-sweden/nallo/pull/388) - Added single-sample tbi output to the short variant calling subworkflow
 - [#393](https://github.com/genomic-medicine-sweden/nallo/pull/393) - Added a new `--minimap2_read_mapping_preset` parameter
 - [#403](https://github.com/genomic-medicine-sweden/nallo/pull/403) - Added `FOUND_IN=hificnv` tags to CNV calling output
+- [#408](https://github.com/genomic-medicine-sweden/nallo/pull/408) - Added a new subworkflow to annotate SVs
 - [#417](https://github.com/genomic-medicine-sweden/nallo/pull/417) - Added `FOUND_IN=deepvariant` tags to SNV calling output
 - [#419](https://github.com/genomic-medicine-sweden/nallo/pull/419) - Added support for SV filtering using input BED file ([#348](https://github.com/genomic-medicine-sweden/nallo/issues/348))
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -110,6 +110,10 @@
 
   > Nilsson D, Magnusson M. moonso/stranger v0.7.1. Published online February 18, 2021. doi:10.5281/ZENODO.4548873
 
+- [SVDB](https://github.com/J35P312/SVDB)
+
+  > Eisfeldt et al., 2017.
+
 - [Tabix](https://academic.oup.com/bioinformatics/article/27/5/718/262743)
 
   > Li H. Tabix: fast retrieval of sequence features from generic TAB-delimited files. Bioinformatics. 2011;27(5):718-719. doi:10.1093/bioinformatics/btq671

diff --git a/README.md b/README.md
@@ -46,6 +46,7 @@
 
 - Annotate SNVs and INDELs with databases of choice, i.e. [gnomAD](https://gnomad.broadinstitute.org), [CADD](https://cadd.gs.washington.edu) etc. with [echtvar](https://github.com/brentp/echtvar) and [VEP](https://github.com/Ensembl/ensembl-vep)
 - Annotate repeat expansions with [stranger](https://github.com/Clinical-Genomics/stranger)
+- Annotate SVs with [SVDB](https://github.com/J35P312/SVDB) and [VEP](https://github.com/Ensembl/ensembl-vep)
 
 ##### Ranking
 

diff --git a/assets/svdb_query_vcf_schema.json b/assets/svdb_query_vcf_schema.json
@@ -0,0 +1,40 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "https://raw.githubusercontent.com/nf-core/raredisease/master/assets/mobile_element_references_schema.json",
+    "title": "Schema for SVDB query - VCF",
+    "description": "Schema for the SVDB query database input, VCF version",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "filename": {
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
+                "pattern": "^\\S+\\.vcf?(\\.gz)?$",
+                "errorMessage": "Path to query database cannot contain spaces and must be a vcf file"
+            },
+            "in_freq_info_key": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "In frequency key cannot contain spaces"
+            },
+            "in_allele_count_info_key": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "In allele count key cannot contain spaces"
+            },
+            "out_freq_info_key": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "Out frequency key must be provided and cannot contain spaces"
+            },
+            "out_allele_count_info_key": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "Out allele count key must be provided and cannot contain spaces"
+            }
+        },
+        "required": ["filename", "out_freq_info_key", "out_allele_count_info_key"]
+    }
+}
diff --git a/conf/modules/annotate_svs.config b/conf/modules/annotate_svs.config
@@ -0,0 +1,57 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args   = Additional arguments appended to command in module.
+        ext.args2  = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3  = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix = File name prefix for output files.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    /*
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Annotate SVs
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    */
+
+    withName: '.*:ANNOTATE_SVS:.*' {
+        publishDir = [
+            enabled: false,
+        ]
+    }
+
+    withName: '.*ANNOTATE_SVS:ENSEMBLVEP_SV' {
+        ext.args = { [
+            "${params.extra_vep_options}",
+            "--dir_plugins .",
+            '--plugin pLI,pLI_values.txt',
+            '--appris --biotype --buffer_size 100 --canonical --cache --ccds',
+            '--compress_output bgzip --distance 5000 --domains',
+            '--exclude_predicted --force_overwrite --format vcf',
+            '--hgvs --humdiv --max_sv_size 248387328',
+            '--no_progress --numbers --per_gene --polyphen p',
+            '--protein --offline --sift p --regulatory',
+            '--symbol --tsl --uniprot --vcf',
+            '--no_stats'
+        ].join(' ') }
+        ext.prefix = { "${meta.id}_svs_annotated" }
+        publishDir = [
+            path: { "${params.outdir}/svs/multi_sample/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*ANNOTATE_SVS:TABIX_ENSEMBLVEP_SV' {
+        publishDir = [
+            path: { "${params.outdir}/svs/multi_sample/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+}
diff --git a/conf/modules/call_svs.config b/conf/modules/call_svs.config
@@ -57,7 +57,7 @@ process {
         publishDir = [
             path: { "${params.outdir}/svs/multi_sample/${meta.id}" },
             mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            saveAs: { filename -> filename.equals('versions.yml') || !params.skip_sv_annotation ? null : filename }
         ]
     }
 
@@ -67,10 +67,5 @@ process {
             '--output-type z',
             '--write-index=tbi'
         ].join(' ')
-        publishDir = [
-            path: { "${params.outdir}/svs/single_sample/${meta.id}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
     }
 }
diff --git a/conf/modules/general.config b/conf/modules/general.config
@@ -57,7 +57,7 @@ process {
         ]
     }
 
-    withName: '.*:NALLO:BCFTOOLS_PLUGINSPLIT' {
+    withName: '.*:NALLO:BCFTOOLS_PLUGINSPLIT_SNVS' {
         ext.args = [
             '-i \'GT="alt"\'',
             '--output-type z',
@@ -81,6 +81,30 @@ process {
         ]
     }
 
+    withName: '.*:NALLO:BCFTOOLS_PLUGINSPLIT_SVS' {
+        ext.args = [
+            '-i \'GT="alt"\'',
+            '--output-type z',
+            '--write-index=tbi'
+        ].join(' ')
+        publishDir = [
+            path: { "${params.outdir}/svs/single_sample/" },
+            mode: params.publish_dir_mode,
+            // Can't use prefix as it would come from the original file
+            saveAs: { filename ->
+                if (filename.equals('versions.yml')) {
+                    null
+                } else {
+                    def matcher = filename =~ /(.+)(\.vcf\.gz(?:\.tbi)?)$/
+                    def sample = matcher[0][1]
+                    def extension = matcher[0][2]
+                    def annotated = params.skip_sv_annotation ? "" : "_annotated"
+                    "${sample}/${sample}_svs${annotated}${extension}"
+                }
+            }
+        ]
+    }
+
     withName: '.*:NALLO:SAMPLESHEET_PED' {
         publishDir = [
             enabled: false

diff --git a/conf/modules/snv_annotation.config b/conf/modules/snv_annotation.config
@@ -36,7 +36,7 @@ process {
         ].join(' ')
     }
 
-    withName: '.*:SNV_ANNOTATION:ENSEMBLVEP_VEP' {
+    withName: '.*:SNV_ANNOTATION:ENSEMBLVEP_SNV' {
         ext.prefix = { "${meta.id}_vep" }
         ext.args = { [
             "${params.extra_vep_options}",

diff --git a/conf/test.config b/conf/test.config
@@ -10,14 +10,6 @@
 ----------------------------------------------------------------------------------------
 */
 
-process {
-    resourceLimits = [
-        cpus: 4,
-        memory: '15.GB',
-        time: '1.h'
-    ]
-}
-
 params {
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
@@ -26,7 +18,7 @@ params {
     modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'
 
     // Base directory for genomic-medicine-sweden/nallo test data
-    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/1f4e062926fc10f70a38e917e5771edb333e89bf/'
+    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
 
     // References
     fasta                    = params.pipelines_testdata_base_path + 'reference/hg38.test.fa.gz'
@@ -41,6 +33,7 @@ params {
     vep_cache                = params.pipelines_testdata_base_path + 'reference/vep_cache_test_data.tar.gz'
     vep_plugin_files         = params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv'
     snp_db                   = params.pipelines_testdata_base_path + 'testdata/snp_dbs.csv'
+    svdb_dbs                 = params.pipelines_testdata_base_path + 'testdata/svdb_dbs.csv'
     reduced_penetrance       = params.pipelines_testdata_base_path + 'reference/reduced_penetrance.tsv'
     score_config_snv         = params.pipelines_testdata_base_path + 'reference/rank_model_snv.ini'
     variant_consequences_snv = params.pipelines_testdata_base_path + 'reference/variant_consequences_v2.txt'
@@ -59,7 +52,7 @@ params {
 
 // Impose same minimum Nextflow version as in nextflow.config
 manifest {
-    nextflowVersion = '!>=23.04.0'
+    nextflowVersion = '!>=24.04.2'
 }
 
 // Disable all Nextflow reporting options
@@ -69,16 +62,25 @@ trace    { enabled = false }
 dag      { enabled = false }
 
 process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '15.GB',
+        time: '1.h'
+    ]
     withLabel: 'process_high' {
         cpus = 4
+        memory = '15.GB'
     }
     withLabel: 'process_medium' {
         cpus = 2
+        memory = '7.GB'
     }
     withLabel: 'process_low' {
         cpus = 1
+        memory = '3.GB'
     }
     withLabel: 'process_single' {
         cpus = 1
+        memory = '3.GB'
     }
 }
diff --git a/docs/output.md b/docs/output.md
@@ -27,6 +27,7 @@
       - [Ranking](#ranking)
     - [Ranked Variants](#ranked-variants)
     - [SV Calling](#sv-calling)
+    - [SV Annotation](#sv-annotation)
 
 ## Pipeline overview
 
@@ -348,3 +349,18 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
   - `*.vcf.gz`: VCF with variants per sample
   - `*.vcf.gz.tbi`: Index of the corresponding VCF file
   </details>
+
+### SV Annotation
+
+[SVDB](https://github.com/J35P312/SVDB) and [VEP](https://www.ensembl.org/vep) are used to annotate SVs.
+
+<details markdown="1">
+<summary>Output files from SV Annotation</summary>
+
+- `{outputdir}/svs/multi_sample/{project}`
+  - `{project}_svs_annotated.vcf.gz`: VCF file with annotated merged variants
+  - `{project}_svs_annotated.vcf.gz.tbi`: Index of the corresponding VCF file
+- `{outputdir}/svs/single_sample/{sample}`
+  - `*.vcf_annotated.gz`: VCF with annotated variants per sample
+  - `*.vcf_annotated.gz.tbi`: Index of the corresponding VCF file
+  </details>
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -17,6 +17,7 @@ Allows skipping certain parts of the pipeline
 | `skip_repeat_annotation` | Skip tandem repeat annotation | `boolean` | False |  |  |
 | `skip_phasing_wf` | Skip phasing of variants and haplotagging of reads | `boolean` | False |  |  |
 | `skip_snv_annotation` | Skip short variant annotation | `boolean` | False |  |  |
+| `skip_sv_annotation` | Skip structural variant annotation | `boolean` | False |  |  |
 | `skip_cnv_calling` | Skip CNV calling | `boolean` | False |  |  |
 | `skip_call_paralogs` | Skip the calling of specific paralogous genes | `boolean` | False |  |  |
 | `skip_rank_variants` | Skip ranking of short variants | `boolean` | False |  |  |
@@ -37,6 +38,7 @@ Define where the pipeline should find input data and save output data.
 | `tandem_repeats` | A tandem repeat BED file for sniffles | `string` |  |  |  |
 | `trgt_repeats` | A BED file with repeats to be genotyped with TRGT | `string` |  |  |  |
 | `snp_db` | A csv file with echtvar databases to annotate SNVs with | `string` |  |  |  |
+| `svdb_dbs` | Databases used for structural variant annotation in vcf format. <details><summary>Help</summary><small>Path to comma-separated file containing information about the databases used for structural variant annotation.</small></details>| `string` |  |  |  |
 | `variant_catalog` | A variant catalog json-file for stranger | `string` |  |  |  |
 | `variant_consequences_snv` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` |  |  |  |
 | `vep_cache` | A path to the VEP cache location | `string` |  |  |  |
@@ -47,7 +49,7 @@ Define where the pipeline should find input data and save output data.
 | `reduced_penetrance` | A file with gene ids that have reduced penetrance. For use with genmod. | `string` |  |  |  |
 | `score_config_snv` | A SNV rank model config file for genmod. | `string` |  |  |  |
 | `somalier_sites` | A VCF of known polymorphic sites for somalier | `string` |  |  |  |
-| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/1f4e062926fc10f70a38e917e5771edb333e89bf/ |  | True |
+| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/ |  | True |
 
 ## Reference genome options
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -236,9 +236,26 @@ cadd,/path/to/cadd.v1.6.hg38.zip
 > [!NOTE]
 > Optionally, to calcuate CADD scores for small indels, supply a path to a folder containing cadd annotations with `--cadd_resources` and prescored indels with `--cadd_prescored`. Equivalent of the `data/annotations/` and `data/prescored/` folders described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation). CADD scores for SNVs can be annotated through echvtvar and `--snp_db`.
 
+### SV annotation (`--skip_sv_annotation`)
+
+This subworkflow relies on the mapping subworkflow, and requires the following additional files:
+
+| Parameter               | Description                                                                                                                                                                                                                                            |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `svdb_dbs` <sup>1</sup> | Csv file with databases used for structural variant annotation in vcf format. <details><summary>Help</summary><small>Path to comma-separated file containing information about the databases used for structural variant annotation.</small></details> |
+
+<sup>1</sup> Example file for input with `--svdb_dbs`:
+
+```
+filename,in_freq_info_key,in_allele_count_info_key,out_freq_info_key,out_allele_count_info_key
+https://github.com/genomic-medicine-sweden/test-datasets/raw/b9ff54b59cdd39df5b6e278a30b08d94075a644c/reference/colorsdb.test_data.vcf.gz,AF,AC,colorsdb_af,colorsdb_ac
+```
+
+These databases could for example come from [CoLoRSdb](https://zenodo.org/records/13145123).
+
 ### Rank variants (`--skip_rank_variants`)
 
-This subworkflow relies on the mapping, short variant calling and SNV annotation subworkflows, and requires the following additional files:
+This subworkflow ranks SNVs, and relies on the mapping, short variant calling and SNV annotation subworkflows, and requires the following additional files:
 
 | Parameter            | Description                                                                                                                                                                                                                                                 |
 | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |

diff --git a/modules.json b/modules.json
@@ -244,6 +244,11 @@
                         "git_sha": "4806239588f35d27a95b187b4000d80e15152022",
                         "installed_by": ["modules"]
                     },
+                    "svdb/query": {
+                        "branch": "master",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "installed_by": ["modules"]
+                    },
                     "tabix/bgzip": {
                         "branch": "master",
                         "git_sha": "b20be35facfc5acdc1259f132ed79339d79e989f",

diff --git a/modules/nf-core/svdb/query/environment.yml b/modules/nf-core/svdb/query/environment.yml