Skip to content

Commit

Permalink
Rank SVs (#450)
Browse files Browse the repository at this point in the history
* wip

* working

* Update CHANGELOG

* Update CHANGELOG again
  • Loading branch information
fellen31 authored Oct 30, 2024
1 parent 9000326 commit 558312d
Show file tree
Hide file tree
Showing 19 changed files with 162 additions and 41 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#431](https://github.com/genomic-medicine-sweden/nallo/pull/431) - Added files needed to automatically build and publish docs to GitHub Pages
- [#435](https://github.com/genomic-medicine-sweden/nallo/pull/435) - Added nf-test to rank variants
- [#445](https://github.com/genomic-medicine-sweden/nallo/pull/445) - Added FOUND_IN tag and nf-test to rank variants
- [#450](https://github.com/genomic-medicine-sweden/nallo/pull/450) - Added ranking of SVs (and CNVs)

### `Changed`

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

##### Ranking

- Rank SNVs with [GENMOD](https://github.com/Clinical-Genomics/genmod)
- Rank SNVs, INDELs and SVs with [GENMOD](https://github.com/Clinical-Genomics/genmod)

## Usage

Expand Down
25 changes: 25 additions & 0 deletions conf/modules/annotate_consequence_pli.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
*/

process {

//
// SNVs
//
withName: '.*:ANN_CSQ_PLI_SNV:.*' {
publishDir = [
enabled: false
Expand All @@ -29,4 +33,25 @@ process {
withName: '.*ANN_CSQ_PLI_SNV:TABIX_BGZIPTABIX' {
ext.prefix = { "${meta.id}_snv_csq_pli" }
}

//
// SVs
//
withName: '.*:ANN_CSQ_PLI_SVS:.*' {
publishDir = [
enabled: false
]
}

withName: '.*ANN_CSQ_PLI_SVS:ADD_MOST_SEVERE_CSQ' {
ext.prefix = { "${meta.id}_svs_csq" }
}

withName: '.*ANN_CSQ_PLI_SVS:ADD_MOST_SEVERE_PLI' {
ext.prefix = { "${meta.id}_svs_csq_pli" }
}

withName: '.*ANN_CSQ_PLI_SVS:TABIX_BGZIPTABIX' {
ext.prefix = { "${meta.id}_svs_csq_pli" }
}
}
4 changes: 2 additions & 2 deletions conf/modules/annotate_svs.config
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ process {
publishDir = [
path: { "${params.outdir}/svs/family/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
saveAs: { filename -> filename.equals('versions.yml') || !params.skip_rank_variants ? null : filename }
]
}

withName: '.*ANNOTATE_SVS:TABIX_ENSEMBLVEP_SV' {
publishDir = [
path: { "${params.outdir}/svs/family/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
saveAs: { filename -> filename.equals('versions.yml') || !params.skip_rank_variants ? null : filename }
]
}

Expand Down
47 changes: 39 additions & 8 deletions conf/modules/rank_variants.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,16 @@
----------------------------------------------------------------------------------------
*/

//
// Score and rank SNVs
//

process {

//
// Score and rank SNVs
//
withName: '.*:RANK_VARIANTS_SNV:.*' {
publishDir = [
enabled: false,
]
}

withName: '.*:RANK_VARIANTS_SNV:GENMOD_ANNOTATE' {
ext.prefix = { "${meta.id}_snv_genmod_annotate" }
ext.args = { [
Expand All @@ -31,20 +29,53 @@ process {
'--temp_dir ./'
].join(' ') }
}

withName: '.*:RANK_VARIANTS_SNV:GENMOD_MODELS' {
ext.prefix = { "${meta.id}_snv_genmod_models" }
ext.args = "--whole_gene --temp_dir ./"
}

withName: '.*:RANK_VARIANTS_SNV:GENMOD_SCORE' {
ext.prefix = { "${meta.id}_snv_genmod_score" }
ext.args = "--rank_results"
}

withName: '.*:RANK_VARIANTS_SNV:GENMOD_COMPOUND' {
ext.prefix = { "${meta.id}_snv_genmod_compound" }
ext.args = "--temp_dir ./"
}

//
// Score and rank SVSs
//
withName: '.*:RANK_VARIANTS_SVS:.*' {
publishDir = [
enabled: false,
]
}
withName: '.*:RANK_VARIANTS_SVS:GENMOD_ANNOTATE' {
ext.prefix = { "${meta.id}_svs_genmod_annotate" }
ext.args = { [
'--annotate_regions',
'--genome-build 38',
'--temp_dir ./'
].join(' ') }
}
withName: '.*:RANK_VARIANTS_SVS:GENMOD_MODELS' {
ext.prefix = { "${meta.id}_svs_genmod_models" }
ext.args = "--whole_gene --temp_dir ./"
}
withName: '.*:RANK_VARIANTS_SVS:GENMOD_SCORE' {
ext.prefix = { "${meta.id}_svs_genmod_score" }
ext.args = "--rank_results"
}
withName: '.*:RANK_VARIANTS_SVS:GENMOD_COMPOUND' {
ext.prefix = { "${meta.id}_svs_genmod_compound" }
ext.args = "--temp_dir ./"
}
withName: '.*:RANK_VARIANTS_SVS:TABIX_BGZIPTABIX' {
ext.prefix = { params.skip_cnv_calling ? "${meta.id}_svs_merged_annotated_ranked" : "${meta.id}_svs_cnvs_merged_annotated_ranked" }
publishDir = [
path: { "${params.outdir}/svs/family/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
}
4 changes: 3 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ params {
modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'

// Base directory for genomic-medicine-sweden/nallo test data
pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'

// References
fasta = params.pipelines_testdata_base_path + 'reference/hg38.test.fa.gz'
Expand All @@ -36,7 +36,9 @@ params {
svdb_dbs = params.pipelines_testdata_base_path + 'testdata/svdb_dbs.csv'
reduced_penetrance = params.pipelines_testdata_base_path + 'reference/reduced_penetrance.tsv'
score_config_snv = params.pipelines_testdata_base_path + 'reference/rank_model_snv.ini'
score_config_svs = params.pipelines_testdata_base_path + 'reference/rank_model_svs.ini'
variant_consequences_snv = params.pipelines_testdata_base_path + 'reference/variant_consequences_v2.txt'
variant_consequences_svs = params.pipelines_testdata_base_path + 'reference/variant_consequences_v2.txt'
somalier_sites = params.pipelines_testdata_base_path + 'reference/somalier_sites.vcf.gz'

// Pipeline options
Expand Down
12 changes: 11 additions & 1 deletion docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,11 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin
[Severus](https://github.com/KolmogorovLab/Severus) or [Sniffles](https://github.com/fritzsedlazeck/Sniffles) is used to call structural variants.
[HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs. It also produces copy number, depth, and MAF [visualization tracks](#visualization-tracks).
[SVDB](https://github.com/J35P312/SVDB) is used to combine and merge SVs and CNVs within and between samples.
[GENMOD](https://github.com/Clinical-Genomics/genmod) is used to rank the annotated SVs.

!!!note

Variants are only output without annotation if that subworkflow is turned off.
Variants are only output without annotation and/or ranking if these subworkflow is turned off.

!!!note

Expand Down Expand Up @@ -237,6 +238,15 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin
| `svs/family/{family_id}/{family_id}_svs_merged_annotated.vcf.gz` | VCF file with merged and annotated SVs per family (output if CNV-calling is off) |
| `svs/family/{family_id}/{family_id}_svs_merged_annotated.vcf.gz.tbi` | Index of the merged VCF file |

[GENMOD](https://github.com/Clinical-Genomics/genmod) is used to rank the annotated SVs.

| Path | Description |
| -------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
| `svs/family/{family_id}/{family_id}_cnvs_svs_merged_annotated_ranked.vcf.gz` | VCF file with merged, annotated and ranked CNVs and SVs per family |
| `svs/family/{family_id}/{family_id}_cnvs_svs_merged_annotated_ranked.vcf.gz.tbi` | Index of the merged VCF file |
| `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz` | VCF file with merged, annotated and ranked SVs per family (output if CNV-calling is off) |
| `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz.tbi` | Index of the merged VCF file |

## Visualization Tracks

[HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs, but it also produces copy number, depth, and MAF tracks that can be visualized in for example IGV.
Expand Down
6 changes: 4 additions & 2 deletions docs/parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,17 @@ Define where the pipeline should find input data and save output data.
| `svdb_dbs` | Databases used for structural variant annotation in vcf format. <details><summary>Help</summary><small>Path to comma-separated file containing information about the databases used for structural variant annotation.</small></details>| `string` | | | |
| `variant_catalog` | A variant catalog json-file for stranger | `string` | | | |
| `variant_consequences_snv` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` | | | |
| `variant_consequences_svs` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` | | | |
| `vep_cache` | A path to the VEP cache location | `string` | | | |
| `bed` | A BED file with regions of interest, used to limit short variant calling. | `string` | | | |
| `hificnv_xy` | A BED file containing expected copy number regions for XY samples. | `string` | | | |
| `hificnv_xx` | A BED file containing expected copy number regions for XX samples. | `string` | | | |
| `hificnv_exclude` | A BED file specifying regions to exclude with HiFiCNV, such as centromeres. | `string` | | | |
| `reduced_penetrance` | A file with gene ids that have reduced penetrance. For use with genmod. | `string` | | | |
| `score_config_snv` | A SNV rank model config file for genmod. | `string` | | | |
| `score_config_svs` | A SV rank model config file for genmod. | `string` | | | |
| `somalier_sites` | A VCF of known polymorphic sites for somalier | `string` | | | |
| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/ | | True |
| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/ | | True |

## Reference genome options

Expand Down Expand Up @@ -106,7 +108,7 @@ Workflow options specific to genomic-medicine-sweden/nallo
| `vep_cache_version` | VEP cache version | `integer` | 110 | | |
| `vep_plugin_files` | A csv file with vep_plugins as header, and then paths to vep plugin files. Paths to pLI_values.txt and LoFtool_scores.txt are required. | `string` | | | |
| `deepvariant_model_type` | Sets the model type used for DeepVariant. This is set automatically using `--preset` by default. | `string` | PACBIO | | True |
| `minimap2_read_mapping_preset` | Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default. | `string` | | | True |
| `minimap2_read_mapping_preset` | Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default. | `string` | map-hifi | | True |
| `extra_modkit_options` | Extra options to modkit, used for test profile. | `string` | | | True |
| `extra_vep_options` | Extra options to VEP, used for test profile. | `string` | | | True |
| `extra_paraphase_options` | Extra options to Paraphase, used for test profile. | `string` | | | True |
Expand Down
11 changes: 11 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,17 @@ These databases could for example come from [CoLoRSdb](https://zenodo.org/record

Turned off with `--skip_sv_annotation`.

### Rank SVs

This subworkflow ranks SVs, and relies on the mapping, SV calling and SV annotation subworkflows, and requires the following additional files:

| Parameter | Description |
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `score_config_svs` |  Used by GENMOD when ranking variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/rank_model_snv.ini). |
| `reduced_penetrance` | A list of loci that show [reduced penetrance](https://medlineplus.gov/genetics/understanding/inheritance/penetranceexpressivity/) in people. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/reduced_penetrance.tsv) |

`--skip_rank_variants`.

## Other highlighted parameters

- Limit SNV calling to regions in BED file (`--bed`).
Expand Down
4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ params {
variant_catalog = null
reduced_penetrance = null
score_config_snv = null
score_config_svs = null
snp_db = null
svdb_dbs = null
variant_consequences_snv = null
variant_consequences_svs = null
vep_cache = null
vep_plugin_files = null
hificnv_xy = null
Expand Down Expand Up @@ -86,7 +88,7 @@ params {
help_full = false
show_hidden = false
version = false
pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'

// Config options
config_profile_name = null
Expand Down
17 changes: 15 additions & 2 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,11 @@
"description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html",
"fa_icon": "fas fa-file-csv"
},
"variant_consequences_svs": {
"type": "string",
"description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html",
"fa_icon": "fas fa-file-csv"
},
"vep_cache": {
"type": "string",
"description": "A path to the VEP cache location",
Expand Down Expand Up @@ -234,6 +239,13 @@
"fa_icon": "fas fa-file",
"description": "A SNV rank model config file for genmod."
},
"score_config_svs": {
"type": "string",
"exists": true,
"format": "path",
"fa_icon": "fas fa-file",
"description": "A SV rank model config file for genmod."
},
"somalier_sites": {
"type": "string",
"pattern": "^\\S+\\.vcf(\\.gz)?$",
Expand All @@ -245,7 +257,7 @@
"type": "string",
"fa_icon": "far fa-check-circle",
"description": "Base URL or local path to location of pipeline test dataset files",
"default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/",
"default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/",
"hidden": true
}
}
Expand Down Expand Up @@ -467,7 +479,8 @@
"type": "string",
"description": "Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default.",
"hidden": true,
"enum": ["map-hifi", "map-ont", "lr:hq", "lr:hqae"]
"enum": ["map-hifi", "map-ont", "lr:hq", "lr:hqae"],
"default": "map-hifi"
},
"extra_modkit_options": {
"type": "string",
Expand Down
7 changes: 4 additions & 3 deletions subworkflows/local/utils_nfcore_nallo_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ def fileDependencies = [
assembly : ["fasta", "par_regions"], // The assembly workflow should be split into two - assembly and variant calling (requires ref)
snv_calling : ["fasta", "par_regions"],
snv_annotation : ["snp_db", "vep_cache", "vep_plugin_files", "variant_consequences_snv"],
sv_annotation : ["svdb_dbs", "vep_cache", "vep_plugin_files"],
sv_annotation : ["svdb_dbs", "vep_cache", "vep_plugin_files", "variant_consequences_svs"],
cnv_calling : ["hificnv_xy", "hificnv_xx", "hificnv_exclude"],
rank_variants : ["reduced_penetrance", "score_config_snv"],
rank_variants : ["reduced_penetrance", "score_config_snv", "score_config_svs"],
repeat_calling : ["trgt_repeats"],
repeat_annotation: ["variant_catalog"],
]
Expand Down Expand Up @@ -108,10 +108,11 @@ def parameterStatus = [
fasta : params.fasta,
trgt_repeats : params.trgt_repeats,
variant_catalog : params.variant_catalog,
score_config_snv : params.score_config_snv,
reduced_penetrance : params.reduced_penetrance,
score_config_snv : params.score_config_snv,
score_config_svs : params.score_config_svs,
variant_consequences_snv: params.variant_consequences_snv,
variant_consequences_svs: params.variant_consequences_svs,
]
]

Expand Down
2 changes: 1 addition & 1 deletion tests/samplesheet.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ nextflow_pipeline {

when {
params {
pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/7be7114cb132be8cae9343f225bcf42ec11ecc1b/'
pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/'
input = params.pipelines_testdata_base_path + 'testdata/samplesheet.csv'
outdir = "$outputDir"
}
Expand Down
Loading

0 comments on commit 558312d

Please sign in to comment.