docs and split

genomic-medicine-sweden · Oct 7, 2024 · d3a25c2 · d3a25c2
1 parent 62c068b
commit d3a25c2
Show file tree

Hide file tree

Showing 10 changed files with 108 additions and 38 deletions.
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -110,6 +110,10 @@
 
   > Nilsson D, Magnusson M. moonso/stranger v0.7.1. Published online February 18, 2021. doi:10.5281/ZENODO.4548873
 
+- [SVDB](https://github.com/J35P312/SVDB)
+
+  > Eisfeldt et al., 2017.
+
 - [Tabix](https://academic.oup.com/bioinformatics/article/27/5/718/262743)
 
   > Li H. Tabix: fast retrieval of sequence features from generic TAB-delimited files. Bioinformatics. 2011;27(5):718-719. doi:10.1093/bioinformatics/btq671

diff --git a/conf/modules/call_svs.config b/conf/modules/call_svs.config
@@ -67,10 +67,5 @@ process {
             '--output-type z',
             '--write-index=tbi'
         ].join(' ')
-        publishDir = [
-            path: { "${params.outdir}/svs/single_sample/${meta.id}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
     }
 }
diff --git a/conf/modules/general.config b/conf/modules/general.config
@@ -57,7 +57,7 @@ process {
         ]
     }
 
-    withName: '.*:NALLO:BCFTOOLS_PLUGINSPLIT' {
+    withName: '.*:NALLO:BCFTOOLS_PLUGINSPLIT_SNVS' {
         ext.args = [
             '-i \'GT="alt"\'',
             '--output-type z',
@@ -81,6 +81,30 @@ process {
         ]
     }
 
+    withName: '.*:NALLO:BCFTOOLS_PLUGINSPLIT_SVS' {
+        ext.args = [
+            '-i \'GT="alt"\'',
+            '--output-type z',
+            '--write-index=tbi'
+        ].join(' ')
+        publishDir = [
+            path: { "${params.outdir}/svs/single_sample/" },
+            mode: params.publish_dir_mode,
+            // Can't use prefix as it would come from the original file
+            saveAs: { filename ->
+                if (filename.equals('versions.yml')) {
+                    null
+                } else {
+                    def matcher = filename =~ /(.+)(\.vcf\.gz(?:\.tbi)?)$/
+                    def sample = matcher[0][1]
+                    def extension = matcher[0][2]
+                    def annotated = params.skip_sv_annotation ? "" : "_annotated"
+                    "${sample}/${sample}_svs${annotated}${extension}"
+                }
+            }
+        ]
+    }
+
     withName: '.*:NALLO:SAMPLESHEET_PED' {
         publishDir = [
             enabled: false

diff --git a/docs/output.md b/docs/output.md
@@ -27,6 +27,7 @@
       - [Ranking](#ranking)
     - [Ranked Variants](#ranked-variants)
     - [SV Calling](#sv-calling)
+    - [SV Annotation](#sv-annotation)
 
 ## Pipeline overview
 
@@ -348,3 +349,18 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
   - `*.vcf.gz`: VCF with variants per sample
   - `*.vcf.gz.tbi`: Index of the corresponding VCF file
   </details>
+
+### SV Annotation
+
+[SVDB](https://github.com/J35P312/SVDB) and [VEP](https://www.ensembl.org/vep) are used to annotate SVs.
+
+<details markdown="1">
+<summary>Output files from SV Annotation</summary>
+
+- `{outputdir}/svs/multi_sample/{project}`
+  - `{project}_svs_annotated.vcf.gz`: VCF file with annotated merged variants
+  - `{project}_svs_annotated.vcf.gz.tbi`: Index of the corresponding VCF file
+- `{outputdir}/svs/single_sample/{sample}`
+  - `*.vcf_annotated.gz`: VCF with annotated variants per sample
+  - `*.vcf_annotated.gz.tbi`: Index of the corresponding VCF file
+  </details>
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -17,6 +17,7 @@ Allows skipping certain parts of the pipeline
 | `skip_repeat_annotation` | Skip tandem repeat annotation | `boolean` | False |  |  |
 | `skip_phasing_wf` | Skip phasing of variants and haplotagging of reads | `boolean` | False |  |  |
 | `skip_snv_annotation` | Skip short variant annotation | `boolean` | False |  |  |
+| `skip_sv_annotation` | Skip structural variant annotation | `boolean` | False |  |  |
 | `skip_cnv_calling` | Skip CNV calling | `boolean` | False |  |  |
 | `skip_call_paralogs` | Skip the calling of specific paralogous genes | `boolean` | False |  |  |
 | `skip_rank_variants` | Skip ranking of short variants | `boolean` | False |  |  |
@@ -122,6 +123,7 @@ The different files that are required. Some are only required by certain workflo
 | `tandem_repeats` | A tandem repeat BED file for sniffles | `string` |  |  |  |
 | `trgt_repeats` | A BED file with repeats to be genotyped with TRGT | `string` |  |  |  |
 | `snp_db` | A csv file with echtvar databases to annotate SNVs with | `string` |  |  |  |
+| `svdb_dbs` | Databases used for structural variant annotation in vcf format. <details><summary>Help</summary><small>Path to comma-separated file containing information about the databases used for structural variant annotation.</small></details>| `string` |  |  |  |
 | `variant_catalog` | A variant catalog json-file for stranger | `string` |  |  |  |
 | `variant_consequences_snv` | File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SNVs. For more information check https://ensembl.org/info/genome/variation/prediction/predicted_data.html | `string` |  |  |  |
 | `vep_cache` | A path to the VEP cache location | `string` |  |  |  |

diff --git a/docs/usage.md b/docs/usage.md
@@ -236,9 +236,17 @@ cadd,/path/to/cadd.v1.6.hg38.zip
 > [!NOTE]
 > Optionally, to calcuate CADD scores for small indels, supply a path to a folder containing cadd annotations with `--cadd_resources` and prescored indels with `--cadd_prescored`. Equivalent of the `data/annotations/` and `data/prescored/` folders described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation). CADD scores for SNVs can be annotated through echvtvar and `--snp_db`.
 
+### SV annotation (`--skip_sv_annotation`)
+
+This subworkflow relies on the mapping subworkflow, and requires the following additional files:
+
+| Parameter  | Description                                                                                                                                                                                                                              |
+| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `svdb_dbs` | Databases used for structural variant annotation in vcf format. <details><summary>Help</summary><small>Path to comma-separated file containing information about the databases used for structural variant annotation.</small></details> |
+
 ### Rank variants (`--skip_rank_variants`)
 
-This subworkflow relies on the mapping, short variant calling and SNV annotation subworkflows, and requires the following additional files:
+This subworkflow ranks SNVs and INDELs, and relies on the mapping, short variant calling and SNV annotation subworkflows, and requires the following additional files:
 
 | Parameter            | Description                                                                                                                                                                                                                                                 |
 | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |

diff --git a/subworkflows/local/annotate_svs/main.nf b/subworkflows/local/annotate_svs/main.nf
@@ -15,8 +15,6 @@ workflow ANNOTATE_SVS {
     main:
     ch_versions = Channel.empty()
 
-    ch_sv_dbs.view()
-
     ch_sv_dbs
         .map { meta, csv -> csv }
         .splitCsv ( header:true )

diff --git a/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf b/subworkflows/local/utils_nfcore_nallo_pipeline/main.nf
@@ -365,6 +365,12 @@ def toolCitationText() {
                 "GLnexus (Yun et al. 2021)",
             ]
         }
+        if (!params.skip_sv_annotation) {
+            citation_text = citation_text + [
+                "VEP (McLaren et al. 2016)",
+                "SVDB (Eisfeldt et al. 2017)",
+            ]
+        }
         if (!params.skip_snv_annotation) {
             citation_text = citation_text + [
                 "CADD (Rentzsch et al. 2019, Rentzsch et al. 2021)",

diff --git a/tests/main.nf.test b/tests/main.nf.test
@@ -110,8 +110,8 @@ nextflow_pipeline {
                 { assert new File("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio/HG002_Revio_spanning_sorted.bam.bai").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_Revio/HG002_Revio_snv_annotated_ranked.vcf.gz").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_Revio/HG002_Revio_snv_annotated_ranked.vcf.gz.tbi").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_Revio/HG002_Revio_severus.vcf.gz").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_Revio/HG002_Revio_severus.vcf.gz.tbi").exists() }
+                { assert new File("$outputDir/svs/single_sample/HG002_Revio/HG002_Revio_svs_annotated.vcf.gz").exists() },
+                { assert new File("$outputDir/svs/single_sample/HG002_Revio/HG002_Revio_svs_annotated.vcf.gz.tbi").exists() }
             )
         }
     }
@@ -261,8 +261,8 @@ nextflow_pipeline {
                 { assert new File("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_A/HG002_Revio_A_spanning_sorted.bam.bai").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_Revio_A/HG002_Revio_A_snv_annotated_ranked.vcf.gz").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_Revio_A/HG002_Revio_A_snv_annotated_ranked.vcf.gz.tbi").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_Revio_A/HG002_Revio_A_severus.vcf.gz").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_Revio_A/HG002_Revio_A_severus.vcf.gz.tbi").exists() },
+                { assert new File("$outputDir/svs/single_sample/HG002_Revio_A/HG002_Revio_A_svs_annotated.vcf.gz").exists() },
+                { assert new File("$outputDir/svs/single_sample/HG002_Revio_A/HG002_Revio_A_svs_annotated.vcf.gz.tbi").exists() },
                 // Assert exists HG002_Revio_B
                 { assert new File("$outputDir/aligned_reads/HG002_Revio_B/HG002_Revio_B_haplotagged.bam.bai").exists() },
                 { assert new File("$outputDir/assembly_variant_calling/dipcall/HG002_Revio_B/HG002_Revio_B.hap1.bam.bai").exists() },
@@ -294,8 +294,8 @@ nextflow_pipeline {
                 { assert new File("$outputDir/repeat_calling/trgt/single_sample/HG002_Revio_B/HG002_Revio_B_spanning_sorted.bam.bai").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_Revio_B/HG002_Revio_B_snv_annotated_ranked.vcf.gz").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_Revio_B/HG002_Revio_B_snv_annotated_ranked.vcf.gz.tbi").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_Revio_B/HG002_Revio_B_severus.vcf.gz").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_Revio_B/HG002_Revio_B_severus.vcf.gz.tbi").exists() }
+                { assert new File("$outputDir/svs/single_sample/HG002_Revio_B/HG002_Revio_B_svs_annotated.vcf.gz").exists() },
+                { assert new File("$outputDir/svs/single_sample/HG002_Revio_B/HG002_Revio_B_svs_annotated.vcf.gz.tbi").exists() }
             )
         }
     }
@@ -400,8 +400,8 @@ nextflow_pipeline {
                 { assert new File("$outputDir/qc/fastqc/HG002_ONT_A/HG002_ONT_A_fastqc.zip").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_ONT_A/HG002_ONT_A_snv_annotated_ranked.vcf.gz").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_ONT_A/HG002_ONT_A_snv_annotated_ranked.vcf.gz.tbi").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_ONT_A/HG002_ONT_A_severus.vcf.gz").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_ONT_A/HG002_ONT_A_severus.vcf.gz.tbi").exists() },
+                { assert new File("$outputDir/svs/single_sample/HG002_ONT_A/HG002_ONT_A_svs_annotated.vcf.gz").exists() },
+                { assert new File("$outputDir/svs/single_sample/HG002_ONT_A/HG002_ONT_A_svs_annotated.vcf.gz.tbi").exists() },
                 // Assert exists HG002_ONT_B
                 { assert new File("$outputDir/aligned_reads/HG002_ONT_B/HG002_ONT_B_haplotagged.bam.bai").exists() },
                 { assert new File("$outputDir/cnv_calling/hificnv/HG002_ONT_B/HG002_ONT_B.log").exists() },
@@ -426,8 +426,8 @@ nextflow_pipeline {
                 { assert new File("$outputDir/qc/fastqc/HG002_ONT_B/HG002_ONT_B_fastqc.zip").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_ONT_B/HG002_ONT_B_snv_annotated_ranked.vcf.gz").exists() },
                 { assert new File("$outputDir/snvs/single_sample/HG002_ONT_B/HG002_ONT_B_snv_annotated_ranked.vcf.gz.tbi").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_ONT_B/HG002_ONT_B_severus.vcf.gz").exists() },
-                { assert new File("$outputDir/svs/single_sample/HG002_ONT_B/HG002_ONT_B_severus.vcf.gz.tbi").exists() }
+                { assert new File("$outputDir/svs/single_sample/HG002_ONT_B/HG002_ONT_B_svs_annotated.vcf.gz").exists() },
+                { assert new File("$outputDir/svs/single_sample/HG002_ONT_B/HG002_ONT_B_svs_annotated.vcf.gz.tbi").exists() }
             )
         }
     }

diff --git a/workflows/nallo.nf b/workflows/nallo.nf
@@ -33,23 +33,24 @@ include { SNV_ANNOTATION                          } from '../subworkflows/local/
 */
 
 // local
-include { CREATE_PEDIGREE_FILE as SAMPLESHEET_PED } from '../modules/local/create_pedigree_file/main'
-include { CREATE_PEDIGREE_FILE as SOMALIER_PED    } from '../modules/local/create_pedigree_file/main'
-include { ECHTVAR_ENCODE                          } from '../modules/local/echtvar/encode/main'
-include { SAMTOOLS_MERGE                          } from '../modules/nf-core/samtools/merge/main'
+include { CREATE_PEDIGREE_FILE as SAMPLESHEET_PED           } from '../modules/local/create_pedigree_file/main'
+include { CREATE_PEDIGREE_FILE as SOMALIER_PED              } from '../modules/local/create_pedigree_file/main'
+include { ECHTVAR_ENCODE                                    } from '../modules/local/echtvar/encode/main'
+include { SAMTOOLS_MERGE                                    } from '../modules/nf-core/samtools/merge/main'
 
 // nf-core
-include { BCFTOOLS_CONCAT                         } from '../modules/nf-core/bcftools/concat/main'
-include { BCFTOOLS_PLUGINSPLIT                    } from '../modules/nf-core/bcftools/pluginsplit/main'
-include { BCFTOOLS_SORT                           } from '../modules/nf-core/bcftools/sort/main'
-include { BCFTOOLS_STATS                          } from '../modules/nf-core/bcftools/stats/main'
-include { MINIMAP2_ALIGN                          } from '../modules/nf-core/minimap2/align/main'
-include { MULTIQC                                 } from '../modules/nf-core/multiqc/main'
-include { SPLITUBAM                               } from '../modules/nf-core/splitubam/main'
-include { paramsSummaryMap                        } from 'plugin/nf-validation'
-include { paramsSummaryMultiqc                    } from '../subworkflows/nf-core/utils_nfcore_pipeline'
-include { softwareVersionsToYAML                  } from '../subworkflows/nf-core/utils_nfcore_pipeline'
-include { methodsDescriptionText                  } from '../subworkflows/local/utils_nfcore_nallo_pipeline'
+include { BCFTOOLS_CONCAT                                   } from '../modules/nf-core/bcftools/concat/main'
+include { BCFTOOLS_PLUGINSPLIT as BCFTOOLS_PLUGINSPLIT_SNVS } from '../modules/nf-core/bcftools/pluginsplit/main'
+include { BCFTOOLS_PLUGINSPLIT as BCFTOOLS_PLUGINSPLIT_SVS  } from '../modules/nf-core/bcftools/pluginsplit/main'
+include { BCFTOOLS_SORT                                     } from '../modules/nf-core/bcftools/sort/main'
+include { BCFTOOLS_STATS                                    } from '../modules/nf-core/bcftools/stats/main'
+include { MINIMAP2_ALIGN                                    } from '../modules/nf-core/minimap2/align/main'
+include { MULTIQC                                           } from '../modules/nf-core/multiqc/main'
+include { SPLITUBAM                                         } from '../modules/nf-core/splitubam/main'
+include { paramsSummaryMap                                  } from 'plugin/nf-validation'
+include { paramsSummaryMultiqc                              } from '../subworkflows/nf-core/utils_nfcore_pipeline'
+include { softwareVersionsToYAML                            } from '../subworkflows/nf-core/utils_nfcore_pipeline'
+include { methodsDescriptionText                            } from '../subworkflows/local/utils_nfcore_nallo_pipeline'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -313,7 +314,9 @@ workflow NALLO {
         )
         ch_versions = ch_versions.mix(CALL_SVS.out.versions)
 
-
+        CALL_SVS.out.ch_multisample_vcf
+            .join( CALL_SVS.out.ch_multisample_tbi )
+            .set { ch_split_svs_in }
         //
         // Annotate structural variants
         //
@@ -326,8 +329,22 @@ workflow NALLO {
                 params.vep_cache_version,
                 ch_vep_extra_files
             )
+
+            ANNOTATE_SVS.out.vcf
+                .join( ANNOTATE_SVS.out.tbi )
+                .set { ch_split_svs_in }
         }
 
+        // Split the multisample SV VCF to also publish an (annotated) VCF per sample
+        BCFTOOLS_PLUGINSPLIT_SVS (
+            ch_split_svs_in,
+            [],
+            [],
+            [],
+            []
+        )
+        ch_versions = ch_versions.mix(BCFTOOLS_PLUGINSPLIT_SVS.out.versions)
+
         //
         // Call (and annotate and rank) SNVs
         //
@@ -440,10 +457,10 @@ workflow NALLO {
             ch_versions = ch_versions.mix(ECHTVAR_ENCODE.out.versions)
 
             // Split multisample VCF to also publish a VCF per sample
-            BCFTOOLS_PLUGINSPLIT ( BCFTOOLS_SORT.out.vcf.join(BCFTOOLS_SORT.out.tbi ), [], [], [], [] )
-            ch_versions = ch_versions.mix(BCFTOOLS_PLUGINSPLIT.out.versions)
+            BCFTOOLS_PLUGINSPLIT_SNVS ( BCFTOOLS_SORT.out.vcf.join(BCFTOOLS_SORT.out.tbi ), [], [], [], [] )
+            ch_versions = ch_versions.mix(BCFTOOLS_PLUGINSPLIT_SNVS.out.versions)
 
-            BCFTOOLS_PLUGINSPLIT.out.vcf
+            BCFTOOLS_PLUGINSPLIT_SNVS.out.vcf
                 .transpose()
                 .map { meta, vcf -> [ meta, vcf, [] ] }
                 .set { ch_bcftools_stats_snv_in }