Merge pull request #483 from genomic-medicine-sweden/fp_cnvkit_html

feat: Added an FP-flag to the html report
genomic-medicine-sweden · Sep 23, 2024 · d92c538 · d92c538
2 parents 0c4a350 + ef5d439
commit d92c538
Show file tree

Hide file tree

Showing 12 changed files with 1,095 additions and 39 deletions.
diff --git a/.tests/units/test_cnv_report.py b/.tests/units/test_cnv_report.py
@@ -16,6 +16,7 @@ def test_create_tsv_report(self):
         cnv = tempfile.mkdtemp() + "/tcnv.txt"
         out_additional_only = open(tempfile.mkdtemp() + "/out_additional_only.txt", "w")
         out_tsv_chrom_arms = tempfile.mkdtemp() + "/cnv_chromosome_arms.tsv"
+        out_vcf_filename = tempfile.mkdtemp() + "/out_vcf.vcf"
         create_tsv_report(
             [".tests/units/vcf/test.cnv1.vcf"],
             [".tests/units/vcf/test.cnv2.vcf"],
@@ -27,6 +28,7 @@ def test_create_tsv_report(self):
             cnv,
             out_additional_only,
             out_tsv_chrom_arms,
+            out_vcf_filename,
             1.5,
             0.5,
             0.3,
@@ -39,6 +41,7 @@ def test_create_tsv_report(self):
             0.2,
             0.2,
             0.5,
+            15000000,
         )
 
         @dataclass
@@ -53,23 +56,23 @@ class TestCase:
                 ),
                 TestCase(
                     name="variant 1",
-                    expected=("FGFR1", "chr8", "34370199-43930232", "cnvkit", "0.01", "8.59")
+                    expected=("FGFR1", "chr8", "34370199-43930232", "cnvkit", "0.01", "8.59", "-")
                 ),
                 TestCase(
                     name="variant 2",
-                    expected=("FGFR1,MYC", "chr8", "35008818-146144253", "gatk", "0.01", "7.01")
+                    expected=("FGFR1,MYC", "chr8", "35008818-146144253", "gatk", "0.01", "7.01", "-")
                 ),
                 TestCase(
                     name="variant 3",
-                    expected=("MYC", "chr8", "46689525-146144003", "cnvkit", "0.09", "5.06")
+                    expected=("MYC", "chr8", "46689525-146144003", "cnvkit", "0.09", "5.06", "-")
                 ),
                 TestCase(
                     name="small deletion",
-                    expected=("CDKN2A,CDKN2B", "chr9", "21968207-22008972", "small_deletion", "NA", "-0.28")
+                    expected=("CDKN2A,CDKN2B", "chr9", "21968207-22008972", "small_deletion", "NA", "-0.28", "-")
                 ),
                 TestCase(
                     name="small amplification",
-                    expected=("MYCN", "chr2", "16968207-17008972", "small_amplification", "NA", "7.29")
+                    expected=("MYCN", "chr2", "16968207-17008972", "small_amplification", "NA", "7.29", "-")
                 ),
         ]
 

diff --git a/config/config.yaml b/config/config.yaml
@@ -86,20 +86,20 @@ cnv_html_report:
         Also warnings of baseline skewness and detection of polyploidy in the sample.
       path: "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.cnv_loh_genes_all.cnv_chromosome_arms.tsv"
 
-
 cnv_tsv_report:
   amp_cn_limit: 6.0
   baseline_fraction_limit: 0.2
   del_1p19q_cn_limit: 1.4
   del_1p19q_chr_arm_fraction: 0.3
   chr_arm_fraction: 0.3
   del_chr_arm_cn_limit: 1.4
-  amp_chr_arm_cn_limit: 2.5
-  normal_baf_lower_limit: 0.4
-  normal_baf_upper_limit: 0.6
+  amp_chr_arm_cn_limit: 2.6
+  normal_baf_lower_limit: 0.3
+  normal_baf_upper_limit: 0.7
   normal_cn_lower_limit: 1.7
   normal_cn_upper_limit: 2.25
   polyploidy_fraction_limit: 0.2
+  max_cnv_fp_size: 15000000
 
 fastp_pe:
   container: "docker://hydragenetics/fastp:0.20.1"
@@ -204,12 +204,18 @@ manta_run_workflow_t:
 
 merge_cnv_json:
   filtered_cnv_vcfs:
-    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.filter.cnv_hard_filter_amp.vcf.gz
-    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_loh_genes_all.filter.cnv_hard_filter_loh.vcf.gz
-  germline_vcf: snv_indels/bcbio_variation_recall_ensemble/{sample}_{type}.ensembled.vep_annotated.filter.germline.exclude.blacklist.vcf.gz
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.filter.cnv_hard_filter_amp.fp_tag.vcf.gz
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_loh_genes_all.filter.cnv_hard_filter_loh.fp_tag.annotate_fp.vcf.gz
+  filtered_cnv_vcfs_tbi:
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.filter.cnv_hard_filter_amp.fp_tag.vcf.gz.tbi
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_loh_genes_all.filter.cnv_hard_filter_loh.fp_tag.annotate_fp.vcf.gz.tbi
   unfiltered_cnv_vcfs:
-    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.vcf.gz
-    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_loh_genes_all.vcf.gz
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.fp_tag.vcf.gz
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_loh_genes_all.fp_tag.vcf.gz
+  unfiltered_cnv_vcfs_tbi:
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.fp_tag.vcf.gz.tbi
+    - cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_loh_genes_all.fp_tag.vcf.gz.tbi
+  germline_vcf: snv_indels/bcbio_variation_recall_ensemble/{sample}_{type}.ensembled.vep_annotated.filter.germline.exclude.blacklist.vcf.gz
 
 mosdepth:
   container: "docker://hydragenetics/mosdepth:0.3.2"

diff --git a/docs/softwares.md b/docs/softwares.md
@@ -71,7 +71,7 @@ The CNVkit and GATK CNV caller often miss small deletions of four exons or small
 ---
 
 ## cnv_tsv_report
-Collect all CNV calls into an excel friendly text file. Adds potential 1p19q calls. See further [CNV tsv report info](dna_cnvs.md#cnv-report).
+Collect all CNV calls into an excel friendly text file. Adds potential 1p19q calls. Also add a FP flag for variants called by CNVkit that in GATK CNV does not have any signal in that region. See further [CNV tsv report info](dna_cnvs.md#cnv-report).
 
 ### :snake: Rule
 

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -619,15 +619,25 @@ module reports:
         config
 
 
-use rule * from reports exclude all as reports_*
+use rule * from reports exclude all, merge_cnv_json as reports_*
 
 
 use rule cnv_html_report from reports as reports_cnv_html_report with:
     input:
         json=workflow.get_rule("reports_cnv_html_report").input.json,
         html_template=workflow.get_rule("reports_cnv_html_report").input.html_template,
-        js_files=workflow.get_rule("reports_cnv_html_report").input.js_files,
-        css_files=workflow.get_rule("reports_cnv_html_report").input.css_files,
+        js_files=[
+            workflow.get_rule("reports_cnv_html_report").input.js_files[0],
+            workflow.get_rule("reports_cnv_html_report").input.js_files[1],
+            workflow.get_rule("reports_cnv_html_report").input.js_files[2],
+            workflow.source_path("templates/cnv_html_report/03-results-table.js"),
+            workflow.get_rule("reports_cnv_html_report").input.js_files[4],
+            workflow.get_rule("reports_cnv_html_report").input.js_files[5],
+        ],
+        css_files=[
+            workflow.get_rule("reports_cnv_html_report").input.css_files[0],
+            workflow.source_path("templates/cnv_html_report/style.css"),
+        ],
         tc_file=get_tc_file,
         extra_table_files=[t["path"] for t in config.get("cnv_html_report", {}).get("extra_tables", [])],
     params:
@@ -638,6 +648,44 @@ use rule cnv_html_report from reports as reports_cnv_html_report with:
         tc_method=lambda wildcards: wildcards.tc_method,
 
 
+rule merge_cnv_json:
+    input:
+        json=reports.get_json_for_merge_cnv_json,
+        fai=config.get("reference", {}).get("fai", ""),
+        annotation_bed=config.get("merge_cnv_json", {}).get("annotations", []),
+        germline_vcf=config.get("merge_cnv_json", {}).get("germline_vcf", []),
+        filtered_cnv_vcfs=config.get("merge_cnv_json", {}).get("filtered_cnv_vcfs", []),
+        filtered_cnv_vcfs_tbi=config.get("merge_cnv_json", {}).get("filtered_cnv_vcfs_tbi", []),
+        cnv_vcfs=config.get("merge_cnv_json", {}).get("unfiltered_cnv_vcfs", []),
+        cnv_vcfs_tbi=config.get("merge_cnv_json", {}).get("unfiltered_cnv_vcfs_tbi", []),
+        cytobands=config.get("merge_cnv_json", {}).get("cytobands", []),
+    output:
+        json=temp("reports/cnv_html_report/{sample}_{type}.{tc_method}.merged.json"),
+    params:
+        skip_chromosomes=config.get("reference", {}).get("skip_chrs", []),
+        cytobands=config.get("cnv_html_report", {}).get("cytobands", False),
+    log:
+        "reports/cnv_html_report/{sample}_{type}.{tc_method}.merged.json.log",
+    benchmark:
+        repeat(
+            "reports/cnv_html_report/{sample}_{type}.{tc_method}.merged.json.benchmark.tsv",
+            config.get("merge_cnv_json", {}).get("benchmark_repeats", 1),
+        )
+    threads: config.get("merge_cnv_json", {}).get("threads", config["default_resources"]["threads"])
+    resources:
+        mem_mb=config.get("merge_cnv_json", {}).get("mem_mb", config["default_resources"]["mem_mb"]),
+        mem_per_cpu=config.get("merge_cnv_json", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]),
+        partition=config.get("merge_cnv_json", {}).get("partition", config["default_resources"]["partition"]),
+        threads=config.get("merge_cnv_json", {}).get("threads", config["default_resources"]["threads"]),
+        time=config.get("merge_cnv_json", {}).get("time", config["default_resources"]["time"]),
+    container:
+        config.get("merge_cnv_json", {}).get("container", config["default_container"])
+    message:
+        "{rule}: Merge CNV JSON data for {wildcards.sample}_{wildcards.type}"
+    script:
+        "scripts/merge_cnv_json.py"
+
+
 module misc:
     snakefile:
         get_module_snakefile(config, "hydra-genetics/misc", path="workflow/Snakefile", tag="v0.2.0")

diff --git a/workflow/rules/cnv_tsv_report.smk b/workflow/rules/cnv_tsv_report.smk
@@ -11,18 +11,19 @@ rule cnv_tsv_report:
         deletions="cnv_sv/call_small_cnv_deletions/{sample}_{type}.deletions.tsv",
         gatk_cnr="cnv_sv/gatk_denoise_read_counts/{sample}_{type}.clean.denoisedCR.tsv",
         org_vcfs=[
-            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.vcf",
-            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{tag}.vcf",
+            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.fp_tag.vcf",
+            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{tag}.fp_tag.vcf",
         ],
         tc_file=get_tc_file,
         vcfs=[
-            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.filter.cnv_hard_filter_amp.vcf",
-            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{tag}.filter.cnv_hard_filter_loh.vcf",
+            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.cnv_amp_genes.filter.cnv_hard_filter_amp.fp_tag.vcf",
+            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{tag}.filter.cnv_hard_filter_loh.fp_tag.vcf",
         ],
     output:
         tsv=temp("cnv_sv/svdb_query/{sample}_{type}.{tc_method}.{tag}.cnv_report.tsv"),
         tsv_additional_only=temp("cnv_sv/svdb_query/{sample}_{type}.{tc_method}.{tag}.cnv_additional_variants_only.tsv"),
         tsv_chrom_arms=temp("cnv_sv/svdb_query/{sample}_{type}.{tc_method}.{tag}.cnv_chromosome_arms.tsv"),
+        vcf_del="cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{tag}.filter.cnv_hard_filter_loh.fp_tag.annotate_fp.vcf",
     params:
         amp_chr_arm_cn_limit=config.get("cnv_tsv_report", {}).get("amp_chr_arm_cn_limit", ""),
         baseline_fraction_limit=config.get("cnv_tsv_report", {}).get("baseline_fraction_limit", ""),
@@ -31,6 +32,7 @@ rule cnv_tsv_report:
         del_chr_arm_cn_limit=config.get("cnv_tsv_report", {}).get("del_chr_arm_cn_limit", ""),
         del_1p19q_cn_limit=config.get("cnv_tsv_report", {}).get("del_1p19q_cn_limit", ""),
         del_1p19q_chr_arm_fraction=config.get("cnv_tsv_report", {}).get("del_1p19q_chr_arm_fraction", ""),
+        max_cnv_fp_size=config.get("cnv_tsv_report", {}).get("max_cnv_fp_size", ""),
         normal_cn_lower_limit=config.get("cnv_tsv_report", {}).get("normal_cn_lower_limit", ""),
         normal_cn_upper_limit=config.get("cnv_tsv_report", {}).get("normal_cn_upper_limit", ""),
         normal_baf_lower_limit=config.get("cnv_tsv_report", {}).get("normal_baf_lower_limit", ""),
@@ -57,3 +59,31 @@ rule cnv_tsv_report:
         "{rule}: Convert cnv vcf to a tsv file: {output.tsv}"
     script:
         "../scripts/cnv_report.py"
+
+
+rule cnv_add_fp_header:
+    input:
+        vcf="cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{post_fix}.vcf.gz",
+        vcf_tbi="cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{post_fix}.vcf.gz.tbi",
+    output:
+        vcf="cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{post_fix}.fp_tag.vcf",
+    log:
+        "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{post_fix}.fp_tag.vcf.log",
+    benchmark:
+        repeat(
+            "cnv_sv/svdb_query/{sample}_{type}.{tc_method}.svdb_query.annotate_cnv.{post_fix}.fp_tag.vcf.benchmark.tsv",
+            config.get("cnv_add_fp_header", {}).get("benchmark_repeats", 1),
+        )
+    threads: config.get("cnv_add_fp_header", {}).get("threads", config["default_resources"]["threads"])
+    resources:
+        mem_mb=config.get("cnv_add_fp_header", {}).get("mem_mb", config["default_resources"]["mem_mb"]),
+        mem_per_cpu=config.get("cnv_add_fp_header", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]),
+        partition=config.get("cnv_add_fp_header", {}).get("partition", config["default_resources"]["partition"]),
+        threads=config.get("cnv_add_fp_header", {}).get("threads", config["default_resources"]["threads"]),
+        time=config.get("cnv_add_fp_header", {}).get("time", config["default_resources"]["time"]),
+    container:
+        config.get("cnv_add_fp_header", {}).get("container", config["default_container"])
+    message:
+        "{rule}: Add FP_FLAG to cnv vcf header in {output.vcf}"
+    script:
+        "../scripts/cnv_add_fp_header.py"
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -125,6 +125,9 @@ properties:
       del_chr_arm_cn_limit:
         type: number
         description: Upper limit of copy number to be included in calculation of chr_arm_fraction for deletions
+      max_cnv_fp_size:
+        type: number
+        description: Max size of cnvkit cnv to be tested for false positive
       normal_cn_lower_limit:
         type: number
         description: Lower limit of copy number to be counted as a normal segment

diff --git a/workflow/schemas/rules.schema.yaml b/workflow/schemas/rules.schema.yaml
@@ -28,7 +28,7 @@ properties:
         properties:
           vcf:
             type: string
-            description: Vcf with pileup informations for all positions in input bed file
+            description: Vcf with pileup information for all positions in input bed file
 
   call_small_cnv_amplifications:
     type: object
@@ -50,7 +50,7 @@ properties:
         properties:
           amplifications:
             type: string
-            description: Text file with the called amplifictions
+            description: Text file with the called amplifications
 
   call_small_cnv_deletions:
     type: object

diff --git a/workflow/scripts/cnv_add_fp_header.py b/workflow/scripts/cnv_add_fp_header.py
@@ -0,0 +1,18 @@
+from pysam import VariantFile
+
+
+def add_fp_header(in_vcf_filename, out_vcf_filename):
+    variants = VariantFile(in_vcf_filename)
+    header = variants.header
+    header.add_meta(
+        'INFO', items=[('ID', "FP_FLAG"), ('Number', "1"), ('Type', 'String'), ('Description', 'CNV false positive flag')]
+    )
+    out_vcf = VariantFile(out_vcf_filename, "w", header=header)
+    for variant in variants.fetch():
+        out_vcf.write(variant)
+
+
+if __name__ == "__main__":
+    in_vcf_filename = snakemake.input.vcf
+    out_vcf_filename = snakemake.output.vcf
+    add_fp_header(in_vcf_filename, out_vcf_filename)