Merge pull request #517 from genomic-medicine-sweden/develop

chore: dev to master
genomic-medicine-sweden · Oct 24, 2024 · bc606ce · bc606ce
2 parents f9c4859 + 5872f10
commit bc606ce
Show file tree

Hide file tree

Showing 7 changed files with 40 additions and 17 deletions.
diff --git a/config/output_reference_files.yaml b/config/output_reference_files.yaml
@@ -5,8 +5,8 @@ files:
     types:
       - N
   - name: jumble_pon
-    input: references/jumble_reference/design.bed.reference.RDS
-    output: result/jumble.PoN.RDS
+    input: references/jumble_reference/{design}.reference.RDS
+    output: result/jumble.{design}.PoN.RDS
     types:
       - N
   - name: gatk_pon

diff --git a/config/reports/multiqc_config_dna.yaml b/config/reports/multiqc_config_dna.yaml
@@ -1,3 +1,13 @@
+title: "Clinical Genomics MultiQC Report"
+subtitle: "Reference used: GRCh37"
+intro_text: "The MultiQC DNA report summarise analysis results from GMS560 panel data that been analysed by the Twist Solid pipeline (https://github.com/genomic-medicine-sweden/Twist_Solid)."
+
+report_header_info:
+  - Contact E-mail: "[email protected]"
+  - Application Type: "Bioinformatic analysis of GMS560 panel for solid cancers"
+
+show_analysis_paths: True
+
 #decimalPoint_format: ','
 extra_fn_clean_exts: ##from this until end
     - '.duplication_metrics'

diff --git a/config/reports/multiqc_config_rna.yaml b/config/reports/multiqc_config_rna.yaml
@@ -1,10 +1,21 @@
+title: "Clinical Genomics MultiQC Report"
+subtitle: "Reference used: GRCh37"
+intro_text: "The MultiQC RNA report summarise analysis results from GMS560 panel data that been analysed by the Twist Solid pipeline (https://github.com/genomic-medicine-sweden/Twist_Solid)."
+
+report_header_info:
+  - Contact E-mail: "[email protected]"
+  - Application Type: "Bioinformatic analysis of GMS560 panel for solid cancers"
+
+show_analysis_paths: True
+
+
 #decimalPoint_format: ','
 extra_fn_clean_exts: ##from this until end
     - '.duplication_metrics'
     - '.HsMetrics'
     - '.alignment_summary_metrics'
-    - type: regex
-      pattern: '_fastq[12]'
+    - type: regex_keep
+      pattern: '[0-9A-Z-]+'
 #extra_fn_clean_trim:   #if found in beginning or end
 #fn_ignore_dirs:
 #fn_ignore_files:
@@ -33,8 +44,8 @@ table_columns_visible:
   "Samtools: stats":
     error_rate: False
     non-primary_alignments: False
-    reads_mapped: False
-    reads_mapped_percent: False
+    reads_mapped: True
+    reads_mapped_percent: True
     reads_properly_paired_percent: False
     reads_MQ0_percent: False
     raw_total_sequences: False

diff --git a/workflow/Snakefile_references.smk b/workflow/Snakefile_references.smk
@@ -54,7 +54,7 @@ use rule cnvkit_batch from cnv_sv as cnv_sv_cnvkit_batch with:
     input:
         bam="alignment/samtools_merge_bam/{sample}_{type}.bam",
         bai="alignment/samtools_merge_bam/{sample}_{type}.bam.bai",
-        cnv_reference="references/cnvkit_build_normal_reference/cnvkit.PoN.cnn",
+        reference="references/cnvkit_build_normal_reference/cnvkit.PoN.cnn",
 
 
 use rule background_annotation from annotation as annotation_background_annotation with:

diff --git a/workflow/rules/common_references.smk b/workflow/rules/common_references.smk
@@ -54,7 +54,9 @@ def compile_output_list(wildcards):
     for filedef in output_spec["files"]:
         output_files += set(
             [
-                filedef["output"].format(sample=sample, type=unit_type, caller=caller)
+                filedef["output"].format(
+                    sample=sample, type=unit_type, caller=caller, design=config["reference"]["design_bed"].split("/")[-1]
+                )
                 for sample in get_samples(samples)
                 for unit_type in get_unit_types(units, sample)
                 if unit_type in set(filedef["types"]).intersection(types)

diff --git a/workflow/scripts/report_fusions.py b/workflow/scripts/report_fusions.py
@@ -190,16 +190,16 @@
         if int(Junction_read_count) < housekeeping_genes[gene2][0]:
             continue
     # Min AF for frequent FP gene fusions and housekeeping gene
-    if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1]):
+    if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1] and artefact_gene_dict[gene1][gene2][3] > 0):
         if int(Junction_read_count) / artefact_gene_dict[gene1][gene2][3] < artefact_gene_dict[gene1][gene2][2]:
             continue
-    if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2]):
+    if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2] and artefact_gene_dict[gene2][gene1][3] > 0):
         if int(Junction_read_count) / artefact_gene_dict[gene2][gene1][3] < artefact_gene_dict[gene2][gene1][2]:
             continue
-    if gene1 in housekeeping_genes:
+    if gene1 in housekeeping_genes and housekeeping_genes[gene1][3] > 0:
         if int(Junction_read_count) / housekeeping_genes[gene1][3] < housekeeping_genes[gene1][2]:
             continue
-    if gene2 in housekeeping_genes:
+    if gene2 in housekeeping_genes and housekeeping_genes[gene2][3] > 0:
         if int(Junction_read_count) / housekeeping_genes[gene2][3] < housekeeping_genes[gene2][2]:
             continue
     breakpoint1 = lline[7][:-2]
@@ -282,16 +282,16 @@
         if int(Spanning_reads_unique) < housekeeping_genes[gene2][1]:
             continue
     # Min AF for frequent FP gene fusions and housekeeping gene
-    if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1]):
+    if (gene1 in artefact_gene_dict and gene2 in artefact_gene_dict[gene1] and artefact_gene_dict[gene1][gene2][3] > 0):
         if int(Spanning_reads_unique) / artefact_gene_dict[gene1][gene2][3] < artefact_gene_dict[gene1][gene2][2]:
             continue
-    if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2]):
+    if (gene2 in artefact_gene_dict and gene1 in artefact_gene_dict[gene2] and artefact_gene_dict[gene2][gene1][3] > 0):
         if int(Spanning_reads_unique) / artefact_gene_dict[gene2][gene1][3] < artefact_gene_dict[gene2][gene1][2]:
             continue
-    if gene1 in housekeeping_genes:
+    if gene1 in housekeeping_genes and housekeeping_genes[gene1][3] > 0:
         if int(Spanning_reads_unique) / housekeeping_genes[gene1][3] < housekeeping_genes[gene1][2]:
             continue
-    if gene2 in housekeeping_genes:
+    if gene2 in housekeeping_genes and housekeeping_genes[gene2][3] > 0:
         if int(Spanning_reads_unique) / housekeeping_genes[gene2][3] < housekeeping_genes[gene2][2]:
             continue
     # Flag fusions annotated that are fusions with very high probability

diff --git a/workflow/scripts/sample_mixup_check.py b/workflow/scripts/sample_mixup_check.py
@@ -65,7 +65,7 @@ def read_vcf(vcf_filename, vcf_dict, samples):
         if rna_samples[rna_sample][dna_sample] > best_gt_match:
             best_dna_sample = dna_sample
             best_gt_match = rna_samples[rna_sample][dna_sample]
-    p_match = round(best_gt_match / 42.0, 1)
+    p_match = round(best_gt_match * 100 / 42.0, 1)
     report.write(f"{rna_sample}\t{best_dna_sample}\t{best_gt_match}\t{p_match}%\t")
     if p_match > match_cutoff:
         report.write(f"yes\n")