Merge branch 'dev'

TimD1 · Nov 9, 2023 · 62704ee · 62704ee
2 parents 82aee34 + fcb41be
commit 62704ee
Show file tree

Hide file tree

Showing 63 changed files with 2,630 additions and 827 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,11 +11,12 @@
 *.vcf
 *.gz
 *.gprof
+*.png
 src/vcfdist
 
 # ignore analysis
 analysis/*/
-analysis/*.png
+analysis-v2/*/*/
 
 # ignore input
 data/*/*

diff --git a/README.md b/README.md
@@ -70,7 +70,7 @@ If you do already have HTSlib installed elsewhere, make sure you've added it to
 > cd vcfdist/src
 > make
 > ./vcfdist --version
-vcfdist v2.2.1
+vcfdist v2.3.0
 ```
 
 ### Option 2: Docker Image

diff --git a/analysis-v2/clustering/1_plot_beds.py b/analysis-v2/clustering/1_plot_beds.py
@@ -0,0 +1,42 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+ds_names = ["t2t-q100", "nist", "giab-tr", "cmrg"]
+ds_versions = ["v0.9", "v4.2.1","v4.20", "v1.00"]
+ds_beds = ["GRCh38_HG2-T2TQ100-V0.9_dipcall-z2k.benchmark.bed",
+        "HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed",
+        "GIABTR.HG002.benchmark.regions.bed",
+        "HG002_GRCh38_CMRG_smallvar_v1.00.bed"]
+data = "/home/timdunn/vcfdist/data"
+
+bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, 
+        100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, 
+        100_000_000, 200000000, 500000000, 1_000_000_000]
+for name, version, bed in zip(ds_names, ds_versions, ds_beds):
+    print(f"parsing {name} BED")
+    counts = [0]*len(bins)
+    bases, bases2 = 0, 0
+    with open(f"{data}/{name}-{version}/{bed}", "r") as bedfile:
+        for line in bedfile:
+            fields = line.split()
+            ctg, start, stop = fields[0], int(fields[1]), int(fields[2])
+
+            for bin_idx, bin_val in enumerate(bins):
+                if stop - start < bin_val:
+                    counts[bin_idx] += 1
+                    bases += stop-start
+                    bases2 += (stop-start)*(stop-start)
+                    break
+    fig, ax = plt.subplots(1,1)
+    ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
+    ax.set_xlim(0,len(bins))
+    ax.set_xticks(range(0, len(bins), 3))
+    ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
+    ax.set_yscale('log')
+    ax.set_ylim(0.5, 2000000)
+    ax.set_xlabel("Region Size")
+    ax.set_ylabel("Counts")
+    ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
+    plt.savefig(f"img/bed-{name}.png")
diff --git a/analysis-v2/clustering/2_plot_gap_clusters.py b/analysis-v2/clustering/2_plot_gap_clusters.py
@@ -0,0 +1,38 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+gaps = [10, 20, 50, 100, 200] # 500, 1000 didn't finish
+data = "/home/timdunn/vcfdist/analysis-v2/clustering"
+
+bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, 
+        100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, 
+        100_000_000, 200000000, 500000000, 1_000_000_000]
+for gap in gaps:
+    print(f"parsing gap {gap} superclusters")
+    counts = [0]*len(bins)
+    bases, bases2 = 0, 0
+    with open(f"{data}/gaps/{gap}.superclusters.tsv", "r") as bedfile:
+        next(bedfile)
+        for line in bedfile:
+            fields = line.split()
+            ctg, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3])
+
+            for bin_idx, bin_val in enumerate(bins):
+                if size < bin_val:
+                    counts[bin_idx] += 1
+                    bases += size
+                    bases2 += size*size
+                    break
+    fig, ax = plt.subplots(1,1)
+    ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
+    ax.set_xlim(0,len(bins))
+    ax.set_xticks(range(0, len(bins), 3))
+    ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
+    ax.set_yscale('log')
+    ax.set_ylim(0.5, 20000000)
+    ax.set_xlabel("Region Size")
+    ax.set_ylabel("Counts")
+    ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
+    plt.savefig(f"img/gaps-{gap}.png")
diff --git a/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh b/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+source globals.sh
+
+dists=( 
+        "10"
+        # "20"
+        # "50"
+        # "100"
+        # "200"
+)
+
+# vcfdist evaluation
+mkdir -p $out/sweep_gaplens
+for q in "${!query_names[@]}"; do
+    for d in "${dists[@]}"; do
+        echo "vcfdist: evaluating len 1000 '${query_names[q]}' gap $d"
+        $timer -v ../../src/vcfdist \
+            $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+            $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+            $data/refs/$ref_name \
+            --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+            --simple-cluster \
+            -g $d \
+            -l 1000 \
+            -p $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000. \
+            2> $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000.log
+    done
+done
diff --git a/analysis-v2/clustering/3_plot_len_clusters.py b/analysis-v2/clustering/3_plot_len_clusters.py
@@ -0,0 +1,39 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+# lens = [10, 50, 100, 500, 1000] # 5000, 10000 didn't finish
+lens = [1000]
+data = "/home/timdunn/vcfdist/analysis-v2/clustering"
+
+bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, 
+        100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, 
+        100_000_000, 200000000, 500000000, 1_000_000_000]
+for length in lens:
+    print(f"Parsing WFA variant max length {length} superclusters")
+    counts = [0]*len(bins)
+    bases, bases2 = 0, 0
+    with open(f"{data}/sweep_varlens/pav.wfa.len{length}ra.superclusters.tsv", "r") as bedfile:
+        next(bedfile)
+        for line in bedfile:
+            fields = line.split()
+            ctg, sc, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3]), int(fields[4])
+
+            for bin_idx, bin_val in enumerate(bins):
+                if size < bin_val:
+                    counts[bin_idx] += 1
+                    bases += size
+                    bases2 += size*size
+                    break
+    fig, ax = plt.subplots(1,1)
+    ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
+    ax.set_xlim(0,len(bins))
+    ax.set_xticks(range(0, len(bins), 3))
+    ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
+    ax.set_yscale('log')
+    ax.set_ylim(0.5, 20000000)
+    ax.set_xlabel("Region Size")
+    ax.set_ylabel("Counts")
+    ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
+    plt.savefig(f"img/lens-{length}.png")
diff --git a/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+source globals.sh
+
+lens=( 
+        # "10"
+        "50"
+        # "100"
+        # "500"
+        # "1000"
+        # "5000"
+        # "10000"
+)
+
+mkdir -p $out/sweep_varlens
+
+# # wfa-based superclustering
+# for q in "${!query_names[@]}"; do
+#     for l in "${lens[@]}"; do
+#         echo "vcfdist: evaluating wfa '${query_names[q]}' len $l"
+#         $timer -v ../../src/vcfdist \
+#             $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+#             $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#             $data/refs/$ref_name \
+#             --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+#             -l $l \
+#             -p $out/sweep_varlens/${query_names[q]}.wfa.len${l}. \
+#             2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}.log
+#     done
+# done
+
+# wfa-based superclustering
+for q in "${!query_names[@]}"; do
+    for l in "${lens[@]}"; do
+        echo "vcfdist: evaluating wfa '${query_names[q]}' len $l"
+        $timer -v ../../src/vcfdist \
+            $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+            $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+            $data/refs/$ref_name \
+            --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+            -l $l \
+            -p $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra. \
+            2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra.log
+    done
+done
+
+# # gap-based superclustering
+# for q in "${!query_names[@]}"; do
+#     for l in "${lens[@]}"; do
+#         echo "vcfdist: evaluating gap 100 '${query_names[q]}' len $l"
+#         $timer -v ../../src/vcfdist \
+#             $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+#             $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#             $data/refs/$ref_name \
+#             --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+#             --simple-cluster \
+#             -g 100 \
+#             -l $l \
+#             -p $out/sweep_varlens/${query_names[q]}.gap100.len${l}. \
+#             2> $out/sweep_varlens/${query_names[q]}.gap100.len${l}.log
+#     done
+# done
diff --git a/analysis-v2/clustering/4_plot_accuracy.py b/analysis-v2/clustering/4_plot_accuracy.py
@@ -0,0 +1,67 @@
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+# plt.rcParams.update({"figure.facecolor": (0,0,0,0)})
+
+fig, ax = plt.subplots(1, 2, figsize=(10,6))
+
+partial_credit = True
+
+types = ["gap", "gap", "gap", "gap", "gap", "wfa_len"]
+vals = [10, 20, 50, 100, 200, 1000]
+
+for t, v in zip(types, vals):
+    print(f"plotting {t} {v}")
+    with open(f"./{t}s/{v}.precision-recall.tsv") as csv:
+        indel_recall = []
+        indel_prec = []
+        snp_recall = []
+        snp_prec = []
+        next(csv) # skip header
+
+        for line in csv:
+            typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \
+                query_tot, query_tp, query_pp, query_fp = line.split('\t')
+            if partial_credit: # normal vcfdist prec/recall calc, already done
+                if typ == "INDEL":
+                    indel_recall.append(float(recall))
+                    indel_prec.append(float(prec))
+                elif typ == "SNP":
+                    snp_recall.append(float(recall))
+                    snp_prec.append(float(prec))
+            else: # recalculate using TP/FP/FN
+                if typ == "INDEL":
+                    indel_recall.append(float(truth_tp) / float(truth_tot))
+                    if int(truth_tp) + int(query_fp) == 0:
+                        indel_prec.append(1)
+                    else:
+                        indel_prec.append(float(truth_tp) / 
+                                (float(truth_tp) + float(query_fp)))
+                elif typ == "SNP":
+                    snp_recall.append(float(truth_tp) / float(truth_tot))
+                    if int(truth_tp) + int(query_fp) == 0:
+                        snp_prec.append(1)
+                    else:
+                        snp_prec.append(float(truth_tp) / 
+                                (float(truth_tp) + float(query_fp)))
+        ax[0].plot(snp_recall, snp_prec, linestyle='', marker='.')
+        ax[1].plot(indel_recall, indel_prec, linestyle='', marker='.')
+
+# SNP plot
+ax[0].set_title("SNPs")
+ax[0].set_xlabel("Recall", fontsize=15)
+ax[0].set_ylabel("Precision", fontsize=15)
+ax[0].set_xlim((0.9,1))
+ax[0].set_ylim((0.9,1))
+
+# INDEL plot
+ax[1].set_title("INDELs")
+ax[1].set_xlabel("Recall", fontsize=15)
+ax[1].set_ylabel("Precision", fontsize=15)
+ax[1].set_xlim((0.9,1))
+ax[1].set_ylim((0.9,1))
+
+ax[0].legend(["Gap 10", "Gap 20", "Gap 50", "Gap 100", "Gap 200", "WFA"])
+plt.tight_layout()
+plt.savefig('img/accuracy.png', dpi=200)
diff --git a/analysis-v2/clustering/README.md b/analysis-v2/clustering/README.md
@@ -0,0 +1,3 @@
+gaps = modify gap length
+wfa_lens = modify variant lengths, wfa superclustering
+gap_lens = modify variant lengths, gap superclustering
diff --git a/analysis-v2/clustering/globals.sh b/analysis-v2/clustering/globals.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+data="/home/timdunn/vcfdist/data"
+parallel="/home/timdunn/parallel-20211222/src/parallel"
+vcfdist="/home/timdunn/vcfdist/src/vcfdist"
+rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg"
+tabix="/home/timdunn/software/htslib-1.16/tabix"
+bgzip="/home/timdunn/software/htslib-1.16/bgzip"
+timer="/usr/bin/time"
+
+out="/home/timdunn/vcfdist/analysis-v2/clustering"
+
+# define reference FASTA
+ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
+
+# T2T information
+truth_name="t2t-q100"
+truth_version="v0.9"
+
+# query VCF information
+query_names=(
+    # "hprc"
+    "pav"
+    # "giab-tr"
+)
+query_versions=(
+    # "v1"
+    "v4"
+    # "v4.20"
+)
diff --git a/analysis-v2/multi_match/1_eval_vcfs.sh b/analysis-v2/multi_match/1_eval_vcfs.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+source globals.sh
+
+        # --bed $data/cmrg-v1.00/HG002_GRCh38_CMRG_smallvar_v1.00.bed \
+        # --max-threads 1 \
+# vcfdist evaluation
+mkdir -p $out/evals/vcfdist
+for i in "${!query_names[@]}"; do
+    echo "vcfdist: evaluating '${query_names[i]}'"
+    $timer -v ../../src/vcfdist \
+        $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+        $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+        $data/refs/$ref_name \
+        --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+        --keep-query --keep-truth \
+        -l 1000 \
+        -p $out/evals/vcfdist/${query_names[i]}. \
+        2> $out/evals/vcfdist/${query_names[i]}.log
+done