From f7c7c4592d47286975aa8784e039d637f5efd202 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Mon, 6 Nov 2023 12:50:18 -0500
Subject: [PATCH 01/16] Minor fixes to Docker release instructions.

---
 docs/release.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/release.md b/docs/release.md
index 684c78e..7557f31 100644
--- a/docs/release.md
+++ b/docs/release.md
@@ -1,6 +1,13 @@
 1. Update `vcfdist` version in `globals.h`
-2. Replace `vcfdist --version` text in `src/README.md`
+
+2. Update `vcfdist` help text in `src/README.md`
+```bash
+./vcfdist >> src/README.md
+```
+Manually delete previous text
+
 3. Update `vcfdist` version in `README.md`
+
 4. Make final Git commit of changes
 ```bash
 git commit -m "version bump (vX.X.X)"
@@ -9,13 +16,16 @@ git merge dev
 git push origin master
 git checkout dev
 ```
+
 5. Make new release and tag on Github
+
 6. Build and deploy new Docker image
 ```bash
 cd ~/vcfdist/src
 sudo docker login -u timd1
-sudo docker build --no-cache -t vcfdist .
+sudo docker build --no-cache -t timd1/vcfdist .
 sudo docker image ls
 sudo docker image tag timd1/vcfdist:latest timd1/vcfdist:vX.X.X
 sudo docker image push timd1/vcfdist:vX.X.X
+sudo docker image push timd1/vcfdist:latest
 ```

From 8b3e734517a17d3af88a484eef138f34261be401 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Mon, 6 Nov 2023 12:54:55 -0500
Subject: [PATCH 02/16] Added new analysis-v2 directory

---
 .gitignore                                    |   3 +-
 analysis-v2/clustering/1_plot_beds.py         |  42 +++++
 analysis-v2/clustering/2_plot_gap_clusters.py |  38 ++++
 .../clustering/2_vcfdist_sweep_gaplens.sh     |  29 +++
 analysis-v2/clustering/3_plot_len_clusters.py |  39 ++++
 .../clustering/3_vcfdist_sweep_varlens.sh     |  62 +++++++
 analysis-v2/clustering/4_plot_accuracy.py     |  67 +++++++
 analysis-v2/clustering/README.md              |   3 +
 analysis-v2/clustering/globals.sh             |  30 +++
 analysis-v2/multi_match/1_eval_vcfs.sh        |  20 ++
 analysis-v2/multi_match/2_plot_vcf_multi.py   | 137 ++++++++++++++
 analysis-v2/multi_match/globals.sh            |  29 +++
 analysis-v2/phab_cm/1_eval_vcfs.sh            | 127 +++++++++++++
 analysis-v2/phab_cm/globals.sh                |  29 +++
 .../phab_cm/vcfdist_truvari_compare.py        |   7 +-
 analysis-v2/phasing/0_create_filtered_vcfs.sh |  13 ++
 .../phasing}/14_phase_analysis.py             |   0
 analysis-v2/phasing/1_whatshap_compare.sh     |   8 +
 analysis-v2/phasing/parse_vcfs.py             |  64 +++++++
 analysis-v2/small_sv_all/0_split.sh           |  47 +++++
 analysis-v2/small_sv_all/1_eval_vcfs.sh       |  20 ++
 analysis-v2/small_sv_all/2_stratify_vcfs.sh   |  23 +++
 analysis-v2/small_sv_all/3_plot_vcf_tp.py     | 122 ++++++++++++
 analysis-v2/small_sv_all/3_test_plot_tsv.py   |  85 +++++++++
 analysis-v2/small_sv_all/4_plot_vcf_fp.py     | 135 ++++++++++++++
 analysis-v2/small_sv_all/5_extract_diffs.py   | 175 ++++++++++++++++++
 analysis-v2/small_sv_all/globals.sh           |  45 +++++
 analysis-v2/variants/1_plot_vars.py           |  36 ++++
 analysis-v2/vs_prior_work/1_eval_vcfs.sh      |  89 +++++++++
 analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh   |  85 +++++++++
 analysis-v2/vs_prior_work/README.md           |   1 +
 analysis-v2/vs_prior_work/globals.sh          |  29 +++
 analysis/{12_plot_ram.py => 10_plot_ram.py}   |   0
 analysis/10_vcfdist_sv.sh                     |  36 ----
 analysis/11_truvari_sv.sh                     |  83 ---------
 analysis/2_general_sw_space_plot.py           |  93 +++++-----
 analysis/4_vcfdist_output.py                  |  81 +++++---
 analysis/7_vcfeval_pr_plot.py                 |  60 ++++--
 analysis/8_vcfdist_pr_plot.py                 |  81 +++++---
 analysis/9_f1_ed_plot.py                      |  43 +++--
 analysis/9_f1_pr_plot.py                      | 116 +++++++-----
 41 files changed, 1931 insertions(+), 301 deletions(-)
 create mode 100644 analysis-v2/clustering/1_plot_beds.py
 create mode 100644 analysis-v2/clustering/2_plot_gap_clusters.py
 create mode 100755 analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh
 create mode 100644 analysis-v2/clustering/3_plot_len_clusters.py
 create mode 100755 analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
 create mode 100644 analysis-v2/clustering/4_plot_accuracy.py
 create mode 100644 analysis-v2/clustering/README.md
 create mode 100755 analysis-v2/clustering/globals.sh
 create mode 100755 analysis-v2/multi_match/1_eval_vcfs.sh
 create mode 100644 analysis-v2/multi_match/2_plot_vcf_multi.py
 create mode 100755 analysis-v2/multi_match/globals.sh
 create mode 100755 analysis-v2/phab_cm/1_eval_vcfs.sh
 create mode 100755 analysis-v2/phab_cm/globals.sh
 rename analysis/13_truvari_compare.py => analysis-v2/phab_cm/vcfdist_truvari_compare.py (97%)
 create mode 100755 analysis-v2/phasing/0_create_filtered_vcfs.sh
 rename {analysis => analysis-v2/phasing}/14_phase_analysis.py (100%)
 create mode 100755 analysis-v2/phasing/1_whatshap_compare.sh
 create mode 100644 analysis-v2/phasing/parse_vcfs.py
 create mode 100755 analysis-v2/small_sv_all/0_split.sh
 create mode 100755 analysis-v2/small_sv_all/1_eval_vcfs.sh
 create mode 100755 analysis-v2/small_sv_all/2_stratify_vcfs.sh
 create mode 100644 analysis-v2/small_sv_all/3_plot_vcf_tp.py
 create mode 100644 analysis-v2/small_sv_all/3_test_plot_tsv.py
 create mode 100644 analysis-v2/small_sv_all/4_plot_vcf_fp.py
 create mode 100644 analysis-v2/small_sv_all/5_extract_diffs.py
 create mode 100755 analysis-v2/small_sv_all/globals.sh
 create mode 100644 analysis-v2/variants/1_plot_vars.py
 create mode 100755 analysis-v2/vs_prior_work/1_eval_vcfs.sh
 create mode 100755 analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh
 create mode 100644 analysis-v2/vs_prior_work/README.md
 create mode 100755 analysis-v2/vs_prior_work/globals.sh
 rename analysis/{12_plot_ram.py => 10_plot_ram.py} (100%)
 delete mode 100755 analysis/10_vcfdist_sv.sh
 delete mode 100755 analysis/11_truvari_sv.sh

diff --git a/.gitignore b/.gitignore
index 88d5a6b..660b761 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,11 +11,12 @@
 *.vcf
 *.gz
 *.gprof
+*.png
 src/vcfdist
 
 # ignore analysis
 analysis/*/
-analysis/*.png
+analysis-v2/*/*/
 
 # ignore input
 data/*/*
diff --git a/analysis-v2/clustering/1_plot_beds.py b/analysis-v2/clustering/1_plot_beds.py
new file mode 100644
index 0000000..b009d37
--- /dev/null
+++ b/analysis-v2/clustering/1_plot_beds.py
@@ -0,0 +1,42 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+ds_names = ["t2t-q100", "nist", "giab-tr", "cmrg"]
+ds_versions = ["v0.9", "v4.2.1","v4.20", "v1.00"]
+ds_beds = ["GRCh38_HG2-T2TQ100-V0.9_dipcall-z2k.benchmark.bed",
+        "HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed",
+        "GIABTR.HG002.benchmark.regions.bed",
+        "HG002_GRCh38_CMRG_smallvar_v1.00.bed"]
+data = "/home/timdunn/vcfdist/data"
+
+bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, 
+        100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, 
+        100_000_000, 200000000, 500000000, 1_000_000_000]
+for name, version, bed in zip(ds_names, ds_versions, ds_beds):
+    print(f"parsing {name} BED")
+    counts = [0]*len(bins)
+    bases, bases2 = 0, 0
+    with open(f"{data}/{name}-{version}/{bed}", "r") as bedfile:
+        for line in bedfile:
+            fields = line.split()
+            ctg, start, stop = fields[0], int(fields[1]), int(fields[2])
+
+            for bin_idx, bin_val in enumerate(bins):
+                if stop - start < bin_val:
+                    counts[bin_idx] += 1
+                    bases += stop-start
+                    bases2 += (stop-start)*(stop-start)
+                    break
+    fig, ax = plt.subplots(1,1)
+    ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
+    ax.set_xlim(0,len(bins))
+    ax.set_xticks(range(0, len(bins), 3))
+    ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
+    ax.set_yscale('log')
+    ax.set_ylim(0.5, 2000000)
+    ax.set_xlabel("Region Size")
+    ax.set_ylabel("Counts")
+    ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
+    plt.savefig(f"img/bed-{name}.png")
diff --git a/analysis-v2/clustering/2_plot_gap_clusters.py b/analysis-v2/clustering/2_plot_gap_clusters.py
new file mode 100644
index 0000000..8c7a290
--- /dev/null
+++ b/analysis-v2/clustering/2_plot_gap_clusters.py
@@ -0,0 +1,38 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+gaps = [10, 20, 50, 100, 200] # 500, 1000 didn't finish
+data = "/home/timdunn/vcfdist/analysis-v2/clustering"
+
+bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, 
+        100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, 
+        100_000_000, 200000000, 500000000, 1_000_000_000]
+for gap in gaps:
+    print(f"parsing gap {gap} superclusters")
+    counts = [0]*len(bins)
+    bases, bases2 = 0, 0
+    with open(f"{data}/gaps/{gap}.superclusters.tsv", "r") as bedfile:
+        next(bedfile)
+        for line in bedfile:
+            fields = line.split()
+            ctg, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3])
+
+            for bin_idx, bin_val in enumerate(bins):
+                if size < bin_val:
+                    counts[bin_idx] += 1
+                    bases += size
+                    bases2 += size*size
+                    break
+    fig, ax = plt.subplots(1,1)
+    ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
+    ax.set_xlim(0,len(bins))
+    ax.set_xticks(range(0, len(bins), 3))
+    ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
+    ax.set_yscale('log')
+    ax.set_ylim(0.5, 20000000)
+    ax.set_xlabel("Region Size")
+    ax.set_ylabel("Counts")
+    ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
+    plt.savefig(f"img/gaps-{gap}.png")
diff --git a/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh b/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh
new file mode 100755
index 0000000..f4e7c9d
--- /dev/null
+++ b/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+source globals.sh
+
+dists=( 
+        "10"
+        # "20"
+        # "50"
+        # "100"
+        # "200"
+)
+
+# vcfdist evaluation
+mkdir -p $out/sweep_gaplens
+for q in "${!query_names[@]}"; do
+    for d in "${dists[@]}"; do
+        echo "vcfdist: evaluating len 1000 '${query_names[q]}' gap $d"
+        $timer -v ../../src/vcfdist \
+            $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+            $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+            $data/refs/$ref_name \
+            --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+            --simple-cluster \
+            -g $d \
+            -l 1000 \
+            -p $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000. \
+            2> $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000.log
+    done
+done
diff --git a/analysis-v2/clustering/3_plot_len_clusters.py b/analysis-v2/clustering/3_plot_len_clusters.py
new file mode 100644
index 0000000..3bcf336
--- /dev/null
+++ b/analysis-v2/clustering/3_plot_len_clusters.py
@@ -0,0 +1,39 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+# lens = [10, 50, 100, 500, 1000] # 5000, 10000 didn't finish
+lens = [1000]
+data = "/home/timdunn/vcfdist/analysis-v2/clustering"
+
+bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, 
+        100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, 
+        100_000_000, 200000000, 500000000, 1_000_000_000]
+for length in lens:
+    print(f"Parsing WFA variant max length {length} superclusters")
+    counts = [0]*len(bins)
+    bases, bases2 = 0, 0
+    with open(f"{data}/sweep_varlens/pav.wfa.len{length}ra.superclusters.tsv", "r") as bedfile:
+        next(bedfile)
+        for line in bedfile:
+            fields = line.split()
+            ctg, sc, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3]), int(fields[4])
+
+            for bin_idx, bin_val in enumerate(bins):
+                if size < bin_val:
+                    counts[bin_idx] += 1
+                    bases += size
+                    bases2 += size*size
+                    break
+    fig, ax = plt.subplots(1,1)
+    ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
+    ax.set_xlim(0,len(bins))
+    ax.set_xticks(range(0, len(bins), 3))
+    ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
+    ax.set_yscale('log')
+    ax.set_ylim(0.5, 20000000)
+    ax.set_xlabel("Region Size")
+    ax.set_ylabel("Counts")
+    ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
+    ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
+    plt.savefig(f"img/lens-{length}.png")
diff --git a/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
new file mode 100755
index 0000000..3e3442b
--- /dev/null
+++ b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+source globals.sh
+
+lens=( 
+        # "10"
+        # "50"
+        # "100"
+        # "500"
+        "1000"
+        # "5000"
+        # "10000"
+)
+
+mkdir -p $out/sweep_varlens
+
+# # wfa-based superclustering
+# for q in "${!query_names[@]}"; do
+#     for l in "${lens[@]}"; do
+#         echo "vcfdist: evaluating wfa '${query_names[q]}' len $l"
+#         $timer -v ../../src/vcfdist \
+#             $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+#             $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#             $data/refs/$ref_name \
+#             --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+#             -l $l \
+#             -p $out/sweep_varlens/${query_names[q]}.wfa.len${l}. \
+#             2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}.log
+#     done
+# done
+
+# wfa-based superclustering
+for q in "${!query_names[@]}"; do
+    for l in "${lens[@]}"; do
+        echo "vcfdist: evaluating wfa '${query_names[q]}' len $l"
+        $timer -v ../../src/vcfdist \
+            $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+            $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+            $data/refs/$ref_name \
+            --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+            -l $l \
+            -p $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra. \
+            2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra.log
+    done
+done
+
+# # gap-based superclustering
+# for q in "${!query_names[@]}"; do
+#     for l in "${lens[@]}"; do
+#         echo "vcfdist: evaluating gap 100 '${query_names[q]}' len $l"
+#         $timer -v ../../src/vcfdist \
+#             $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
+#             $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#             $data/refs/$ref_name \
+#             --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+#             --simple-cluster \
+#             -g 100 \
+#             -l $l \
+#             -p $out/sweep_varlens/${query_names[q]}.gap100.len${l}. \
+#             2> $out/sweep_varlens/${query_names[q]}.gap100.len${l}.log
+#     done
+# done
diff --git a/analysis-v2/clustering/4_plot_accuracy.py b/analysis-v2/clustering/4_plot_accuracy.py
new file mode 100644
index 0000000..8f71a7f
--- /dev/null
+++ b/analysis-v2/clustering/4_plot_accuracy.py
@@ -0,0 +1,67 @@
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+# plt.rcParams.update({"figure.facecolor": (0,0,0,0)})
+
+fig, ax = plt.subplots(1, 2, figsize=(10,6))
+
+partial_credit = True
+
+types = ["gap", "gap", "gap", "gap", "gap", "wfa_len"]
+vals = [10, 20, 50, 100, 200, 1000]
+
+for t, v in zip(types, vals):
+    print(f"plotting {t} {v}")
+    with open(f"./{t}s/{v}.precision-recall.tsv") as csv:
+        indel_recall = []
+        indel_prec = []
+        snp_recall = []
+        snp_prec = []
+        next(csv) # skip header
+
+        for line in csv:
+            typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \
+                query_tot, query_tp, query_pp, query_fp = line.split('\t')
+            if partial_credit: # normal vcfdist prec/recall calc, already done
+                if typ == "INDEL":
+                    indel_recall.append(float(recall))
+                    indel_prec.append(float(prec))
+                elif typ == "SNP":
+                    snp_recall.append(float(recall))
+                    snp_prec.append(float(prec))
+            else: # recalculate using TP/FP/FN
+                if typ == "INDEL":
+                    indel_recall.append(float(truth_tp) / float(truth_tot))
+                    if int(truth_tp) + int(query_fp) == 0:
+                        indel_prec.append(1)
+                    else:
+                        indel_prec.append(float(truth_tp) / 
+                                (float(truth_tp) + float(query_fp)))
+                elif typ == "SNP":
+                    snp_recall.append(float(truth_tp) / float(truth_tot))
+                    if int(truth_tp) + int(query_fp) == 0:
+                        snp_prec.append(1)
+                    else:
+                        snp_prec.append(float(truth_tp) / 
+                                (float(truth_tp) + float(query_fp)))
+        ax[0].plot(snp_recall, snp_prec, linestyle='', marker='.')
+        ax[1].plot(indel_recall, indel_prec, linestyle='', marker='.')
+         
+# SNP plot
+ax[0].set_title("SNPs")
+ax[0].set_xlabel("Recall", fontsize=15)
+ax[0].set_ylabel("Precision", fontsize=15)
+ax[0].set_xlim((0.9,1))
+ax[0].set_ylim((0.9,1))
+
+# INDEL plot
+ax[1].set_title("INDELs")
+ax[1].set_xlabel("Recall", fontsize=15)
+ax[1].set_ylabel("Precision", fontsize=15)
+ax[1].set_xlim((0.9,1))
+ax[1].set_ylim((0.9,1))
+
+ax[0].legend(["Gap 10", "Gap 20", "Gap 50", "Gap 100", "Gap 200", "WFA"])
+plt.tight_layout()
+plt.savefig('img/accuracy.png', dpi=200)
diff --git a/analysis-v2/clustering/README.md b/analysis-v2/clustering/README.md
new file mode 100644
index 0000000..5599f1d
--- /dev/null
+++ b/analysis-v2/clustering/README.md
@@ -0,0 +1,3 @@
+gaps = modify gap length
+wfa_lens = modify variant lengths, wfa superclustering
+gap_lens = modify variant lengths, gap superclustering
diff --git a/analysis-v2/clustering/globals.sh b/analysis-v2/clustering/globals.sh
new file mode 100755
index 0000000..5f80b9d
--- /dev/null
+++ b/analysis-v2/clustering/globals.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+data="/home/timdunn/vcfdist/data"
+parallel="/home/timdunn/parallel-20211222/src/parallel"
+vcfdist="/home/timdunn/vcfdist/src/vcfdist"
+rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg"
+tabix="/home/timdunn/software/htslib-1.16/tabix"
+bgzip="/home/timdunn/software/htslib-1.16/bgzip"
+timer="/usr/bin/time"
+
+out="/home/timdunn/vcfdist/analysis-v2/clustering"
+
+# define reference FASTA
+ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
+
+# T2T information
+truth_name="t2t-q100"
+truth_version="v0.9"
+
+# query VCF information
+query_names=(
+    # "hprc"
+    "pav"
+    # "giab-tr"
+)
+query_versions=(
+    # "v1"
+    "v4"
+    # "v4.20"
+)
diff --git a/analysis-v2/multi_match/1_eval_vcfs.sh b/analysis-v2/multi_match/1_eval_vcfs.sh
new file mode 100755
index 0000000..5be781b
--- /dev/null
+++ b/analysis-v2/multi_match/1_eval_vcfs.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+source globals.sh
+
+        # --bed $data/cmrg-v1.00/HG002_GRCh38_CMRG_smallvar_v1.00.bed \
+        # --max-threads 1 \
+# vcfdist evaluation
+mkdir -p $out/evals/vcfdist
+for i in "${!query_names[@]}"; do
+    echo "vcfdist: evaluating '${query_names[i]}'"
+    $timer -v ../../src/vcfdist \
+        $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+        $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+        $data/refs/$ref_name \
+        --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+        --keep-query --keep-truth \
+        -l 1000 \
+        -p $out/evals/vcfdist/${query_names[i]}. \
+        2> $out/evals/vcfdist/${query_names[i]}.log
+done
diff --git a/analysis-v2/multi_match/2_plot_vcf_multi.py b/analysis-v2/multi_match/2_plot_vcf_multi.py
new file mode 100644
index 0000000..843cf41
--- /dev/null
+++ b/analysis-v2/multi_match/2_plot_vcf_multi.py
@@ -0,0 +1,137 @@
+from collections import defaultdict
+import json
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+
+query_data, truth_data = {}, {}
+BKs = ["TP", "PP", "FP", "FN"]
+for BK in BKs:
+    query_data[BK] = defaultdict(int)
+    truth_data[BK] = defaultdict(int)
+keys = set()
+
+QUERY = 0
+TRUTH = 1
+SINGLE = 0
+MULTI = 1
+
+TP = 0
+PP = 1
+FP = 2
+FN = 3
+results = np.zeros((4,2,2), dtype=float)
+
+results_fn = f"/home/timdunn/vcfdist/analysis-v2/multi_match/evals/vcfdist/giab-tr.summary.vcf"
+with open(results_fn, 'r') as results_file:
+    line_ct = 0
+    for line in results_file:
+        if line[0] == "#": # header
+            continue
+        contig, pos, var_id, ref, alt, qual, filt, info, fmt, truth, query = line.strip().split('\t')
+        tGT, tBD, tBC, tBK, tQQ, tSC, tSG, tPB, tPS, tPF = truth.split(":")
+        qGT, qBD, qBC, qBK, qQQ, qSC, qSG, qPB, qPS, qPF = query.split(":")
+
+        # flip qGT if necessary, to compare to correct truth variant
+        if qGT == "0|1":
+            if (int(qPS) + int(qPF)) % 2:
+                qGT = "1|0"
+        elif qGT == "1|0":
+            if (int(qPS) + int(qPF)) % 2:
+                qGT = "0|1"
+
+        # parse truth variant
+        if tBC == ".":
+            pass
+        elif float(tBC) == 0:
+            truth_data["FN"][f"{contig}:{tGT}:{tSC}:{tSG}"] += 1
+            keys.add(f"{contig}:{tGT}:{tSC}:{tSG}")
+            # print(f"Truth FN, {contig}:{tGT}:{tSC}:{tSG}")
+        elif float(tBC) < 1:
+            truth_data["PP"][f"{contig}:{tGT}:{tSC}:{tSG}"] += 1
+            keys.add(f"{contig}:{tGT}:{tSC}:{tSG}")
+            # print(f"Truth PP, {contig}:{tGT}:{tSC}:{tSG}")
+        elif float(tBC) == 1:
+            truth_data["TP"][f"{contig}:{tGT}:{tSC}:{tSG}"] += 1
+            keys.add(f"{contig}:{tGT}:{tSC}:{tSG}")
+            # print(f"Truth TP, {contig}:{tGT}:{tSC}:{tSG}")
+        else:
+            print("ERROR: unexpected tBC =", tBC)
+
+        # parse query variant
+        if qBC == ".":
+            pass
+        elif float(qBC) == 0:
+            query_data["FP"][f"{contig}:{qGT}:{qSC}:{qSG}"] += 1
+            keys.add(f"{contig}:{qGT}:{qSC}:{qSG}")
+            # print(f"Query FP, {contig}:{qGT}:{qSC}:{qSG}")
+        elif float(qBC) < 1:
+            query_data["PP"][f"{contig}:{qGT}:{qSC}:{qSG}"] += 1
+            keys.add(f"{contig}:{qGT}:{qSC}:{qSG}")
+            # print(f"Query PP, {contig}:{qGT}:{qSC}:{qSG}")
+        elif float(qBC) == 1:
+            query_data["TP"][f"{contig}:{qGT}:{qSC}:{qSG}"] += 1
+            keys.add(f"{contig}:{qGT}:{qSC}:{qSG}")
+            # print(f"Query TP, {contig}:{qGT}:{qSC}:{qSG}")
+        else:
+            print("ERROR: unexpected qBC =", qBC)
+
+        line_ct += 1
+        # if line_ct > 1000: break
+
+# plot results
+for key in keys:
+    for bk, BK in enumerate(BKs):
+        # not in either
+        if truth_data[BK][key] == 0 and query_data[BK][key] == 0:
+            pass
+        # PP/TP, should be present in both
+        elif truth_data[BK][key] > 1 and query_data[BK][key] > 1:
+            results[bk][MULTI][MULTI] += truth_data[BK][key] + query_data[BK][key]
+        elif truth_data[BK][key] > 1 and query_data[BK][key] == 1:
+            results[bk][SINGLE][MULTI] += truth_data[BK][key] + query_data[BK][key]
+        elif truth_data[BK][key] == 1 and query_data[BK][key] > 1:
+            results[bk][MULTI][SINGLE] += truth_data[BK][key] + query_data[BK][key]
+        elif truth_data[BK][key] == 1 and query_data[BK][key] == 1:
+            results[bk][SINGLE][SINGLE] += truth_data[BK][key] + query_data[BK][key]
+        else: # present in one
+            if BK == "FP":
+                results[bk][SINGLE][SINGLE] += 1
+                if truth_data[BK][key] > 0:
+                    print("ERROR: FP key present in truth")
+                    print(BK, key)
+
+            elif BK == "FN":
+                results[bk][SINGLE][SINGLE] += 1
+                if query_data[BK][key] > 0:
+                    print("ERROR: FN key present in query")
+                    print(BK, key)
+            else:
+                print("ERROR: PP/TP key not present for truth and query")
+                print(BK, key)
+
+fracs = [100*results[x].sum(axis=None) / results.sum(axis=None) for x in range(4)]
+props = [100*results[x] / results[x].sum(axis=None) for x in range(4)]
+# results = 100 * results / results.sum(axis=None)
+# print(results)
+cmaps = ["Greens", "Oranges", "Reds", "Blues"]
+fig, ax = plt.subplots(1, 5, figsize=(15,4))
+for x in range(4):
+    ax[x].matshow(results[x], cmap=cmaps[x])
+    ax[x].set_title(f"{BKs[x]} ({fracs[x]:.3f}%)")
+    ax[x].set_yticks([0, 1])
+    ax[x].set_xticks([0, 1])
+    ax[x].set_yticklabels(["SINGLE", "MULTI"])
+    ax[x].set_xticklabels(["SINGLE", "MULTI"])
+    ax[x].set_xlabel("Query")
+    ax[x].set_ylabel("Truth")
+
+    for (i,j), z in np.ndenumerate(props[x]):
+        ax[x].text(j, i, f"{z:.3f}%", ha='center', va='center',
+            bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
+
+ax[4].set_title("Variant Calls")
+ax[4].pie(fracs, labels=BKs, autopct='%1.1f%%', explode=(0, 0.6, 0.3, 0),
+        colors=['green', 'orange', 'red', 'blue'])
+plt.tight_layout()
+plt.savefig("results.png")
diff --git a/analysis-v2/multi_match/globals.sh b/analysis-v2/multi_match/globals.sh
new file mode 100755
index 0000000..4b26904
--- /dev/null
+++ b/analysis-v2/multi_match/globals.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+data="/home/timdunn/vcfdist/data"
+out="/home/timdunn/vcfdist/analysis-v2/multi_match"
+parallel="/home/timdunn/parallel-20211222/src/parallel"
+vcfdist="/home/timdunn/vcfdist/src/vcfdist"
+rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg"
+tabix="/home/timdunn/software/htslib-1.16/tabix"
+bgzip="/home/timdunn/software/htslib-1.16/bgzip"
+timer="/usr/bin/time"
+
+# define reference FASTA
+ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
+
+# T2T information
+truth_name="t2t-q100"
+truth_version="v0.9"
+
+# query VCF information
+query_names=(
+    "hprc"
+    "pav"
+    "giab-tr"
+)
+query_versions=(
+    "v1"
+    "v4"
+    "v4.20"
+)
diff --git a/analysis-v2/phab_cm/1_eval_vcfs.sh b/analysis-v2/phab_cm/1_eval_vcfs.sh
new file mode 100755
index 0000000..b628873
--- /dev/null
+++ b/analysis-v2/phab_cm/1_eval_vcfs.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+source globals.sh
+
+# # truvari phab VCF normalization v4.1.0 (doesn't work)
+# mkdir -p $out/phab-mafft
+# mkdir -p $out/phab-wfa
+# source ~/truvari/venv3.10/bin/activate
+# for i in "${!query_names[@]}"; do
+#     echo "truvari v4.1: phab wfa for '${query_names[i]}'"
+#     truvari phab \
+#         -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         --bSamples HG002 \
+#         --cSamples HG002 \
+#         -f $data/refs/$ref_name \
+#         --align wfa \
+#         -t 64 \
+#         -o $out/phab-wfa/${query_names[i]}.vcf.gz \
+#         2> $out/phab-wfa/${query_names[i]}.log
+#     echo "truvari v4.1: phab mafft for '${query_names[i]}'"
+#     truvari phab \
+#         -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         --bSamples HG002 \
+#         --cSamples HG002 \
+#         -f $data/refs/$ref_name \
+#         --align mafft \
+#         -t 64 \
+#         -o $out/phab-mafft/${query_names[i]}.vcf.gz \
+#         2> $out/phab-mafft/${query_names[i]}.log
+# done
+# deactivate
+
+# # truvari phab VCF normalization v4.0
+# source ~/software/Truvari-v4.0.0/venv3.10/bin/activate
+# mkdir -p $out/phab
+# for i in "${!query_names[@]}"; do
+#     echo "truvari v4.0: phab mafft for '${query_names[i]}'"
+#     truvari phab \
+#         -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         --bSamples HG002 \
+#         --cSamples HG002 \
+#         -f $data/refs/$ref_name \
+#         -t 64 \
+#         -o $out/phab/${query_names[i]}.vcf.gz \
+#         2> $out/phab/${query_names[i]}.log
+# done
+# deactivate
+
+# # split phab normalized VCF into truth/query
+# in_samples=(
+#     "HG002"
+#     "p:HG002"
+# )
+# out_samples=(
+#     "TRUTH"
+#     "QUERY"
+# )
+# for s in "${out_samples[@]}"; do 
+#     echo $s > "${s}.txt" 
+# done
+# for i in "${!query_names[@]}"; do
+#     in_file="$out/phab/${query_names[i]}.vcf.gz"
+#     for s in "${!in_samples[@]}"; do
+#         bcftools view -c1 -s ${in_samples[s]} $in_file | bcftools reheader -s ${out_samples[s]}.txt | 
+#         sed "s/\//|/g" | bcftools view -Oz &> $out/phab/${query_names[i]}.${out_samples[s]}.vcf.gz
+#         tabix -p vcf $out/phab/${query_names[i]}.${out_samples[s]}.vcf.gz
+#     done
+# done
+
+# # vcfdist evaluation
+# mkdir -p $out/vcfdist
+# for i in "${!query_names[@]}"; do
+#     echo "vcfdist: evaluating '${query_names[i]}'"
+#     $timer -v /home/timdunn/vcfdist/src/vcfdist \
+#         $out/phab/${query_names[i]}.QUERY.vcf.gz \
+#         $out/phab/${query_names[i]}.TRUTH.vcf.gz \
+#         $data/refs/$ref_name \
+#         --bed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+#         -l 1000 \
+#         -p $out/vcfdist/${query_names[i]}. \
+#         2> $out/vcfdist/${query_names[i]}.log
+# done
+
+# # vcfeval evaluation
+# mkdir -p $out/vcfeval
+# for i in "${!query_names[@]}"; do
+#     echo "vcfeval: evaluating '${query_names[i]}'"
+#     $timer -v $rtg vcfeval \
+#         -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
+#         -c $out/phab/${query_names[i]}.QUERY.vcf.gz \
+#         -t $data/refs/${ref_name}.sdf \
+#         -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+#         --threads 64 \
+#         --all-records \
+#         -o $out/vcfeval/${query_names[i]} \
+#         2> $out/vcfeval/${query_names[i]}.log
+# done
+
+# Truvari evaluation
+mkdir -p $out/truvari
+source ~/software/Truvari-v4.0.0/venv3.10/bin/activate
+for i in "${!query_names[@]}"; do
+    echo "Truvari: evaluating '${query_names[i]}'"
+    $timer -v truvari bench \
+        -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
+        -c $out/phab/${query_names[i]}.QUERY.vcf.gz \
+        -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
+        --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+        --no-ref a \
+        --sizemin 1 --sizefilt 1 --sizemax 1000 \
+        --pick multi \
+        --typeignore \
+        --dup-to-ins \
+        -o $out/truvari/${query_names[i]}
+
+    laytr tru2ga \
+        -i $out/truvari/${query_names[i]}/ \
+        -o $out/truvari/${query_names[i]}/result
+    gunzip $out/truvari/${query_names[i]}/result*.gz
+done
+deactivate
diff --git a/analysis-v2/phab_cm/globals.sh b/analysis-v2/phab_cm/globals.sh
new file mode 100755
index 0000000..48a102a
--- /dev/null
+++ b/analysis-v2/phab_cm/globals.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+data="/home/timdunn/vcfdist/data"
+out="/home/timdunn/vcfdist/analysis-v2/phab_cm"
+parallel="/home/timdunn/parallel-20211222/src/parallel"
+vcfdist="/home/timdunn/vcfdist/src/vcfdist"
+rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg"
+tabix="/home/timdunn/software/htslib-1.16/tabix"
+bgzip="/home/timdunn/software/htslib-1.16/bgzip"
+timer="/usr/bin/time"
+
+# define reference FASTA
+ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
+
+# T2T information
+truth_name="t2t-q100"
+truth_version="v0.9"
+
+# query VCF information
+query_names=(
+    "hprc"
+    "pav"
+    "giab-tr"
+)
+query_versions=(
+    "v1"
+    "v4"
+    "v4.20"
+)
diff --git a/analysis/13_truvari_compare.py b/analysis-v2/phab_cm/vcfdist_truvari_compare.py
similarity index 97%
rename from analysis/13_truvari_compare.py
rename to analysis-v2/phab_cm/vcfdist_truvari_compare.py
index 1fcae53..7c768d7 100644
--- a/analysis/13_truvari_compare.py
+++ b/analysis-v2/phab_cm/vcfdist_truvari_compare.py
@@ -2,12 +2,9 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-# full genome, no phab
-# truvari_prefix = "../out/giabtr/truvari-init/result_"
-# vcfdist_prefix = "../out/giabtr/vcfdist-keep/"
 # chr20, with phab
-truvari_prefix = "../out/giabtr/truvari-norm-phab-chr20/result_"
-vcfdist_prefix = "../out/giabtr/vcfdist-norm-phab-chr20/"
+truvari_prefix = "./truvari/pav/result_"
+vcfdist_prefix = "./vcfdist/pav."
 do_print = True
 
 SIZE             = 0
diff --git a/analysis-v2/phasing/0_create_filtered_vcfs.sh b/analysis-v2/phasing/0_create_filtered_vcfs.sh
new file mode 100755
index 0000000..8238a28
--- /dev/null
+++ b/analysis-v2/phasing/0_create_filtered_vcfs.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+echo "filtering pav"
+bcftools filter \
+    ~/vcfdist/data/pav-v4/split/pav.all.vcf.gz \
+    -R ~/vcfdist/data/t2t-q100-v0.9/split/bench.bed \
+    -o ./vcfs/pav.vcf
+
+echo "filtering t2t-q100"
+bcftools filter \
+    ~/vcfdist/data/t2t-q100-v0.9/split/t2t-q100.all.vcf.gz \
+    -R ~/vcfdist/data/t2t-q100-v0.9/split/bench.bed \
+    -o ./vcfs/t2t-q100.vcf
diff --git a/analysis/14_phase_analysis.py b/analysis-v2/phasing/14_phase_analysis.py
similarity index 100%
rename from analysis/14_phase_analysis.py
rename to analysis-v2/phasing/14_phase_analysis.py
diff --git a/analysis-v2/phasing/1_whatshap_compare.sh b/analysis-v2/phasing/1_whatshap_compare.sh
new file mode 100755
index 0000000..398f260
--- /dev/null
+++ b/analysis-v2/phasing/1_whatshap_compare.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+/home/timdunn/truvari/venv3.10/bin/whatshap compare \
+    --names pav,t2t-q100 \
+    ./vcfs/pav.vcf \
+    ./vcfs/t2t-q100.vcf \
+    --tsv-pairwise pav.tsv \
+    --switch-error-bed switches.bed
diff --git a/analysis-v2/phasing/parse_vcfs.py b/analysis-v2/phasing/parse_vcfs.py
new file mode 100644
index 0000000..04f584b
--- /dev/null
+++ b/analysis-v2/phasing/parse_vcfs.py
@@ -0,0 +1,64 @@
+
+truth_vcf_fn = "./vcfs/t2t-q100.vcf"
+query_vcf_fn = "./vcfs/pav.vcf"
+out = open("diffs.txt", "w")
+
+# filter positions with multiple variants (like WhatsHap)
+print("Finding query duplicates")
+query_pos = set()
+query_dups = set()
+with open(query_vcf_fn, "r") as query_vcf:
+    for line in query_vcf:
+        if line[0] == "#": continue # skip header
+        fields = line.split()
+        gt = fields[9].split(":")[0]
+        pos = f"{fields[0]}:{fields[1]}:{gt}"
+        if pos in query_pos:
+            query_dups.add(pos)
+        query_pos.add(pos)
+
+print("Finding truth duplicates")
+truth_pos = set()
+truth_dups = set()
+with open(truth_vcf_fn, "r") as truth_vcf:
+    for line in truth_vcf:
+        if line[0] == "#": continue # skip header
+        fields = line.split()
+        gt = fields[9].split(":")[0]
+        pos = f"{fields[0]}:{fields[1]}:{gt}"
+        print(pos)
+        if pos in truth_pos:
+            truth_dups.add(pos)
+        truth_pos.add(pos)
+
+query_vars = []
+query_gts = {}
+truth_gts = {}
+count = 0
+print("Parsing query VCF")
+with open(query_vcf_fn, "r") as query_vcf:
+    for line in query_vcf:
+        if line[0] == "#": continue # skip header
+        fields = line.split()
+        gt = fields[9].split(":")[0]
+        if gt == "1|1": continue
+        if f"{fields[0]}:{fields[1]}" in query_dups: continue
+        var_id = f"{fields[0]}:{fields[1]}={fields[3]}>{fields[4]}"
+        query_vars.append(var_id)
+        query_gts[var_id] = gt
+
+print("Parsing truth VCF")
+with open(truth_vcf_fn, "r") as truth_vcf:
+    for line in truth_vcf:
+        if line[0] == "#": continue # skip header
+        fields = line.split()
+        gt = fields[9].split(":")[0]
+        if gt == "1|1": continue
+        if f"{fields[0]}:{fields[1]}" in truth_dups: continue
+        var_id = f"{fields[0]}:{fields[1]}={fields[3]}>{fields[4]}"
+        truth_gts[var_id] = gt
+
+print("Printing GT mismatches")
+for var in query_vars:
+    if var in truth_gts.keys() and query_gts[var] != truth_gts[var]:
+        print(f"variant: {var}\tquery:{query_gts[var]}\ttruth:{truth_gts[var]}", file=out)
diff --git a/analysis-v2/small_sv_all/0_split.sh b/analysis-v2/small_sv_all/0_split.sh
new file mode 100755
index 0000000..0a6eea0
--- /dev/null
+++ b/analysis-v2/small_sv_all/0_split.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+data="/home/timdunn/vcfdist/data"
+src="pav"
+version="v4"
+base="${data}/${src}-${version}/split/${src}"
+
+# split VCF into sub-VCFs of SNPs, INDELs, SVs
+
+echo "$src snp"
+bcftools view \
+    -i 'TYPE=="SNP"'\
+    -Oz \
+    -o ${base}.snv.vcf.gz \
+    ${base}.all.vcf.gz
+tabix -p vcf ${base}.snv.vcf.gz
+
+echo "$src indel"
+bcftools view \
+    -i 'ILEN < 50 && ILEN > -50'\
+    -Oz \
+    -o ${base}.indel.vcf.gz \
+    ${base}.all.vcf.gz
+tabix -p vcf ${base}.indel.vcf.gz
+
+echo "$src sv"
+bcftools view \
+    -i 'ILEN <= -50 || ILEN >= 50'\
+    -Oz \
+    -o ${base}.sv.vcf.gz \
+    ${base}.all.vcf.gz
+tabix -p vcf ${base}.sv.vcf.gz
+
+echo "$src small"
+bcftools view \
+    -i 'TYPE=="SNP" || (ILEN < 50 && ILEN > -50)'\
+    -Oz \
+    -o ${base}.small.vcf.gz \
+    ${base}.all.vcf.gz
+tabix -p vcf ${base}.small.vcf.gz
+
+echo "$src large"
+bcftools view \
+    -i 'ILEN <= -1 || ILEN >= 1'\
+    -Oz \
+    -o ${base}.large.vcf.gz \
+    ${base}.all.vcf.gz
+tabix -p vcf ${base}.large.vcf.gz
diff --git a/analysis-v2/small_sv_all/1_eval_vcfs.sh b/analysis-v2/small_sv_all/1_eval_vcfs.sh
new file mode 100755
index 0000000..5b1ab6c
--- /dev/null
+++ b/analysis-v2/small_sv_all/1_eval_vcfs.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+source globals.sh
+
+# vcfdist evaluation
+mkdir -p vcfdist
+for cat in "${splits[@]}"; do
+    for i in "${!ds_names[@]}"; do
+        echo "vcfdist: evaluating '${ds_names[i]}' ${cat}"
+        $timer -v $vcfdist \
+            $data/${ds_names[i]}-${ds_versions[i]}/split/${ds_names[i]}.${cat}.vcf.gz \
+            $data/t2t-q100-v0.9/split/t2t-q100.${cat}.vcf.gz \
+            $ref_name \
+            --bed $data/t2t-q100-v0.9/GRCh38_HG2-T2TQ100-V0.9_dipcall-z2k.benchmark.bed \
+            --keep-query --keep-truth \
+            -l 1000 \
+            -p $dir/vcfdist/${ds_names[i]}.${cat}. \
+            2> $dir/vcfdist/log_${ds_names[i]}.${cat}.txt
+    done
+done
diff --git a/analysis-v2/small_sv_all/2_stratify_vcfs.sh b/analysis-v2/small_sv_all/2_stratify_vcfs.sh
new file mode 100755
index 0000000..3a1a2b7
--- /dev/null
+++ b/analysis-v2/small_sv_all/2_stratify_vcfs.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+source globals.sh
+
+$parallel \
+    "bgzip ./vcfdist/{1}.{2}.summary.vcf" ::: \
+    ${ds_names[@]} ::: ${splits[@]}
+
+$parallel \
+    "tabix -p vcf ./vcfdist/{1}.{2}.summary.vcf.gz" ::: \
+    ${ds_names[@]} ::: ${splits[@]}
+
+$parallel \
+    "bcftools view \
+        -R $data/genome-stratifications-v3.3/GRCh38_{3}.bed.gz \
+        -Ou \
+        ./vcfdist/{1}.{2}.summary.vcf.gz \
+        -o ./vcfdist/{1}.{2}.{3}.vcf" ::: \
+        ${ds_names[@]} ::: ${splits[@]} ::: ${bednames[@]}
+
+$parallel \
+    "gunzip ./vcfdist/{1}.{2}.summary.vcf.gz" ::: \
+    ${ds_names[@]} ::: ${splits[@]}
diff --git a/analysis-v2/small_sv_all/3_plot_vcf_tp.py b/analysis-v2/small_sv_all/3_plot_vcf_tp.py
new file mode 100644
index 0000000..bd04c8b
--- /dev/null
+++ b/analysis-v2/small_sv_all/3_plot_vcf_tp.py
@@ -0,0 +1,122 @@
+from collections import defaultdict
+import json
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+
+vcfs = ["t2t-q100", "hprc", "pav", "giab-tr"]
+variant_types = ["snv", "indel", "sv"]
+vcf_types = ["snv", "indel", "sv", "small", "large", "all"]
+regions = [
+    "summary",
+    "alldifficultregions",
+    "AllHomopolymers_ge7bp_imperfectge11bp_slop5",
+    "AllTandemRepeats",
+    "AllTandemRepeatsandHomopolymers_slop5",
+    "MHC",
+    "satellites_slop5",
+    "segdups",
+    "alllowmapandsegdupregions"
+]
+
+for region in regions:
+# initialize counts
+    print(region)
+    counts = {}
+    partial = {}
+    for vcf in vcfs:
+        counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)}
+        partial[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)}
+    colors = ["yellow", "blue", "red", "green", "orange", "purple"]
+
+# count variants
+    for vcf in vcfs:
+        print(f"    {vcf}")
+        for typ in vcf_types:
+            results_fn = f"vcfdist/{vcf}.{typ}.{region}.vcf"
+            with open(results_fn, 'r') as results_file:
+                line_ct = 0
+                for line in results_file:
+                    if line[0] == "#": # header
+                        continue
+                    contig, pos, var_id, ref, alt, qual, filt, info, fmt, truth, query = line.strip().split('\t')
+                    gt, decision, credit, ga4gh_cat, qual2, sc, swap = truth.split(":")
+                    if credit == '.': continue
+                    haps = sum([0 if x == '.' else int(x) for x in gt.split('|')])
+                    if decision == "TP" and float(credit) == 1:
+                        if len(ref) == len(alt) == 1: # snp
+                            counts[vcf]["snv"][typ] += haps
+                        elif len(ref) == 1 and len(alt) > 1: # insertion
+                            if len(alt) > 50:
+                                counts[vcf]["sv"][typ] += haps
+                            else:
+                                counts[vcf]["indel"][typ] += haps
+                        elif len(alt) == 1 and len(ref) > 1: # deletion
+                            if len(ref) > 50:
+                                counts[vcf]["sv"][typ] += haps
+                            else:
+                                counts[vcf]["indel"][typ] += haps
+                        else:
+                            print("ERROR: unexpected variant type")
+                    elif float(credit) > 0:
+                        if len(ref) == len(alt) == 1: # snp
+                            partial[vcf]["snv"][typ] += haps
+                        elif len(ref) == 1 and len(alt) > 1: # insertion
+                            if len(alt) > 50:
+                                partial[vcf]["sv"][typ] += haps
+                            else:
+                                partial[vcf]["indel"][typ] += haps
+                        elif len(alt) == 1 and len(ref) > 1: # deletion
+                            if len(ref) > 50:
+                                partial[vcf]["sv"][typ] += haps
+                            else:
+                                partial[vcf]["indel"][typ] += haps
+                        else:
+                            print("ERROR: unexpected variant type")
+                    line_ct += 1
+                    # if line_ct > 1000: break
+    print(json.dumps(counts, indent=4))
+
+    fig, ax = plt.subplots(1, 3, figsize=(15,6))
+    indices = np.arange(len(vcfs)-1)
+    width = 0.1
+    yquals = [0, 3.01, 6.99, 10, 13.01, 16.99, 20, 23.01, 26.99, 30, # 33.01, 36.99, 40, 43.01, 46.99, 50
+            ]
+    ylabels = ["0%", "50%", "80%", "90%", "95%", "98%", "99%", "99.5%", "99.8%", "99.9%", # "99.95%", "99.98%", "99.99%", "99.995%", "99.998%", "99.999%"
+            ]
+    for var_type_idx, var_type in enumerate(variant_types):
+        for vcf_type_idx, vcf_type in enumerate(vcf_types):
+
+            # skip certain plots (few variants due to complex not getting filtered)
+            if var_type == "snv" and vcf_type in ["indel", "large", "sv"]: continue
+            if var_type == "indel" and vcf_type in ["snv", "sv"]: continue
+            if var_type == "sv" and vcf_type in ["snv", "indel", "small"]: continue
+
+            tp_fracs = [counts[vcf][var_type][vcf_type] / 
+                    max(1, counts["t2t-q100"][var_type][vcf_type]) for vcf in vcfs[1:]]
+            tp_qscores = [50 if frac == 1 else -10*np.log10(1-frac) for frac in tp_fracs]
+            pp_fracs = [partial[vcf][var_type][vcf_type] /
+                    max(1, counts["t2t-q100"][var_type][vcf_type]) for vcf in vcfs[1:]]
+            pptp_fracs = [tp_frac + pp_frac for (pp_frac, tp_frac) in zip(pp_fracs, tp_fracs)]
+            pptp_qscores = [50 if frac == 1 else -10*np.log10(1-frac) for frac in pptp_fracs]
+
+            ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, 
+                    tp_qscores, width, color=colors[vcf_type_idx])
+            ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, 
+                    [pptpq-tpq for (pptpq, tpq) in zip(pptp_qscores, tp_qscores)], 
+                        width, color=colors[vcf_type_idx],
+                    bottom=tp_qscores, alpha=0.5)
+        for yqual in yquals:
+            ax[var_type_idx].axhline(y=yqual, color='k', alpha=0.5, linestyle=':', zorder=-1)
+        ax[var_type_idx].set_title(var_type)
+        ax[var_type_idx].set_xlabel("VCFs")
+        ax[var_type_idx].set_xticks(indices)
+        ax[var_type_idx].set_xticklabels(vcfs[1:])
+        ax[var_type_idx].set_yticks(yquals)
+        ax[var_type_idx].set_yticklabels(ylabels, fontsize=8)
+        ax[var_type_idx].set_ylim(0,30)
+    patches = [mpatches.Patch(color=c, label=l)for c,l in zip(colors, vcf_types)]
+    ax[2].legend(handles=patches, loc=(0.6,0.6))
+    ax[0].set_ylabel("True Positive Recall")
+    plt.suptitle(f"{region}")
+    plt.savefig(f"counts-{region}.png", dpi=300)
diff --git a/analysis-v2/small_sv_all/3_test_plot_tsv.py b/analysis-v2/small_sv_all/3_test_plot_tsv.py
new file mode 100644
index 0000000..7880544
--- /dev/null
+++ b/analysis-v2/small_sv_all/3_test_plot_tsv.py
@@ -0,0 +1,85 @@
+from collections import defaultdict
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+
+vcfs = ["t2t-q100", "hprc", "pav", "giab-tr"]
+variant_types = ["snv", "indel", "sv"]
+vcf_types = ["snv", "indel", "sv", "small", "large", "all"]
+
+# initialize counts
+counts = {}
+partial = {}
+for vcf in vcfs:
+    counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)}
+    partial[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)}
+colors = ["yellow", "blue", "red", "green", "orange", "purple"]
+
+# count variants
+for vcf in vcfs:
+    for typ in vcf_types:
+        print(f"{vcf} {typ}")
+        results_fn = f"vcfdist/{vcf}.{typ}.query.tsv"
+        with open(results_fn, 'r') as results_file:
+            next(results_file) # skip header
+            line_ct = 0
+            for line in results_file:
+                contig, pos, hap, ref, alt, qual, var_type, err_type, credit, clust, sc, loc = line.strip().split('\t')
+                if err_type == "TP":
+                    if len(ref) == len(alt) == 1 and var_type == "SNP": # snp
+                        counts[vcf]["snv"][typ] += 1
+                    elif len(ref) == 0 and var_type == "INS": # insertion
+                        if len(alt) >= 50:
+                            counts[vcf]["sv"][typ] += 1
+                        else:
+                            counts[vcf]["indel"][typ] += 1
+                    elif len(alt) == 0 and var_type == "DEL": # deletion
+                        if len(ref) >= 50:
+                            counts[vcf]["sv"][typ] += 1
+                        else:
+                            counts[vcf]["indel"][typ] += 1
+                    else:
+                        print("ERROR: unexpected variant type")
+                if err_type == "PP":
+                    if len(ref) == len(alt) == 1 and var_type == "SNP": # snp
+                        partial[vcf]["snv"][typ] += 1
+                    elif len(ref) == 0 and var_type == "INS": # insertion
+                        if len(alt) >= 50:
+                            partial[vcf]["sv"][typ] += 1
+                        else:
+                            partial[vcf]["indel"][typ] += 1
+                    elif len(alt) == 0 and var_type == "DEL": # deletion
+                        if len(ref) >= 50:
+                            partial[vcf]["sv"][typ] += 1
+                        else:
+                            partial[vcf]["indel"][typ] += 1
+                    else:
+                        print("ERROR: unexpected variant type")
+                line_ct += 1
+                # if line_ct > 1000: break
+print(json.dumps(counts, indent=4))
+
+fig, ax = plt.subplots(1, 3, figsize=(15,5))
+indices = np.arange(len(vcfs))
+width = 0.1
+for var_type_idx, var_type in enumerate(variant_types):
+    for vcf_type_idx, vcf_type in enumerate(vcf_types):
+        ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, 
+                [counts[vcf][var_type][vcf_type] for vcf in vcfs], width, color=colors[vcf_type_idx])
+        ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, 
+                [partial[vcf][var_type][vcf_type] for vcf in vcfs], width, color=colors[vcf_type_idx],
+                bottom=[counts[vcf][var_type][vcf_type] for vcf in vcfs], alpha=0.5)
+    ax[var_type_idx].set_title(var_type)
+    ax[var_type_idx].set_ylabel("Counts")
+    ax[var_type_idx].set_xlabel("VCFs")
+    ax[var_type_idx].set_xticks(indices)
+    ax[var_type_idx].set_xticklabels(vcfs)
+ax[0].set_ylim(0, 6_000_000)
+ax[1].set_ylim(0, 1_400_000)
+ax[2].set_ylim(0, 35_000)
+fig.tight_layout()
+ax[2].legend(vcf_types, loc=(0.6,0.6))
+leg = ax[2].get_legend()
+for i in range(len(colors)):
+    leg.legendHandles[i].set_color(colors[i])
+plt.savefig("counts.png", dpi=300)
diff --git a/analysis-v2/small_sv_all/4_plot_vcf_fp.py b/analysis-v2/small_sv_all/4_plot_vcf_fp.py
new file mode 100644
index 0000000..67c60fa
--- /dev/null
+++ b/analysis-v2/small_sv_all/4_plot_vcf_fp.py
@@ -0,0 +1,135 @@
+from collections import defaultdict
+import json
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import numpy as np
+
+vcfs = ["t2t-q100", "hprc", "pav", "giab-tr"]
+variant_types = ["snv", "indel", "sv"]
+vcf_types = ["snv", "indel", "sv", "small", "large", "all"]
+regions = [
+    "summary",
+    "alldifficultregions",
+    "AllHomopolymers_ge7bp_imperfectge11bp_slop5",
+    "AllTandemRepeats",
+    "AllTandemRepeatsandHomopolymers_slop5",
+    "MHC",
+    "satellites_slop5",
+    "segdups",
+    "alllowmapandsegdupregions"
+]
+
+for region in regions:
+# initialize fp_counts
+    print(region)
+    fp_counts = {}
+    pp_counts = {}
+    tp_counts = {}
+    for vcf in vcfs:
+        fp_counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)}
+        pp_counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)}
+        tp_counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)}
+    colors = ["yellow", "blue", "red", "green", "orange", "purple"]
+
+# count variants
+    for vcf in vcfs:
+        print(f"    {vcf}")
+        for typ in vcf_types:
+            results_fn = f"vcfdist/{vcf}.{typ}.{region}.vcf"
+            with open(results_fn, 'r') as results_file:
+                line_ct = 0
+                for line in results_file:
+                    if line[0] == "#": # header
+                        continue
+                    contig, pos, var_id, ref, alt, qual, filt, info, fmt, truth, query = line.strip().split('\t')
+                    gt, decision, credit, ga4gh_cat, qual2, sc, swap = query.split(":")
+                    if credit == '.': continue
+                    haps = sum([0 if x == '.' else int(x) for x in gt.split('|')])
+                    if decision == "FP" and float(credit) == 0:
+                        if len(ref) == len(alt) == 1: # snp
+                            fp_counts[vcf]["snv"][typ] += haps
+                        elif len(ref) == 1 and len(alt) > 1: # insertion
+                            if len(alt) > 50:
+                                fp_counts[vcf]["sv"][typ] += haps
+                            else:
+                                fp_counts[vcf]["indel"][typ] += haps
+                        elif len(alt) == 1 and len(ref) > 1: # deletion
+                            if len(ref) > 50:
+                                fp_counts[vcf]["sv"][typ] += haps
+                            else:
+                                fp_counts[vcf]["indel"][typ] += haps
+                        else:
+                            print("ERROR: unexpected variant type")
+                    elif float(credit) < 1:
+                        if len(ref) == len(alt) == 1: # snp
+                            pp_counts[vcf]["snv"][typ] += haps
+                        elif len(ref) == 1 and len(alt) > 1: # insertion
+                            if len(alt) > 50:
+                                pp_counts[vcf]["sv"][typ] += haps
+                            else:
+                                pp_counts[vcf]["indel"][typ] += haps
+                        elif len(alt) == 1 and len(ref) > 1: # deletion
+                            if len(ref) > 50:
+                                pp_counts[vcf]["sv"][typ] += haps
+                            else:
+                                pp_counts[vcf]["indel"][typ] += haps
+                        else:
+                            print("ERROR: unexpected variant type")
+                    else: # credit == 1
+                        if len(ref) == len(alt) == 1: # snp
+                            tp_counts[vcf]["snv"][typ] += haps
+                        elif len(ref) == 1 and len(alt) > 1: # insertion
+                            if len(alt) > 50:
+                                tp_counts[vcf]["sv"][typ] += haps
+                            else:
+                                tp_counts[vcf]["indel"][typ] += haps
+                        elif len(alt) == 1 and len(ref) > 1: # deletion
+                            if len(ref) > 50:
+                                tp_counts[vcf]["sv"][typ] += haps
+                            else:
+                                tp_counts[vcf]["indel"][typ] += haps
+                        else:
+                            print("ERROR: unexpected variant type")
+                    line_ct += 1
+                    # if line_ct > 1000: break
+    print(json.dumps(fp_counts, indent=4))
+
+    fig, ax = plt.subplots(1, 3, figsize=(15,6))
+    indices = np.arange(len(vcfs)-1)
+    width = 0.1
+    thirty_less_yquals = [0, 3.01, 6.99, 10, 13.01, 16.99, 20, 23.01, 26.99, 30] 
+    ylabels = ["0.1%", "0.2%", "0.5%", "1%", "2%", "5%", "10%", "20%", "50%", "100%"]
+    for var_type_idx, var_type in enumerate(variant_types):
+        for vcf_type_idx, vcf_type in enumerate(vcf_types):
+
+            # skip certain plots (few variants due to complex not getting filtered)
+            if var_type == "snv" and vcf_type in ["indel", "large", "sv"]: continue
+            if var_type == "indel" and vcf_type in ["snv", "sv"]: continue
+            if var_type == "sv" and vcf_type in ["snv", "indel", "small"]: continue
+
+            fp = [fp_counts[vcf][var_type][vcf_type]/max(1, tp_counts[vcf][var_type][vcf_type]) for vcf in vcfs[1:]]
+            fpq = [0 if not frac else -10*np.log10(frac) for frac in fp]
+            pp = [pp_counts[vcf][var_type][vcf_type]/max(1, tp_counts[vcf][var_type][vcf_type]) for vcf in vcfs[1:]]
+            ppq = [0 if not frac else -10*np.log10(frac) for frac in pp]
+            ppfp = [fpf + ppf for (fpf, ppf) in zip(fp, pp)]
+            ppfpq = [0 if not frac else -10*np.log10(frac) for frac in ppfp]
+
+            ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, 
+                    [30-fpqual for fpqual in fpq], width, color=colors[vcf_type_idx])
+            ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, 
+                    [-(ppfpqual-fpqual) for (ppfpqual, fpqual) in zip(ppfpq, fpq)],
+                        width, color=colors[vcf_type_idx],
+                    bottom=[30-fpqual for fpqual in fpq], alpha=0.5)
+        for yqual in thirty_less_yquals:
+            ax[var_type_idx].axhline(y=yqual, color='k', alpha=0.5, linestyle=':', zorder=-1)
+        ax[var_type_idx].set_title(var_type)
+        ax[var_type_idx].set_xlabel("VCFs")
+        ax[var_type_idx].set_xticks(indices)
+        ax[var_type_idx].set_xticklabels(vcfs[1:])
+        ax[var_type_idx].set_yticks(thirty_less_yquals)
+        ax[var_type_idx].set_yticklabels(ylabels, fontsize=8)
+    patches = [mpatches.Patch(color=c, label=l)for c,l in zip(colors, vcf_types)]
+    ax[2].legend(handles=patches, loc=(0.6,0.6))
+    ax[0].set_ylabel("False Positive Rate")
+    plt.suptitle(f"{region}")
+    plt.savefig(f"counts-{region}-fp.png", dpi=300)
diff --git a/analysis-v2/small_sv_all/5_extract_diffs.py b/analysis-v2/small_sv_all/5_extract_diffs.py
new file mode 100644
index 0000000..9ec1dc2
--- /dev/null
+++ b/analysis-v2/small_sv_all/5_extract_diffs.py
@@ -0,0 +1,175 @@
+import sys
+
+datasets = ["hprc", "pav", "giab-tr"]
+vcf_types = ["small", "sv"]
+
+def RED(text):
+    return "\x1b[0;31;49m" + text + "\033[0m"
+def YLW(text):
+    return "\x1b[0;33;49m" + text + "\033[0m"
+def GRN(text):
+    return "\x1b[0;32;49m" + text + "\033[0m"
+# def RED(text):
+#     return "\x1b[7;49;31m" + text + "\033[0m"
+# def YLW(text):
+#     return "\x1b[7;49;33m" + text + "\033[0m"
+# def GRN(text):
+#     return "\x1b[7;49;32m" + text + "\033[0m"
+
+all_sc_dict = {}
+for ds in datasets:
+    for vcf in vcf_types:
+        print(f"DATASET: {ds}, SUBSET: {vcf}")
+        # subset, not substitution mutation
+        sub_file = open(f"vcfdist/{ds}.{vcf}.summary.vcf", 'r')
+        all_file = open(f"vcfdist/{ds}.all.summary.vcf", 'r')
+        sub_vars = [l for l in sub_file.readlines() if l[0] != "#"]
+        all_vars = [l for l in all_file.readlines() if l[0] != "#"]
+        out_file = open(f"diffs/{ds}.{vcf}_vs_all.txt", 'w')
+
+        # get all differences in subset (per supercluster)
+        sub_scs = []
+        all_scs = []
+        sc_ctgs = []
+        svi = 0
+        for avi, all_line in enumerate(all_vars):
+            # extract fields
+            all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \
+                    all_info, all_fmt, all_truth, all_query = \
+                    all_line.strip().split('\t')
+            if svi >= len(sub_vars): break
+            sub_line = sub_vars[svi]
+            sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \
+                    sub_info, sub_fmt, sub_truth, sub_query = \
+                    sub_line.strip().split('\t')
+
+            # extract tags
+            all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":")
+            all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":")
+            all_SC = all_tSC if all_qSC == "." else all_qSC
+
+            sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":")
+            sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":")
+            sub_SC = sub_tSC if sub_qSC == "." else sub_qSC
+
+            # same variant
+            if sub_ctg == all_ctg and sub_pos == all_pos and \
+                    sub_ref == all_ref and sub_alt == all_alt:
+
+                # create many-to-one mapping from sub_scs to all_scs (larger due to more variants)
+                if sub_SC != "." and all_SC != ".":
+                    all_sc_dict[sub_ctg+":"+sub_SC] = all_SC
+
+                # skip exact matches
+                if sub_qBD == all_qBD and sub_qBC == all_qBC:
+                    svi += 1
+                    continue
+
+                # different classification
+                if not len(all_scs) or int(all_SC) != all_scs[-1]:
+                    sub_scs.append(int(sub_SC))
+                    all_scs.append(int(all_SC))
+                    sc_ctgs.append(sub_ctg)
+                    print(sub_ctg, sub_SC, all_SC)
+                svi += 1
+
+            else: # different variant
+                if sub_ctg == all_ctg and int(sub_pos) < int(all_pos):
+                    # this only occurs with variant missing from 'all' file due to overlap
+                    svi += 1
+
+        # print superclusters
+        svi = 0
+        avi = 0
+        for sc_ctg, (sub_sc, all_sc) in zip(sc_ctgs, zip(sub_scs, all_scs)):
+            print(f"ctg: {sc_ctg}\t{vcf.upper()} supercluster: {sub_sc}\tALL supercluster: {all_sc}\t{RED('FP/FN')} {YLW('PP')} {GRN('TP')}", file=out_file)
+            print("\tCONTIG\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTRUTH\tQUERY", file=out_file)
+
+            # get info for all vars
+            all_line = all_vars[avi]
+            all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \
+                    all_info, all_fmt, all_truth, all_query = \
+                    all_line.strip().split('\t')
+            all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":")
+            all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":")
+            all_SC = all_tSC if all_qSC == "." else all_qSC
+            all_BC = all_tBC if all_qBC == "." else all_qBC
+            all_BC = max(float(0 if all_tBC == "." else all_tBC), \
+                         float(0 if all_qBC == "." else all_qBC))
+
+            # find supercluster start
+            while int(all_SC) < all_sc or all_ctg != sc_ctg:
+                avi += 1
+                all_line = all_vars[avi]
+                all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \
+                        all_info, all_fmt, all_truth, all_query = \
+                        all_line.strip().split('\t')
+                all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":")
+                all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":")
+                all_SC = all_tSC if all_qSC == "." else all_qSC
+                all_BC = max(float(0 if all_tBC == "." else all_tBC), \
+                             float(0 if all_qBC == "." else all_qBC))
+
+            # get info for sub vars
+            sub_line = sub_vars[svi]
+            sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \
+                    sub_info, sub_fmt, sub_truth, sub_query = \
+                    sub_line.strip().split('\t')
+            sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":")
+            sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":")
+            sub_SC = sub_tSC if sub_qSC == "." else sub_qSC
+            sub_BC = max(float(0 if sub_tBC == "." else sub_tBC), \
+                         float(0 if sub_qBC == "." else sub_qBC))
+
+            # find supercluster start
+            while int(sub_SC) < sub_sc or sub_ctg != sc_ctg:
+                svi += 1
+                sub_line = sub_vars[svi]
+                sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \
+                        sub_info, sub_fmt, sub_truth, sub_query = \
+                        sub_line.strip().split('\t')
+                sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":")
+                sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":")
+                sub_SC = sub_tSC if sub_qSC == "." else sub_qSC
+                sub_BC = max(float(0 if sub_tBC == "." else sub_tBC), \
+                             float(0 if sub_qBC == "." else sub_qBC))
+
+            # print supercluster
+            while sub_ctg+":"+sub_SC in all_sc_dict and all_sc_dict[sub_ctg+":"+sub_SC] == all_SC:
+                if float(sub_BC) == 0:
+                    print(f"{vcf.upper().rjust(6)}: {RED(sub_line)}", end="", file=out_file)
+                elif float(sub_BC) == 1:
+                    print(f"{vcf.upper().rjust(6)}: {GRN(sub_line)}", end="", file=out_file)
+                else:
+                    print(f"{vcf.upper().rjust(6)}: {YLW(sub_line)}", end="", file=out_file)
+                svi += 1
+                sub_line = sub_vars[svi]
+                sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \
+                        sub_info, sub_fmt, sub_truth, sub_query = \
+                        sub_line.strip().split('\t')
+                sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":")
+                sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":")
+                sub_SC = sub_tSC if sub_qSC == "." else sub_qSC
+                sub_BC = max(float(0 if sub_tBC == "." else sub_tBC), \
+                             float(0 if sub_qBC == "." else sub_qBC))
+
+            # print supercluster
+            print(" ", file=out_file)
+            while int(all_SC) == all_sc and all_ctg == sc_ctg:
+                if float(all_BC) == 0:
+                    print(f"   ALL: {RED(all_line)}", end="", file=out_file)
+                elif float(all_BC) == 1:
+                    print(f"   ALL: {GRN(all_line)}", end="", file=out_file)
+                else:
+                    print(f"   ALL: {YLW(all_line)}", end="", file=out_file)
+                avi += 1
+                all_line = all_vars[avi]
+                all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \
+                        all_info, all_fmt, all_truth, all_query = \
+                        all_line.strip().split('\t')
+                all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":")
+                all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":")
+                all_SC = all_tSC if all_qSC == "." else all_qSC
+                all_BC = max(float(0 if all_tBC == "." else all_tBC), \
+                             float(0 if all_qBC == "." else all_qBC))
+            print("\n", file=out_file)
diff --git a/analysis-v2/small_sv_all/globals.sh b/analysis-v2/small_sv_all/globals.sh
new file mode 100755
index 0000000..f3388b2
--- /dev/null
+++ b/analysis-v2/small_sv_all/globals.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+data="/home/timdunn/vcfdist/data"
+dir="/home/timdunn/vcfdist/analysis-v2/small_sv_all"
+parallel="/home/timdunn/parallel-20211222/src/parallel"
+vcfdist="/home/timdunn/vcfdist/src/vcfdist"
+rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg"
+tabix="/home/timdunn/software/htslib-1.16/tabix"
+bgzip="/home/timdunn/software/htslib-1.16/bgzip"
+timer="/usr/bin/time"
+
+# define reference FASTA
+ref_name="${data}/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
+
+# filename information for each dataset
+ds_names=(
+    # "t2t-q100" # must always be included as first VCF
+    # "hprc"
+    "pav"
+    # "giab-tr"
+)
+ds_versions=(
+    # "v0.9"
+    # "v1"
+    "v4"
+    # "v4.20"
+)
+splits=(
+    "snv"
+    "indel"
+    "sv"
+    "small"
+    "large"
+    "all"
+)
+bednames=(
+    "alldifficultregions"
+    "AllHomopolymers_ge7bp_imperfectge11bp_slop5"
+    "AllTandemRepeats"
+    "AllTandemRepeatsandHomopolymers_slop5"
+    "MHC"
+    "satellites_slop5"
+    "segdups"
+    "alllowmapandsegdupregions"
+)
diff --git a/analysis-v2/variants/1_plot_vars.py b/analysis-v2/variants/1_plot_vars.py
new file mode 100644
index 0000000..6c8a954
--- /dev/null
+++ b/analysis-v2/variants/1_plot_vars.py
@@ -0,0 +1,36 @@
+import matplotlib.pyplot as plt
+import numpy as np
+from bisect import bisect_right
+
+vcf_fn = "/home/timdunn/vcfdist/data/t2t-q100-v0.9/split/all.vcf"
+
+bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, 
+        100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, 
+        100_000_000, 200000000, 500000000, 1_000_000_000]
+labels = [1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"]
+all_bins = [-x for x in bins[::-1]] + [0] + bins
+all_labels = [f"-{l}" for l in labels[::-1]] + [0] + labels
+
+counts = [0]*len(all_bins)
+with open(vcf_fn, "r") as vcf:
+    for line in vcf:
+        if line[0] == "#": continue
+        fields = line.split()
+        ctg, pos, uid, ref, alt = fields[:5]
+        varlen = len(alt) - len(ref)
+        idx = bisect_right(all_bins, varlen)
+        counts[idx] += 1
+
+fig, ax = plt.subplots(1,1, figsize=(15,5))
+for x in range(56):
+    plt.axvline(x, color='k', linestyle=':')
+
+ax.bar(np.arange(-0.5, len(all_bins)-0.5), counts, width=1)
+ax.set_xlim(0,len(all_bins))
+ax.set_xticks([0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 28, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56])
+ax.set_xticklabels(all_labels)
+ax.set_yscale('log')
+ax.set_ylim(0.5, 2000000)
+ax.set_xlabel("Variant Size")
+ax.set_ylabel("Counts")
+plt.savefig(f"img/var-lengths.png")
diff --git a/analysis-v2/vs_prior_work/1_eval_vcfs.sh b/analysis-v2/vs_prior_work/1_eval_vcfs.sh
new file mode 100755
index 0000000..9580a1c
--- /dev/null
+++ b/analysis-v2/vs_prior_work/1_eval_vcfs.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+source globals.sh
+
+# # vcfdist evaluation
+# mkdir -p $out/evals/vcfdist
+# for i in "${!query_names[@]}"; do
+#     echo "vcfdist: evaluating '${query_names[i]}'"
+#     $timer -v ../../src/vcfdist \
+#         $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         $data/refs/$ref_name \
+#         --bed $data/${truth_name}-${truth_version}/split/bench.bed \
+#         --keep-query --keep-truth \
+#         -l 1000 \
+#         -p $out/evals/vcfdist/${query_names[i]}. \
+#         2> $out/evals/vcfdist/${query_names[i]}.log
+# done
+
+# # vcfeval evaluation
+# mkdir -p $out/evals/vcfeval
+# for i in "${!query_names[@]}"; do
+#     echo "vcfeval: evaluating '${query_names[i]}'"
+#     $timer -v $rtg vcfeval \
+#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         -t $data/refs/${ref_name}.sdf \
+#         -e $data/${truth_name}-${truth_version}/split/bench.bed \
+#         --threads 64 \
+#         --all-records \
+#         -o $out/evals/vcfeval/${query_names[i]} \
+#         2> $out/evals/vcfeval/${query_names[i]}.log
+# done
+
+source ~/truvari/venv3.10/bin/activate
+
+# # truvari evaluation
+# mkdir -p $out/evals/truvari
+# for i in "${!query_names[@]}"; do
+#     echo "truvari: evaluating '${query_names[i]}'"
+#     truvari bench \
+#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         --bSample HG002 \
+#         --cSample HG002 \
+#         --includebed $data/${truth_name}-${truth_version}/split/bench.bed \
+#         -f $data/refs/$ref_name \
+#         -s 1 -S 1 \
+#         --sizemax 1000 \
+#         --no-ref a \
+#         --pick multi \
+#         -o $out/evals/truvari/${query_names[i]} \
+#         2> $out/evals/truvari/${query_names[i]}.log
+# done
+
+# wfa refine
+for i in "${!query_names[@]}"; do
+    if [[ $i -eq  1 ]]; then
+        continue
+    fi
+    echo "truvari: wfa refine '${query_names[i]}'"
+    rm -rf $out/evals/truvari/${query_names[i]}_wfa
+    cp -r $out/evals/truvari/${query_names[i]}_\
+          $out/evals/truvari/${query_names[i]}_wfa
+    truvari refine \
+        -f $data/refs/$ref_name \
+        -t 64 \
+        --use-original-vcfs \
+        --align wfa \
+        $out/evals/truvari/${query_names[i]}_wfa \
+        2> $out/evals/truvari/${query_names[i]}_wfa.log
+done
+
+# for rep in "${reps[@]}"; do
+#     for i in "${!ds_names[@]}"; do
+#         echo "truvari: mafft refine $rep '${ds_names[i]}'"
+#         rm -rf $out/evals/$rep/truvari/${ds_names[i]}_mafft
+#         cp -r $out/evals/$rep/truvari/${ds_names[i]} \
+#               $out/evals/$rep/truvari/${ds_names[i]}_mafft
+#         truvari refine \
+#             -f $data/refs/$ref_name \
+#             -t 64 \
+#             --use-original-vcfs \
+#             --align mafft \
+#             $out/evals/$rep/truvari/${ds_names[i]}_mafft
+#     done
+# done
+
+deactivate
diff --git a/analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh b/analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh
new file mode 100755
index 0000000..d606f60
--- /dev/null
+++ b/analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+source globals.sh
+
+# # vcfdist evaluation
+# mkdir -p $out/evals/vcfdist_tr
+# for i in "${!query_names[@]}"; do
+#     echo "vcfdist: evaluating '${query_names[i]}'"
+#     $timer -v ../../src/vcfdist \
+#         $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         $data/refs/$ref_name \
+#         --bed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \
+#         -l 1000 \
+#         -p $out/evals/vcfdist_tr/${query_names[i]}. \
+#         2> $out/evals/vcfdist_tr/${query_names[i]}.log
+# done
+
+# # vcfeval evaluation
+# mkdir -p $out/evals/vcfeval_tr
+# for i in "${!query_names[@]}"; do
+#     echo "vcfeval: evaluating '${query_names[i]}'"
+#     $timer -v $rtg vcfeval \
+#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         -t $data/refs/${ref_name}.sdf \
+#         -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \
+#         --threads 64 \
+#         --all-records \
+#         -o $out/evals/vcfeval_tr/${query_names[i]} \
+#         2> $out/evals/vcfeval_tr/${query_names[i]}.log
+# done
+
+source ~/truvari/venv3.10/bin/activate
+
+# # truvari evaluation
+# mkdir -p $out/evals/truvari_tr
+# for i in "${!query_names[@]}"; do
+#     echo "truvari: evaluating '${query_names[i]}'"
+#     truvari bench \
+#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
+#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
+#         --bSample HG002 \
+#         --cSample HG002 \
+#         --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \
+#         -f $data/refs/$ref_name \
+#         -s 1 -S 1 \
+#         --sizemax 1000 \
+#         --no-ref a \
+#         --pick multi \
+#         -o $out/evals/truvari_tr/${query_names[i]} \
+#         2> $out/evals/truvari_tr/${query_names[i]}.log
+# done
+
+# # wfa refine
+# for i in "${!query_names[@]}"; do
+#     echo "truvari: wfa refine '${query_names[i]}'"
+#     rm -rf $out/evals/truvari_tr/${query_names[i]}_wfa
+#     cp -r $out/evals/truvari_tr/${query_names[i]} \
+#           $out/evals/truvari_tr/${query_names[i]}_wfa
+#     truvari refine \
+#         -f $data/refs/$ref_name \
+#         -t 64 \
+#         --use-original-vcfs \
+#         --align wfa \
+#         $out/evals/truvari_tr/${query_names[i]}_wfa \
+#         2> $out/evals/truvari_tr/${query_names[i]}_wfa.log
+# done
+
+# mafft refine
+for i in "${!query_names[@]}"; do
+    echo "truvari: mafft refine '${query_names[i]}'"
+    rm -rf $out/evals/truvari_tr/${query_names[i]}_mafft
+    cp -r $out/evals/truvari_tr/${query_names[i]} \
+          $out/evals/truvari_tr/${query_names[i]}_mafft
+    truvari refine \
+        -f $data/refs/$ref_name \
+        -t 64 \
+        --use-original-vcfs \
+        --align mafft \
+        $out/evals/truvari_tr/${query_names[i]}_mafft \
+        2> $out/evals/truvari_tr/${query_names[i]}_mafft.log
+done
+
+deactivate
diff --git a/analysis-v2/vs_prior_work/README.md b/analysis-v2/vs_prior_work/README.md
new file mode 100644
index 0000000..3a7642d
--- /dev/null
+++ b/analysis-v2/vs_prior_work/README.md
@@ -0,0 +1 @@
+Directory for comparing vcfdist, vcfeval, and Truvari
diff --git a/analysis-v2/vs_prior_work/globals.sh b/analysis-v2/vs_prior_work/globals.sh
new file mode 100755
index 0000000..8062207
--- /dev/null
+++ b/analysis-v2/vs_prior_work/globals.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+data="/home/timdunn/vcfdist/data"
+out="/home/timdunn/vcfdist/analysis-v2/vs_prior_work"
+parallel="/home/timdunn/parallel-20211222/src/parallel"
+vcfdist="/home/timdunn/vcfdist/src/vcfdist"
+rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg"
+tabix="/home/timdunn/software/htslib-1.16/tabix"
+bgzip="/home/timdunn/software/htslib-1.16/bgzip"
+timer="/usr/bin/time"
+
+# define reference FASTA
+ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"
+
+# T2T information
+truth_name="t2t-q100"
+truth_version="v0.9"
+
+# query VCF information
+query_names=(
+    "hprc"
+    "pav"
+    "giab-tr"
+)
+query_versions=(
+    "v1"
+    "v4"
+    "v4.20"
+)
diff --git a/analysis/12_plot_ram.py b/analysis/10_plot_ram.py
similarity index 100%
rename from analysis/12_plot_ram.py
rename to analysis/10_plot_ram.py
diff --git a/analysis/10_vcfdist_sv.sh b/analysis/10_vcfdist_sv.sh
deleted file mode 100755
index d460c38..0000000
--- a/analysis/10_vcfdist_sv.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-source ../pipeline/includes.sh
-
-# # Zook GIAB TR dataset (0:56, 3:40, 7:30)
-# $timer -v ../src/vcfdist \
-#     $data/zook/verkko.vcf.gz \
-#     $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \
-#     $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-#     -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \
-#     -l 10000 \
-#     -t -q \
-#     -p ../out/giabtr/vcfdist-keep/
-
-# Zook GIAB TR dataset
-$timer -v ../src/vcfdist \
-    $data/zook/verkko.vcf.gz \
-    $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \
-    $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-    -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-    -l 10000 \
-    -p ../out/giabtr/vcfdist-chr20
-
-# Benchmark on Truvari phab variants (chr20)
-# file="../out/giabtr/phab-chr20/output.vcf.gz"
-# for sample in `bcftools query -l $file`; do
-#     bcftools view -c1 -Oz -s $sample -o ${file/.vcf*/.$sample.vcf.gz} $file
-# done
-# $timer -v ../src/vcfdist \
-#     ../out/giabtr/phab-chr20/norm.QUERY.vcf.gz \
-#     ../out/giabtr/phab-chr20/norm.TRUTH.vcf.gz \
-#     $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-#     -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-#     -l 10000 \
-#     -t -q \
-#     -p ../out/giabtr/vcfdist-norm-phab-chr20/
diff --git a/analysis/11_truvari_sv.sh b/analysis/11_truvari_sv.sh
deleted file mode 100755
index 4e0c929..0000000
--- a/analysis/11_truvari_sv.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/bin/bash
-
-source ~/truvari/venv3.10/bin/activate
-source ../pipeline/includes.sh
-
-# bcftools annotate -x FMT/AD GRCh38_HG2-verrkoV1.1-V0.7_dipcall-z2k.vcf.gz > verkko0.vcf.gz
-# bcftools norm -m-any --check-ref -w -f GCA... verkko0.vcf.gz -o verkko-norm.vcf.gz
-# bcftools sort verkko-norm.vcf.gz -o verkko.vcf.gz
-# tabix -p vcf verkko.vcf.gz
-
-# # TruVari without harmonization
-# rm -r ../out/giabtr/truvari-init
-# truvari bench \
-#     -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \
-#     -c $data/zook/verkko.vcf.gz \
-#     -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-#     --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.lregions.bed \
-#     --no-ref a \
-#     --sizemin 1 --sizefilt 1 --sizemax 10000 \
-#     --pick multi \
-#     --typeignore \
-#     --dup-to-ins \
-#     -o ../out/giabtr/truvari-init/
-
-# laytr tru2ga \
-#     -i ../out/giabtr/truvari-init/ \
-    # -o ../out/giabtr/truvari-init/result
-
-# Truvari benchmarking chr20
-giab_dir="../out/giabtr"
-phab_dir="$giab_dir/phab-chr20"
-truvari_phab_dir="$giab_dir/truvari-norm-phab-chr20"
-
-mkdir -p $phab_dir
-$timer -v truvari phab \
-    -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-    -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \
-    -c $data/zook/verkko.vcf.gz \
-    --bSamples HG002\
-    --cSamples syndip\
-    --align wfa \
-    -t 56 \
-    -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-    -o $phab_dir/output.vcf.gz
-
-# file="../out/giabtr/phab-chr20/output.vcf.gz"
-# for sample in `bcftools query -l $file`; do
-#     bcftools view -c1 -Oz -s $sample -o ${file/.vcf*/.$sample.vcf.gz} $file
-#     tabix -p vcf ${file/.vcf*/.$sample.vcf.gz}
-# done
-samples=(
-    "QUERY"
-    "TRUTH"
-)
-# for sample in ${samples[@]}; do
-#     bcftools norm \
-#         -a \
-#         -m-any \
-#         --check-ref w \
-#         -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-#         $phab_dir/output.${sample}.vcf.gz \
-#         -o $phab_dir/norm.${sample}.vcf.gz
-#     tabix -p vcf $phab_dir/norm.${sample}.vcf.gz
-# done
-
-# $timer -v truvari bench \
-#     -b $phab_dir/norm.TRUTH.vcf.gz \
-#     -c $phab_dir/norm.QUERY.vcf.gz \
-#     -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-#     --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-#     --no-ref a \
-#     --sizemin 1 --sizefilt 1 --sizemax 10000 \
-#     --pick multi \
-#     --typeignore \
-#     --dup-to-ins \
-#     -o $truvari_phab_dir
-
-# laytr tru2ga \
-#     -i $truvari_phab_dir/ \
-#     -o $truvari_phab_dir/result
-# gunzip $truvari_phab_dir/result*.gz
-
-# deactivate
diff --git a/analysis/2_general_sw_space_plot.py b/analysis/2_general_sw_space_plot.py
index bf1324f..902cd05 100644
--- a/analysis/2_general_sw_space_plot.py
+++ b/analysis/2_general_sw_space_plot.py
@@ -6,7 +6,10 @@
 mpl.rcParams.update(mpl.rcParamsDefault)
 mpl.rcParams['text.usetex'] = True
 
-plt.figure(figsize=(12, 5))
+def mm2in(x):
+    return x / 25.4
+
+plt.figure(figsize=(mm2in(180), mm2in(75)))
 xmin, xmax = 0, 3.2
 ymin, ymax = -0.1, 1.5
 label_pad = 0.03
@@ -15,11 +18,11 @@
 x = list(np.arange(xmin, xmax+1, 0.1))
 
 # G_open > 0
-plt.annotate("$o < 0$", (1.32,1.37), rotation=35, color="white")
+plt.annotate("$o < 0$", (1.31,1.36), rotation=35, color="white", fontsize=7)
 plt.fill_between(x, x, ymax, alpha=0.8, facecolor='k')
 
 # vertical
-plt.annotate("$2(o+e) < x$", (0.46,1.18), rotation=90, color="white")
+plt.annotate("$2(o+e) < x$", (0.45,1.08), rotation=90, color="white", fontsize=7)
 plt.axvline(0.5, alpha=0.8, color='k', zorder=-100)
 # plt.annotate("$3x < o+e$", (3.01,1.22), rotation=90, color="black")
 # plt.axvspan(3, 3.2, alpha=0.1, color='k', zorder=-100)
@@ -27,35 +30,35 @@
 # horizontal
 # plt.annotate("$x < 2e$", (label_pad,0.57), ha="left", va="top", color="white")
 # plt.plot(x, [0.5]*len(x), color='k', alpha=0.8)
-plt.annotate("$e < 0$", (label_pad,-0.03), ha="left", va="top", color="white")
+plt.annotate("$e < 0$", (label_pad,-0.03), ha="left", va="top", color="white", fontsize=7)
 plt.fill_between(x, -0.2, 0, facecolor='k', alpha=0.8)
 
 # min ED
 i = 0
-plt.scatter(1,1, marker="X")
-plt.annotate("edlib", (1+label_pad,1), ha="left", va="center", color=f"C{i}")
+plt.scatter(1,1, marker="X", s=7)
+plt.annotate("edlib", (1+label_pad,1), ha="left", va="center", color=f"C{i}", fontsize=7)
 i += 1
 
 # minimap2
 mm2_presets = {
         r"mm2 \texttt{sr}":     { "a": 0, "b": 10, "o1": 13, "e1":3, "o2":25,  "e2": 2, "m":"^" },
-        r"\noindent mm2 \texttt{map-ont}\\winnowmap":{ "a": 0, "b": 6,  "o1": 5,  "e1":3, "o2":25, "e2": 2, "m":"s" },
+        r"\noindent mm2 \texttt{map-ont}\vspace{-2mm}\\winnowmap":{ "a": 0, "b": 6,  "o1": 5,  "e1":3, "o2":25, "e2": 2, "m":"s" },
         r"mm2 \texttt{map-pb}": { "a": 0, "b": 10, "o1": 13, "e1":5, "o2":53, "e2": 3, "m":"s" },
         r"mm2 \texttt{asm5}":   { "a": 0, "b": 40, "o1": 79, "e1":7, "o2":163, "e2": 3, "m":"P" },
         r"mm2 \texttt{asm10}":  { "a": 0, "b": 20, "o1": 33, "e1":5, "o2":83, "e2": 3, "m":"P" },
     "pbmm2":                { "a": 0, "b": 7, "o1": 6, "e1":5, "o2":57, "e2": 3, "m":"s" },
 }
 for name, x in mm2_presets.items():
-    start = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e1"]/x["b"], color=f"C{i}", marker=x["m"])
-    end = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e2"]/x["b"], color=f"C{i}", marker=x["m"])
+    start = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e1"]/x["b"], color=f"C{i}", marker=x["m"], s=7)
+    end = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e2"]/x["b"], color=f"C{i}", marker=x["m"], s=7)
     plt.arrow((x["o1"] + x["e1"])/x["b"], x["e1"]/x["b"]-0.02, 0, x["e2"]/x["b"]-x["e1"]/x["b"]+0.04, color='k',
             length_includes_head=True, head_width=0.02, head_length = 0.016)
-    plt.annotate(name, ((x["o1"] + x["e1"])/x["b"]+label_pad, x["e1"]/x["b"]), ha="left", va="center", color=f"C{i}")
+    plt.annotate(name, ((x["o1"] + x["e1"])/x["b"]+label_pad, x["e1"]/x["b"]), ha="left", va="center", color=f"C{i}", fontsize=7)
     i += 1
 
 # BWA, GRAF, DRAGEN
-plt.scatter(1.6, 0.3, color=f"C{i}", marker="^", zorder=100)
-plt.annotate(r"\noindent DRAGEN\\ BWA\\GRAF", (1.6+label_pad, 0.275), ha="right", va="center", color=f"C{i}")
+plt.scatter(1.6, 0.3, color=f"C{i}", marker="^", zorder=100, s=7)
+plt.annotate(r"\noindent DRAGEN\vspace{-2mm}\\BWA\vspace{-2mm}\\GRAF", (1.6+label_pad, 0.275), ha="right", va="center", color=f"C{i}", fontsize=7)
 i += 1
 
 # ngmlr
@@ -65,8 +68,8 @@
 }
 for name, x in ngmlr_presets.items():
     for e in np.arange(x["e_max"], x["e_min"], -x["e_dec"]):
-        plt.scatter(x["o"]/x["b"], e/x["b"], color=f"C{i}")
-    plt.annotate(name, (x["o"]/x["b"]+label_pad, (x["e_max"]+x["e_min"])/(2*x["b"])), ha="left", va="center", color=f"C{i}")
+        plt.scatter(x["o"]/x["b"], e/x["b"], color=f"C{i}", s=7)
+    plt.annotate(name, (x["o"]/x["b"]+label_pad, (x["e_max"]+x["e_min"])/(2*x["b"])), ha="left", va="center", color=f"C{i}", fontsize=7)
     i += 1
     plt.arrow(x["o"]/x["b"], x["e_max"]/x["b"], 0, x["e_min"]/x["b"]-x["e_max"]/x["b"], color='k',
             length_includes_head=True, head_width=0.02, head_length = 0.016)
@@ -74,35 +77,35 @@
 # npore
 points = [[1.7, 0.5], [1.7,0.7], [1.7, 1], [1.7, 0.8], [1.7, 1.1], [2.3, 1.8], [2.8, 1], [3.2, 1.1], [3.2, 2.1], [3.2, 1], [3.7, 1.9], [3.7, 2], [3.7, 1], [3.7, 0.6], [3.8, 2.4], [3.8, 1.7], [3.8, 0.8], [4.6, 2.5], [4.6, 2.3], [4.6, 1.5]]
 for p in points:
-    plt.scatter(p[0]/5.72, p[1]/5.72, color=f"purple")
-plt.annotate("npore", (0.5, 0.25), ha="left", va="center", color=f"purple")
-plt.scatter(6/5.72, 1/5.72, marker="s", color=f"purple")
-plt.annotate("npore", (6/5.72+label_pad, 1/5.72), ha="left", va="center", color=f"purple")
+    plt.scatter(p[0]/5.72, p[1]/5.72, color=f"purple", s=7)
+plt.annotate("npore", (0.5, 0.25), ha="left", va="center", color=f"purple", fontsize=7)
+plt.scatter(6/5.72, 1/5.72, marker="s", color=f"purple", s=7)
+plt.annotate("npore", (6/5.72+label_pad, 1/5.72), ha="left", va="center", color=f"purple", fontsize=7)
 i += 1
 
 # verkko
-plt.scatter(0, 0, color=f"goldenrod", zorder=100)
-plt.annotate("verkko", (label_pad, 0), ha="left", va="bottom", color=f"goldenrod")
+plt.scatter(0, 0, color=f"goldenrod", zorder=100, s=7)
+plt.annotate("verkko", (label_pad, 0), ha="left", va="bottom", color=f"goldenrod", fontsize=7)
 
 # points
-plt.scatter(0.4, 0.3, marker="*", color=f"k")
-plt.annotate("A", (0.4+label_pad, 0.3), ha="left", va="center", color="k")
-plt.scatter(1, 0.333, marker="*", color=f"k")
-plt.annotate("B", (1+label_pad, 0.333), ha="left", va="center", color="k")
-plt.scatter(1.6, 0.4, marker="*", color=f"k")
-plt.annotate("C", (1.6+label_pad, 0.4), ha="left", va="center", color="k")
-plt.scatter(2, 0.2, marker="*", color=f"k")
-plt.annotate("D", (2+label_pad, 0.2), ha="left", va="center", color="k")
+plt.scatter(0.4, 0.3, marker="*", color=f"k", s=7)
+plt.annotate("A", (0.4+label_pad, 0.3), ha="left", va="center", color="k", fontsize=7)
+plt.scatter(1, 0.333, marker="*", color=f"k", s=7)
+plt.annotate("B", (1+label_pad, 0.333), ha="left", va="center", color="k", fontsize=7)
+plt.scatter(1.6, 0.4, marker="*", color=f"k", s=7)
+plt.annotate("C", (1.6+label_pad, 0.4), ha="left", va="center", color="k", fontsize=7)
+plt.scatter(2, 0.2, marker="*", color=f"k", s=7)
+plt.annotate("D", (2+label_pad, 0.2), ha="left", va="center", color="k", fontsize=7)
 
 # arrows
 plt.arrow(2.1, 1.1,  0.1,  0.05, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # right
-plt.annotate("prefer\nsubstitutions", (2.2+label_pad, 1.15), ha="left", va="center")
+plt.annotate("prefer\nsubstitutions", (2.2+label_pad, 1.15), ha="left", va="center", fontsize=7)
 plt.arrow(2.1, 1.1, -0.1, -0.05, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # left
-plt.annotate("prefer\nINDELs", (2.0-label_pad, 1.05), ha="right", va="center")
+plt.annotate("prefer\nINDELs", (2.0-label_pad, 1.05), ha="right", va="center", fontsize=7)
 plt.arrow(2.065, 1.175, -0.01,  0.01, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # up
-plt.annotate("prefer\nseparate\ngaps", (2.05, 1.2+label_pad), ha="center", va="bottom")
+plt.annotate("prefer\nseparate\ngaps", (2.05, 1.2+label_pad), ha="center", va="bottom", fontsize=7)
 plt.arrow(2.13, 1.05,  0, -0.02, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # down
-plt.annotate("prefer\nmerged\ngaps", (2.15, 1-label_pad), ha="center", va="top")
+plt.annotate("prefer\nmerged\ngaps", (2.15, 1-label_pad), ha="center", va="top", fontsize=7)
 arc_angles = linspace(0 * pi, pi/4, 20)
 r = 0.2
 arc_xs = r * cos(arc_angles)
@@ -111,29 +114,31 @@
 
 # legend
 sra = mlines.Line2D([], [], color='k', marker='^', linestyle='None',
-        markersize=7, label = 'Short Read Aligners')
+        markersize=6, label = 'Short Read Aligners')
 lra = mlines.Line2D([], [], color='k', marker='s', linestyle='None',
-        markersize=7, label = 'Long Read Aligners')
+        markersize=6, label = 'Long Read Aligners')
 aa = mlines.Line2D([], [], color='k', marker='P', linestyle='None',
-        markersize=7, label = 'Assembly Aligners')
+        markersize=6, label = 'Assembly Aligners')
 sva = mlines.Line2D([], [], color='k', marker='o', linestyle='None',
-        markersize=7, label = 'SV/CNV Aligners')
+        markersize=6, label = 'SV/CNV Aligners')
 ed = mlines.Line2D([], [], color='k', marker='X', linestyle='None',
-        markersize=7, label = 'Edit Distance Align')
+        markersize=6, label = 'Edit Distance Align')
 points = mlines.Line2D([], [], color='k', marker='*', linestyle='None',
-        markersize=9, label = 'Select Design Points')
-leg1 = plt.legend(handles = [sra, lra, aa, sva, ed, points], loc = (0.8,0.2))
+        markersize=8, label = 'Select Design Points')
+leg1 = plt.legend(handles = [sra, lra, aa, sva, ed, points], loc = (0.78,0.15), fontsize=7)
 
 m = mlines.Line2D([], [], linestyle='None', label = r'$m$ : match penalty ($0$)')
 x = mlines.Line2D([], [], linestyle='None', label = r'$x$ : mismatch penalty')
 o = mlines.Line2D([], [], linestyle='None', label = r'$o$ : gap opening penalty')
 e = mlines.Line2D([], [], linestyle='None', label = r'$e$ : gap extension penalty')
-plt.legend(handles = [m, x, o, e], loc=(0.78, 0.6), frameon=False)
+plt.legend(handles = [m, x, o, e], loc=(0.75, 0.6), frameon=False, fontsize=7)
 plt.gca().add_artist(leg1)
 
 # labels
-plt.xlabel(r'\LARGE{Starting a Gap: $(o+e) / x$}')
-plt.ylabel(r'\LARGE{Extending a Gap: $e / x$}')
+plt.xlabel(r'\textbf{Starting a Gap:} $(o+e) / x$', fontsize=7)
+plt.ylabel(r'\textbf{Extending a Gap:} $e / x$', fontsize=7)
+plt.xticks(fontsize=7)
+plt.yticks(fontsize=7)
 plt.tight_layout()
-# plt.savefig("img/2_general_sw_space.pdf", format="pdf")
-plt.savefig("img/2_general_sw_space.png", dpi=200)
+plt.savefig("img/2_general_sw_space.pdf", format="pdf")
+# plt.savefig("img/2_general_sw_space.png", dpi=200)
diff --git a/analysis/4_vcfdist_output.py b/analysis/4_vcfdist_output.py
index 5f6ffa8..f8d78f4 100644
--- a/analysis/4_vcfdist_output.py
+++ b/analysis/4_vcfdist_output.py
@@ -2,11 +2,26 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
+import matplotlib as mpl
+mpl.rcParams.update(mpl.rcParamsDefault)
+mpl.rcParams['text.usetex'] = True
+plt.rcParams.update({"figure.facecolor": (0,0,0,0)})
+
+def mm2in(x):
+    return x / 25.4
 
 names = ["O"]
 max_qual = 31
 
-fig, ax = plt.subplots(3, 2, figsize=(8,9))
+fig, ax = plt.subplots(3, 2, figsize=(mm2in(80),mm2in(90)))
+
+# source data headers
+out_5bi = open("tsv/5bi.tsv", "w")
+out_5bii = open("tsv/5bii.tsv", "w")
+out_5biii = open("tsv/5biii.tsv", "w")
+print("DATASET\tSUBMISSION\tVARIANT_TYPE\tQUALITY_SCORE\tPRECISION\tRECALL", file=out_5bi)
+print("DATASET\tSUBMISSION\tVARIANT_TYPE\tQUALITY_SCORE\tEDIT_DISTANCE\tDISTINCT_EDITS", file=out_5bii)
+print("DATASET\tSUBMISSION\tVARIANT_TYPE\tQUALITY_SCORE\tEDIT_DISTANCE\tDISTINCT_EDITS", file=out_5biii)
 
 # EDIT DISTANCE
 sub_des = []
@@ -33,25 +48,38 @@
         ins_eds.append(int(ins_ed))
         del_eds.append(int(del_ed))
         eds.append(int(ed))
+        print(f"NIST\tK4GT3\tSUB\t{qual}\t{sub_ed}\t{sub_de}", file=out_5bii)
+        print(f"NIST\tK4GT3\tINS\t{qual}\t{ins_ed}\t{ins_de}", file=out_5biii)
+        print(f"NIST\tK4GT3\tDEL\t{qual}\t{del_ed}\t{del_de}", file=out_5biii)
 
-ax[1][0].plot(quals[:max_qual], sub_des[:max_qual], marker='.', linestyle='', color=f"C0", label=label)
-ax[2][0].plot(quals[:max_qual], ins_des[:max_qual], marker='.', linestyle='', color=f"C1", label=label)
-ax[2][0].plot(quals[:max_qual], del_des[:max_qual], marker='.', linestyle='', color=f"C2", label=label)
+ax[1][0].plot(quals[:max_qual], sub_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C0", label=label)
+ax[2][0].plot(quals[:max_qual], ins_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C1", label=label)
+ax[2][0].plot(quals[:max_qual], del_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C2", label=label)
 
-ax[1][1].plot(quals[:max_qual], sub_des[:max_qual], marker='.', linestyle='', color=f"C0", label=label)
-ax[2][1].plot(quals[:max_qual], ins_eds[:max_qual], marker='.', linestyle='', color=f"C1", label=label)
-ax[2][1].plot(quals[:max_qual], del_eds[:max_qual], marker='.', linestyle='', color=f"C2", label=label)
+ax[1][1].plot(quals[:max_qual], sub_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C0", label=label)
+ax[2][1].plot(quals[:max_qual], ins_eds[:max_qual], marker='.', markersize=1, linestyle='', color=f"C1", label=label)
+ax[2][1].plot(quals[:max_qual], del_eds[:max_qual], marker='.', markersize=1, linestyle='', color=f"C2", label=label)
  
 for i in range(2):
-    ax[1][i].set_xlabel("Quality Threshold")
-    ax[1][i].set_ylabel("Distinct Edits (DE)")
+    ax[1][i].set_xlabel(r"\textbf{Quality Threshold}", fontsize=7)
+    if i % 2:
+        ax[1][i].set_ylabel(r"\textbf{Edit Distance}", fontsize=7)
+    else:
+        ax[1][i].set_ylabel(r"\textbf{Distinct Edits}", fontsize=7)
     ax[1][i].set_yscale('log')
-    ax[1][i].legend(["SNP"], loc="upper left") 
+    ax[1][i].yaxis.set_minor_formatter(mpl.ticker.ScalarFormatter())
+    ax[1][i].tick_params(axis='both', which='both', labelsize=5)
+    ax[1][i].legend(["substitution"], markerscale=3, loc="upper left", fontsize=5) 
 for i in range(2):
-    ax[2][i].set_xlabel("Quality Threshold")
-    ax[2][i].set_ylabel("Edit Distance (ED)")
+    ax[2][i].set_xlabel(r"\textbf{Quality Threshold}", fontsize=7)
+    if i % 2:
+        ax[2][i].set_ylabel(r"\textbf{Edit Distance}", fontsize=7)
+    else:
+        ax[2][i].set_ylabel(r"\textbf{Distinct Edits}", fontsize=7)
     ax[2][i].set_yscale('log')
-    ax[2][i].legend(["INS", "DEL" ], loc="upper left")
+    ax[2][i].yaxis.set_minor_formatter(mpl.ticker.ScalarFormatter())
+    ax[2][i].tick_params(axis='both', which='both', labelsize=5)
+    ax[2][i].legend(["insertion", "deletion" ], markerscale=3, loc="upper left", fontsize=5)
 
 # PRECISION RECALL
 with open(f"/x/vcfdist/data/pfda-v2/nist_vcfdist/K4GT3_HG002_O.precision-recall.tsv") as csv:
@@ -72,6 +100,7 @@
             else:
                 indel_prec.append(float(truth_tp) / 
                         (float(truth_tp) + float(query_pp) + float(query_fp)))
+            print(f"NIST\tK4GT3\tINDEL\t{qual}\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out_5bi)
         elif typ == "SNP":
             snp_recall.append(float(truth_tp) / float(truth_tot))
             if int(truth_tp) + int(query_pp) + int(query_fp) == 0:
@@ -79,26 +108,32 @@
             else:
                 snp_prec.append(float(truth_tp) / 
                         (float(truth_tp) + float(query_pp) + float(query_fp)))
+            print(f"NIST\tK4GT3\tSNP\t{qual}\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out_5bi)
 
-ax[0][0].set_title("SNPs")
-ax[0][0].set_xlabel("Recall")
-ax[0][0].set_ylabel("Precision")
+ax[0][0].set_title(r"\textbf{SNPs}", fontsize=7)
+ax[0][0].set_xlabel(r"\textbf{Recall}", fontsize=7)
+ax[0][0].set_ylabel(r"\textbf{Precision}", fontsize=7)
 ax[0][0].set_xlim(0.995, 1)
-ax[0][0].set_xticks(np.arange(0.995, 1.0001, 0.001))
+ax[0][0].set_xticks(np.arange(0.995, 1.0001, 0.0025))
 ax[0][0].set_ylim(0.995, 1)
 ax[0][0].set_yticks(np.arange(0.995, 1.0001, 0.001))
+ax[0][0].tick_params(axis='both', which='both', labelsize=5)
 ax[0][0].plot(snp_recall, snp_prec, linestyle='', 
-        marker='.', color=f"C0", label=label)
+        marker='.', color=f"C0", label=label, markersize=1)
 
-ax[0][1].set_title("INDELs")
-ax[0][1].set_xlabel("Recall")
-ax[0][1].set_ylabel("Precision")
+ax[0][1].set_title(r"\textbf{INDELs}", fontsize=7)
+ax[0][1].set_xlabel(r"\textbf{Recall}", fontsize=7)
+ax[0][1].set_ylabel(r"\textbf{Precision}", fontsize=7)
 ax[0][1].set_xlim(0.98, 1)
-ax[0][1].set_xticks(np.arange(0.98, 1.001, 0.005))
+ax[0][1].set_xticks(np.arange(0.98, 1.001, 0.01))
 ax[0][1].set_ylim(0.98, 1)
 ax[0][1].set_yticks(np.arange(0.98, 1.001, 0.005))
+ax[0][1].tick_params(axis='both', which='both', labelsize=5)
 ax[0][1].plot(indel_recall, indel_prec, linestyle='', 
-        marker='.', color=f"C0", label=label)
+        marker='.', color=f"C0", label=label, markersize=1)
 
 plt.tight_layout()
+out_5bi.close()
+out_5bii.close()
+out_5biii.close()
 plt.savefig('img/4_vcfdist_plot.pdf', format="pdf")
diff --git a/analysis/7_vcfeval_pr_plot.py b/analysis/7_vcfeval_pr_plot.py
index 13735f0..9cc03fe 100644
--- a/analysis/7_vcfeval_pr_plot.py
+++ b/analysis/7_vcfeval_pr_plot.py
@@ -2,8 +2,14 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
+import matplotlib as mpl
+mpl.rcParams.update(mpl.rcParamsDefault)
+mpl.rcParams['text.usetex'] = True
 plt.rcParams.update({"figure.facecolor": (0,0,0,0)})
 
+def mm2in(x):
+    return x / 25.4
+
 datasets = ["nist", "cmrg"]
 names = [
     ["original", "bwa", "mm2-ont", "mm2-pb", "pbmm2"],
@@ -17,10 +23,12 @@
 ]
 sub_id = "K4GT3"
 
-fig, ax = plt.subplots(3, 4, figsize=(20,12))
+fig, ax = plt.subplots(3, 4, figsize=(mm2in(180), mm2in(108)))
 
 for di, dataset in enumerate(datasets):
     for ei, eval_list in enumerate(names):
+        out3 = open(f"tsv/3{chr(ord('a')+ei)}.tsv", "w")
+        print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tPRECISION\tRECALL", file=out3)
         for ni, name in enumerate(eval_list):
             filename = name.replace("original", "O")
             try:
@@ -35,6 +43,7 @@
                             indel_recall.append(float(line.split(',')[7]))
                             indel_prec.append(float(line.split(',')[8]))
                             indel_f1 = line.split(',')[10]
+                            print(f"{dataset.upper()}\t{sub_id}\t{label.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out3)
                         elif line[:19] == "SNP,*,*,PASS,*,QUAL":
                             recall = float(line.split(',')[7])
                             snp_recall.append(recall)
@@ -42,47 +51,62 @@
                             if prec == 0: prec = 1
                             snp_prec.append(prec)
                             snp_f1 = line.split(',')[10]
+                            print(f"{dataset.upper()}\t{sub_id}\t{label.upper()}\tSNP\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out3)
                     ax[ei][di*2].plot(snp_recall, snp_prec, linestyle='', 
-                            marker='.', color=colors[ei][ni], label=label)
+                            marker='.', markersize=1, color=colors[ei][ni], label=label)
                     ax[ei][di*2+1].plot(indel_recall, indel_prec, linestyle='', 
-                            marker='.', color=colors[ei][ni], label=label)
+                            marker='.', markersize=1, color=colors[ei][ni], label=label)
             except FileNotFoundError:
                 print(f"'/x/vcfdist/data/pfda-v2/{dataset}_vcfeval/{sub_id}_HG002_{filename}.roc.all.csv' not found.")
                 continue
      
         # SNP plot
-        ax[ei][di*2].set_xlabel("Recall", fontsize=15)
-        ax[ei][di*2].set_ylabel("Precision", fontsize=15)
+        ax[ei][0].set_ylabel(r"\textbf{Precision}", fontsize=7)
+        ax[2][di*2].set_xlabel(r"\textbf{Recall}", fontsize=7)
+        ax[2][di*2+1].set_xlabel(r"\textbf{Recall}", fontsize=7)
         if dataset == "nist":
             ax[ei][di*2].set_xlim(0.99, 1)
-            ax[ei][di*2].set_xticks(np.arange(0.99, 1.001, 0.002))
-            ax[ei][di*2].set_ylim(0.999, 1.00001)
-            ax[ei][di*2].set_yticks(np.arange(0.999, 1.0001, 0.0002))
+            ax[ei][di*2].set_xticks(np.arange(0.99, 1.001, 0.005))
+            ax[ei][di*2].set_xticklabels([f"{x:.3f}" for x in 
+                np.arange(0.99, 1.001, 0.005)], fontsize=5)
+            ax[ei][di*2].set_ylim(0.999, 1.00002)
+            ax[ei][di*2].set_yticks(np.arange(0.999, 1.00002, 0.0002))
             ax[ei][di*2].set_yticklabels([f"{x:.4f}" for x in 
-                np.arange(0.999, 1.00001, 0.0002)])
+                np.arange(0.999, 1.00002, 0.0002)], fontsize=5)
         if dataset == "cmrg":
             ax[ei][di*2].set_xlim(0.9, 1)
             ax[ei][di*2].set_xticks(np.arange(0.9, 1.01, 0.02))
-            ax[ei][di*2].set_ylim(0.95, 1.0001)
+            ax[ei][di*2].set_xticklabels([f"{x:.2f}" for x in 
+                np.arange(0.9, 1.01, 0.02)], fontsize=5)
+            ax[ei][di*2].set_ylim(0.95, 1.001)
             ax[ei][di*2].set_yticks(np.arange(0.95, 1.001, 0.01))
             ax[ei][di*2].set_yticklabels([f"{x:.3f}" for x in 
-                np.arange(0.95, 1.001, 0.01)])
+                np.arange(0.95, 1.001, 0.01)], fontsize=5)
         if di == 0:
-            ax[ei][di*2].legend(loc="lower left", fontsize=15, markerscale=3)
+            ax[ei][di*2].legend(loc="lower left", fontsize=5, markerscale=4)
 
         # INDEL plot
-        ax[ei][di*2+1].set_xlabel("Recall", fontsize=15)
-        ax[ei][di*2+1].set_ylabel("Precision", fontsize=15)
+        # ax[ei][0].set_xlabel("Recall", fontsize=7)
+        # ax[ei][di*2+1].set_ylabel("Precision", fontsize=7)
         if dataset == "nist":
             ax[ei][di*2+1].set_xlim(0.99, 1)
-            ax[ei][di*2+1].set_xticks(np.arange(0.99, 1.001, 0.002))
+            ax[ei][di*2+1].set_xticks(np.arange(0.99, 1.001, 0.005))
+            ax[ei][di*2+1].set_xticklabels([f"{x:.3f}" for x in 
+                np.arange(0.99, 1.001, 0.005)], fontsize=5)
             ax[ei][di*2+1].set_ylim(0.995, 1.0001)
-            ax[ei][di*2+1].set_yticks(np.arange(0.995, 1.001, 0.001))
+            ax[ei][di*2+1].set_yticks(np.arange(0.995, 1.0001, 0.001))
+            ax[ei][di*2+1].set_yticklabels([f"{x:.3f}" for x in 
+                np.arange(0.995, 1.0001, 0.001)], fontsize=5)
         if dataset == "cmrg":
             ax[ei][di*2+1].set_xlim(0.9, 1)
             ax[ei][di*2+1].set_xticks(np.arange(0.9, 1.01, 0.02))
-            ax[ei][di*2+1].set_ylim(0.9, 1.001)
-            ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.01, 0.02))
+            ax[ei][di*2+1].set_xticklabels([f"{x:.2f}" for x in 
+                np.arange(0.9, 1.01, 0.02)], fontsize=5)
+            ax[ei][di*2+1].set_ylim(0.9, 1.002)
+            ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.002, 0.02))
+            ax[ei][di*2+1].set_yticklabels([f"{x:.2f}" for x in 
+                np.arange(0.9, 1.01, 0.02)], fontsize=5)
+        out3.close()
 
 plt.tight_layout()
 plt.savefig('img/7_vcfeval_pr_plot.pdf', format="pdf")
diff --git a/analysis/8_vcfdist_pr_plot.py b/analysis/8_vcfdist_pr_plot.py
index 892e21d..f76bcdd 100644
--- a/analysis/8_vcfdist_pr_plot.py
+++ b/analysis/8_vcfdist_pr_plot.py
@@ -2,21 +2,30 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
+import matplotlib as mpl
+mpl.rcParams.update(mpl.rcParamsDefault)
+mpl.rcParams['text.usetex'] = True
 plt.rcParams.update({"figure.facecolor": (0,0,0,0)})
 
+def mm2in(x):
+    return x / 25.4
+
 datasets = ["nist", "cmrg"]
 names = ["original", "A", "B", "C", "D"]
-evals = ["vcfeval", "nopc", "vcfdist", "standard"]
+evals = ["vcfeval", "vcfdist_no_partial", "vcfdist", "vcfdist_standard"]
 colors = ["k", "C0", "C1", "C2", "C3"]
 
-fig, ax = plt.subplots(4, 4, figsize=(20,16))
+fig, ax = plt.subplots(4, 4, figsize=(mm2in(180), mm2in(144)))
 sub_id = "K4GT3"
 
 for di, dataset in enumerate(datasets):
     for ei, evaluation in enumerate(evals):
+        out4 = open(f"tsv/4{chr(ord('a')+ei)}.tsv", "w")
+        print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tPRECISION\tRECALL", file=out4)
+
         for ni, name in enumerate(names):
             filename = "O" if name == "original" else name
-            evalstring = "std" if evaluation == "standard" else ""
+            evalstring = "std" if evaluation == "vcfdist_standard" else ""
             if evaluation == "vcfeval":
                 with open(f"/x/vcfdist/data/pfda-v2/{dataset}_vcfeval/{sub_id}_HG002_{filename}std.roc.all.csv") as csv:
                     indel_recall = []
@@ -28,17 +37,19 @@
                             indel_recall.append(float(line.split(',')[7]))
                             indel_prec.append(float(line.split(',')[8]))
                             indel_f1 = line.split(',')[10]
+                            print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4)
                         elif line[:19] == "SNP,*,*,PASS,*,QUAL":
                             recall = float(line.split(',')[7])
                             snp_recall.append(recall)
                             prec = float(line.split(',')[8])
                             if prec == 0: prec = 1
                             snp_prec.append(prec)
+                            print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4)
                             snp_f1 = line.split(',')[10]
                     ax[ei][di*2].plot(snp_recall, snp_prec, linestyle='', 
-                            marker='.', color=colors[ni], label=name)
+                            marker='.', markersize=1, color=colors[ni], label=name)
                     ax[ei][di*2+1].plot(indel_recall, indel_prec, linestyle='', 
-                            marker='.', color=colors[ni], label=name)
+                            marker='.', markersize=1, color=colors[ni], label=name)
             else:
                 with open(f"/x/vcfdist/data/pfda-v2/{dataset}_vcfdist/{sub_id}_HG002_{filename}{evalstring}.precision-recall.tsv") as csv:
                     label = f"{name}"
@@ -51,7 +62,7 @@
                     for line in csv:
                         typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \
                             query_tot, query_tp, query_pp, query_fp = line.split('\t')
-                        if evaluation == "nopc":
+                        if evaluation == "vcfdist_no_partial":
                             if typ == "INDEL":
                                 indel_recall.append(float(truth_tp) / float(truth_tot))
                                 if int(truth_tp) + int(query_fp) == 0:
@@ -59,6 +70,7 @@
                                 else:
                                     indel_prec.append(float(truth_tp) / 
                                             (float(truth_tp) + float(query_fp)))
+                                print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4)
                             elif typ == "SNP":
                                 snp_recall.append(float(truth_tp) / float(truth_tot))
                                 if int(truth_tp) + int(query_fp) == 0:
@@ -66,51 +78,68 @@
                                 else:
                                     snp_prec.append(float(truth_tp) / 
                                             (float(truth_tp) + float(query_fp)))
+                                print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out4)
                         else: # normal vcfdist prec/recall calc, already done
                             if typ == "INDEL":
                                 indel_recall.append(float(recall))
                                 indel_prec.append(float(prec))
+                                print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4)
                             elif typ == "SNP":
                                 snp_recall.append(float(recall))
                                 snp_prec.append(float(prec))
+                                print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out4)
                     ax[ei][di*2].plot(snp_recall, snp_prec, linestyle='', 
-                            marker='.', color=colors[ni], label=label)
+                            marker='.', markersize=1, color=colors[ni], label=label)
                     ax[ei][di*2+1].plot(indel_recall, indel_prec, linestyle='', 
-                            marker='.', color=colors[ni], label=label)
+                            marker='.', markersize=1, color=colors[ni], label=label)
      
         # SNP plot
-        ax[ei][di*2].set_xlabel("Recall", fontsize=15)
-        ax[ei][di*2].set_ylabel("Precision", fontsize=15)
+        ax[ei][0].set_ylabel(r"\textbf{Precision}", fontsize=7)
+        ax[3][di*2].set_xlabel(r"\textbf{Recall}", fontsize=7)
+        ax[3][di*2+1].set_xlabel(r"\textbf{Recall}", fontsize=7)
         if dataset == "nist":
             ax[ei][di*2].set_xlim(0.99, 1)
-            ax[ei][di*2].set_xticks(np.arange(0.99, 1.001, 0.002))
-            ax[ei][di*2].set_ylim(0.998, 1.00001)
-            ax[ei][di*2].set_yticks(np.arange(0.998, 1.0001, 0.0004))
+            ax[ei][di*2].set_xticks(np.arange(0.99, 1.0002, 0.005))
+            ax[ei][di*2].set_xticklabels([f"{x:.3f}" for x in 
+                np.arange(0.99, 1.0002, 0.005)], fontsize=5)
+            ax[ei][di*2].set_ylim(0.998, 1.00004)
+            ax[ei][di*2].set_yticks(np.arange(0.998, 1.00004, 0.0004))
             ax[ei][di*2].set_yticklabels([f"{x:.4f}" for x in 
-                np.arange(0.998, 1.00001, 0.0004)])
+                np.arange(0.998, 1.00004, 0.0004)], fontsize=5)
         if dataset == "cmrg":
             ax[ei][di*2].set_xlim(0.9, 1)
-            ax[ei][di*2].set_xticks(np.arange(0.9, 1.01, 0.02))
-            ax[ei][di*2].set_ylim(0.95, 1.0001)
+            ax[ei][di*2].set_xticks(np.arange(0.9, 1.002, 0.02))
+            ax[ei][di*2].set_xticklabels([f"{x:.2f}" for x in 
+                np.arange(0.9, 1.002, 0.02)], fontsize=5)
+            ax[ei][di*2].set_ylim(0.95, 1.001)
             ax[ei][di*2].set_yticks(np.arange(0.95, 1.001, 0.01))
-            ax[ei][di*2].set_yticklabels([f"{x:.3f}" for x in 
-                np.arange(0.95, 1.001, 0.01)])
+            ax[ei][di*2].set_yticklabels([f"{x:.2f}" for x in 
+                np.arange(0.95, 1.001, 0.01)], fontsize=5)
         if di == 0:
-            ax[ei][di*2].legend(loc="lower left", fontsize=15, markerscale=3)
+            ax[ei][di*2].legend(loc="lower left", fontsize=5, markerscale=4)
 
         # INDEL plot
-        ax[ei][di*2+1].set_xlabel("Recall", fontsize=15)
-        ax[ei][di*2+1].set_ylabel("Precision", fontsize=15)
+        # ax[ei][di*2+1].set_xlabel("Recall", fontsize=15)
+        # ax[ei][di*2+1].set_ylabel("Precision", fontsize=15)
         if dataset == "nist":
             ax[ei][di*2+1].set_xlim(0.95, 1)
             ax[ei][di*2+1].set_xticks(np.arange(0.95, 1.001, 0.01))
-            ax[ei][di*2+1].set_ylim(0.98, 1.001)
-            ax[ei][di*2+1].set_yticks(np.arange(0.98, 1.001, 0.004))
+            ax[ei][di*2+1].set_xticklabels([f"{x:.2f}" for x in 
+                np.arange(0.95, 1.001, 0.01)], fontsize=5)
+            ax[ei][di*2+1].set_ylim(0.98, 1.0004)
+            ax[ei][di*2+1].set_yticks(np.arange(0.98, 1.0004, 0.004))
+            ax[ei][di*2+1].set_yticklabels([f"{x:.3f}" for x in 
+                np.arange(0.98, 1.0004, 0.004)], fontsize=5)
         if dataset == "cmrg":
             ax[ei][di*2+1].set_xlim(0.9, 1)
-            ax[ei][di*2+1].set_xticks(np.arange(0.9, 1.01, 0.02))
-            ax[ei][di*2+1].set_ylim(0.9, 1.001)
-            ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.01, 0.02))
+            ax[ei][di*2+1].set_xticks(np.arange(0.9, 1.002, 0.02))
+            ax[ei][di*2+1].set_xticklabels([f"{x:.2f}" for x in 
+                np.arange(0.9, 1.002, 0.02)], fontsize=5)
+            ax[ei][di*2+1].set_ylim(0.9, 1.002)
+            ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.002, 0.02))
+            ax[ei][di*2+1].set_yticklabels([f"{x:.2f}" for x in 
+                np.arange(0.9, 1.002, 0.02)], fontsize=5)
+        out4.close()
 
 plt.tight_layout()
 plt.savefig('img/8_vcfdist_pr_plot.pdf', format="pdf")
diff --git a/analysis/9_f1_ed_plot.py b/analysis/9_f1_ed_plot.py
index 8102989..1d555bd 100644
--- a/analysis/9_f1_ed_plot.py
+++ b/analysis/9_f1_ed_plot.py
@@ -11,11 +11,17 @@
 plt.rcParams.update({"figure.facecolor": (0,0,0,0)})
 mpl.rcParams['text.usetex'] = True
 
+def mm2in(x):
+    return x / 25.4
+
 data = "/home/timdunn/vcfdist/data"
 datasets = ["nist", "cmrg"]
 names = ["original", "A", "B", "C", "D"]
 colors = ["k", "C0", "C1", "C2", "C3"]
 
+out6c = open("tsv/S5c.tsv", 'w')
+print("DATASET\tSUBMISSION\tREPRESENTATION\tED_QSCORE\tDE_QSCORE\tALN_QSCORE", file=out6c)
+
 sub_ids = [
     "0GOOR", "23O09", "4GKR1", "61YRJ", "8H0ZB", "B1S5A", "C6JUX", "EIUT6", "H9OJ3", "IA789",
     "KXBR8", "MECML", "NWQ6Y", "R9CXN", "SEX9X", "UYMUW", "W91C1", "XC97E", "YBE9U", "YUI27",
@@ -33,7 +39,7 @@
 # ]
 # markers = ['s', 'o', '^']
 
-fig, ax = plt.subplots(1, 6, figsize=(20,4))
+fig, ax = plt.subplots(1, 6, figsize=(mm2in(180), mm2in(36)))
 
 for di, dataset in enumerate(datasets):
 
@@ -66,6 +72,7 @@
                     de_f1_list.append(de_f1q)
                     aln_f1_list.append(aln_f1q)
                     print(f"{filename}={ed_f1q:.4f}\t", end="")
+                    print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\t{ed_f1q}\t{de_f1q}\t{aln_f1q}", file=out6c)
             except FileNotFoundError:
                 # print(f"File '{data}/pfda-v2/{dataset}_vcfdist/{sub_id}_HG002_{filename}.distance-summary.tsv' not found.")
                 break
@@ -78,11 +85,11 @@
         aln_f1_mean = np.mean(aln_f1_list)
         for ni, name in enumerate(names):
             ax[di*3].plot(ed_f1_mean, ed_f1_list[ni], linestyle='', 
-                    marker=markers[si], color=colors[ni], label=label)
+                    marker=markers[si], markersize=1, color=colors[ni], label=label)
             ax[di*3+1].plot(de_f1_mean, de_f1_list[ni], linestyle='', 
-                    marker=markers[si], color=colors[ni], label=label)
+                    marker=markers[si], markersize=1, color=colors[ni], label=label)
             ax[di*3+2].plot(aln_f1_mean, aln_f1_list[ni], linestyle='', 
-                    marker=markers[si], color=colors[ni], label=label)
+                    marker=markers[si], markersize=1, color=colors[ni], label=label)
         all_ed_f1_y.extend(ed_f1_list)
         all_ed_f1_x.extend([ed_f1_mean]*5)
         all_ed_f1_meds.append(statistics.median(ed_f1_list))
@@ -95,22 +102,25 @@
 
     print(" ")
     m, b, r, p, std_err = scipy.stats.linregress(all_ed_f1_x, all_ed_f1_y)
-    ax[di*3].set_xlabel("Avg ED Q-score", fontsize=15)
-    ax[di*3].set_ylabel("ED Q-score", fontsize=15)
-    ax[di*3].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, 
+    ax[di*3].set_xlabel(r"\textbf{Avg ED Q-score}", fontsize=7)
+    ax[di*3].set_ylabel(r"\textbf{ED Q-score}", fontsize=7)
+    ax[di*3].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, 
             transform=ax[di*3].transAxes)
+    ax[di*3].tick_params(axis='both', which='major', labelsize=5)
 
     m, b, r, p, std_err = scipy.stats.linregress(all_de_f1_x, all_de_f1_y)
-    ax[di*3+1].set_xlabel("Avg DE Q-score", fontsize=15)
-    ax[di*3+1].set_ylabel("DE Q-score", fontsize=15)
-    ax[di*3+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, 
+    ax[di*3+1].set_xlabel(r"\textbf{Avg DE Q-score}", fontsize=7)
+    ax[di*3+1].set_ylabel(r"\textbf{DE Q-score}", fontsize=7)
+    ax[di*3+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, 
             transform=ax[di*3+1].transAxes)
+    ax[di*3+1].tick_params(axis='both', which='major', labelsize=5)
 
     m, b, r, p, std_err = scipy.stats.linregress(all_aln_f1_x, all_aln_f1_y)
-    ax[di*3+2].set_xlabel("Avg ALN Q-score", fontsize=15)
-    ax[di*3+2].set_ylabel("ALN Q-score", fontsize=15)
-    ax[di*3+2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, 
+    ax[di*3+2].set_xlabel(r"\textbf{Avg ALN Q-score}", fontsize=7)
+    ax[di*3+2].set_ylabel(r"\textbf{ALN Q-score}", fontsize=7)
+    ax[di*3+2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, 
             transform=ax[di*3+2].transAxes)
+    ax[di*3+2].tick_params(axis='both', which='major', labelsize=5)
 
     # second pass, calculate average max rank change
     all_ed_f1_meds.sort()
@@ -163,9 +173,10 @@
     ed_amrc = np.mean(ed_rank_change)
     de_amrc = np.mean(de_rank_change)
     aln_amrc = np.mean(aln_rank_change)
-    ax[di*3].text(0.05,0.8,f"AMRC: {ed_amrc:.2f}", fontsize=15, transform=ax[di*3].transAxes)
-    ax[di*3+1].text(0.05,0.8,f"AMRC: {de_amrc:.2f}", fontsize=15, transform=ax[di*3+1].transAxes)
-    ax[di*3+2].text(0.05,0.8,f"AMRC: {aln_amrc:.2f}", fontsize=15, transform=ax[di*3+2].transAxes)
+    ax[di*3].text(0.05,0.8,f"AMRC: {ed_amrc:.2f}", fontsize=5, transform=ax[di*3].transAxes)
+    ax[di*3+1].text(0.05,0.8,f"AMRC: {de_amrc:.2f}", fontsize=5, transform=ax[di*3+1].transAxes)
+    ax[di*3+2].text(0.05,0.8,f"AMRC: {aln_amrc:.2f}", fontsize=5, transform=ax[di*3+2].transAxes)
 
 plt.tight_layout()
+out6c.close()
 plt.savefig('img/9_f1_ed_plot.pdf', format="pdf")
diff --git a/analysis/9_f1_pr_plot.py b/analysis/9_f1_pr_plot.py
index 4beed7d..18fabad 100644
--- a/analysis/9_f1_pr_plot.py
+++ b/analysis/9_f1_pr_plot.py
@@ -11,30 +11,38 @@
 mpl.rcParams['text.usetex'] = True
 plt.rcParams.update({"figure.facecolor": (0,0,0,0)})
 
+def mm2in(x):
+    return x / 25.4
+
 data = "/home/timdunn/vcfdist/data"
 datasets = ["nist", "cmrg"]
 tools = ["vcfeval", "vcfdist"]
 names = ["original", "A", "B", "C", "D"]
 colors = ["k", "C0", "C1", "C2", "C3"]
 
-# sub_ids = [
-#     "0GOOR", "23O09", "4GKR1", "61YRJ", "8H0ZB", "B1S5A", "C6JUX", "EIUT6", "H9OJ3", "IA789",
-#     "KXBR8", "MECML", "NWQ6Y", "R9CXN", "SEX9X", "UYMUW", "W91C1", "XC97E", "YBE9U", "YUI27",
-#     "4HL0B", "7NJ5C", "9DGOR", "BARQS", "CN36L", "ES3XW", "HB8P3", "ISKOS", "JIPSI", "KFA0T",
-#     "PGXA4", "RU88N", "TG5TE", "VES2R", "WGQ43", "XV7ZN", "YGOTK", "13678", "32LOW", "60Z59", 
-#     "JBBK0", "K4GT3", "7RR8Z", "ASJT6", "BSODP", "CZA1Y", "0O7FL", "2OT9Q", "FFFGB", "HF8CT", "Y8QR8", "YJN61", "LR1FD", "MT57N",
-#     "J04SL", "K33QJ", "KPXT2", "M9KLP", "NFT0L", "QUE7Q", "S7K7S", "TZMTX", "W607K", "WX8VK"
-# ]
-# markers = ['.']*len(sub_ids)
+out6a = open("tsv/6a.tsv", 'w')
+out6b = open("tsv/6b.tsv", 'w')
+print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tF1_QSCORE", file=out6a)
+print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tF1_QSCORE", file=out6b)
 
 sub_ids = [
-    "K4GT3", # Google Health
-    "W91C1", # Sentieon
-    "IA789"  # Roche
+    "0GOOR", "23O09", "4GKR1", "61YRJ", "8H0ZB", "B1S5A", "C6JUX", "EIUT6", "H9OJ3", "IA789",
+    "KXBR8", "MECML", "NWQ6Y", "R9CXN", "SEX9X", "UYMUW", "W91C1", "XC97E", "YBE9U", "YUI27",
+    "4HL0B", "7NJ5C", "9DGOR", "BARQS", "CN36L", "ES3XW", "HB8P3", "ISKOS", "JIPSI", "KFA0T",
+    "PGXA4", "RU88N", "TG5TE", "VES2R", "WGQ43", "XV7ZN", "YGOTK", "13678", "32LOW", "60Z59", 
+    "JBBK0", "K4GT3", "7RR8Z", "ASJT6", "BSODP", "CZA1Y", "0O7FL", "2OT9Q", "FFFGB", "HF8CT", "Y8QR8", "YJN61", "LR1FD", "MT57N",
+    "J04SL", "K33QJ", "KPXT2", "M9KLP", "NFT0L", "QUE7Q", "S7K7S", "TZMTX", "W607K", "WX8VK"
 ]
-markers = ['s', 'o', '^']
+markers = ['.']*len(sub_ids)
+
+# sub_ids = [
+#     "K4GT3", # Google Health
+#     "W91C1", # Sentieon
+#     "IA789"  # Roche
+# ]
+# markers = ['s', 'o', '^']
 
-fig, ax = plt.subplots(2, 4, figsize=(20,8))
+fig, ax = plt.subplots(2, 4, figsize=(mm2in(180), mm2in(72)))
 
 for di, dataset in enumerate(datasets):
     for ti, tool in enumerate(tools):
@@ -72,6 +80,8 @@
                             snp_f1_list.append(snp_f1q)
                             indel_f1_list.append(indel_f1q)
                             print(f"{filename}={snp_f1q:6.4f},{indel_f1q:6.4f}\t", end="")
+                            print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_f1q}", file=out6a)
+                            print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_f1q}", file=out6a)
                     except FileNotFoundError:
                         print(f"File '{data}/pfda-v2/{dataset}_vcfeval/{sub_id}_HG002_{filename}.summary.csv' not found.")
                         continue
@@ -80,9 +90,9 @@
                 indel_f1_mean = sum(indel_f1_list) / len(indel_f1_list)
                 for ni, name in enumerate(names):
                     ax[ti][di*2].plot(snp_f1_mean, snp_f1_list[ni], linestyle='', 
-                            marker=markers[si], color=colors[ni], label=label)
+                            marker=markers[si], markersize=1, color=colors[ni], label=label)
                     ax[ti][di*2+1].plot(indel_f1_mean, indel_f1_list[ni], linestyle='', 
-                            marker=markers[si], color=colors[ni], label=label)
+                            marker=markers[si], markersize=1, color=colors[ni], label=label)
                 all_snp_f1_y.extend(snp_f1_list)
                 all_snp_f1_x.extend([snp_f1_mean]*len(names))
                 all_snp_f1_meds.append(statistics.median(snp_f1_list))
@@ -91,14 +101,16 @@
                 all_indel_f1_meds.append(statistics.median(indel_f1_list))
 
             m, b, r, p, std_err = scipy.stats.linregress(all_snp_f1_x, all_snp_f1_y)
-            ax[ti][di*2].set_xlabel("Avg SNP F1 Q-score", fontsize=15)
-            ax[ti][di*2].set_ylabel("SNP F1 Q-score", fontsize=15)
-            ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2].transAxes)
+            ax[ti][di*2].set_xlabel(r"\textbf{Avg SNP F1 Q-score}", fontsize=7)
+            ax[ti][di*2].set_ylabel(r"\textbf{SNP F1 Q-score}", fontsize=7)
+            ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2].transAxes)
+            ax[ti][di*2].tick_params(axis='both', which='major', labelsize=5)
 
             m, b, r, p, std_err = scipy.stats.linregress(all_indel_f1_x, all_indel_f1_y)
-            ax[ti][di*2+1].set_xlabel("Avg INDEL F1 Q-score", fontsize=15)
-            ax[ti][di*2+1].set_ylabel("INDEL F1 Q-score", fontsize=15)
-            ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2+1].transAxes)
+            ax[ti][di*2+1].set_xlabel(r"\textbf{Avg INDEL F1 Q-score}", fontsize=7)
+            ax[ti][di*2+1].set_ylabel(r"\textbf{INDEL F1 Q-score}", fontsize=7)
+            ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2+1].transAxes)
+            ax[ti][di*2+1].tick_params(axis='both', which='major', labelsize=5)
 
             # second pass, calculate average max rank change
             all_snp_f1_meds.sort()
@@ -146,8 +158,8 @@
                 indel_rank_change.append(indel_max_rank - indel_min_rank)
             snp_amrc = np.mean(snp_rank_change)
             indel_amrc = np.mean(indel_rank_change)
-            ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=15, transform=ax[ti][di*2].transAxes)
-            ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=15, transform=ax[ti][di*2+1].transAxes)
+            ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=5, transform=ax[ti][di*2].transAxes)
+            ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=5, transform=ax[ti][di*2+1].transAxes)
 
         # VCFDIST
         elif tool == "vcfdist":
@@ -174,6 +186,8 @@
                             snp_f1_list.append(snp_f1q)
                             indel_f1_list.append(indel_f1q)
                             print(f"{filename}={snp_f1q:6.4f},{indel_f1q:6.4f}\t", end="")
+                            print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_f1q}", file=out6b)
+                            print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_f1q}", file=out6b)
                     except FileNotFoundError:
                         print(f"File '{data}/pfda-v2/{dataset}_vcfdist/{sub_id}_HG002_{filename}.precision-recall-summary.tsv' not found.")
                         continue
@@ -182,9 +196,9 @@
                 indel_f1_mean = sum(indel_f1_list) / len(indel_f1_list)
                 for ni, name in enumerate(names):
                     ax[ti][di*2].plot(snp_f1_mean, snp_f1_list[ni], linestyle='', 
-                            marker=markers[si], color=colors[ni], label=label)
+                            marker=markers[si], markersize=1, color=colors[ni], label=label)
                     ax[ti][di*2+1].plot(indel_f1_mean, indel_f1_list[ni], linestyle='', 
-                            marker=markers[si], color=colors[ni], label=label)
+                            marker=markers[si], markersize=1, color=colors[ni], label=label)
                 all_snp_f1_y.extend(snp_f1_list)
                 all_snp_f1_x.extend([snp_f1_mean]*len(names))
                 all_snp_f1_meds.append(statistics.median(snp_f1_list))
@@ -193,14 +207,16 @@
                 all_indel_f1_meds.append(statistics.median(indel_f1_list))
 
             m, b, r, p, std_err = scipy.stats.linregress(all_snp_f1_x, all_snp_f1_y)
-            ax[ti][di*2].set_xlabel("Avg SNP F1 Q-score", fontsize=15)
-            ax[ti][di*2].set_ylabel("SNP F1 Q-score", fontsize=15)
-            ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2].transAxes)
+            ax[ti][di*2].set_xlabel(r"\textbf{Avg SNP F1 Q-score}", fontsize=7)
+            ax[ti][di*2].set_ylabel(r"\textbf{SNP F1 Q-score}", fontsize=7)
+            ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2].transAxes)
+            ax[ti][di*2].tick_params(axis='both', which='major', labelsize=5)
 
             m, b, r, p, std_err = scipy.stats.linregress(all_indel_f1_x, all_indel_f1_y)
-            ax[ti][di*2+1].set_xlabel("Avg INDEL F1 Q-score", fontsize=15)
-            ax[ti][di*2+1].set_ylabel("INDEL F1 Q-score", fontsize=15)
-            ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2+1].transAxes)
+            ax[ti][di*2+1].set_xlabel(r"\textbf{Avg INDEL F1 Q-score}", fontsize=7)
+            ax[ti][di*2+1].set_ylabel(r"\textbf{INDEL F1 Q-score}", fontsize=7)
+            ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2+1].transAxes)
+            ax[ti][di*2+1].tick_params(axis='both', which='major', labelsize=5)
 
             # second pass, calculate average max rank change
             all_snp_f1_meds.sort()
@@ -240,32 +256,34 @@
                 indel_rank_change.append(indel_max_rank - indel_min_rank)
             snp_amrc = np.mean(snp_rank_change)
             indel_amrc = np.mean(indel_rank_change)
-            ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=15, transform=ax[ti][di*2].transAxes)
-            ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=15, transform=ax[ti][di*2+1].transAxes)
+            ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=5, transform=ax[ti][di*2].transAxes)
+            ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=5, transform=ax[ti][di*2+1].transAxes)
 
-square = mlines.Line2D([], [], linestyle="", color='k', marker='s',
-                                  markersize=12, label='Google Health')
-circle = mlines.Line2D([], [], linestyle="", color='k', marker='o',
-                                  markersize=12, label='Sentieon')
-triangle = mlines.Line2D([], [], linestyle="", color='k', marker='^',
-                                  markersize=12, label='Roche')
-ax[0][0].legend(handles=[square, circle, triangle], loc="center right", fontsize=15)
+# square = mlines.Line2D([], [], linestyle="", color='k', marker='s',
+#                                   markersize=12, label='Google Health')
+# circle = mlines.Line2D([], [], linestyle="", color='k', marker='o',
+#                                   markersize=12, label='Sentieon')
+# triangle = mlines.Line2D([], [], linestyle="", color='k', marker='^',
+#                                   markersize=12, label='Roche')
+# ax[0][0].legend(handles=[square, circle, triangle], loc="center right", fontsize=15)
 
 O = mlines.Line2D([], [], linestyle="", color='k', marker='.',
-                                  markersize=12, label='original')
+                                  markersize=5, label='original')
 A = mlines.Line2D([], [], linestyle="", color='C0', marker='.',
-                                  markersize=12, label='A')
+                                  markersize=5, label='A')
 B = mlines.Line2D([], [], linestyle="", color='C1', marker='.',
-                                  markersize=12, label='B')
+                                  markersize=5, label='B')
 C = mlines.Line2D([], [], linestyle="", color='C2', marker='.',
-                                  markersize=12, label='C')
+                                  markersize=5, label='C')
 D = mlines.Line2D([], [], linestyle="", color='C3', marker='.',
-                                  markersize=12, label='D')
-ax2 = ax[0][0].twinx()
+                                  markersize=5, label='D')
+# ax2 = ax[0][0].twinx()
 
-# ax2.legend(handles=[O,A,B,C,D], loc="center right", fontsize=15)
-ax2.legend(handles=[O,A,B,C,D], loc="center left", fontsize=15)
+ax[0][0].legend(handles=[O,A,B,C,D], loc="lower right", fontsize=5)
+# ax2.legend(handles=[O,A,B,C,D], loc="center left", fontsize=5)
 
 
 plt.tight_layout()
+out6a.close()
+out6b.close()
 plt.savefig('img/9_f1_pr_plot.pdf', format="pdf")

From 7dd52b58bec48a1244a4a3263843989a0131fc28 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Tue, 7 Nov 2023 16:53:26 -0500
Subject: [PATCH 03/16] Added FLIP_ERROR column to 'superclusters.tsv'

---
 src/print.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/print.cpp b/src/print.cpp
index af0d6a7..33da71b 100644
--- a/src/print.cpp
+++ b/src/print.cpp
@@ -742,7 +742,7 @@ void write_results(
         FILE* out_clusterings = fopen(out_clusterings_fn.data(), "w");
         if (g.verbosity >= 1) INFO("  Writing superclustering results to '%s'", out_clusterings_fn.data());
         fprintf(out_clusterings, "CONTIG\tSUPERCLUSTER\tSTART\tSTOP\tSIZE\tQUERY1_VARS\tQUERY2_VARS"
-                "\tTRUTH1_VARS\tTRUTH2_VARS\tORIG_ED\tSWAP_ED\tPHASE\tPHASE_SET\tPHASE_BLOCK\n");
+                "\tTRUTH1_VARS\tTRUTH2_VARS\tORIG_ED\tSWAP_ED\tPHASE\tPHASE_SET\tPHASE_BLOCK\tFLIP_ERROR\n");
         for (std::string ctg : phasedata_ptr->contigs) {
             std::shared_ptr<ctgPhaseblocks> ctg_pbs = phasedata_ptr->phase_blocks[ctg];
             std::shared_ptr<ctgSuperclusters> ctg_scs = ctg_pbs->ctg_superclusters;
@@ -754,6 +754,16 @@ void write_results(
                     phase_block_idx++;
                 }
 
+                // set supercluster flip/swap based on phaseblock and sc phasing
+                bool phase_switch = ctg_pbs->block_states[phase_block_idx];
+                int phase_sc = ctg_scs->phase[i];
+                bool flip_error;
+                if (phase_switch) {
+                    flip_error = phase_sc == PHASE_ORIG;
+                } else { // no phase switch
+                    flip_error = phase_sc == PHASE_SWAP;
+                }
+
                 // count query vars, allowing empty haps
                 int query1_vars = ctg_scs->ctg_variants[QUERY][HAP1]->clusters.size() ?
                     ctg_scs->ctg_variants[QUERY][HAP1]->clusters[
@@ -777,12 +787,12 @@ void write_results(
                             ctg_scs->superclusters[TRUTH][HAP2][i]] : 0;
 
                 // print data
-                fprintf(out_clusterings, "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\n", 
+                fprintf(out_clusterings, "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n", 
                     ctg.data(), i, ctg_scs->begs[i], ctg_scs->ends[i],
                     ctg_scs->ends[i] - ctg_scs->begs[i],
                     query1_vars, query2_vars, truth1_vars, truth2_vars,
                     ctg_scs->orig_phase_dist[i], ctg_scs->swap_phase_dist[i],
-                    phase_strs[ctg_scs->phase[i]].data(), ctg_scs->phase_sets[i], phase_block_idx
+                    phase_strs[phase_sc].data(), ctg_scs->phase_sets[i], phase_block_idx, flip_error ? "TRUE" : "FALSE"
                );
             }
         }

From 5c30ff6afcc1cb5aa0d8d71f1718c90197369490 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Tue, 7 Nov 2023 17:26:07 -0500
Subject: [PATCH 04/16] Added phasing threshold

 - instead of considering superclusters to be phased when the swapped
   edit distance is less than the original, --phasing-threshold requires
   that the edit distance is reduced by this fraction, 1.0 indicating
   that there is an exact match and 0.5 indicating it is halved
---
 src/dist.cpp    | 15 ++++++++++++---
 src/globals.cpp | 20 ++++++++++++++++++++
 src/globals.h   |  3 +++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/dist.cpp b/src/dist.cpp
index 22c5d73..7c7a4b1 100644
--- a/src/dist.cpp
+++ b/src/dist.cpp
@@ -452,9 +452,18 @@ int store_phase(
     // calculate best phasing
     int orig_phase_dist = s[QUERY1_TRUTH1] + s[QUERY2_TRUTH2];
     int swap_phase_dist = s[QUERY2_TRUTH1] + s[QUERY1_TRUTH2];
-    int phase = PHASE_NONE; // default either way if equal dist
-    if (orig_phase_dist < swap_phase_dist) phase = PHASE_ORIG;
-    if (swap_phase_dist < orig_phase_dist) phase = PHASE_SWAP;
+    int phase = PHASE_NONE; // default either way if insufficient evidence
+    if (orig_phase_dist == swap_phase_dist) { // allow either phasing if equal dist
+        phase = PHASE_NONE;
+    } else if (orig_phase_dist == 0) { // protect division by zero
+        phase = PHASE_ORIG;
+    } else if (swap_phase_dist == 0) { // protect division by zero
+        phase = PHASE_SWAP;
+    } else if (1 - swap_phase_dist / orig_phase_dist > g.phase_threshold) { // significant reduction by swapping
+        phase = PHASE_SWAP;
+    } else if (1 - orig_phase_dist / swap_phase_dist > g.phase_threshold) { // significant reduction with orig
+        phase = PHASE_ORIG;
+    }
 
     // save alignment information
     clusterdata_ptr->superclusters[ctg]->set_phase(sc_idx,
diff --git a/src/globals.cpp b/src/globals.cpp
index 2278391..0c8fdfc 100644
--- a/src/globals.cpp
+++ b/src/globals.cpp
@@ -340,6 +340,21 @@ void Globals::parse_args(int argc, char ** argv) {
             if (this->max_threads < 1) {
                 ERROR("Max threads must be positive");
             }
+/*******************************************************************************/
+        } else if (std::string(argv[i]) == "-pt" ||
+                std::string(argv[i]) == "--phasing-threshold") {
+            i++;
+            if (i == argc) {
+                ERROR("Option '--phasing-threshold' used without providing value");
+            }
+            try {
+                this->phase_threshold = std::stod(argv[i++]);
+            } catch (const std::exception & e) {
+                ERROR("Invalid phasing threshold provided");
+            }
+            if (this->phase_threshold <= 0 || this->phase_threshold > 1) {
+                ERROR("Provided phasing threshold must be on the interval (0,1]");
+            }
 /*******************************************************************************/
         } else if (std::string(argv[i]) == "-r" ||
                 std::string(argv[i]) == "--max-ram") {
@@ -498,6 +513,11 @@ void Globals::print_usage() const
     printf("      minimum gap between independent clusters and superclusters (in bases),\n");
     printf("      only applicable if used with '--simple-cluster' option\n");
 
+    printf("\n  Phasing:\n");
+    printf("  -pt, --phasing-threshold <FLOAT> [%.2f]\n", g.phase_threshold);
+    printf("      minimum fractional reduction in edit distance over other phasing\n");
+    printf("      in order to consider this supercluster phased\n");
+
     printf("\n  Distance:\n");
     printf("  -ex, --eval-mismatch-penalty <INTEGER> [%d]\n", g.eval_sub);
     printf("      mismatch penalty (distance evaluation)\n");
diff --git a/src/globals.h b/src/globals.h
index 349828e..421e540 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -36,6 +36,9 @@ class Globals {
     int reach_min_gap = 10;
     int max_cluster_itrs = 4;
 
+    // phasing options
+    double phase_threshold = 0.5;
+
     // memory params
     int max_threads = 64;
     double max_ram = 64; // GB

From 2a9b41fa62d601cda1c4630d55b36b16ca643ac7 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Wed, 8 Nov 2023 11:15:38 -0500
Subject: [PATCH 05/16] Added g.phase_threshold to parameters.txt

---
 src/print.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/print.cpp b/src/print.cpp
index 33da71b..8f17c84 100644
--- a/src/print.cpp
+++ b/src/print.cpp
@@ -41,14 +41,14 @@ void write_params() {
         "program = '%s'\nversion = '%s'\nout_prefix = '%s'\ncommand = '%s'\nreference_fasta = '%s'\n"
         "query_vcf = '%s'\ntruth_vcf = '%s'\nbed_file = '%s'\nwrite_outputs = %s\nfilters = '%s'\n"
         "min_var_qual = %d\nmax_var_qual = %d\nmin_var_size = %d\nmax_var_size = %d\n"
-        "realign_truth = %s\nrealign_query = %s\n"
+        "phase_threshold=%f\nrealign_truth = %s\nrealign_query = %s\n"
         "realign_only = %s\nsimple_cluster = %s\ncluster_min_gap = %d\n"
         "reach_min_gap = %d\nmax_cluster_itrs = %d\nmax_threads = %d\nmax_ram = %f\n"
         "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\n",
         g.PROGRAM.data(), g.VERSION.data(), g.out_prefix.data(), g.cmd.data(), g.ref_fasta_fn.data(), 
         g.query_vcf_fn.data(), g.truth_vcf_fn.data(), g.bed_fn.data(), b2s(g.write).data(), 
         filters_str.data(), g.min_qual, g.max_qual, g.min_size, g.max_size,
-        b2s(g.realign_truth).data(), b2s(g.realign_query).data(),
+        g.phase_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(),
         b2s(g.realign_only).data(), b2s(g.simple_cluster).data(), g.cluster_min_gap,
         g.reach_min_gap, g.max_cluster_itrs, g.max_threads, g.max_ram,
         g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend);

From a5b4ab6c4ac38f95b50b21554369286536945335 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Wed, 8 Nov 2023 11:16:04 -0500
Subject: [PATCH 06/16] Output no longer always 'gm'

 - 'gm' for TP, 'lm' for PP, '.' for FP/FN
---
 src/phase.cpp   |  2 +-
 src/variant.cpp | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/phase.cpp b/src/phase.cpp
index 8b9c71f..4b92f0d 100644
--- a/src/phase.cpp
+++ b/src/phase.cpp
@@ -25,7 +25,7 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
     fprintf(out_vcf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"GenoType\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=BD,Number=1,Type=String,Description=\"Benchmark Decision for call (TP/FP/FN). PP conversion: TP if BC >= 0.5 else FP/FN (hap.py compatibility)\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=BC,Number=1,Type=String,Description=\"Benchmark Credit (on the interval [0,1], based on sync group edit distance)\">\n");
-    fprintf(out_vcf, "##FORMAT=<ID=BK,Number=1,Type=String,Description=\"BenchmarK category (for hap.py compatibility, always genotype match 'gm')\">\n");
+    fprintf(out_vcf, "##FORMAT=<ID=BK,Number=1,Type=String,Description=\"BenchmarK category (for hap.py compatibility, 'gm' for TP, 'lm' if PP, else '.')\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=QQ,Number=1,Type=Float,Description=\"variant Quality\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=SC,Number=1,Type=Integer,Description=\"SuperCluster (index in contig)\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=SG,Number=1,Type=Integer,Description=\"Sync Group (index in supercluster, for credit assignment)\">\n");
diff --git a/src/variant.cpp b/src/variant.cpp
index 98db573..a3119b5 100644
--- a/src/variant.cpp
+++ b/src/variant.cpp
@@ -259,17 +259,20 @@ void ctgVariants::print_var_sample(FILE* out_fp, int idx, std::string gt,
 
     // get categorization
     std::string errtype;
+    std::string match_type;
     switch (this->errtypes[swap][idx]) {
-        case ERRTYPE_TP: errtype = "TP"; break;
-        case ERRTYPE_FP: errtype = "FP"; break;
-        case ERRTYPE_FN: errtype = "FN"; break;
+        case ERRTYPE_TP: errtype = "TP"; match_type = "gm"; break;
+        case ERRTYPE_FP: errtype = "FP"; match_type = "."; break;
+        case ERRTYPE_FN: errtype = "FN"; match_type = "."; break;
         case ERRTYPE_PP: // for hap.py compatibility
              errtype = this->credit[swap][idx] >= 0.5 ? "TP" : 
-                 (query ? "FP" : "FN"); break;
+                 (query ? "FP" : "FN"); 
+             match_type = "lm";
+             break;
     }
 
-    fprintf(out_fp, "\t%s:%s:%f:gm:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), 
-            this->credit[swap][idx], int(this->var_quals[idx]), sc_idx, 
+    fprintf(out_fp, "\t%s:%s:%f:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), 
+            this->credit[swap][idx], match_type.data(), int(this->var_quals[idx]), sc_idx, 
             int(this->sync_group[swap][idx]), this->phase_sets[idx], phase_block,
             query ? (phase_switch ? "1" : "0") : "." , 
             query ? (phase_flip ? "1" : "0") : "." , 

From 6f5aa100afcf0c5e32276cb1e26d6f7b5fba45fc Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Wed, 8 Nov 2023 13:21:41 -0500
Subject: [PATCH 07/16] Replaced partial credit with credit threshold.

---
 src/defs.h      |  9 +++--
 src/dist.cpp    | 30 +++-------------
 src/globals.cpp | 19 ++++++++++
 src/globals.h   |  3 ++
 src/main.cpp    |  2 +-
 src/phase.cpp   |  4 +--
 src/print.cpp   | 94 +++++++++++++++++++------------------------------
 src/variant.cpp | 17 +++++----
 src/variant.h   |  6 ++--
 9 files changed, 81 insertions(+), 103 deletions(-)

diff --git a/src/defs.h b/src/defs.h
index a31995b..16ec893 100644
--- a/src/defs.h
+++ b/src/defs.h
@@ -60,11 +60,10 @@ class idx2;
 #define ERRTYPE_TP 0 // true positive
 #define ERRTYPE_FP 1 // false positive
 #define ERRTYPE_FN 2 // false negative
-#define ERRTYPE_PP 3 // partial positive (reduces ED, but not TP)
-#define ERRTYPE_PE 4 // phase error (0|1 -> 1|0)
-#define ERRTYPE_GE 5 // genotype error (0|1 -> 1|1)
-#define ERRTYPE_UN 6 // unknown
-#define ERRTYPES   7
+#define ERRTYPE_PE 3 // phase error (0|1 -> 1|0)
+#define ERRTYPE_GE 4 // genotype error (0|1 -> 1|1)
+#define ERRTYPE_UN 5 // unknown
+#define ERRTYPES   6
 
 // runtime timers
 #define TIME_READ     0
diff --git a/src/dist.cpp b/src/dist.cpp
index 7c7a4b1..c55bf77 100644
--- a/src/dist.cpp
+++ b/src/dist.cpp
@@ -1269,7 +1269,7 @@ void calc_prec_recall(
                     float credit = 1 - float(new_ed)/old_ed;
                     // don't overwrite FPs
                     if (query_vars->errtypes[swap][query_var_idx] == ERRTYPE_UN) {
-                        if (new_ed == 0) { // TP
+                        if (credit >= g.credit_threshold) { // TP
                             query_vars->errtypes[swap][query_var_idx] = ERRTYPE_TP;
                             query_vars->sync_group[swap][query_var_idx] = sync_group;
                             query_vars->credit[swap][query_var_idx] = credit;
@@ -1279,7 +1279,7 @@ void calc_prec_recall(
                                     query_vars->refs[query_var_idx].data(),
                                     query_vars->alts[query_var_idx].data(), 
                                     "TP", credit);
-                        } else if (new_ed == old_ed) { // FP
+                        } else { // FP
                             query_vars->errtypes[swap][query_var_idx] = ERRTYPE_FP;
                             query_vars->sync_group[swap][query_var_idx] = sync_group;
                             query_vars->credit[swap][query_var_idx] = 0;
@@ -1289,16 +1289,6 @@ void calc_prec_recall(
                                     query_vars->refs[query_var_idx].data(),
                                     query_vars->alts[query_var_idx].data(), 
                                     "FP", 0.0f);
-                        } else { // PP
-                            query_vars->errtypes[swap][query_var_idx] = ERRTYPE_PP;
-                            query_vars->sync_group[swap][query_var_idx] = sync_group;
-                            query_vars->credit[swap][query_var_idx] = credit;
-                            query_vars->callq[swap][query_var_idx] = callq;
-                            if (print) printf("%s:%d QUERY REF='%s'\tALT='%s'\t%s\t%f\n", 
-                                    ctg.data(), query_vars->poss[query_var_idx],
-                                    query_vars->refs[query_var_idx].data(),
-                                    query_vars->alts[query_var_idx].data(), 
-                                    "PP", credit);
                         }
                     }
                 }
@@ -1307,7 +1297,7 @@ void calc_prec_recall(
                 for (int truth_var_idx = prev_truth_var_ptr; 
                         truth_var_idx > truth_var_ptr; truth_var_idx--) {
                     float credit = 1 - float(new_ed)/old_ed;
-                    if (new_ed == 0) { // TP
+                    if (credit >= g.credit_threshold) { // TP
                         truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_TP;
                         truth_vars->sync_group[swap][truth_var_idx] = sync_group;
                         truth_vars->credit[swap][truth_var_idx] = credit;
@@ -1317,7 +1307,7 @@ void calc_prec_recall(
                                 truth_vars->refs[truth_var_idx].data(),
                                 truth_vars->alts[truth_var_idx].data(), 
                                 "TP", credit);
-                    } else if (new_ed == old_ed) { // FP call, FN truth
+                    } else { // FP call, FN truth
                         truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_FN;
                         truth_vars->sync_group[swap][truth_var_idx] = sync_group;
                         truth_vars->credit[swap][truth_var_idx] = credit;
@@ -1327,17 +1317,7 @@ void calc_prec_recall(
                                 truth_vars->refs[truth_var_idx].data(),
                                 truth_vars->alts[truth_var_idx].data(), 
                                 "FN", credit);
-                    } else { // PP
-                        truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_PP;
-                        truth_vars->sync_group[swap][truth_var_idx] = sync_group;
-                        truth_vars->credit[swap][truth_var_idx] = credit;
-                        truth_vars->callq[swap][truth_var_idx] = callq;
-                        if (print) printf("%s:%d TRUTH REF='%s'\tALT='%s'\t%s\t%f\n", 
-                                ctg.data(), truth_vars->poss[truth_var_idx],
-                                truth_vars->refs[truth_var_idx].data(),
-                                truth_vars->alts[truth_var_idx].data(), 
-                                "PP", credit);
-                    }
+                    } 
                 }
 
                 if (print) printf("%s: SYNC @%s(%2d,%2d) = REF(%2d,%2d), ED %d->%d, QUERY %d-%d, TRUTH %d-%d\n\n", ctg.data(),
diff --git a/src/globals.cpp b/src/globals.cpp
index 0c8fdfc..6244811 100644
--- a/src/globals.cpp
+++ b/src/globals.cpp
@@ -355,6 +355,21 @@ void Globals::parse_args(int argc, char ** argv) {
             if (this->phase_threshold <= 0 || this->phase_threshold > 1) {
                 ERROR("Provided phasing threshold must be on the interval (0,1]");
             }
+/*******************************************************************************/
+        } else if (std::string(argv[i]) == "-ct" ||
+                std::string(argv[i]) == "--credit-threshold") {
+            i++;
+            if (i == argc) {
+                ERROR("Option '--credit-threshold' used without providing value");
+            }
+            try {
+                this->credit_threshold = std::stod(argv[i++]);
+            } catch (const std::exception & e) {
+                ERROR("Invalid credit threshold provided");
+            }
+            if (this->credit_threshold <= 0 || this->credit_threshold > 1) {
+                ERROR("Provided credit threshold must be on the interval (0,1]");
+            }
 /*******************************************************************************/
         } else if (std::string(argv[i]) == "-r" ||
                 std::string(argv[i]) == "--max-ram") {
@@ -485,6 +500,10 @@ void Globals::print_usage() const
     printf("  -e, --gap-extend-penalty <INTEGER> [%d]\n", g.eval_extend);
     printf("      Smith-Waterman gap extension penalty\n");
 
+    printf("\n  Precision-Recall:\n");
+    printf("  -ct, --credit-threshold <FLOAT> [%.2f]\n", g.credit_threshold);
+    printf("      minimum partial credit to consider variant a true positive\n");
+
     printf("\n  Utilization:\n");
     printf("  -t, --max-threads <INTEGER> [%d]\n", g.max_threads);
     printf("      maximum threads to use for clustering and precision/recall alignment\n");
diff --git a/src/globals.h b/src/globals.h
index 421e540..53c30ae 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -39,6 +39,9 @@ class Globals {
     // phasing options
     double phase_threshold = 0.5;
 
+    // precision-recall options
+    double credit_threshold = 0.7;
+
     // memory params
     int max_threads = 64;
     double max_ram = 64; // GB
diff --git a/src/main.cpp b/src/main.cpp
index d2ec555..c32d09e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,7 +14,7 @@ Globals g;
 std::vector<std::string> type_strs = {"REF", "SNP", "INS", "DEL", "CPX"};
 std::vector<std::string> type_strs2 = {"ALL", "SNP", "INS", "DEL", "INDEL"};
 std::vector<std::string> vartype_strs = {"SNP", "INDEL"};
-std::vector<std::string> error_strs = {"TP", "FP", "FN", "PP", "PE", "GE", "??"};
+std::vector<std::string> error_strs = {"TP", "FP", "FN", "PE", "GE", "??"};
 std::vector<std::string> gt_strs = {
         "0", "1", "0|0", "0|1", "1|0", "1|1", "1|2", "2|1", ".|.", "M|N" };
 std::vector<std::string> region_strs = {"OUTSIDE", "INSIDE ", "BORDER ", "OFF CTG"};
diff --git a/src/phase.cpp b/src/phase.cpp
index 4b92f0d..67324a1 100644
--- a/src/phase.cpp
+++ b/src/phase.cpp
@@ -23,9 +23,9 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
     }
     fprintf(out_vcf, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"GenoType\">\n");
-    fprintf(out_vcf, "##FORMAT=<ID=BD,Number=1,Type=String,Description=\"Benchmark Decision for call (TP/FP/FN). PP conversion: TP if BC >= 0.5 else FP/FN (hap.py compatibility)\">\n");
+    fprintf(out_vcf, "##FORMAT=<ID=BD,Number=1,Type=String,Description=\"Benchmark Decision for call (TP/FP/FN).\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=BC,Number=1,Type=String,Description=\"Benchmark Credit (on the interval [0,1], based on sync group edit distance)\">\n");
-    fprintf(out_vcf, "##FORMAT=<ID=BK,Number=1,Type=String,Description=\"BenchmarK category (for hap.py compatibility, 'gm' for TP, 'lm' if PP, else '.')\">\n");
+    fprintf(out_vcf, "##FORMAT=<ID=BK,Number=1,Type=String,Description=\"BenchmarK category ('gm' if credit == 1, 'lm' if credit > 0, else '.')\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=QQ,Number=1,Type=Float,Description=\"variant Quality\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=SC,Number=1,Type=Integer,Description=\"SuperCluster (index in contig)\">\n");
     fprintf(out_vcf, "##FORMAT=<ID=SG,Number=1,Type=Integer,Description=\"Sync Group (index in supercluster, for credit assignment)\">\n");
diff --git a/src/print.cpp b/src/print.cpp
index 8f17c84..3d36abd 100644
--- a/src/print.cpp
+++ b/src/print.cpp
@@ -41,14 +41,14 @@ void write_params() {
         "program = '%s'\nversion = '%s'\nout_prefix = '%s'\ncommand = '%s'\nreference_fasta = '%s'\n"
         "query_vcf = '%s'\ntruth_vcf = '%s'\nbed_file = '%s'\nwrite_outputs = %s\nfilters = '%s'\n"
         "min_var_qual = %d\nmax_var_qual = %d\nmin_var_size = %d\nmax_var_size = %d\n"
-        "phase_threshold=%f\nrealign_truth = %s\nrealign_query = %s\n"
+        "phase_threshold = %f\ncredit_threshold = %f\nrealign_truth = %s\nrealign_query = %s\n"
         "realign_only = %s\nsimple_cluster = %s\ncluster_min_gap = %d\n"
         "reach_min_gap = %d\nmax_cluster_itrs = %d\nmax_threads = %d\nmax_ram = %f\n"
         "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\n",
         g.PROGRAM.data(), g.VERSION.data(), g.out_prefix.data(), g.cmd.data(), g.ref_fasta_fn.data(), 
         g.query_vcf_fn.data(), g.truth_vcf_fn.data(), g.bed_fn.data(), b2s(g.write).data(), 
         filters_str.data(), g.min_qual, g.max_qual, g.min_size, g.max_size,
-        g.phase_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(),
+        g.phase_threshold, g.credit_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(),
         b2s(g.realign_only).data(), b2s(g.simple_cluster).data(), g.cluster_min_gap,
         g.reach_min_gap, g.max_cluster_itrs, g.max_threads, g.max_ram,
         g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend);
@@ -321,13 +321,13 @@ void print_ptrs(const std::vector< std::vector<uint8_t> > & ptrs,
 
 void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
 
-    // init counters; ax0: SUB/INDEL, ax1: TP,FP,FN,PP,PP_FRAC, ax2: QUAL
-    int PP_FRAC = 4;
+    // for each class, store variant counts above each quality threshold
+    // init counters; ax0: SUB/INDEL, ax1: TP,FP,FN ax2: QUAL
     std::vector< std::vector< std::vector<float> > > query_counts(VARTYPES,
-            std::vector< std::vector<float> >(5, 
+            std::vector< std::vector<float> >(3, 
             std::vector<float>(g.max_qual-g.min_qual+1, 0.0))) ;
     std::vector< std::vector< std::vector<float> > > truth_counts(VARTYPES,
-            std::vector< std::vector<float> >(5, 
+            std::vector< std::vector<float> >(3, 
             std::vector<float>(g.max_qual-g.min_qual+1, 0.0))) ;
 
     // calculate summary statistics
@@ -374,9 +374,6 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
                         }
                         for (int qual = g.min_qual; qual <= q; qual++) {
                             query_counts[t][ ctg_vars[QUERY][h]->errtypes[swap][i] ][qual-g.min_qual]++;
-                            if (ctg_vars[QUERY][h]->errtypes[swap][i] == ERRTYPE_PP) {
-                                query_counts[t][PP_FRAC][qual-g.min_qual] += ctg_vars[QUERY][h]->credit[swap][i];
-                            }
                         }
                     }
                 }
@@ -419,11 +416,13 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
                             WARN("Unknown error type at TRUTH %s:%d", ctg.data(), ctg_vars[TRUTH][h]->poss[i]);
                             continue;
                         }
+                        // corresponding query call is only correct until its Qscore, after which it falls below
+                        // the quality threshold, is filtered, and becomes a false negative
                         for (int qual = g.min_qual; qual <= q; qual++) {
                             truth_counts[t][ ctg_vars[TRUTH][h]->errtypes[swap][i] ][qual-g.min_qual]++;
-                            if (ctg_vars[TRUTH][h]->errtypes[swap][i] == ERRTYPE_PP) {
-                                truth_counts[t][PP_FRAC][qual-g.min_qual] += ctg_vars[TRUTH][h]->credit[swap][i];
-                            }
+                        }
+                        for (int qual = q+1; qual < g.max_qual; qual++) {
+                            truth_counts[t][ERRTYPE_FN][qual-g.min_qual]++;
                         }
                     }
                 }
@@ -438,7 +437,7 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
         if (g.verbosity >= 1) INFO("  Writing precision-recall results to '%s'", out_pr_fn.data());
         out_pr = fopen(out_pr_fn.data(), "w");
         fprintf(out_pr, "VAR_TYPE\tMIN_QUAL\tPREC\tRECALL\tF1_SCORE\tF1_QSCORE\t"
-                "TRUTH_TOTAL\tTRUTH_TP\tTRUTH_PP\tTRUTH_FN\tQUERY_TOTAL\tQUERY_TP\tQUERY_PP\tQUERY_FP\n");
+                "TRUTH_TOTAL\tTRUTH_TP\tTRUTH_FN\tQUERY_TOTAL\tQUERY_TP\tQUERY_FP\n");
     }
     std::vector<float> max_f1_score = {0, 0};
     std::vector<int> max_f1_qual = {0, 0};
@@ -448,36 +447,25 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
         for (int qual = g.min_qual; qual <= g.max_qual; qual++) {
             int qidx = qual - g.min_qual;
 
-            int query_tot = query_counts[type][ERRTYPE_TP][qidx] +
-                        query_counts[type][ERRTYPE_FP][qidx] +
-                        query_counts[type][ERRTYPE_PP][qidx];
-            float query_tp_f = query_counts[type][ERRTYPE_TP][qidx] +
-                        query_counts[type][PP_FRAC][qidx];
-            float query_fp_f = query_tot - query_tp_f;
-
-            int truth_tot = truth_counts[type][ERRTYPE_TP][0] +
-                        truth_counts[type][ERRTYPE_PP][0] +
-                        truth_counts[type][ERRTYPE_FN][0];
-            float truth_tp_f = truth_counts[type][ERRTYPE_TP][qidx] +
-                         truth_counts[type][PP_FRAC][qidx];
-            if (truth_tot == 0) break;
-            if (truth_tp_f + query_fp_f == 0) break;
-
-            // ignore PP, this is only for summary output, not calculations
-            int truth_fn = truth_counts[type][ERRTYPE_FN][0] +
-                         truth_counts[type][ERRTYPE_TP][0] -
-                         truth_counts[type][ERRTYPE_TP][qidx];
-
-            float precision = truth_tp_f / (truth_tp_f + query_fp_f);
-            float recall = truth_tp_f / truth_tot;
-            float f1_score = 2*precision*recall / (precision + recall);
+            // define helper variables
+            int query_tp = query_counts[type][ERRTYPE_TP][qidx];
+            int query_fp = query_counts[type][ERRTYPE_FP][qidx];
+            int query_tot = query_tp + query_fp;
+            int truth_tp = truth_counts[type][ERRTYPE_TP][qidx];
+            int truth_fn = truth_counts[type][ERRTYPE_FN][qidx];
+            int truth_tot = truth_tp + truth_fn;
+
+            // calculate summary metrics
+            float precision = query_tot == 0 ? 1 : float(query_tp) / query_tot;
+            float recall = truth_tot == 0 ? 1 : float(truth_tp) / truth_tot;
+            float f1_score = precision+recall ? 2*precision*recall / (precision + recall) : 0;
             if (f1_score > max_f1_score[type]) {
                 max_f1_score[type] = f1_score;
                 max_f1_qual[type] = qual;
             }
 
             if (g.write) fprintf(out_pr, 
-                    "%s\t%d\t%f\t%f\t%f\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
+                    "%s\t%d\t%f\t%f\t%f\t%f\t%d\t%d\t%d\t%d\t%d\t%d\n",
                     vartype_strs[type].data(),
                     qual,
                     precision,
@@ -486,11 +474,9 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
                     qscore(1-f1_score),
                     truth_tot,
                     int(truth_counts[type][ERRTYPE_TP][qidx]),
-                    int(truth_counts[type][ERRTYPE_PP][qidx]),
                     truth_fn,
                     query_tot, 
                     int(query_counts[type][ERRTYPE_TP][qidx]),
-                    int(query_counts[type][ERRTYPE_PP][qidx]),
                     int(query_counts[type][ERRTYPE_FP][qidx])
            );
         }
@@ -518,28 +504,20 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
             // redo calculations for these two
             int qidx = qual - g.min_qual;
 
-            int query_tot = query_counts[type][ERRTYPE_TP][qidx] +
-                        query_counts[type][ERRTYPE_FP][qidx] +
-                        query_counts[type][ERRTYPE_PP][qidx];
-            float query_tp_f = query_counts[type][ERRTYPE_TP][qidx] +
-                         query_counts[type][PP_FRAC][qidx];
-            float query_fp_f = query_tot - query_tp_f;
+            // define helper variables
+            int query_tp = query_counts[type][ERRTYPE_TP][qidx];
+            int query_fp = query_counts[type][ERRTYPE_FP][qidx];
+            int query_tot = query_tp + query_fp;
             if (query_tot == 0) WARN("No QUERY %s variant calls.", vartype_strs[type].data());
-
-            int truth_tot = truth_counts[type][ERRTYPE_TP][0] +
-                        truth_counts[type][ERRTYPE_PP][0] +
-                        truth_counts[type][ERRTYPE_FN][0];
-            int truth_fn = truth_counts[type][ERRTYPE_FN][0] +
-                             truth_counts[type][ERRTYPE_TP][0] -
-                             truth_counts[type][ERRTYPE_TP][qidx];
-            float truth_tp_f = truth_counts[type][ERRTYPE_TP][qidx] +
-                             truth_counts[type][PP_FRAC][qidx];
+            int truth_tp = truth_counts[type][ERRTYPE_TP][qidx];
+            int truth_fn = truth_counts[type][ERRTYPE_FN][qidx];
+            int truth_tot = truth_tp + truth_fn;
             if (truth_tot == 0) WARN("No TRUTH %s variant calls.", vartype_strs[type].data());
 
-            float precision = truth_tp_f + query_fp_f == 0 ? 
-                1.0f : truth_tp_f / (truth_tp_f + query_fp_f);
-            float recall = truth_tot == 0 ? 1.0f : truth_tp_f / truth_tot;
-            float f1_score = 2*precision*recall / (precision + recall);
+            // calculate summary metrics
+            float precision = query_tot == 0 ? 1 : float(query_tp) / query_tot;
+            float recall = truth_tot == 0 ? 1 : float(truth_tp) / truth_tot;
+            float f1_score = precision+recall ? 2*precision*recall / (precision + recall) : 0;
 
             // print summary
             INFO("%s%s\tQ >= %d\t\t%-16d%-16d%-16d%-16d%f\t%f\t%f\t%f%s",
diff --git a/src/variant.cpp b/src/variant.cpp
index a3119b5..999d8e5 100644
--- a/src/variant.cpp
+++ b/src/variant.cpp
@@ -260,15 +260,14 @@ void ctgVariants::print_var_sample(FILE* out_fp, int idx, std::string gt,
     // get categorization
     std::string errtype;
     std::string match_type;
-    switch (this->errtypes[swap][idx]) {
-        case ERRTYPE_TP: errtype = "TP"; match_type = "gm"; break;
-        case ERRTYPE_FP: errtype = "FP"; match_type = "."; break;
-        case ERRTYPE_FN: errtype = "FN"; match_type = "."; break;
-        case ERRTYPE_PP: // for hap.py compatibility
-             errtype = this->credit[swap][idx] >= 0.5 ? "TP" : 
-                 (query ? "FP" : "FN"); 
-             match_type = "lm";
-             break;
+    if (this->credit[swap][idx] == 1) {
+        errtype = "TP"; match_type = "gm";
+    } else if (this->credit[swap][idx] == 0) {
+        errtype = "FP"; match_type = ".";
+    } else if (this->credit[swap][idx] >= g.credit_threshold) {
+        errtype = "TP"; match_type = "lm";
+    } else {
+        errtype = "FP"; match_type = "lm";
     }
 
     fprintf(out_fp, "\t%s:%s:%f:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), 
diff --git a/src/variant.h b/src/variant.h
index 011dc54..635554e 100644
--- a/src/variant.h
+++ b/src/variant.h
@@ -47,14 +47,14 @@ class ctgVariants {
     std::vector<int> right_reaches; // cluster rightmost reach
 
     // set during prec_recall_aln()
-    // error type: TP, FP, FN, PP
+    // error type: TP, FP, FN
     std::vector< std::vector<uint8_t> > errtypes;  
-    // group of variants that participate in TP/PP
+    // group of variants that participate in credit
     std::vector< std::vector<int> > sync_group;    
     // call quality (for truth, of associated call)
     // or for call, min quality in sync group
     std::vector< std::vector<float> > callq;   
-    // fraction of TP for partial positive (PP)
+    // percentage reduction in edit dist of sync group with variants
     std::vector< std::vector<float> > credit;  
 };
 

From adc796755868589a153a6e429eb850c5800b512d Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Wed, 8 Nov 2023 17:52:10 -0500
Subject: [PATCH 08/16] Fixed definitions of NG50 and NGC50

 - phase blocks are now based on phase sets, not backtracking
 - calculate full list of switch and flip positions
 - removed per-phaseblock block_state, added per-supercluster pb_phase
---
 src/cluster.cpp |  17 ++--
 src/cluster.h   |   3 +-
 src/defs.h      |   8 ++
 src/dist.cpp    |   2 +-
 src/globals.h   |   1 +
 src/main.cpp    |   1 +
 src/phase.cpp   | 205 +++++++++++++++++++++++++-----------------------
 src/phase.h     |  10 +--
 src/print.cpp   |  87 ++++++++++----------
 9 files changed, 180 insertions(+), 154 deletions(-)

diff --git a/src/cluster.cpp b/src/cluster.cpp
index 7554df4..53b8152 100644
--- a/src/cluster.cpp
+++ b/src/cluster.cpp
@@ -19,7 +19,8 @@ void ctgSuperclusters::add_supercluster(
         this->superclusters[i>>1][i&1].push_back(brks[i]);
     this->begs.push_back(beg);
     this->ends.push_back(end);
-    this->phase.push_back(PHASE_NONE);
+    this->sc_phase.push_back(PHASE_NONE);
+    this->pb_phase.push_back(PHASE_NONE);
     this->orig_phase_dist.push_back(-1);
     this->swap_phase_dist.push_back(-1);
     this->n++;
@@ -29,7 +30,7 @@ void ctgSuperclusters::set_phase(
            int sc_idx, int phase, 
            int orig_phase_dist, 
            int swap_phase_dist) {
-    this->phase[sc_idx] = phase;
+    this->sc_phase[sc_idx] = phase;
     this->orig_phase_dist[sc_idx] = orig_phase_dist;
     this->swap_phase_dist[sc_idx] = swap_phase_dist;
 }
@@ -379,8 +380,8 @@ void superclusterData::transfer_phase_sets() {
         total_bases += lengths[i];
     }
 
-    int query_ps_ng50 = calc_ng50(query_phase_set_sizes, total_bases);
-    int truth_ps_ng50 = calc_ng50(truth_phase_set_sizes, total_bases);
+    int query_pb_ng50 = calc_ng50(query_phase_set_sizes, total_bases);
+    int truth_pb_ng50 = calc_ng50(truth_phase_set_sizes, total_bases);
 
     // print
     if (total_multiple_phase_sets)
@@ -391,10 +392,10 @@ void superclusterData::transfer_phase_sets() {
         WARN("%d total phase sets (PS) are in non-increasing order",
                 total_non_increasing)
 
-    if (g.verbosity >= 1) INFO("        Input QUERY phase sets: %d", total_query_phase_sets);
-    if (g.verbosity >= 1) INFO("    Input QUERY phase set NG50: %d", query_ps_ng50);
-    if (g.verbosity >= 1) INFO("        Input TRUTH phase sets: %d", total_truth_phase_sets);
-    if (g.verbosity >= 1) INFO("    Input TRUTH phase set NG50: %d", truth_ps_ng50);
+    if (g.verbosity >= 1) INFO("              QUERY phase sets: %d", total_query_phase_sets);
+    if (g.verbosity >= 1) INFO("        QUERY phase block NG50: %d", query_pb_ng50);
+    if (g.verbosity >= 1) INFO("              TRUTH phase sets: %d", total_truth_phase_sets);
+    if (g.verbosity >= 1) INFO("        TRUTH phase block NG50: %d", truth_pb_ng50);
 }
 
 
diff --git a/src/cluster.h b/src/cluster.h
index d157780..ee04a98 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -36,7 +36,8 @@ class ctgSuperclusters {
     // data (length n)
     int n = 0;                                         // number of superclusters
     std::vector<int> begs, ends;                       // ref beg/end pos across haps and truth/query
-    std::vector<int> phase;                            // keep/swap/unknown, from alignment
+    std::vector<int> sc_phase;                         // keep/swap/unknown, from alignment
+    std::vector<int> pb_phase;                         // keep/swap, from phasing algorithm
     std::vector<int> phase_sets;                       // input, from first variant in sc
     std::vector<int> orig_phase_dist, swap_phase_dist; // alignment distances
 };
diff --git a/src/defs.h b/src/defs.h
index 16ec893..707a5ab 100644
--- a/src/defs.h
+++ b/src/defs.h
@@ -65,6 +65,14 @@ class idx2;
 #define ERRTYPE_UN 5 // unknown
 #define ERRTYPES   6
 
+#define SWITCHTYPE_FLIP       0
+#define SWITCHTYPE_SWITCH     1
+#define SWITCHTYPE_SWITCH_ERR 2
+#define SWITCHTYPE_FLIP_BEG   3
+#define SWITCHTYPE_FLIP_END   4
+#define SWITCHTYPE_NONE       5
+#define SWITCHTYPES           6
+
 // runtime timers
 #define TIME_READ     0
 #define TIME_CLUST    1
diff --git a/src/dist.cpp b/src/dist.cpp
index c55bf77..4422b26 100644
--- a/src/dist.cpp
+++ b/src/dist.cpp
@@ -1944,7 +1944,7 @@ editData edits_wrapper(std::shared_ptr<superclusterData> clusterdata_ptr) {
                     sc->begs[sc_idx], sc->ends[sc_idx], clusterdata_ptr->ref, ctg
             );
 
-            int phase = sc->phase[sc_idx];
+            int phase = sc->sc_phase[sc_idx];
 
             /////////////////////////////////////////////////////////////////////
             // SMITH-WATERMAN DISTANCE: don't allow skipping called variants     
diff --git a/src/globals.h b/src/globals.h
index 53c30ae..e609859 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -92,5 +92,6 @@ extern std::vector<std::string> gt_strs;
 extern std::vector<std::string> region_strs;
 extern std::vector<std::string> aln_strs;
 extern std::vector<std::string> phase_strs;
+extern std::vector<std::string> switch_strs;
 
 #endif
diff --git a/src/main.cpp b/src/main.cpp
index c32d09e..1bd5438 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -23,6 +23,7 @@ std::vector<std::string> callset_strs = {"QUERY", "TRUTH"};
 std::vector<std::string> phase_strs = {"=", "X", "?"};
 std::vector<std::string> timer_strs = {"reading", "clustering", "realigning", 
     "reclustering", "superclustering", "precision/recall", "edit distance", "phasing", "writing", "total"};
+std::vector<std::string> switch_strs = {"FLIP", "SWITCH", "SWITCH_ERR", "FLIP_BEG", "FLIP_END", "NONE"};
  
 int main(int argc, char **argv) {
 
diff --git a/src/phase.cpp b/src/phase.cpp
index 67324a1..fce761e 100644
--- a/src/phase.cpp
+++ b/src/phase.cpp
@@ -51,8 +51,8 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
 
         // set supercluster flip/swap based on phaseblock and sc phasing
         int phase_block = 0;
-        bool phase_switch = ctg_pbs->block_states[phase_block];
-        int phase_sc = ctg_scs->phase[sc_idx];
+        bool phase_switch = ctg_scs->pb_phase[sc_idx];
+        int phase_sc = ctg_scs->sc_phase[sc_idx];
         bool phase_flip, swap;
         if (phase_switch) {
             if (phase_sc == PHASE_ORIG) { // flip error
@@ -103,12 +103,12 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
             if (pos >= ctg_scs->ends[sc_idx]) {
                 // update supercluster
                 sc_idx++;
-                phase_sc = ctg_scs->phase[sc_idx];
+                phase_sc = ctg_scs->sc_phase[sc_idx];
 
                 // update phase block
                 if (sc_idx >= ctg_pbs->phase_blocks[phase_block+1])
                     phase_block++;
-                phase_switch = ctg_pbs->block_states[phase_block];
+                phase_switch = ctg_scs->pb_phase[sc_idx];
 
                 // update switch/flip status
                 if (phase_switch) {
@@ -240,9 +240,25 @@ phaseblockData::phaseblockData(std::shared_ptr<superclusterData> clusterdata_ptr
     this->ref = clusterdata_ptr->ref;
 
     // add pointers to clusters
-    for (std::string ctg : clusterdata_ptr->contigs) {
+    for (std::string ctg : this->contigs) {
         this->phase_blocks[ctg]->ctg_superclusters = clusterdata_ptr->superclusters[ctg];
     }
+
+    // add phase blocks based on phase sets for each contig
+    for (std::string ctg : this->contigs) {
+        std::shared_ptr<ctgPhaseblocks> ctg_pbs = this->phase_blocks[ctg];
+        std::shared_ptr<ctgSuperclusters> ctg_scs = ctg_pbs->ctg_superclusters;
+        int curr_phase_set = -1;
+        for (int sc_idx = 0; sc_idx < ctg_scs->n; sc_idx++) {
+            if (ctg_scs->phase_sets[sc_idx] != curr_phase_set) {
+                ctg_pbs->phase_blocks.push_back(sc_idx);
+                ctg_pbs->n++;
+                curr_phase_set = ctg_scs->phase_sets[sc_idx];
+            }
+        }
+        ctg_pbs->phase_blocks.push_back(ctg_scs->n);
+    }
+    
     this->phase();
 }
 
@@ -268,7 +284,7 @@ void phaseblockData::phase()
 
             // determine costs (penalized if this phasing deemed incorrect)
             std::vector<int> costs = {0, 0};
-            switch (ctg_scs->phase[i]) {
+            switch (ctg_scs->sc_phase[i]) {
                 case PHASE_NONE: 
                     costs[PHASE_PTR_KEEP] = 0; 
                     costs[PHASE_PTR_SWAP] = 0; 
@@ -282,7 +298,7 @@ void phaseblockData::phase()
                     costs[PHASE_PTR_SWAP] = 0; 
                      break;
                 default:
-                    ERROR("Unexpected phase (%d)", ctg_scs->phase[i]);
+                    ERROR("Unexpected phase (%d)", ctg_scs->sc_phase[i]);
                     break;
             }
 
@@ -313,28 +329,27 @@ void phaseblockData::phase()
                 phase = 1;
 
             int i = ctg_scs->n-1;
-            ctg_pbs->phase_blocks.push_back(i+1);
             while (i > 0) {
-                if (ptrs[phase][i] == PHASE_PTR_SWAP) { // new phase block
-                    ctg_pbs->n++;
+                ctg_scs->pb_phase[i] = phase;
+                if (ptrs[phase][i] == PHASE_PTR_SWAP) {
 
                     // not a switch error if between phase sets
-                    if (i+1 < ctg_scs->n && ctg_scs->phase_sets[i] == ctg_scs->phase_sets[i+1])
+                    if (i+1 < ctg_scs->n && ctg_scs->phase_sets[i] == ctg_scs->phase_sets[i+1]) {
+                        ctg_pbs->switches.push_back(i+1);
                         ctg_pbs->nswitches++;
+                    }
 
-                    ctg_pbs->phase_blocks.push_back(i+1);
-                    ctg_pbs->block_states.push_back(phase);
                     phase ^= 1;
                 } else if (ptrs[phase][i] == PHASE_PTR_KEEP) { // within phase block
-                    if (ctg_scs->phase[i] != PHASE_NONE && ctg_scs->phase[i] != phase)
+                    if (ctg_scs->sc_phase[i] != PHASE_NONE && ctg_scs->sc_phase[i] != phase) {
+                        ctg_pbs->flips.push_back(i);
                         ctg_pbs->nflips++;
+                    }
                 }
                 i--;
             }
-            ctg_pbs->phase_blocks.push_back(0);
-            std::reverse(ctg_pbs->phase_blocks.begin(), ctg_pbs->phase_blocks.end());
-            ctg_pbs->block_states.push_back(phase);
-            std::reverse(ctg_pbs->block_states.begin(), ctg_pbs->block_states.end());
+            std::reverse(ctg_pbs->flips.begin(), ctg_pbs->flips.end());
+            std::reverse(ctg_pbs->switches.begin(), ctg_pbs->switches.end());
         }
     }
     
@@ -346,28 +361,10 @@ void phaseblockData::phase()
     int superclusters = 0;
     if (g.verbosity >= 1) INFO("  Contigs:");
     int id = 0;
-    bool print = false;
     for (std::string ctg : this->contigs) {
         std::shared_ptr<ctgPhaseblocks> ctg_pbs = this->phase_blocks[ctg];
         std::shared_ptr<ctgSuperclusters> ctg_scs = ctg_pbs->ctg_superclusters;
 
-        // verbose, print phase of each supercluster
-        if (print) {
-            int s = 0;
-            std::vector<std::string> colors {"\033[34m", "\033[32m"};
-            int color_idx = 0;
-            for (int i = 0; i < ctg_scs->n; i++) {
-                if (i == ctg_pbs->phase_blocks[s]) {
-                    printf("%s", colors[color_idx].data());
-                    color_idx ^= 1;
-                    s++;
-                }
-                int phase = ctg_scs->phase[i];
-                printf("%s", phase_strs[phase].data());
-            }
-            printf("\033[0m\n");
-        }
-
         // print errors per contig
         if (g.verbosity >= 1) {
             INFO("    [%2d] %s: %d switch errors, %d flip errors, %d phase blocks", id, ctg.data(), 
@@ -379,26 +376,28 @@ void phaseblockData::phase()
         phase_blocks += ctg_pbs->n;
         id++;
     }
-    int ng50 = this->calculate_ng50();
-    int ngc50 = this->calculate_ngc50();
+    int ng50 = this->calculate_ng50(false, false);
+    int s_ngc50 = this->calculate_ng50(true, false);
+    int sf_ngc50 = this->calculate_ng50(true, true);
 
     if (g.verbosity >= 1) INFO(" ");
-    if (g.verbosity >= 1) INFO("   Total switch errors: %d", switch_errors);
-    if (g.verbosity >= 1) INFO("   Total   flip errors: %d", flip_errors);
-    if (g.verbosity >= 1) INFO("   Total  phase blocks: %d", phase_blocks);
-    if (g.verbosity >= 1) INFO("     Phase block  NG50: %d", ng50);
-    if (g.verbosity >= 1) INFO("     Phase block NGC50: %d", ngc50);
+    if (g.verbosity >= 1) INFO("             Total  phase blocks: %d", phase_blocks);
+    if (g.verbosity >= 1) INFO("             Total switch errors: %d", switch_errors);
+    if (g.verbosity >= 1) INFO("             Total   flip errors: %d", flip_errors);
     if (g.verbosity >= 1 && superclusters) 
-        INFO("  SC Switch error rate: %.6f%%", 100*switch_errors/float(superclusters));
+        INFO("  Supercluster switch error rate: %.6f%%", 100*switch_errors/float(superclusters));
     if (g.verbosity >= 1 && superclusters) 
-        INFO("    SC Flip error rate: %.6f%%", 100*flip_errors/float(superclusters));
+        INFO("    Supercluster flip error rate: %.6f%%", 100*flip_errors/float(superclusters));
+    if (g.verbosity >= 1) INFO("               Phase block  NG50: %d", ng50);
+    if (g.verbosity >= 1) INFO("  (switch)     Phase block NGC50: %d", s_ngc50);
+    if (g.verbosity >= 1) INFO("  (switchflip) Phase block NGC50: %d", sf_ngc50);
 }
 
 
 /*******************************************************************************/
 
 
-int phaseblockData::calculate_ngc50() {
+int phaseblockData::calculate_ng50(bool break_on_switch, bool break_on_flip) {
 
     // get total bases in genome
     size_t total_bases = 0;
@@ -412,23 +411,69 @@ int phaseblockData::calculate_ngc50() {
         
         std::shared_ptr<ctgPhaseblocks> ctg_pbs = this->phase_blocks[ctg];
         std::shared_ptr<ctgSuperclusters> ctg_scs = ctg_pbs->ctg_superclusters;
-        for (int pb = 0; pb < ctg_pbs->n && ctg_scs->n > 0; pb++) { // each phase block
-
-            // init start of this correct block
-            int sc = ctg_pbs->phase_blocks[pb];
-            int beg = ctg_scs->begs[sc];
-            int end = 0;
-            for (; sc < ctg_pbs->phase_blocks[pb+1]; sc++) { // each superclsuter
-                if (ctg_scs->phase[sc] != PHASE_NONE &&  // flip error
-                        ctg_scs->phase[sc] != ctg_pbs->block_states[pb]) {
-                    end = ctg_scs->ends[sc-1];
-                    correct_blocks.push_back(end-beg);
-                    beg = ctg_scs->begs[sc]; // reset
-                }
+
+        int switch_idx = 0;
+        int flip_idx = 0;
+        int pb_idx = 1;
+        if (ctg_scs->n == 0) continue;
+
+        // init start of this correct block
+        int sc = 0;
+        int next_sc = ctg_scs->n; // default to last
+        int beg = ctg_scs->begs[0];
+        int end = 0;
+        int type = SWITCHTYPE_NONE;
+
+        while (true) {
+
+            // get type and supercluster of next switch/flip/phaseset
+            // check flip first because it will cause a second switch
+            if (break_on_flip && flip_idx < ctg_pbs->nflips && ctg_pbs->flips[flip_idx] < next_sc) {
+                type = SWITCHTYPE_FLIP;
+                next_sc = ctg_pbs->flips[flip_idx];
+            }
+            if (pb_idx < ctg_pbs->n && ctg_pbs->phase_blocks[pb_idx] < next_sc) {
+                type = SWITCHTYPE_SWITCH;
+                next_sc = ctg_pbs->phase_blocks[pb_idx];
+            }
+            if (break_on_switch && switch_idx < ctg_pbs->nswitches && ctg_pbs->switches[switch_idx] < next_sc) {
+                type = SWITCHTYPE_SWITCH_ERR;
+                next_sc = ctg_pbs->switches[switch_idx];
+            }
+            if (type == SWITCHTYPE_NONE) { // all out-of-bounds
+                break;
             }
-            end = ctg_scs->ends[sc-1];
-            correct_blocks.push_back(end-beg);
+            if (next_sc <= sc) ERROR("Next SC is not after current SC");
+
+
+            // get block(s)
+            if (type == SWITCHTYPE_FLIP) {
+                end = ctg_scs->ends[next_sc-1];
+                correct_blocks.push_back(end-beg);
+                beg = ctg_scs->begs[next_sc];
+
+                end = ctg_scs->ends[next_sc];
+                correct_blocks.push_back(end-beg);
+                beg = ctg_scs->begs[next_sc+1];
+                flip_idx++;
+            } else if (type == SWITCHTYPE_SWITCH) {
+                end = ctg_scs->ends[next_sc-1];
+                correct_blocks.push_back(end-beg);
+                beg = ctg_scs->begs[next_sc];
+                pb_idx++;
+            } else if (type == SWITCHTYPE_SWITCH_ERR) {
+                end = ctg_scs->ends[next_sc-1];
+                correct_blocks.push_back(end-beg);
+                beg = ctg_scs->begs[next_sc];
+                switch_idx++;
+            }
+
+            sc = next_sc;
+            next_sc = ctg_scs->n; // reset to end
+            type = SWITCHTYPE_NONE;
         }
+        end = ctg_scs->ends[ctg_scs->n-1];
+        correct_blocks.push_back(end-beg);
     }
 
     // return NGC50
@@ -441,41 +486,3 @@ int phaseblockData::calculate_ngc50() {
     }
     return 0;
 }
-
-
-/*******************************************************************************/
-
-
-int phaseblockData::calculate_ng50() {
-
-    // get total bases in genome
-    size_t total_bases = 0;
-    for (size_t i = 0; i < this->contigs.size(); i++) {
-        total_bases += lengths[i];
-    }
-
-    // get sizes of each phase block
-    std::vector<int> phase_blocks;
-    for (std::string ctg: this->contigs) {
-        
-        std::shared_ptr<ctgPhaseblocks> ctg_pbs = this->phase_blocks[ctg];
-        std::shared_ptr<ctgSuperclusters> ctg_scs = ctg_pbs->ctg_superclusters;
-        for (int i = 0; i < ctg_pbs->n && ctg_scs->n > 0; i++) {
-            int beg_idx = ctg_pbs->phase_blocks[i];
-            int end_idx = ctg_pbs->phase_blocks[i+1]-1;
-            int beg = ctg_scs->begs[beg_idx];
-            int end = ctg_scs->ends[end_idx];
-            phase_blocks.push_back(end-beg);
-        }
-    }
-
-    // return NG50
-    size_t total_phased = 0;
-    std::sort(phase_blocks.begin(), phase_blocks.end(), std::greater<>());
-    for (size_t i = 0; i < phase_blocks.size(); i++) {
-        total_phased += phase_blocks[i];
-        if (total_phased >= total_bases / 2)
-            return phase_blocks[i];
-    }
-    return 0;
-}
diff --git a/src/phase.h b/src/phase.h
index 1b346b8..962b037 100644
--- a/src/phase.h
+++ b/src/phase.h
@@ -13,11 +13,12 @@ class ctgPhaseblocks {
 
     std::shared_ptr<ctgSuperclusters> ctg_superclusters = nullptr;
 
-    int n = 1;                       // number of phase blocks
+    int n = 0;                       // number of phase blocks
 
     // data, filled out during phase()
-    std::vector<int> phase_blocks;   // n+1, indices of superclusters where phasing switches
-    std::vector<int> block_states;   // n, phasing for each phase block (keep/swap)
+    std::vector<int> phase_blocks;   // n+1, supercluster indices of new phase sets
+    std::vector<int> switches;       // switch occurs before this supercluster
+    std::vector<int> flips;          // indices of flipped superclusters
     int nswitches = 0;               // number of phase switch errors
     int nflips = 0;                  // number of supercluster phase flip errors
 };
@@ -28,8 +29,7 @@ class phaseblockData {
 
     void write_summary_vcf(std::string vcf_fn);
     void phase();
-    int calculate_ng50();
-    int calculate_ngc50();
+    int calculate_ng50(bool break_on_switch = false, bool break_on_flip = false);
 
     std::shared_ptr<fastaData> ref;
     std::vector<std::string> contigs;
diff --git a/src/print.cpp b/src/print.cpp
index 3d36abd..52b6f59 100644
--- a/src/print.cpp
+++ b/src/print.cpp
@@ -346,10 +346,10 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
 
                     // get swap info
                     bool swap;
-                    switch (ctg_scs->phase[sci]) {
+                    switch (ctg_scs->sc_phase[sci]) {
                         case PHASE_ORIG: swap = false; break;
                         case PHASE_SWAP: swap = true; break;
-                        default: swap = ctg_pbs->block_states[pbi]; break;
+                        default: swap = ctg_scs->pb_phase[sci]; break;
                     }
 
                     for (int i = ctg_vars[QUERY][h]->clusters[
@@ -390,10 +390,10 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
 
                     // get swap info
                     bool swap;
-                    switch (ctg_scs->phase[sci]) {
+                    switch (ctg_scs->sc_phase[sci]) {
                         case PHASE_ORIG: swap = false; break;
                         case PHASE_SWAP: swap = true; break;
-                        default: swap = ctg_pbs->block_states[pbi]; break;
+                        default: swap = ctg_scs->pb_phase[sci]; break;
                     }
 
                     for (int i = ctg_vars[TRUTH][h]->clusters[
@@ -699,7 +699,7 @@ void write_results(
         std::string out_phaseblocks_fn = g.out_prefix + "phase-blocks.tsv";
         FILE* out_phaseblocks = fopen(out_phaseblocks_fn.data(), "w");
         if (g.verbosity >= 1) INFO("  Writing phasing results to '%s'", out_phaseblocks_fn.data());
-        fprintf(out_phaseblocks, "CONTIG\tPHASE_BLOCK\tSTART\tSTOP\tSIZE\tSUPERCLUSTERS\tBLOCK_STATE\n");
+        fprintf(out_phaseblocks, "CONTIG\tPHASE_BLOCK\tSTART\tSTOP\tSIZE\tSUPERCLUSTERS\tFLIP_ERRORS\tSWITCH_ERRORS\n");
         for (std::string ctg : phasedata_ptr->contigs) {
             std::shared_ptr<ctgPhaseblocks> ctg_pbs = phasedata_ptr->phase_blocks[ctg];
             std::shared_ptr<ctgSuperclusters> ctg_scs = ctg_pbs->ctg_superclusters;
@@ -708,9 +708,16 @@ void write_results(
                 int end_idx = ctg_pbs->phase_blocks[i+1]-1;
                 int beg = ctg_scs->begs[beg_idx];
                 int end = ctg_scs->ends[end_idx];
-                int phase = ctg_pbs->block_states[i];
-                fprintf(out_phaseblocks, "%s\t%d\t%d\t%d\t%d\t%d\t%d\n", 
-                        ctg.data(), i, beg, end, end-beg, end_idx-beg_idx+1, phase);
+                int nswitches = 0;
+                for (int si = 0; si < ctg_pbs->nswitches; si++) {
+                    if (ctg_pbs->switches[si] > beg_idx && ctg_pbs->switches[si] <= end_idx) nswitches++;
+                }
+                int nflips = 0;
+                for (int fi = 0; fi < ctg_pbs->nflips; fi++) {
+                    if (ctg_pbs->flips[fi] > beg_idx && ctg_pbs->flips[fi] <= end_idx) nflips++;
+                }
+                fprintf(out_phaseblocks, "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", 
+                        ctg.data(), i, beg, end, end-beg, end_idx-beg_idx+1, nflips, nswitches);
             }
         }
         fclose(out_phaseblocks);
@@ -733,8 +740,8 @@ void write_results(
                 }
 
                 // set supercluster flip/swap based on phaseblock and sc phasing
-                bool phase_switch = ctg_pbs->block_states[phase_block_idx];
-                int phase_sc = ctg_scs->phase[i];
+                bool phase_switch = ctg_scs->pb_phase[i];
+                int phase_sc = ctg_scs->sc_phase[i];
                 bool flip_error;
                 if (phase_switch) {
                     flip_error = phase_sc == PHASE_ORIG;
@@ -790,7 +797,7 @@ void write_results(
             std::shared_ptr<ctgVariants> query1_vars = ctg_scs->ctg_variants[QUERY][HAP1];
             std::shared_ptr<ctgVariants> query2_vars = ctg_scs->ctg_variants[QUERY][HAP2];
 
-            int supercluster_idx = 0;
+            int sci = 0;
             int var1_idx = 0;
             int var2_idx = 0;
             int cluster1_idx = 0;
@@ -804,16 +811,16 @@ void write_results(
                     if (cluster1_idx+1 >= int(query1_vars->clusters.size())) 
                         ERROR("Out of bounds cluster during write_results(): query1")
                     if (query1_vars->clusters[cluster1_idx+1] <= var1_idx) cluster1_idx++;
-                    if (supercluster_idx >= int(ctg_scs->begs.size())) 
+                    if (sci >= int(ctg_scs->begs.size())) 
                         ERROR("Out of bounds supercluster during write_results(): query1")
-                    while (query1_vars->poss[var1_idx] >= ctg_scs->ends[supercluster_idx])
-                        supercluster_idx++;
+                    while (query1_vars->poss[var1_idx] >= ctg_scs->ends[sci])
+                        sci++;
 
                     // update phasing info
-                    if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1])
+                    if (sci >= ctg_pbs->phase_blocks[phase_block+1])
                         phase_block++;
-                    bool phase_switch = ctg_pbs->block_states[phase_block];
-                    int phase_sc = ctg_scs->phase[supercluster_idx];
+                    bool phase_switch = ctg_scs->pb_phase[sci];
+                    int phase_sc = ctg_scs->sc_phase[sci];
                     bool swap;
                     switch (phase_sc) {
                         case PHASE_ORIG: swap = false; break;
@@ -832,7 +839,7 @@ void write_results(
                             error_strs[query1_vars->errtypes[swap][var1_idx]].data(),
                             query1_vars->credit[swap][var1_idx],
                             cluster1_idx,
-                            supercluster_idx,
+                            sci,
                             query1_vars->sync_group[swap][var1_idx],
                             region_strs[query1_vars->locs[var1_idx]].data()
                            );
@@ -841,16 +848,16 @@ void write_results(
                     if (cluster2_idx+1 >= int(query2_vars->clusters.size()))
                         ERROR("Out of bounds cluster during write_results(): query2")
                     if (query2_vars->clusters[cluster2_idx+1] <= var2_idx) cluster2_idx++;
-                    if (supercluster_idx >= int(ctg_scs->begs.size())) 
+                    if (sci >= int(ctg_scs->begs.size())) 
                         ERROR("Out of bounds supercluster during write_results(): query2")
-                    while (query2_vars->poss[var2_idx] >= ctg_scs->ends[supercluster_idx])
-                        supercluster_idx++;
+                    while (query2_vars->poss[var2_idx] >= ctg_scs->ends[sci])
+                        sci++;
 
                     // update phasing info
-                    if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1])
+                    if (sci >= ctg_pbs->phase_blocks[phase_block+1])
                         phase_block++;
-                    bool phase_switch = ctg_pbs->block_states[phase_block];
-                    int phase_sc = ctg_scs->phase[supercluster_idx];
+                    bool phase_switch = ctg_scs->pb_phase[sci];
+                    int phase_sc = ctg_scs->sc_phase[sci];
                     bool swap;
                     switch (phase_sc) {
                         case PHASE_ORIG: swap = false; break;
@@ -869,7 +876,7 @@ void write_results(
                             error_strs[query2_vars->errtypes[swap][var2_idx]].data(),
                             query2_vars->credit[swap][var2_idx],
                             cluster2_idx,
-                            supercluster_idx,
+                            sci,
                             query2_vars->sync_group[swap][var2_idx],
                             region_strs[query2_vars->locs[var2_idx]].data()
                            );
@@ -896,7 +903,7 @@ void write_results(
             int var2_idx = 0;
             int cluster1_idx = 0;
             int cluster2_idx = 0;
-            int supercluster_idx = 0;
+            int sci = 0;
             int phase_block = 0;
             while (var1_idx < truth1_vars->n || var2_idx < truth2_vars->n) {
                 if (var2_idx >= truth2_vars->n || // only truth1 has remaining vars
@@ -904,16 +911,16 @@ void write_results(
                     if (cluster1_idx+1 >= int(truth1_vars->clusters.size()))
                         ERROR("Out of bounds cluster during write_results(): truth1")
                     if (truth1_vars->clusters[cluster1_idx+1] <= var1_idx) cluster1_idx++;
-                    if (supercluster_idx >= int(ctg_scs->begs.size())) 
+                    if (sci >= int(ctg_scs->begs.size())) 
                         ERROR("Out of bounds supercluster during write_results(): truth1")
-                    while (truth1_vars->poss[var1_idx] >= ctg_scs->ends[supercluster_idx])
-                        supercluster_idx++;
+                    while (truth1_vars->poss[var1_idx] >= ctg_scs->ends[sci])
+                        sci++;
 
                     // update phasing info
-                    if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1])
+                    if (sci >= ctg_pbs->phase_blocks[phase_block+1])
                         phase_block++;
-                    bool phase_switch = ctg_pbs->block_states[phase_block];
-                    int phase_sc = ctg_scs->phase[supercluster_idx];
+                    bool phase_switch = ctg_scs->pb_phase[sci];
+                    int phase_sc = ctg_scs->sc_phase[sci];
                     bool swap;
                     switch (phase_sc) {
                         case PHASE_ORIG: swap = false; break;
@@ -932,7 +939,7 @@ void write_results(
                             error_strs[truth1_vars->errtypes[swap][var1_idx]].data(),
                             truth1_vars->credit[swap][var1_idx],
                             cluster1_idx,
-                            supercluster_idx,
+                            sci,
                             truth1_vars->sync_group[swap][var1_idx],
                             region_strs[truth1_vars->locs[var1_idx]].data()
                            );
@@ -942,16 +949,16 @@ void write_results(
                         ERROR("Out of bounds cluster during write_results(): truth2")
                     }
                     if (truth2_vars->clusters[cluster2_idx+1] <= var2_idx) cluster2_idx++;
-                    if (supercluster_idx >= int(ctg_scs->begs.size())) 
+                    if (sci >= int(ctg_scs->begs.size())) 
                         ERROR("Out of bounds supercluster during write_results(): truth2")
-                    while (truth2_vars->poss[var2_idx] >= ctg_scs->ends[supercluster_idx])
-                        supercluster_idx++;
+                    while (truth2_vars->poss[var2_idx] >= ctg_scs->ends[sci])
+                        sci++;
 
                     // update phasing info
-                    if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1])
+                    if (sci >= ctg_pbs->phase_blocks[phase_block+1])
                         phase_block++;
-                    bool phase_switch = ctg_pbs->block_states[phase_block];
-                    int phase_sc = ctg_scs->phase[supercluster_idx];
+                    bool phase_switch = ctg_scs->pb_phase[sci];
+                    int phase_sc = ctg_scs->sc_phase[sci];
                     bool swap;
                     switch (phase_sc) {
                         case PHASE_ORIG: swap = false; break;
@@ -970,7 +977,7 @@ void write_results(
                             error_strs[truth2_vars->errtypes[swap][var2_idx]].data(),
                             truth2_vars->credit[swap][var2_idx],
                             cluster2_idx,
-                            supercluster_idx,
+                            sci,
                             truth2_vars->sync_group[swap][var2_idx],
                             region_strs[truth2_vars->locs[var2_idx]].data()
                            );

From 4fce6a220dac5a722bfb84b2a6e9324f8b530822 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Wed, 8 Nov 2023 18:11:32 -0500
Subject: [PATCH 09/16] Only write truth/query VCF if realign selected.

---
 src/main.cpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 1bd5438..e0f2a67 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -49,15 +49,17 @@ int main(int argc, char **argv) {
 
     // write results
     if (g.write) {
-        g.timers[TIME_WRITE].start();
-        if (g.verbosity >= 1) INFO(" ");
-        if (g.verbosity >= 1) INFO("  Writing original query VCF to '%s'", 
-                std::string(g.out_prefix + "orig-query.vcf").data());
+        if (g.realign_query) {
+            g.timers[TIME_WRITE].start();
+            if (g.verbosity >= 1) INFO("  Writing original query VCF to '%s'", 
+                    std::string(g.out_prefix + "orig-query.vcf").data());
             query_ptr->write_vcf(g.out_prefix + "orig-query.vcf");
-        if (g.verbosity >= 1) INFO("  Writing original truth VCF to '%s'", 
-                std::string(g.out_prefix + "orig-truth.vcf").data());
+        } else if (g.realign_truth) {
+            if (g.verbosity >= 1) INFO("  Writing original truth VCF to '%s'", 
+                    std::string(g.out_prefix + "orig-truth.vcf").data());
             truth_ptr->write_vcf(g.out_prefix + "orig-truth.vcf");
-        g.timers[TIME_WRITE].stop();
+            g.timers[TIME_WRITE].stop();
+        }
     }
 
     // ensure each input contains all contigs in BED
@@ -94,7 +96,7 @@ int main(int argc, char **argv) {
         g.timers[TIME_REALN].stop();
     }
 
-    if (g.realign_only && g.write) { // realign only, exit early
+    if (g.realign_only && g.realign_query && g.write) { // realign only, exit early
         g.timers[TIME_WRITE].start();
         query_ptr->write_vcf(g.out_prefix + "query.vcf");
         g.timers[TIME_WRITE].stop();
@@ -155,7 +157,7 @@ int main(int argc, char **argv) {
     }
 
     if (g.realign_only) { // realign only, exit early
-        if (g.write) {
+        if (g.realign_truth && g.write) {
             g.timers[TIME_WRITE].start();
             truth_ptr->write_vcf(g.out_prefix + "truth.vcf");
             g.timers[TIME_WRITE].stop();

From a20c152ee4d223529ef889c69e17e00571210738 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Thu, 9 Nov 2023 09:57:40 -0500
Subject: [PATCH 10/16] Added phasing summary TSV.

---
 src/phase.cpp | 18 ++++++++++++++++++
 src/phase.h   |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/src/phase.cpp b/src/phase.cpp
index fce761e..6cbd1d4 100644
--- a/src/phase.cpp
+++ b/src/phase.cpp
@@ -391,6 +391,24 @@ void phaseblockData::phase()
     if (g.verbosity >= 1) INFO("               Phase block  NG50: %d", ng50);
     if (g.verbosity >= 1) INFO("  (switch)     Phase block NGC50: %d", s_ngc50);
     if (g.verbosity >= 1) INFO("  (switchflip) Phase block NGC50: %d", sf_ngc50);
+    this->write_phasing_summary(phase_blocks, switch_errors, flip_errors, ng50, s_ngc50, sf_ngc50);
+}
+
+
+/*******************************************************************************/
+
+
+void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors,
+        int flip_errors, int ng50, int s_ngc50, int sf_ngc50) {
+    std::string out_phasing_summary_fn = g.out_prefix + "phasing-summary.tsv";
+    if (g.verbosity >= 1) INFO("  Writing phasing summary to '%s'", 
+            out_phasing_summary_fn.data());
+    FILE* out_phasing_summary = fopen(out_phasing_summary_fn.data(), "w");
+    fprintf(out_phasing_summary,
+            "PHASE_BLOCKS\tFLIP_ERRORS\tSWITCH_ERRORS\tNG_50\tSWITCH_NGC50\tSWITCHFLIP_NGC50\n");
+    fprintf(out_phasing_summary, "%d\t%d\t%d\t%d\t%d\t%d", phase_blocks, 
+            switch_errors, flip_errors, ng50, s_ngc50, sf_ngc50);
+    fclose(out_phasing_summary);
 }
 
 
diff --git a/src/phase.h b/src/phase.h
index 962b037..b91f359 100644
--- a/src/phase.h
+++ b/src/phase.h
@@ -28,6 +28,8 @@ class phaseblockData {
     phaseblockData(std::shared_ptr<superclusterData> clusterdata_ptr); 
 
     void write_summary_vcf(std::string vcf_fn);
+    void write_phasing_summary(int phase_blocks, int switch_errors,
+        int flip_errors, int ng50, int s_ngc50, int sf_ngc50);
     void phase();
     int calculate_ng50(bool break_on_switch = false, bool break_on_flip = false);
 

From 9cf032ac1ddb40a44d1b1b95327ad3e427bd4790 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Thu, 9 Nov 2023 10:45:06 -0500
Subject: [PATCH 11/16] Distance calculations are now optional, skipped by
 default.

---
 src/edit.cpp    | 131 ++++++++++++++++++++++++++++++++++++++++++++++
 src/edit.h      |   2 +
 src/globals.cpp |  32 ++++++++----
 src/globals.h   |  42 ++++++++-------
 src/main.cpp    |  26 +++++++---
 src/phase.cpp   |   1 +
 src/print.cpp   | 135 ++----------------------------------------------
 src/print.h     |   2 +-
 8 files changed, 203 insertions(+), 168 deletions(-)

diff --git a/src/edit.cpp b/src/edit.cpp
index 18667e0..2e56ced 100644
--- a/src/edit.cpp
+++ b/src/edit.cpp
@@ -142,3 +142,134 @@ int editData::get_score(int qual) const {
     }
     return score;
 }
+
+
+/*******************************************************************************/
+
+
+void editData::write_distance() {
+
+    // log all distance results
+    std::string dist_fn = g.out_prefix + "distance.tsv";
+    FILE* out_dists = 0;
+    if (g.write) {
+        out_dists = fopen(dist_fn.data(), "w");
+        if (g.verbosity >= 1) INFO(" ");
+        if (g.verbosity >= 1) INFO("  Writing distance results to '%s'", dist_fn.data());
+        fprintf(out_dists, "MIN_QUAL\tSUB_DE\tINS_DE\tDEL_DE\tSUB_ED\tINS_ED\tDEL_ED\t"
+                "DISTINCT_EDITS\tEDIT_DIST\tALN_SCORE\tALN_QSCORE\n");
+    }
+
+    // get original scores / distance (above g.max_qual, no vars applied)
+    std::vector<int> orig_edit_dists(TYPES, 0);
+    std::vector<int> orig_distinct_edits(TYPES, 0);
+    int orig_score = this->get_score(g.max_qual+1);
+    for (int type = 0; type < TYPES; type++) {
+        orig_edit_dists[type] = this->get_ed(g.max_qual+1, type);
+        orig_distinct_edits[type] = this->get_de(g.max_qual+1, type);
+    }
+
+    std::vector<double> best_score(TYPES, std::numeric_limits<double>::max());
+    std::vector<int> best_qual(TYPES, 0);
+    for (int q = g.min_qual; q <= g.max_qual+1; q++) { // all qualities
+
+        // get ED/DE for each Q threshold, for each type
+        std::vector<int> edit_dists(TYPES, 0);
+        std::vector<int> distinct_edits(TYPES, 0);
+        for (int type = 0; type < TYPES; type++) {
+            edit_dists[type] = this->get_ed(q, type);
+            distinct_edits[type] = this->get_de(q, type);
+
+            // save best Q threshold so far
+            double score = double(edit_dists[type]) * distinct_edits[type];
+            if (score < best_score[type]) {
+                best_score[type] = score;
+                best_qual[type] = q;
+            }
+        }
+
+        if (g.write) fprintf(out_dists, 
+                "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%f\n", 
+                q, distinct_edits[TYPE_SUB],
+                distinct_edits[TYPE_INS], distinct_edits[TYPE_DEL],
+                edit_dists[TYPE_SUB], edit_dists[TYPE_INS], edit_dists[TYPE_DEL],
+                distinct_edits[TYPE_ALL], edit_dists[TYPE_ALL],
+                this->get_score(q), qscore(double(this->get_score(q))/orig_score));
+    }
+    if (g.write) fclose(out_dists);
+
+    // summarize distance results
+    std::string dist_summ_fn = g.out_prefix + "distance-summary.tsv";
+    FILE* dists_summ = 0;
+    if (g.write) {
+        dists_summ = fopen(dist_summ_fn.data(), "w");
+        if (g.verbosity >= 1) 
+            INFO("  Writing distance summary to '%s'", dist_summ_fn.data());
+        fprintf(dists_summ, "VAR_TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE\n");
+    }
+    INFO(" ");
+    INFO("%sALIGNMENT DISTANCE SUMMARY%s", COLOR_BLUE, COLOR_WHITE);
+    for (int type = 0; type < TYPES; type++) {
+
+        // skip INS/DEL individually unless higher print verbosity
+        if (g.verbosity == 0 && type != TYPE_ALL)
+            continue;
+        if (g.verbosity == 1 && (type == TYPE_INS || type == TYPE_DEL))
+            continue;
+
+        INFO(" ");
+        if (type == TYPE_ALL) {
+            INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE%s",
+                    COLOR_BLUE, COLOR_WHITE);
+        } else {
+            INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE%s",
+                    COLOR_BLUE, COLOR_WHITE);
+        }
+        std::vector<int> quals = {g.min_qual, best_qual[type], g.max_qual+1};
+        for (int q : quals) {
+
+            // fill out ED/DE for selected quals
+            std::vector<int> edit_dists(TYPES, 0);
+            std::vector<int> distinct_edits(TYPES, 0);
+            for (int type = 0; type < TYPES; type++) {
+                edit_dists[type] =this->get_ed(q, type);
+                distinct_edits[type] = this->get_de(q, type);
+            }
+
+            float ed_qscore = qscore(double(edit_dists[type]) / orig_edit_dists[type]);
+            float de_qscore = qscore(double(distinct_edits[type]) / orig_distinct_edits[type]);
+            float all_qscore = type == TYPE_ALL ? qscore(double(this->get_score(q)) / orig_score) : 0;
+
+            // print summary
+            if (g.write) fprintf(dists_summ, "%s\t%d\t%d\t%d\t%f\t%f\t%f\n", type_strs2[type].data(),
+                    q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore);
+            if (type == TYPE_ALL) {
+                INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f\t%f%s", COLOR_BLUE, type_strs2[type].data(),
+                    q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore, COLOR_WHITE);
+            } else {
+                INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f%s", COLOR_BLUE, type_strs2[type].data(),
+                    q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, COLOR_WHITE);
+            }
+        }
+    }
+    INFO(" ");
+    if (g.write) fclose(dists_summ);
+}
+
+
+/*******************************************************************************/
+
+
+void editData::write_edits() {
+    std::string edit_fn = g.out_prefix + "edits.tsv";
+    if (g.verbosity >= 1) INFO("  Writing edit results to '%s'", edit_fn.data());
+    FILE* out_edits = fopen(edit_fn.data(), "w");
+    fprintf(out_edits, "CONTIG\tSTART\tHAP\tTYPE\tSIZE\tSUPERCLUSTER\tMIN_QUAL\tMAX_QUAL\n");
+    for (int i = 0; i < this->n; i++) {
+        fprintf(out_edits, "%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", 
+                this->ctgs[i].data(), this->poss[i], this->haps[i],
+                type_strs[this->types[i]].data(), this->lens[i],
+                this->superclusters[i], this->min_quals[i], this->max_quals[i]);
+    }
+    fclose(out_edits);
+}
diff --git a/src/edit.h b/src/edit.h
index 49fb0b8..3e56965 100644
--- a/src/edit.h
+++ b/src/edit.h
@@ -24,6 +24,8 @@ class editData {
     int get_ed(int qual, int type=TYPE_ALL) const; // edit distance
     int get_de(int qual, int type=TYPE_ALL) const; // distinct edits
     int get_score(int qual) const;                 // smith-waterman distance
+    void write_distance();
+    void write_edits();
 
     // data (all of size n)
     int n = 0;
diff --git a/src/globals.cpp b/src/globals.cpp
index 6244811..dad8bdd 100644
--- a/src/globals.cpp
+++ b/src/globals.cpp
@@ -178,7 +178,8 @@ void Globals::parse_args(int argc, char ** argv) {
                 ERROR("Invalid maximum variant size provided");
             }
 /*******************************************************************************/
-        } else if (std::string(argv[i]) == "--min-qual") {
+        } else if (std::string(argv[i]) == "-mn" ||
+                std::string(argv[i]) == "--min-qual") {
             i++;
             if (i == argc) {
                 ERROR("Option '--min-qual' used without providing minimum variant quality");
@@ -192,7 +193,8 @@ void Globals::parse_args(int argc, char ** argv) {
                 ERROR("Must provide non-negative minimum variant quality");
             }
 /*******************************************************************************/
-        } else if (std::string(argv[i]) == "--max-qual") {
+        } else if (std::string(argv[i]) == "-mx" ||
+                std::string(argv[i]) == "--max-qual") {
             i++;
             if (i == argc) {
                 ERROR("Option '--max-qual' used without providing maximum variant quality");
@@ -385,6 +387,11 @@ void Globals::parse_args(int argc, char ** argv) {
             if (this->max_ram < 0) {
                 ERROR("Max RAM must be positive");
             }
+/*******************************************************************************/
+        } else if (std::string(argv[i]) == "-d" || 
+                std::string(argv[i]) == "--distance") {
+            i++;
+            g.distance = true;
 /*******************************************************************************/
         } else if (std::string(argv[i]) == "-ro" || 
                 std::string(argv[i]) == "--realign-only") {
@@ -406,7 +413,8 @@ void Globals::parse_args(int argc, char ** argv) {
             i++;
             g.realign_query = true;
 /*******************************************************************************/
-        } else if (std::string(argv[i]) == "--simple-cluster") {
+        } else if (std::string(argv[i]) == "-sc" ||
+                std::string(argv[i]) == "--simple-cluster") {
             i++;
             g.simple_cluster = true;
 /*******************************************************************************/
@@ -481,12 +489,12 @@ void Globals::print_usage() const
     printf("      minimum variant size, smaller variants ignored (SNPs are size 1)\n");
     printf("  -l, --largest-variant <INTEGER> [%d]\n", g.max_size);
     printf("      maximum variant size, larger variants ignored\n");
-    printf("  --min-qual <INTEGER> [%d]\n", g.min_qual);
+    printf("  -mn, --min-qual <INTEGER> [%d]\n", g.min_qual);
     printf("      minimum variant quality, lower qualities ignored\n");
-    printf("  --max-qual <INTEGER> [%d]\n", g.max_qual);
+    printf("  -mx, --max-qual <INTEGER> [%d]\n", g.max_qual);
     printf("      maximum variant quality, higher qualities kept but thresholded\n");
 
-    printf("\n  ReAlignment:\n");
+    printf("\n  Re-Alignment:\n");
     printf("  -rq, --realign-query\n");
     printf("      realign query variants using Smith-Waterman parameters\n");
     printf("  -rt, --realign-truth\n");
@@ -504,6 +512,10 @@ void Globals::print_usage() const
     printf("  -ct, --credit-threshold <FLOAT> [%.2f]\n", g.credit_threshold);
     printf("      minimum partial credit to consider variant a true positive\n");
 
+    printf("\n  Distance:\n");
+    printf("  -d, --distance\n");
+    printf("      flag to include alignment distance calculations, skipped by default\n");
+
     printf("\n  Utilization:\n");
     printf("  -t, --max-threads <INTEGER> [%d]\n", g.max_threads);
     printf("      maximum threads to use for clustering and precision/recall alignment\n");
@@ -526,7 +538,7 @@ void Globals::print_usage() const
     printf("\n  Clustering:\n");
     printf("  -i, --max-iterations <INTEGER> [%d]\n", g.max_cluster_itrs);
     printf("      maximum iterations for expanding/merging clusters\n");
-    printf("  --simple-cluster\n");
+    printf("  -sc, --simple-cluster\n");
     printf("      instead of biWFA-based clustering, use gap-based clustering \n");
     printf("  -g, --cluster-gap <INTEGER> [%d]\n", g.cluster_min_gap);
     printf("      minimum gap between independent clusters and superclusters (in bases),\n");
@@ -539,11 +551,11 @@ void Globals::print_usage() const
 
     printf("\n  Distance:\n");
     printf("  -ex, --eval-mismatch-penalty <INTEGER> [%d]\n", g.eval_sub);
-    printf("      mismatch penalty (distance evaluation)\n");
+    printf("      mismatch penalty (distance evaluation only)\n");
     printf("  -eo, --eval-gap-open-penalty <INTEGER> [%d]\n", g.eval_open);
-    printf("      gap opening penalty (distance evaluation)\n");
+    printf("      gap opening penalty (distance evaluation only)\n");
     printf("  -ee, --eval-gap-extend-penalty <INTEGER> [%d]\n", g.eval_extend);
-    printf("      gap extension penalty (distance evaluation)\n");
+    printf("      gap extension penalty (distance evaluation only)\n");
 
 }
 
diff --git a/src/globals.h b/src/globals.h
index e609859..e2e630e 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -9,6 +9,9 @@
 
 class Globals {
 public:
+    // constructors
+    Globals() {;}
+
     // input files
     std::string ref_fasta_fn;
     FILE* ref_fasta_fp;
@@ -19,7 +22,7 @@ class Globals {
     bool bed_exists = false;
     bool write = true;
 
-    // variant params
+    // variant filtering
     std::vector<std::string> filters;
     std::vector<int> filter_ids;
     int min_qual = 0;
@@ -27,21 +30,32 @@ class Globals {
     int min_size = 1;
     int max_size = 5000;
 
-    // clustering params
-    bool realign_truth = false;
-    bool realign_query = false;
-    bool realign_only = false;
+    // clustering
     bool simple_cluster = false;
     int cluster_min_gap = 50;
     int reach_min_gap = 10;
     int max_cluster_itrs = 4;
 
-    // phasing options
+    // re-alignment
+    bool realign_truth = false;
+    bool realign_query = false;
+    bool realign_only = false;
+    int sub = 5;
+    int open = 6;
+    int extend = 2;
+
+    // phasing
     double phase_threshold = 0.5;
 
-    // precision-recall options
+    // precision-recall
     double credit_threshold = 0.7;
 
+    // edit distance
+    bool distance = false;
+    int eval_sub = 3;
+    int eval_open = 2;
+    int eval_extend = 1;
+
     // memory params
     int max_threads = 64;
     double max_ram = 64; // GB
@@ -55,19 +69,6 @@ class Globals {
     std::string out_prefix;
     std::string cmd;
 
-    // alignment parameters
-    int sub = 5;
-    int open = 6;
-    int extend = 2;
-    int eval_sub = 3;
-    int eval_open = 2;
-    int eval_extend = 1;
-
-    std::vector<timer> timers;
-
-    // constructors
-    Globals() {;}
-
     // member functions
     void parse_args(int argc, char ** argv);
     void print_version() const;
@@ -78,6 +79,7 @@ class Globals {
     // program data
     const std::string VERSION = "2.2.1";
     const std::string PROGRAM = "vcfdist";
+    std::vector<timer> timers;
 };
 
 extern Globals g;
diff --git a/src/main.cpp b/src/main.cpp
index e0f2a67..0d022e7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -210,9 +210,21 @@ int main(int argc, char **argv) {
     g.timers[TIME_PR_ALN].stop();
 
     // calculate edit distance
-    g.timers[TIME_EDITS].start();
-    editData edits = edits_wrapper(clusterdata_ptr);
-    g.timers[TIME_EDITS].stop();
+    editData edits;
+    if (g.distance) {
+        g.timers[TIME_EDITS].start();
+        editData edits = edits_wrapper(clusterdata_ptr);
+        g.timers[TIME_EDITS].stop();
+
+        g.timers[TIME_WRITE].start();
+        edits.write_distance();
+        if (g.write) edits.write_edits();
+        g.timers[TIME_WRITE].stop();
+    } else {
+        if (g.verbosity >= 1) INFO(" ");
+        if (g.verbosity >= 1) INFO("%s[6/8] Skipping distance metrics%s",
+                COLOR_PURPLE, COLOR_WHITE);
+    }
 
     // calculate global phasings
     g.timers[TIME_PHASE].start();
@@ -221,12 +233,14 @@ int main(int argc, char **argv) {
 
     // write supercluster/phaseblock results in CSV format
     g.timers[TIME_WRITE].start();
-    write_results(phasedata_ptr, edits);
+    write_results(phasedata_ptr);
 
     // save new VCF
     if (g.write) {
-        query_ptr->write_vcf(g.out_prefix + "query.vcf");
-        truth_ptr->write_vcf(g.out_prefix + "truth.vcf");
+        if (g.realign_query)
+            query_ptr->write_vcf(g.out_prefix + "query.vcf");
+        if (g.realign_truth)
+            truth_ptr->write_vcf(g.out_prefix + "truth.vcf");
         phasedata_ptr->write_summary_vcf(g.out_prefix + "summary.vcf");
     }
     g.timers[TIME_WRITE].stop();
diff --git a/src/phase.cpp b/src/phase.cpp
index 6cbd1d4..25fc511 100644
--- a/src/phase.cpp
+++ b/src/phase.cpp
@@ -401,6 +401,7 @@ void phaseblockData::phase()
 void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors,
         int flip_errors, int ng50, int s_ngc50, int sf_ngc50) {
     std::string out_phasing_summary_fn = g.out_prefix + "phasing-summary.tsv";
+    if (g.verbosity >= 1) INFO(" ");
     if (g.verbosity >= 1) INFO("  Writing phasing summary to '%s'", 
             out_phasing_summary_fn.data());
     FILE* out_phasing_summary = fopen(out_phasing_summary_fn.data(), "w");
diff --git a/src/print.cpp b/src/print.cpp
index 52b6f59..2963412 100644
--- a/src/print.cpp
+++ b/src/print.cpp
@@ -44,14 +44,14 @@ void write_params() {
         "phase_threshold = %f\ncredit_threshold = %f\nrealign_truth = %s\nrealign_query = %s\n"
         "realign_only = %s\nsimple_cluster = %s\ncluster_min_gap = %d\n"
         "reach_min_gap = %d\nmax_cluster_itrs = %d\nmax_threads = %d\nmax_ram = %f\n"
-        "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\n",
+        "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\ndistance = %s",
         g.PROGRAM.data(), g.VERSION.data(), g.out_prefix.data(), g.cmd.data(), g.ref_fasta_fn.data(), 
         g.query_vcf_fn.data(), g.truth_vcf_fn.data(), g.bed_fn.data(), b2s(g.write).data(), 
         filters_str.data(), g.min_qual, g.max_qual, g.min_size, g.max_size,
         g.phase_threshold, g.credit_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(),
         b2s(g.realign_only).data(), b2s(g.simple_cluster).data(), g.cluster_min_gap,
         g.reach_min_gap, g.max_cluster_itrs, g.max_threads, g.max_ram,
-        g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend);
+        g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend, b2s(g.distance).data());
     fclose(out_params);
 }
 
@@ -434,6 +434,7 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
     std::string out_pr_fn = g.out_prefix + "precision-recall.tsv";
     FILE* out_pr = 0;
     if (g.write) {
+        if (g.verbosity >= 1) INFO(" ");
         if (g.verbosity >= 1) INFO("  Writing precision-recall results to '%s'", out_pr_fn.data());
         out_pr = fopen(out_pr_fn.data(), "w");
         fprintf(out_pr, "VAR_TYPE\tMIN_QUAL\tPREC\tRECALL\tF1_SCORE\tF1_QSCORE\t"
@@ -557,143 +558,15 @@ void write_precision_recall(std::unique_ptr<phaseblockData> & phasedata_ptr) {
 /*******************************************************************************/
 
 
-void write_distance(const editData & edits) {
-
-    // log all distance results
-    std::string dist_fn = g.out_prefix + "distance.tsv";
-    FILE* out_dists = 0;
-    if (g.write) {
-        out_dists = fopen(dist_fn.data(), "w");
-        fprintf(out_dists, "MIN_QUAL\tSUB_DE\tINS_DE\tDEL_DE\tSUB_ED\tINS_ED\tDEL_ED\t"
-                "DISTINCT_EDITS\tEDIT_DIST\tALN_SCORE\tALN_QSCORE\n");
-        if (g.verbosity >= 1) INFO("  Writing distance results to '%s'", dist_fn.data());
-    }
-
-    // get original scores / distance (above g.max_qual, no vars applied)
-    std::vector<int> orig_edit_dists(TYPES, 0);
-    std::vector<int> orig_distinct_edits(TYPES, 0);
-    int orig_score = edits.get_score(g.max_qual+1);
-    for (int type = 0; type < TYPES; type++) {
-        orig_edit_dists[type] = edits.get_ed(g.max_qual+1, type);
-        orig_distinct_edits[type] = edits.get_de(g.max_qual+1, type);
-    }
-
-    std::vector<double> best_score(TYPES, std::numeric_limits<double>::max());
-    std::vector<int> best_qual(TYPES, 0);
-    for (int q = g.min_qual; q <= g.max_qual+1; q++) { // all qualities
-
-        // get ED/DE for each Q threshold, for each type
-        std::vector<int> edit_dists(TYPES, 0);
-        std::vector<int> distinct_edits(TYPES, 0);
-        for (int type = 0; type < TYPES; type++) {
-            edit_dists[type] = edits.get_ed(q, type);
-            distinct_edits[type] = edits.get_de(q, type);
-
-            // save best Q threshold so far
-            double score = double(edit_dists[type]) * distinct_edits[type];
-            if (score < best_score[type]) {
-                best_score[type] = score;
-                best_qual[type] = q;
-            }
-        }
-
-        if (g.write) fprintf(out_dists, 
-                "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%f\n", 
-                q, distinct_edits[TYPE_SUB],
-                distinct_edits[TYPE_INS], distinct_edits[TYPE_DEL],
-                edit_dists[TYPE_SUB], edit_dists[TYPE_INS], edit_dists[TYPE_DEL],
-                distinct_edits[TYPE_ALL], edit_dists[TYPE_ALL],
-                edits.get_score(q), qscore(double(edits.get_score(q))/orig_score));
-    }
-    if (g.write) fclose(out_dists);
-
-    // summarize distance results
-    std::string dist_summ_fn = g.out_prefix + "distance-summary.tsv";
-    FILE* dists_summ = 0;
-    if (g.write) {
-        dists_summ = fopen(dist_summ_fn.data(), "w");
-        if (g.verbosity >= 1) 
-            INFO("  Writing distance summary to '%s'", dist_summ_fn.data());
-        fprintf(dists_summ, "VAR_TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE\n");
-    }
-    INFO(" ");
-    INFO("%sALIGNMENT DISTANCE SUMMARY%s", COLOR_BLUE, COLOR_WHITE);
-    for (int type = 0; type < TYPES; type++) {
-
-        // skip INS/DEL individually unless higher print verbosity
-        if (g.verbosity == 0 && type != TYPE_ALL)
-            continue;
-        if (g.verbosity == 1 && (type == TYPE_INS || type == TYPE_DEL))
-            continue;
-
-        INFO(" ");
-        if (type == TYPE_ALL) {
-            INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE%s",
-                    COLOR_BLUE, COLOR_WHITE);
-        } else {
-            INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE%s",
-                    COLOR_BLUE, COLOR_WHITE);
-        }
-        std::vector<int> quals = {g.min_qual, best_qual[type], g.max_qual+1};
-        for (int q : quals) {
-
-            // fill out ED/DE for selected quals
-            std::vector<int> edit_dists(TYPES, 0);
-            std::vector<int> distinct_edits(TYPES, 0);
-            for (int type = 0; type < TYPES; type++) {
-                edit_dists[type] = edits.get_ed(q, type);
-                distinct_edits[type] = edits.get_de(q, type);
-            }
-
-            float ed_qscore = qscore(double(edit_dists[type]) / orig_edit_dists[type]);
-            float de_qscore = qscore(double(distinct_edits[type]) / orig_distinct_edits[type]);
-            float all_qscore = type == TYPE_ALL ? qscore(double(edits.get_score(q)) / orig_score) : 0;
-
-            // print summary
-            if (g.write) fprintf(dists_summ, "%s\t%d\t%d\t%d\t%f\t%f\t%f\n", type_strs2[type].data(),
-                    q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore);
-            if (type == TYPE_ALL) {
-                INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f\t%f%s", COLOR_BLUE, type_strs2[type].data(),
-                    q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore, COLOR_WHITE);
-            } else {
-                INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f%s", COLOR_BLUE, type_strs2[type].data(),
-                    q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, COLOR_WHITE);
-            }
-        }
-    }
-    INFO(" ");
-    if (g.write) fclose(dists_summ);
-}
-
-
-/*******************************************************************************/
-
-
-void write_results(
-        std::unique_ptr<phaseblockData> & phasedata_ptr, 
-        const editData & edits) {
+void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
     if (g.verbosity >= 1) INFO(" ");
     if (g.verbosity >= 1) INFO("%s[8/8] Writing results%s", COLOR_PURPLE, COLOR_WHITE);
 
     // print summary (precision/recall) information
     write_precision_recall(phasedata_ptr);
 
-    // print distance information
-    write_distance(edits);
-
     // print edit information
     if (g.write) {
-        std::string edit_fn = g.out_prefix + "edits.tsv";
-        FILE* out_edits = fopen(edit_fn.data(), "w");
-        fprintf(out_edits, "CONTIG\tSTART\tHAP\tTYPE\tSIZE\tSUPERCLUSTER\tMIN_QUAL\tMAX_QUAL\n");
-        if (g.verbosity >= 1) INFO("  Writing edit results to '%s'", edit_fn.data());
-        for (int i = 0; i < edits.n; i++) {
-            fprintf(out_edits, "%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", 
-                    edits.ctgs[i].data(), edits.poss[i], edits.haps[i],
-                    type_strs[edits.types[i]].data(), edits.lens[i],
-                    edits.superclusters[i], edits.min_quals[i], edits.max_quals[i]);
-        }
-        fclose(out_edits);
 
         // print phasing information
         std::string out_phaseblocks_fn = g.out_prefix + "phase-blocks.tsv";
diff --git a/src/print.h b/src/print.h
index 2fbb051..0467a48 100644
--- a/src/print.h
+++ b/src/print.h
@@ -37,7 +37,7 @@ void print_wfa_ptrs(
         const std::vector< std::vector< std::vector<uint8_t> > > & ptrs,
         const std::vector< std::vector< std::vector<int> > > & offs);
 
-void write_results(std::unique_ptr<phaseblockData> & phasings, const editData & edits);
+void write_results(std::unique_ptr<phaseblockData> & phasings);
 void write_params();
 
 #endif

From f169dda9fbb3b6deda9990402aac7e4adbf386da Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Thu, 9 Nov 2023 11:26:05 -0500
Subject: [PATCH 12/16] Fixed integer division on phase threshold comparison.

---
 src/dist.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dist.cpp b/src/dist.cpp
index 4422b26..398f06f 100644
--- a/src/dist.cpp
+++ b/src/dist.cpp
@@ -459,9 +459,9 @@ int store_phase(
         phase = PHASE_ORIG;
     } else if (swap_phase_dist == 0) { // protect division by zero
         phase = PHASE_SWAP;
-    } else if (1 - swap_phase_dist / orig_phase_dist > g.phase_threshold) { // significant reduction by swapping
+    } else if (1 - float(swap_phase_dist) / orig_phase_dist > g.phase_threshold) { // significant reduction by swapping
         phase = PHASE_SWAP;
-    } else if (1 - orig_phase_dist / swap_phase_dist > g.phase_threshold) { // significant reduction with orig
+    } else if (1 - float(orig_phase_dist) / swap_phase_dist > g.phase_threshold) { // significant reduction with orig
         phase = PHASE_ORIG;
     }
 

From 6cfc50955c0104c89dde688596ef87631464b8c5 Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Thu, 9 Nov 2023 11:52:59 -0500
Subject: [PATCH 13/16] Added detailed switchflip output.

---
 src/globals.h |   2 +-
 src/main.cpp  |   1 +
 src/phase.cpp | 105 +++++++++++++++++++++++++++++++++++++++++++++++++-
 src/phase.h   |   1 +
 4 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/src/globals.h b/src/globals.h
index e2e630e..aac31fb 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -45,7 +45,7 @@ class Globals {
     int extend = 2;
 
     // phasing
-    double phase_threshold = 0.5;
+    double phase_threshold = 0.6;
 
     // precision-recall
     double credit_threshold = 0.7;
diff --git a/src/main.cpp b/src/main.cpp
index 0d022e7..74b3634 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -233,6 +233,7 @@ int main(int argc, char **argv) {
 
     // write supercluster/phaseblock results in CSV format
     g.timers[TIME_WRITE].start();
+    if (g.write) phasedata_ptr->write_switchflips();
     write_results(phasedata_ptr);
 
     // save new VCF
diff --git a/src/phase.cpp b/src/phase.cpp
index 25fc511..c946f19 100644
--- a/src/phase.cpp
+++ b/src/phase.cpp
@@ -398,6 +398,109 @@ void phaseblockData::phase()
 /*******************************************************************************/
 
 
+void phaseblockData::write_switchflips() {
+
+    std::string out_sf_fn = g.out_prefix + "switchflips.tsv";
+    FILE* out_sf = 0;
+    if (g.verbosity >= 1) INFO("  Writing switchflip results to '%s'", out_sf_fn.data());
+    out_sf = fopen(out_sf_fn.data(), "w");
+    fprintf(out_sf, "CONTIG\tSTART\tSTOP\tSWITCH_TYPE\tSUPERCLUSTER\tPHASE_BLOCK\n");
+
+    // get sizes of each correct phase block (split on flips, not just switch)
+    for (std::string ctg: this->contigs) {
+        
+        std::shared_ptr<ctgPhaseblocks> ctg_pbs = this->phase_blocks[ctg];
+        std::shared_ptr<ctgSuperclusters> ctg_scs = ctg_pbs->ctg_superclusters;
+
+        int switch_idx = 0;
+        int flip_idx = 0;
+        int pb_idx = 1;
+        if (ctg_scs->n == 0) continue;
+
+        // init start of this correct block
+        int sc = 0;
+        int next_sc = ctg_scs->n; // default to last
+        int beg = 0; int end = 0;
+        int type = SWITCHTYPE_NONE;
+
+        while (true) {
+
+            // get type and supercluster of next switch/flip/phaseset
+            // check flip first because it will cause a second switch
+            if (flip_idx < ctg_pbs->nflips && ctg_pbs->flips[flip_idx] < next_sc) {
+                type = SWITCHTYPE_FLIP;
+                next_sc = ctg_pbs->flips[flip_idx];
+            }
+            if (pb_idx < ctg_pbs->n && ctg_pbs->phase_blocks[pb_idx] < next_sc) {
+                type = SWITCHTYPE_SWITCH;
+                next_sc = ctg_pbs->phase_blocks[pb_idx];
+            }
+            if (switch_idx < ctg_pbs->nswitches && ctg_pbs->switches[switch_idx] < next_sc) {
+                type = SWITCHTYPE_SWITCH_ERR;
+                next_sc = ctg_pbs->switches[switch_idx];
+            }
+            if (type == SWITCHTYPE_NONE) { // all out-of-bounds
+                break;
+            }
+            if (next_sc <= sc) ERROR("Next SC is not after current SC");
+
+
+            // get block(s)
+            if (type == SWITCHTYPE_FLIP) {
+                // switch could have occurred anywhere after last phased supercluster
+                int left = next_sc-1;
+                while (left > 0 && ctg_scs->sc_phase[left] == PHASE_NONE)
+                    left--;
+                if (left >= 0) {
+                    beg = ctg_scs->ends[left];
+                    end = ctg_scs->begs[next_sc];
+                    fprintf(out_sf, "%s\t%d\t%d\t%s\t%d\t%d\n", ctg.data(), beg, end, 
+                            switch_strs[SWITCHTYPE_FLIP_BEG].data(), next_sc, pb_idx-1);
+                }
+
+                // switch could have occurred anywhere before next phased supercluster
+                int right = next_sc+1;
+                while (right < ctg_scs->n-1 && ctg_scs->sc_phase[right] == PHASE_NONE)
+                    right++;
+                if (right < ctg_scs->n) {
+                    beg = ctg_scs->ends[next_sc];
+                    end = ctg_scs->begs[right];
+                    fprintf(out_sf, "%s\t%d\t%d\t%s\t%d\t%d\n", ctg.data(), beg, end, 
+                            switch_strs[SWITCHTYPE_FLIP_END].data(), next_sc, pb_idx-1);
+                }
+                flip_idx++;
+            } else if (type == SWITCHTYPE_SWITCH) {
+                // end of phase block, don't print anything since not an error
+                pb_idx++;
+            } else if (type == SWITCHTYPE_SWITCH_ERR) {
+                // expand left/right from in between these clusters
+                int left = next_sc-1;
+                while (left > 0 && ctg_scs->sc_phase[left] == PHASE_NONE)
+                    left--;
+                int right = next_sc;
+                while (right < ctg_scs->n-1 && ctg_scs->sc_phase[right] == PHASE_NONE)
+                    right++;
+                if (left >= 0 && right < ctg_scs->n) {
+                    beg = ctg_scs->ends[left];
+                    end = ctg_scs->begs[right];
+                    fprintf(out_sf, "%s\t%d\t%d\t%s\t%d\t%d\n", ctg.data(), beg, end, 
+                            switch_strs[SWITCHTYPE_SWITCH_ERR].data(), next_sc, pb_idx-1);
+                }
+                switch_idx++;
+            }
+
+            sc = next_sc;
+            next_sc = ctg_scs->n; // reset to end
+            type = SWITCHTYPE_NONE;
+        }
+    }
+    fclose(out_sf);
+}
+
+
+/*******************************************************************************/
+
+
 void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors,
         int flip_errors, int ng50, int s_ngc50, int sf_ngc50) {
     std::string out_phasing_summary_fn = g.out_prefix + "phasing-summary.tsv";
@@ -406,7 +509,7 @@ void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors,
             out_phasing_summary_fn.data());
     FILE* out_phasing_summary = fopen(out_phasing_summary_fn.data(), "w");
     fprintf(out_phasing_summary,
-            "PHASE_BLOCKS\tFLIP_ERRORS\tSWITCH_ERRORS\tNG_50\tSWITCH_NGC50\tSWITCHFLIP_NGC50\n");
+            "PHASE_BLOCKS\tSWITCH_ERRORS\tFLIP_ERRORS\tNG_50\tSWITCH_NGC50\tSWITCHFLIP_NGC50\n");
     fprintf(out_phasing_summary, "%d\t%d\t%d\t%d\t%d\t%d", phase_blocks, 
             switch_errors, flip_errors, ng50, s_ngc50, sf_ngc50);
     fclose(out_phasing_summary);
diff --git a/src/phase.h b/src/phase.h
index b91f359..784eabb 100644
--- a/src/phase.h
+++ b/src/phase.h
@@ -30,6 +30,7 @@ class phaseblockData {
     void write_summary_vcf(std::string vcf_fn);
     void write_phasing_summary(int phase_blocks, int switch_errors,
         int flip_errors, int ng50, int s_ngc50, int sf_ngc50);
+    void write_switchflips();
     void phase();
     int calculate_ng50(bool break_on_switch = false, bool break_on_flip = false);
 

From 7f361237519ed7f992453b8d784e6cc42bc0e0ae Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Thu, 9 Nov 2023 13:08:33 -0500
Subject: [PATCH 14/16] Fixed bug labelling truth FNs FPs

---
 src/variant.cpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/variant.cpp b/src/variant.cpp
index 999d8e5..2057c21 100644
--- a/src/variant.cpp
+++ b/src/variant.cpp
@@ -263,11 +263,11 @@ void ctgVariants::print_var_sample(FILE* out_fp, int idx, std::string gt,
     if (this->credit[swap][idx] == 1) {
         errtype = "TP"; match_type = "gm";
     } else if (this->credit[swap][idx] == 0) {
-        errtype = "FP"; match_type = ".";
+        errtype = query ? "FP" : "FN"; match_type = ".";
     } else if (this->credit[swap][idx] >= g.credit_threshold) {
         errtype = "TP"; match_type = "lm";
     } else {
-        errtype = "FP"; match_type = "lm";
+        errtype = query ? "FP" : "FN"; match_type = "lm";
     }
 
     fprintf(out_fp, "\t%s:%s:%f:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), 
@@ -625,7 +625,8 @@ variantData::variantData(std::string vcf_fn,
         if (ngt == -1) { // GT not defined in header
             if (!gt_warn) {
                 gt_warn = true;
-                WARN("'GT' tag not defined in header, assuming monoploid");
+                WARN("'GT' tag not defined in %s VCF header, assuming monoploid", 
+                        callset_strs[callset].data());
             }
         } else if (ngt <= 0) { // other error
             ERROR("Failed to read %s GT at %s:%lld", 
@@ -699,7 +700,8 @@ variantData::variantData(std::string vcf_fn,
         if (nPS == -1) { // PS not defined in header
             if (!PS_warn) {
                 PS_warn = true;
-                WARN("'PS' tag not defined in header, assuming one phase set per contig");
+                WARN("'PS' tag not defined in %s VCF header, assuming one phase set per contig",
+                        callset_strs[callset].data());
             }
             phase_set = 0;
 
@@ -885,11 +887,11 @@ variantData::variantData(std::string vcf_fn,
     /*     WARN("%d total missing GQ tags in %s VCF, all considered GQ=0", */
     /*         gq_missing_total, callset_strs[callset].data()); */
 
-    if (failed_filter_total) 
+    if (failed_filter_total && print) 
         INFO("%d variants failed FILTER in %s VCF, skipped",
             failed_filter_total, callset_strs[callset].data());
 
-    if (pass_min_qual[FAIL])
+    if (pass_min_qual[FAIL] && print)
         INFO("%d variants of low quality (<%d) in %s VCF, skipped", 
             pass_min_qual[FAIL], g.min_qual, callset_strs[callset].data());
 
@@ -912,7 +914,7 @@ variantData::variantData(std::string vcf_fn,
 
     multi_total = GT_counts[GT_ALT1_ALT1] + GT_counts[GT_ALT1_ALT2] +
         GT_counts[GT_ALT2_ALT1] + GT_counts[GT_OTHER];
-    if (multi_total)
+    if (multi_total && print)
         INFO("%d homozygous and multi-allelic variants in %s VCF, split for evaluation",
             multi_total, callset_strs[callset].data());
 
@@ -932,12 +934,12 @@ variantData::variantData(std::string vcf_fn,
         WARN("%d reference variants in %s VCF, skipped",
             ref_call_total, callset_strs[callset].data());
 
-    if (nregions[BED_OFFCTG] + nregions[BED_OUTSIDE])
+    if (nregions[BED_OFFCTG] + nregions[BED_OUTSIDE] && print)
         INFO("%d variants outside selected regions in %s VCF, skipped",
                 nregions[BED_OFFCTG] + nregions[BED_OUTSIDE], 
                 callset_strs[callset].data());
 
-    if (nregions[BED_BORDER])
+    if (nregions[BED_BORDER] && print)
         INFO("%d variants on border of selected regions in %s VCF, kept",
                 nregions[BED_BORDER], callset_strs[callset].data());
 

From c2b8c9047dd7c24b23249f61647f8e6ee55958fe Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Thu, 9 Nov 2023 13:13:00 -0500
Subject: [PATCH 15/16] Version bump (v2.3.0)

---
 README.md       |  2 +-
 demo/output.txt | 36 ++++++++----------------------------
 demo/pr_plot.py | 34 ++++++++--------------------------
 docs/release.md | 19 ++++++++++++-------
 src/Makefile    |  6 +++---
 src/README.md   | 16 ++++++++++++----
 src/globals.h   |  2 +-
 src/print.cpp   |  2 +-
 src/run         |  2 +-
 9 files changed, 47 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index 0e1a533..a5ca2e5 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ If you do already have HTSlib installed elsewhere, make sure you've added it to
 > cd vcfdist/src
 > make
 > ./vcfdist --version
-vcfdist v2.2.1
+vcfdist v2.3.0
 ```
 
 ### Option 2: Docker Image
diff --git a/demo/output.txt b/demo/output.txt
index 4b46f2f..893553e 100644
--- a/demo/output.txt
+++ b/demo/output.txt
@@ -1,29 +1,9 @@
-
 PRECISION-RECALL SUMMARY
-  
- TYPE   MIN_QUAL        TRUTH_TP        QUERY_TP        TRUTH_FN        QUERY_FP        PREC            RECALL          F1_SCORE        F1_QSCORE
- SNP    Q >= 0          8220            8220            5               6               0.999149        0.999271        0.999210        31.023401
- SNP    Q >= 0          8220            8220            5               6               0.999149        0.999271        0.999210        31.023401
-  
- TYPE   MIN_QUAL        TRUTH_TP        QUERY_TP        TRUTH_FN        QUERY_FP        PREC            RECALL          F1_SCORE        F1_QSCORE
- INDEL  Q >= 0          932             932             3               3               0.996793        0.996261        0.996527        24.592749
- INDEL  Q >= 0          932             932             3               3               0.996793        0.996261        0.996527        24.592749
-  
-  
- ALIGNMENT DISTANCE SUMMARY
-  
- TYPE   MIN_QUAL        EDIT_DIST       DISTINCT_EDITS  ED_QSCORE       DE_QSCORE       ALN_QSCORE
- ALL    Q >= 0          26              16              26.565773       27.579651       27.154125
- ALL    Q >= 0          26              16              26.565773       27.579651       27.154125
- ALL    Q >= 61         11791           9164            0.000000        0.000000        0.000000
-  
- TYPE   MIN_QUAL        EDIT_DIST       DISTINCT_EDITS  ED_QSCORE       DE_QSCORE
- SNP    Q >= 0          10              10              29.130186       29.130186
- SNP    Q >= 0          10              10              29.130186       29.130186
- SNP    Q >= 61         8185            8185            0.000000        0.000000
-  
- TYPE   MIN_QUAL        EDIT_DIST       DISTINCT_EDITS  ED_QSCORE       DE_QSCORE
- INDEL  Q >= 0          16              6               23.529057       22.126314
- INDEL  Q >= 0          16              6               23.529057       22.126314
- INDEL  Q >= 61         3606            979             0.000000        0.000000
-  
+
+TYPE	MIN_QUAL	TRUTH_TP	QUERY_TP	TRUTH_FN	QUERY_FP	PREC		RECALL		F1_SCORE	F1_QSCORE
+SNP	Q >= 0		8222            8222            1               2               0.999757	0.999878	0.999818	37.388565
+SNP	Q >= 0		8222            8222            1               2               0.999757	0.999878	0.999818	37.388565
+
+TYPE	MIN_QUAL	TRUTH_TP	QUERY_TP	TRUTH_FN	QUERY_FP	PREC		RECALL		F1_SCORE	F1_QSCORE
+INDEL	Q >= 0		876             876             51              12              0.986486	0.944984	0.965289	14.595358
+INDEL	Q >= 0		876             876             51              12              0.986486	0.944984	0.965289	14.595358
diff --git a/demo/pr_plot.py b/demo/pr_plot.py
index f0ff0bb..96b7723 100644
--- a/demo/pr_plot.py
+++ b/demo/pr_plot.py
@@ -6,8 +6,6 @@
 
 fig, ax = plt.subplots(1, 2, figsize=(10,8))
 
-partial_credit = True
-
 with open(f"results/precision-recall.tsv") as csv:
     indel_recall = []
     indel_prec = []
@@ -16,30 +14,14 @@
     next(csv) # skip header
 
     for line in csv:
-        typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \
-            query_tot, query_tp, query_pp, query_fp = line.split('\t')
-        if partial_credit: # normal vcfdist prec/recall calc, already done
-            if typ == "INDEL":
-                indel_recall.append(float(recall))
-                indel_prec.append(float(prec))
-            elif typ == "SNP":
-                snp_recall.append(float(recall))
-                snp_prec.append(float(prec))
-        else: # recalculate using TP/FP/FN
-            if typ == "INDEL":
-                indel_recall.append(float(truth_tp) / float(truth_tot))
-                if int(truth_tp) + int(query_fp) == 0:
-                    indel_prec.append(1)
-                else:
-                    indel_prec.append(float(truth_tp) / 
-                            (float(truth_tp) + float(query_fp)))
-            elif typ == "SNP":
-                snp_recall.append(float(truth_tp) / float(truth_tot))
-                if int(truth_tp) + int(query_fp) == 0:
-                    snp_prec.append(1)
-                else:
-                    snp_prec.append(float(truth_tp) / 
-                            (float(truth_tp) + float(query_fp)))
+        typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_fn, \
+            query_tot, query_tp, query_fp = line.split('\t')
+        if typ == "INDEL":
+            indel_recall.append(float(recall))
+            indel_prec.append(float(prec))
+        elif typ == "SNP":
+            snp_recall.append(float(recall))
+            snp_prec.append(float(prec))
     ax[0].plot(snp_recall, snp_prec, linestyle='', marker='.')
     ax[1].plot(indel_recall, indel_prec, linestyle='', marker='.')
      
diff --git a/docs/release.md b/docs/release.md
index 7557f31..7c16437 100644
--- a/docs/release.md
+++ b/docs/release.md
@@ -1,14 +1,19 @@
-1. Update `vcfdist` version in `globals.h`
+1. Benchmark against `run` and `demo` scripts, updating version
+- update `output.txt` for `demo` if necessary
 
-2. Update `vcfdist` help text in `src/README.md`
+2. Update `vcfdist` version in `globals.h`
+
+3. Update `vcfdist` help text in `src/README.md`
 ```bash
-./vcfdist >> src/README.md
+cd src
+make
+./vcfdist >> README.md
 ```
 Manually delete previous text
 
-3. Update `vcfdist` version in `README.md`
+4. Update `vcfdist` version in `README.md`
 
-4. Make final Git commit of changes
+5. Make final Git commit of changes
 ```bash
 git commit -m "version bump (vX.X.X)"
 git checkout master
@@ -17,9 +22,9 @@ git push origin master
 git checkout dev
 ```
 
-5. Make new release and tag on Github
+6. Make new release and tag on Github
 
-6. Build and deploy new Docker image
+7. Build and deploy new Docker image
 ```bash
 cd ~/vcfdist/src
 sudo docker login -u timd1
diff --git a/src/Makefile b/src/Makefile
index 819b6c9..cb9d1f5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -12,7 +12,7 @@ $(TARGET): $(OBJS) main.cpp
 globals.o: globals.cpp globals.h bed.h print.h defs.h timer.h
 	$(CXX) -c $(CXXFLAGS) globals.cpp
 
-print.o: print.cpp print.h globals.h phase.h dist.h edit.h defs.h
+print.o: print.cpp print.h globals.h phase.h dist.h edit.h defs.h variant.h
 	$(CXX) -c $(CXXFLAGS) print.cpp
 
 timer.o: timer.cpp timer.h globals.h defs.h
@@ -30,10 +30,10 @@ bed.o: bed.cpp bed.h print.h defs.h globals.h
 edit.o: edit.cpp edit.h defs.h globals.h
 	$(CXX) -c $(CXXFLAGS) edit.cpp
 
-cluster.o: cluster.cpp cluster.h variant.h globals.h dist.h defs.h
+cluster.o: cluster.cpp cluster.h variant.h fasta.h globals.h dist.h defs.h
 	$(CXX) -c $(CXXFLAGS) cluster.cpp
 
-phase.o: phase.cpp phase.h cluster.h print.h globals.h defs.h
+phase.o: phase.cpp phase.h cluster.h print.h globals.h defs.h variant.h
 	$(CXX) -c $(CXXFLAGS) phase.cpp
 
 clean:
diff --git a/src/README.md b/src/README.md
index ea884b2..57a35ab 100644
--- a/src/README.md
+++ b/src/README.md
@@ -26,12 +26,12 @@ Options:
       minimum variant size, smaller variants ignored (SNPs are size 1)
   -l, --largest-variant <INTEGER> [5000]
       maximum variant size, larger variants ignored
-  --min-qual <INTEGER> [0]
+  -mn, --min-qual <INTEGER> [0]
       minimum variant quality, lower qualities ignored
-  --max-qual <INTEGER> [60]
+  -mx, --max-qual <INTEGER> [60]
       maximum variant quality, higher qualities kept but thresholded
 
-  ReAlignment:
+  Re-Alignment:
   -rq, --realign-query
       realign query variants using Smith-Waterman parameters
   -rt, --realign-truth
@@ -45,6 +45,14 @@ Options:
   -e, --gap-extend-penalty <INTEGER> [1]
       Smith-Waterman gap extension penalty
 
+  Precision-Recall:
+  -ct, --credit-threshold <FLOAT> [0.70]
+      minimum partial credit to consider variant a true positive
+
+  Distance:
+  -d, --distance
+      flag to include alignment distance calculations, skipped by default
+
   Utilization:
   -t, --max-threads <INTEGER> [64]
       maximum threads to use for clustering and precision/recall alignment
@@ -59,5 +67,5 @@ Options:
   -c, --citation
       please cite vcfdist if used in your analyses; thanks :)
   -v, --version
-      print vcfdist version (v2.2.1)
+      print vcfdist version (v2.3.0)
 ```
diff --git a/src/globals.h b/src/globals.h
index aac31fb..1fd9fb6 100644
--- a/src/globals.h
+++ b/src/globals.h
@@ -77,7 +77,7 @@ class Globals {
     void init_timers(std::vector<std::string> timer_strs);
 
     // program data
-    const std::string VERSION = "2.2.1";
+    const std::string VERSION = "2.3.0";
     const std::string PROGRAM = "vcfdist";
     std::vector<timer> timers;
 };
diff --git a/src/print.cpp b/src/print.cpp
index 2963412..8733730 100644
--- a/src/print.cpp
+++ b/src/print.cpp
@@ -658,7 +658,7 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
 
         // print query variant information
         std::string out_query_fn = g.out_prefix + "query.tsv";
-        if (g.verbosity >= 1) INFO("  Writing call variant results to '%s'", out_query_fn.data());
+        if (g.verbosity >= 1) INFO("  Writing query variant results to '%s'", out_query_fn.data());
         FILE* out_query = fopen(out_query_fn.data(), "w");
         fprintf(out_query, "CONTIG\tPOS\tHAP\tREF\tALT\tQUAL\tTYPE\tERR_TYPE"
                 "\tCREDIT\tCLUSTER\tSUPERCLUSTER\tSYNC_GROUP\tLOCATION\n");
diff --git a/src/run b/src/run
index 1ab9bbb..c803706 100755
--- a/src/run
+++ b/src/run
@@ -14,4 +14,4 @@ source ../pipeline/includes.sh
     /x/gm24385/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
     -b ../test/bench20.bed \
     -l 1000 \
-    -p ../out/
+    -p ../out/dev.

From fcb41be9aee60f12deee9150dfa26185fb8790fe Mon Sep 17 00:00:00 2001
From: Tim Dunn <me.timd1@gmail.com>
Date: Thu, 9 Nov 2023 13:14:12 -0500
Subject: [PATCH 16/16] Updated analyses

---
 .../clustering/3_vcfdist_sweep_varlens.sh     |  4 +-
 analysis-v2/phab_cm/1_eval_vcfs.sh            | 89 ++++++-------------
 .../phab_cm/vcfdist_truvari_compare.py        | 48 +++++-----
 analysis-v2/small_sv_all/3_plot_vcf_tp.py     | 18 ++--
 analysis-v2/small_sv_all/4_plot_vcf_fp.py     | 18 ++--
 5 files changed, 76 insertions(+), 101 deletions(-)

diff --git a/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
index 3e3442b..fdc4242 100755
--- a/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
+++ b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
@@ -4,10 +4,10 @@ source globals.sh
 
 lens=( 
         # "10"
-        # "50"
+        "50"
         # "100"
         # "500"
-        "1000"
+        # "1000"
         # "5000"
         # "10000"
 )
diff --git a/analysis-v2/phab_cm/1_eval_vcfs.sh b/analysis-v2/phab_cm/1_eval_vcfs.sh
index b628873..622626d 100755
--- a/analysis-v2/phab_cm/1_eval_vcfs.sh
+++ b/analysis-v2/phab_cm/1_eval_vcfs.sh
@@ -2,38 +2,6 @@
 
 source globals.sh
 
-# # truvari phab VCF normalization v4.1.0 (doesn't work)
-# mkdir -p $out/phab-mafft
-# mkdir -p $out/phab-wfa
-# source ~/truvari/venv3.10/bin/activate
-# for i in "${!query_names[@]}"; do
-#     echo "truvari v4.1: phab wfa for '${query_names[i]}'"
-#     truvari phab \
-#         -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
-#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
-#         --bSamples HG002 \
-#         --cSamples HG002 \
-#         -f $data/refs/$ref_name \
-#         --align wfa \
-#         -t 64 \
-#         -o $out/phab-wfa/${query_names[i]}.vcf.gz \
-#         2> $out/phab-wfa/${query_names[i]}.log
-#     echo "truvari v4.1: phab mafft for '${query_names[i]}'"
-#     truvari phab \
-#         -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-#         -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
-#         -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
-#         --bSamples HG002 \
-#         --cSamples HG002 \
-#         -f $data/refs/$ref_name \
-#         --align mafft \
-#         -t 64 \
-#         -o $out/phab-mafft/${query_names[i]}.vcf.gz \
-#         2> $out/phab-mafft/${query_names[i]}.log
-# done
-# deactivate
-
 # # truvari phab VCF normalization v4.0
 # source ~/software/Truvari-v4.0.0/venv3.10/bin/activate
 # mkdir -p $out/phab
@@ -87,37 +55,38 @@ source globals.sh
 #         2> $out/vcfdist/${query_names[i]}.log
 # done
 
-# # vcfeval evaluation
-# mkdir -p $out/vcfeval
-# for i in "${!query_names[@]}"; do
-#     echo "vcfeval: evaluating '${query_names[i]}'"
-#     $timer -v $rtg vcfeval \
-#         -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
-#         -c $out/phab/${query_names[i]}.QUERY.vcf.gz \
-#         -t $data/refs/${ref_name}.sdf \
-#         -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-#         --threads 64 \
-#         --all-records \
-#         -o $out/vcfeval/${query_names[i]} \
-#         2> $out/vcfeval/${query_names[i]}.log
-# done
-
-# Truvari evaluation
-mkdir -p $out/truvari
-source ~/software/Truvari-v4.0.0/venv3.10/bin/activate
+# vcfeval evaluation
+mkdir -p $out/vcfeval
 for i in "${!query_names[@]}"; do
-    echo "Truvari: evaluating '${query_names[i]}'"
-    $timer -v truvari bench \
+    echo "vcfeval: evaluating '${query_names[i]}'"
+    $timer -v $rtg vcfeval \
         -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
         -c $out/phab/${query_names[i]}.QUERY.vcf.gz \
-        -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-        --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-        --no-ref a \
-        --sizemin 1 --sizefilt 1 --sizemax 1000 \
-        --pick multi \
-        --typeignore \
-        --dup-to-ins \
-        -o $out/truvari/${query_names[i]}
+        -t $data/refs/${ref_name}.sdf \
+        -m ga4gh \
+        -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+        --threads 64 \
+        --all-records \
+        -o $out/vcfeval/${query_names[i]} \
+        2> $out/vcfeval/${query_names[i]}.log
+done
+
+# # Truvari evaluation
+# mkdir -p $out/truvari
+# source ~/software/Truvari-v4.0.0/venv3.10/bin/activate
+# for i in "${!query_names[@]}"; do
+#     echo "Truvari: evaluating '${query_names[i]}'"
+#     $timer -v truvari bench \
+#         -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
+#         -c $out/phab/${query_names[i]}.QUERY.vcf.gz \
+#         -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
+#         --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
+#         --no-ref a \
+#         --sizemin 1 --sizefilt 1 --sizemax 1000 \
+#         --pick multi \
+#         --typeignore \
+#         --dup-to-ins \
+#         -o $out/truvari/${query_names[i]}
 
     laytr tru2ga \
         -i $out/truvari/${query_names[i]}/ \
diff --git a/analysis-v2/phab_cm/vcfdist_truvari_compare.py b/analysis-v2/phab_cm/vcfdist_truvari_compare.py
index 7c768d7..4badc48 100644
--- a/analysis-v2/phab_cm/vcfdist_truvari_compare.py
+++ b/analysis-v2/phab_cm/vcfdist_truvari_compare.py
@@ -3,18 +3,17 @@
 import matplotlib.pyplot as plt
 
 # chr20, with phab
-truvari_prefix = "./truvari/pav/result_"
-vcfdist_prefix = "./vcfdist/pav."
-do_print = True
+truvari_prefix = "./truvari/giab-tr/result_"
+vcfdist_prefix = "./vcfdist/giab-tr."
+do_print = False
 
 SIZE             = 0
 SZ_SNP           = 0
-SZ_INDEL_1_10    = 1
-SZ_INDEL_10_50   = 2
-SZ_INDEL_50_500  = 3
-SZ_INDEL_500PLUS = 4
-SZ_DIMS = 5
-sizes = ["SNP", "INDEL 1-10", "INDEL 10-50", "INDEL 50-500", "INDEL 500+"]
+SZ_INDEL_1_50    = 1
+SZ_INDEL_50_500  = 2
+SZ_INDEL_500PLUS = 3
+SZ_DIMS = 4
+sizes = ["SNP", "INDEL 1-50", "INDEL 50-500", "INDEL 500+"]
 
 VCF_DIST     = 1
 VD_NONE      = 0
@@ -51,10 +50,8 @@ def get_size(ref : str, alt : str):
         if size_diff == 0:
             print("ERROR: size 0 INDEL")
             exit(1)
-        if size_diff <= 10:
-            return SZ_INDEL_1_10
         elif size_diff <= 50:
-            return SZ_INDEL_10_50
+            return SZ_INDEL_1_50
         elif size_diff <= 500:
             return SZ_INDEL_50_500
         else:
@@ -93,7 +90,7 @@ def get_tv_type(tv_type : str):
     truvari_vcf = vcf.Reader(open(f"{truvari_prefix}{callset}.vcf", "r"))
     name = callset.upper()
     print(name)
-    tv_used, vd_used = True, True
+    tv_used, vd_used, vd_2used = True, True, False
     this_ctg = "chr1"
     print(this_ctg)
 
@@ -102,11 +99,12 @@ def get_tv_type(tv_type : str):
         # get next valid records for each
         if tv_used: tv_rec = next(truvari_vcf, None)
         if vd_used: vd_rec = next(vcfdist_vcf, None)
+        if vd_2used: vd_rec = next(vcfdist_vcf, None)
         while vd_rec != None and vd_rec.genotype(name)['BD'] == None: # skip other callset
             vd_rec = next(vcfdist_vcf, None)
         while tv_rec != None and tv_rec.ALT[0] == "*": # skip nested var
             tv_rec = next(truvari_vcf, None)
-        tv_used, vd_used = False, False
+        tv_used, vd_used, vd_2used = False, False, False
 
         if do_print:
             print("============================================================")
@@ -141,6 +139,8 @@ def get_tv_type(tv_type : str):
             if do_print: print(f"TV VD {name} {tv_rec.genotype(name)['GT']} {vd_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}")
             if tv_rec.REF == vd_rec.REF and tv_rec.ALT[0] == vd_rec.ALT[0]: # full match
                 tv_used, vd_used = True, True
+                if tv_rec.genotype(name)['GT'] == '1|1' or tv_rec.genotype(name)['GT'] == '1/1':
+                    vd_2used = True # skip second split of this GT
                 size = get_size(tv_rec.REF, tv_rec.ALT[0])
                 vd_type = get_vd_type(float(vd_rec.genotype(name)['BC']))
                 tv_type = get_tv_type(tv_rec.genotype(name)['BD'])
@@ -200,7 +200,7 @@ def get_tv_type(tv_type : str):
                     counts[size2][vd_type2][tv_type2] += 1
                     tv_used, vd_used = True, True
                 else:
-                    if do_print: print("ERROR: failed to match")
+                    print("ERROR: failed to match")
 
                     # discard current two, pretend we haven't looked at next two
                     counts[size][VD_NONE][TV_NONE] += 2
@@ -213,17 +213,23 @@ def get_tv_type(tv_type : str):
             size = get_size(vd_rec.REF, vd_rec.ALT[0])
             if do_print: 
                 print(f"   VD {name} {vd_rec.genotype(name)['GT']} {vd_rec.CHROM}:{vd_rec.POS}\t{vd_rec.REF}\t{vd_rec.ALT[0]}\t")
-                if size != SZ_SNP:
-                    print("WARN: vcfdist only, not SNP")
+            if size != SZ_SNP:
+                print(f"   VD {name} {vd_rec.genotype(name)['GT']} {vd_rec.CHROM}:{vd_rec.POS}\t{vd_rec.REF}\t{vd_rec.ALT[0]}\t")
+                print("WARN: vcfdist only, not SNP")
             vd_type = get_vd_type(float(vd_rec.genotype(name)['BC']))
             tv_type = TV_NONE
             counts[size][vd_type][tv_type] += 1
 
         elif tv_pos < vd_pos: # truvari only
-            if do_print:
-                print(f"TV    {name} {tv_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}")
-                print("WARN: Truvari only")
+            print(f"TV    {name} {tv_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}")
+            print("WARN: Truvari only")
             tv_used = True
+
+            # skip unphased truvari variants
+            gt = tv_rec.genotype(name)['GT']
+            if gt[1] == '/' and gt[0] != gt[2]:
+                continue
+
             size = get_size(tv_rec.REF, tv_rec.ALT[0])
             vd_type = VD_NONE
             tv_type = get_tv_type(tv_rec.genotype(name)['BD'])
@@ -257,4 +263,4 @@ def get_tv_type(tv_type : str):
         for (i,j), z in np.ndenumerate(counts[size_idx]):
             ax.text(j, i, f"{int(z)}", ha='center', va='center',
                 bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
-        plt.savefig(f"img/13_{callset}_{size_idx}_cm.png", dpi=200)
+        plt.savefig(f"img/tv_vd_{callset}_{size_idx}_cm.png", dpi=200)
diff --git a/analysis-v2/small_sv_all/3_plot_vcf_tp.py b/analysis-v2/small_sv_all/3_plot_vcf_tp.py
index bd04c8b..10c4644 100644
--- a/analysis-v2/small_sv_all/3_plot_vcf_tp.py
+++ b/analysis-v2/small_sv_all/3_plot_vcf_tp.py
@@ -9,14 +9,14 @@
 vcf_types = ["snv", "indel", "sv", "small", "large", "all"]
 regions = [
     "summary",
-    "alldifficultregions",
-    "AllHomopolymers_ge7bp_imperfectge11bp_slop5",
-    "AllTandemRepeats",
-    "AllTandemRepeatsandHomopolymers_slop5",
-    "MHC",
-    "satellites_slop5",
-    "segdups",
-    "alllowmapandsegdupregions"
+    # "alldifficultregions",
+    # "AllHomopolymers_ge7bp_imperfectge11bp_slop5",
+    # "AllTandemRepeats",
+    # "AllTandemRepeatsandHomopolymers_slop5",
+    # "MHC",
+    # "satellites_slop5",
+    # "segdups",
+    # "alllowmapandsegdupregions"
 ]
 
 for region in regions:
@@ -119,4 +119,4 @@
     ax[2].legend(handles=patches, loc=(0.6,0.6))
     ax[0].set_ylabel("True Positive Recall")
     plt.suptitle(f"{region}")
-    plt.savefig(f"counts-{region}.png", dpi=300)
+    plt.savefig(f"./img/counts-{region}.pdf", format='pdf')
diff --git a/analysis-v2/small_sv_all/4_plot_vcf_fp.py b/analysis-v2/small_sv_all/4_plot_vcf_fp.py
index 67c60fa..0d04a07 100644
--- a/analysis-v2/small_sv_all/4_plot_vcf_fp.py
+++ b/analysis-v2/small_sv_all/4_plot_vcf_fp.py
@@ -9,14 +9,14 @@
 vcf_types = ["snv", "indel", "sv", "small", "large", "all"]
 regions = [
     "summary",
-    "alldifficultregions",
-    "AllHomopolymers_ge7bp_imperfectge11bp_slop5",
-    "AllTandemRepeats",
-    "AllTandemRepeatsandHomopolymers_slop5",
-    "MHC",
-    "satellites_slop5",
-    "segdups",
-    "alllowmapandsegdupregions"
+    # "alldifficultregions",
+    # "AllHomopolymers_ge7bp_imperfectge11bp_slop5",
+    # "AllTandemRepeats",
+    # "AllTandemRepeatsandHomopolymers_slop5",
+    # "MHC",
+    # "satellites_slop5",
+    # "segdups",
+    # "alllowmapandsegdupregions"
 ]
 
 for region in regions:
@@ -132,4 +132,4 @@
     ax[2].legend(handles=patches, loc=(0.6,0.6))
     ax[0].set_ylabel("False Positive Rate")
     plt.suptitle(f"{region}")
-    plt.savefig(f"counts-{region}-fp.png", dpi=300)
+    plt.savefig(f"img/counts-{region}-fp.pdf", format='pdf')