From f7c7c4592d47286975aa8784e039d637f5efd202 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Mon, 6 Nov 2023 12:50:18 -0500 Subject: [PATCH 01/16] Minor fixes to Docker release instructions. --- docs/release.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/release.md b/docs/release.md index 684c78e..7557f31 100644 --- a/docs/release.md +++ b/docs/release.md @@ -1,6 +1,13 @@ 1. Update `vcfdist` version in `globals.h` -2. Replace `vcfdist --version` text in `src/README.md` + +2. Update `vcfdist` help text in `src/README.md` +```bash +./vcfdist >> src/README.md +``` +Manually delete previous text + 3. Update `vcfdist` version in `README.md` + 4. Make final Git commit of changes ```bash git commit -m "version bump (vX.X.X)" @@ -9,13 +16,16 @@ git merge dev git push origin master git checkout dev ``` + 5. Make new release and tag on Github + 6. Build and deploy new Docker image ```bash cd ~/vcfdist/src sudo docker login -u timd1 -sudo docker build --no-cache -t vcfdist . +sudo docker build --no-cache -t timd1/vcfdist . sudo docker image ls sudo docker image tag timd1/vcfdist:latest timd1/vcfdist:vX.X.X sudo docker image push timd1/vcfdist:vX.X.X +sudo docker image push timd1/vcfdist:latest ``` From 8b3e734517a17d3af88a484eef138f34261be401 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Mon, 6 Nov 2023 12:54:55 -0500 Subject: [PATCH 02/16] Added new analysis-v2 directory --- .gitignore | 3 +- analysis-v2/clustering/1_plot_beds.py | 42 +++++ analysis-v2/clustering/2_plot_gap_clusters.py | 38 ++++ .../clustering/2_vcfdist_sweep_gaplens.sh | 29 +++ analysis-v2/clustering/3_plot_len_clusters.py | 39 ++++ .../clustering/3_vcfdist_sweep_varlens.sh | 62 +++++++ analysis-v2/clustering/4_plot_accuracy.py | 67 +++++++ analysis-v2/clustering/README.md | 3 + analysis-v2/clustering/globals.sh | 30 +++ analysis-v2/multi_match/1_eval_vcfs.sh | 20 ++ analysis-v2/multi_match/2_plot_vcf_multi.py | 137 ++++++++++++++ analysis-v2/multi_match/globals.sh | 29 +++ analysis-v2/phab_cm/1_eval_vcfs.sh | 127 +++++++++++++ analysis-v2/phab_cm/globals.sh | 29 +++ .../phab_cm/vcfdist_truvari_compare.py | 7 +- analysis-v2/phasing/0_create_filtered_vcfs.sh | 13 ++ .../phasing}/14_phase_analysis.py | 0 analysis-v2/phasing/1_whatshap_compare.sh | 8 + analysis-v2/phasing/parse_vcfs.py | 64 +++++++ analysis-v2/small_sv_all/0_split.sh | 47 +++++ analysis-v2/small_sv_all/1_eval_vcfs.sh | 20 ++ analysis-v2/small_sv_all/2_stratify_vcfs.sh | 23 +++ analysis-v2/small_sv_all/3_plot_vcf_tp.py | 122 ++++++++++++ analysis-v2/small_sv_all/3_test_plot_tsv.py | 85 +++++++++ analysis-v2/small_sv_all/4_plot_vcf_fp.py | 135 ++++++++++++++ analysis-v2/small_sv_all/5_extract_diffs.py | 175 ++++++++++++++++++ analysis-v2/small_sv_all/globals.sh | 45 +++++ analysis-v2/variants/1_plot_vars.py | 36 ++++ analysis-v2/vs_prior_work/1_eval_vcfs.sh | 89 +++++++++ analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh | 85 +++++++++ analysis-v2/vs_prior_work/README.md | 1 + analysis-v2/vs_prior_work/globals.sh | 29 +++ analysis/{12_plot_ram.py => 10_plot_ram.py} | 0 analysis/10_vcfdist_sv.sh | 36 ---- analysis/11_truvari_sv.sh | 83 --------- analysis/2_general_sw_space_plot.py | 93 +++++----- analysis/4_vcfdist_output.py | 81 +++++--- analysis/7_vcfeval_pr_plot.py | 60 ++++-- analysis/8_vcfdist_pr_plot.py | 81 +++++--- analysis/9_f1_ed_plot.py | 43 +++-- analysis/9_f1_pr_plot.py | 116 +++++++----- 41 files changed, 1931 insertions(+), 301 deletions(-) create mode 100644 analysis-v2/clustering/1_plot_beds.py create mode 100644 analysis-v2/clustering/2_plot_gap_clusters.py create mode 100755 analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh create mode 100644 analysis-v2/clustering/3_plot_len_clusters.py create mode 100755 analysis-v2/clustering/3_vcfdist_sweep_varlens.sh create mode 100644 analysis-v2/clustering/4_plot_accuracy.py create mode 100644 analysis-v2/clustering/README.md create mode 100755 analysis-v2/clustering/globals.sh create mode 100755 analysis-v2/multi_match/1_eval_vcfs.sh create mode 100644 analysis-v2/multi_match/2_plot_vcf_multi.py create mode 100755 analysis-v2/multi_match/globals.sh create mode 100755 analysis-v2/phab_cm/1_eval_vcfs.sh create mode 100755 analysis-v2/phab_cm/globals.sh rename analysis/13_truvari_compare.py => analysis-v2/phab_cm/vcfdist_truvari_compare.py (97%) create mode 100755 analysis-v2/phasing/0_create_filtered_vcfs.sh rename {analysis => analysis-v2/phasing}/14_phase_analysis.py (100%) create mode 100755 analysis-v2/phasing/1_whatshap_compare.sh create mode 100644 analysis-v2/phasing/parse_vcfs.py create mode 100755 analysis-v2/small_sv_all/0_split.sh create mode 100755 analysis-v2/small_sv_all/1_eval_vcfs.sh create mode 100755 analysis-v2/small_sv_all/2_stratify_vcfs.sh create mode 100644 analysis-v2/small_sv_all/3_plot_vcf_tp.py create mode 100644 analysis-v2/small_sv_all/3_test_plot_tsv.py create mode 100644 analysis-v2/small_sv_all/4_plot_vcf_fp.py create mode 100644 analysis-v2/small_sv_all/5_extract_diffs.py create mode 100755 analysis-v2/small_sv_all/globals.sh create mode 100644 analysis-v2/variants/1_plot_vars.py create mode 100755 analysis-v2/vs_prior_work/1_eval_vcfs.sh create mode 100755 analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh create mode 100644 analysis-v2/vs_prior_work/README.md create mode 100755 analysis-v2/vs_prior_work/globals.sh rename analysis/{12_plot_ram.py => 10_plot_ram.py} (100%) delete mode 100755 analysis/10_vcfdist_sv.sh delete mode 100755 analysis/11_truvari_sv.sh diff --git a/.gitignore b/.gitignore index 88d5a6b..660b761 100644 --- a/.gitignore +++ b/.gitignore @@ -11,11 +11,12 @@ *.vcf *.gz *.gprof +*.png src/vcfdist # ignore analysis analysis/*/ -analysis/*.png +analysis-v2/*/*/ # ignore input data/*/* diff --git a/analysis-v2/clustering/1_plot_beds.py b/analysis-v2/clustering/1_plot_beds.py new file mode 100644 index 0000000..b009d37 --- /dev/null +++ b/analysis-v2/clustering/1_plot_beds.py @@ -0,0 +1,42 @@ +import matplotlib.pyplot as plt +import numpy as np + +ds_names = ["t2t-q100", "nist", "giab-tr", "cmrg"] +ds_versions = ["v0.9", "v4.2.1","v4.20", "v1.00"] +ds_beds = ["GRCh38_HG2-T2TQ100-V0.9_dipcall-z2k.benchmark.bed", + "HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed", + "GIABTR.HG002.benchmark.regions.bed", + "HG002_GRCh38_CMRG_smallvar_v1.00.bed"] +data = "/home/timdunn/vcfdist/data" + +bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, + 100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, + 100_000_000, 200000000, 500000000, 1_000_000_000] +for name, version, bed in zip(ds_names, ds_versions, ds_beds): + print(f"parsing {name} BED") + counts = [0]*len(bins) + bases, bases2 = 0, 0 + with open(f"{data}/{name}-{version}/{bed}", "r") as bedfile: + for line in bedfile: + fields = line.split() + ctg, start, stop = fields[0], int(fields[1]), int(fields[2]) + + for bin_idx, bin_val in enumerate(bins): + if stop - start < bin_val: + counts[bin_idx] += 1 + bases += stop-start + bases2 += (stop-start)*(stop-start) + break + fig, ax = plt.subplots(1,1) + ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1) + ax.set_xlim(0,len(bins)) + ax.set_xticks(range(0, len(bins), 3)) + ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"]) + ax.set_yscale('log') + ax.set_ylim(0.5, 2000000) + ax.set_xlabel("Region Size") + ax.set_ylabel("Counts") + ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes) + ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes) + ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes) + plt.savefig(f"img/bed-{name}.png") diff --git a/analysis-v2/clustering/2_plot_gap_clusters.py b/analysis-v2/clustering/2_plot_gap_clusters.py new file mode 100644 index 0000000..8c7a290 --- /dev/null +++ b/analysis-v2/clustering/2_plot_gap_clusters.py @@ -0,0 +1,38 @@ +import matplotlib.pyplot as plt +import numpy as np + +gaps = [10, 20, 50, 100, 200] # 500, 1000 didn't finish +data = "/home/timdunn/vcfdist/analysis-v2/clustering" + +bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, + 100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, + 100_000_000, 200000000, 500000000, 1_000_000_000] +for gap in gaps: + print(f"parsing gap {gap} superclusters") + counts = [0]*len(bins) + bases, bases2 = 0, 0 + with open(f"{data}/gaps/{gap}.superclusters.tsv", "r") as bedfile: + next(bedfile) + for line in bedfile: + fields = line.split() + ctg, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3]) + + for bin_idx, bin_val in enumerate(bins): + if size < bin_val: + counts[bin_idx] += 1 + bases += size + bases2 += size*size + break + fig, ax = plt.subplots(1,1) + ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1) + ax.set_xlim(0,len(bins)) + ax.set_xticks(range(0, len(bins), 3)) + ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"]) + ax.set_yscale('log') + ax.set_ylim(0.5, 20000000) + ax.set_xlabel("Region Size") + ax.set_ylabel("Counts") + ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes) + ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes) + ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes) + plt.savefig(f"img/gaps-{gap}.png") diff --git a/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh b/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh new file mode 100755 index 0000000..f4e7c9d --- /dev/null +++ b/analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +source globals.sh + +dists=( + "10" + # "20" + # "50" + # "100" + # "200" +) + +# vcfdist evaluation +mkdir -p $out/sweep_gaplens +for q in "${!query_names[@]}"; do + for d in "${dists[@]}"; do + echo "vcfdist: evaluating len 1000 '${query_names[q]}' gap $d" + $timer -v ../../src/vcfdist \ + $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \ + $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ + $data/refs/$ref_name \ + --bed $data/${truth_name}-${truth_version}/split/bench.bed \ + --simple-cluster \ + -g $d \ + -l 1000 \ + -p $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000. \ + 2> $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000.log + done +done diff --git a/analysis-v2/clustering/3_plot_len_clusters.py b/analysis-v2/clustering/3_plot_len_clusters.py new file mode 100644 index 0000000..3bcf336 --- /dev/null +++ b/analysis-v2/clustering/3_plot_len_clusters.py @@ -0,0 +1,39 @@ +import matplotlib.pyplot as plt +import numpy as np + +# lens = [10, 50, 100, 500, 1000] # 5000, 10000 didn't finish +lens = [1000] +data = "/home/timdunn/vcfdist/analysis-v2/clustering" + +bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, + 100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, + 100_000_000, 200000000, 500000000, 1_000_000_000] +for length in lens: + print(f"Parsing WFA variant max length {length} superclusters") + counts = [0]*len(bins) + bases, bases2 = 0, 0 + with open(f"{data}/sweep_varlens/pav.wfa.len{length}ra.superclusters.tsv", "r") as bedfile: + next(bedfile) + for line in bedfile: + fields = line.split() + ctg, sc, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3]), int(fields[4]) + + for bin_idx, bin_val in enumerate(bins): + if size < bin_val: + counts[bin_idx] += 1 + bases += size + bases2 += size*size + break + fig, ax = plt.subplots(1,1) + ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1) + ax.set_xlim(0,len(bins)) + ax.set_xticks(range(0, len(bins), 3)) + ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"]) + ax.set_yscale('log') + ax.set_ylim(0.5, 20000000) + ax.set_xlabel("Region Size") + ax.set_ylabel("Counts") + ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes) + ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes) + ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes) + plt.savefig(f"img/lens-{length}.png") diff --git a/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh new file mode 100755 index 0000000..3e3442b --- /dev/null +++ b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +source globals.sh + +lens=( + # "10" + # "50" + # "100" + # "500" + "1000" + # "5000" + # "10000" +) + +mkdir -p $out/sweep_varlens + +# # wfa-based superclustering +# for q in "${!query_names[@]}"; do +# for l in "${lens[@]}"; do +# echo "vcfdist: evaluating wfa '${query_names[q]}' len $l" +# $timer -v ../../src/vcfdist \ +# $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \ +# $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# $data/refs/$ref_name \ +# --bed $data/${truth_name}-${truth_version}/split/bench.bed \ +# -l $l \ +# -p $out/sweep_varlens/${query_names[q]}.wfa.len${l}. \ +# 2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}.log +# done +# done + +# wfa-based superclustering +for q in "${!query_names[@]}"; do + for l in "${lens[@]}"; do + echo "vcfdist: evaluating wfa '${query_names[q]}' len $l" + $timer -v ../../src/vcfdist \ + $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \ + $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ + $data/refs/$ref_name \ + --bed $data/${truth_name}-${truth_version}/split/bench.bed \ + -l $l \ + -p $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra. \ + 2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra.log + done +done + +# # gap-based superclustering +# for q in "${!query_names[@]}"; do +# for l in "${lens[@]}"; do +# echo "vcfdist: evaluating gap 100 '${query_names[q]}' len $l" +# $timer -v ../../src/vcfdist \ +# $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \ +# $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# $data/refs/$ref_name \ +# --bed $data/${truth_name}-${truth_version}/split/bench.bed \ +# --simple-cluster \ +# -g 100 \ +# -l $l \ +# -p $out/sweep_varlens/${query_names[q]}.gap100.len${l}. \ +# 2> $out/sweep_varlens/${query_names[q]}.gap100.len${l}.log +# done +# done diff --git a/analysis-v2/clustering/4_plot_accuracy.py b/analysis-v2/clustering/4_plot_accuracy.py new file mode 100644 index 0000000..8f71a7f --- /dev/null +++ b/analysis-v2/clustering/4_plot_accuracy.py @@ -0,0 +1,67 @@ +import os +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +# plt.rcParams.update({"figure.facecolor": (0,0,0,0)}) + +fig, ax = plt.subplots(1, 2, figsize=(10,6)) + +partial_credit = True + +types = ["gap", "gap", "gap", "gap", "gap", "wfa_len"] +vals = [10, 20, 50, 100, 200, 1000] + +for t, v in zip(types, vals): + print(f"plotting {t} {v}") + with open(f"./{t}s/{v}.precision-recall.tsv") as csv: + indel_recall = [] + indel_prec = [] + snp_recall = [] + snp_prec = [] + next(csv) # skip header + + for line in csv: + typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \ + query_tot, query_tp, query_pp, query_fp = line.split('\t') + if partial_credit: # normal vcfdist prec/recall calc, already done + if typ == "INDEL": + indel_recall.append(float(recall)) + indel_prec.append(float(prec)) + elif typ == "SNP": + snp_recall.append(float(recall)) + snp_prec.append(float(prec)) + else: # recalculate using TP/FP/FN + if typ == "INDEL": + indel_recall.append(float(truth_tp) / float(truth_tot)) + if int(truth_tp) + int(query_fp) == 0: + indel_prec.append(1) + else: + indel_prec.append(float(truth_tp) / + (float(truth_tp) + float(query_fp))) + elif typ == "SNP": + snp_recall.append(float(truth_tp) / float(truth_tot)) + if int(truth_tp) + int(query_fp) == 0: + snp_prec.append(1) + else: + snp_prec.append(float(truth_tp) / + (float(truth_tp) + float(query_fp))) + ax[0].plot(snp_recall, snp_prec, linestyle='', marker='.') + ax[1].plot(indel_recall, indel_prec, linestyle='', marker='.') + +# SNP plot +ax[0].set_title("SNPs") +ax[0].set_xlabel("Recall", fontsize=15) +ax[0].set_ylabel("Precision", fontsize=15) +ax[0].set_xlim((0.9,1)) +ax[0].set_ylim((0.9,1)) + +# INDEL plot +ax[1].set_title("INDELs") +ax[1].set_xlabel("Recall", fontsize=15) +ax[1].set_ylabel("Precision", fontsize=15) +ax[1].set_xlim((0.9,1)) +ax[1].set_ylim((0.9,1)) + +ax[0].legend(["Gap 10", "Gap 20", "Gap 50", "Gap 100", "Gap 200", "WFA"]) +plt.tight_layout() +plt.savefig('img/accuracy.png', dpi=200) diff --git a/analysis-v2/clustering/README.md b/analysis-v2/clustering/README.md new file mode 100644 index 0000000..5599f1d --- /dev/null +++ b/analysis-v2/clustering/README.md @@ -0,0 +1,3 @@ +gaps = modify gap length +wfa_lens = modify variant lengths, wfa superclustering +gap_lens = modify variant lengths, gap superclustering diff --git a/analysis-v2/clustering/globals.sh b/analysis-v2/clustering/globals.sh new file mode 100755 index 0000000..5f80b9d --- /dev/null +++ b/analysis-v2/clustering/globals.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +data="/home/timdunn/vcfdist/data" +parallel="/home/timdunn/parallel-20211222/src/parallel" +vcfdist="/home/timdunn/vcfdist/src/vcfdist" +rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg" +tabix="/home/timdunn/software/htslib-1.16/tabix" +bgzip="/home/timdunn/software/htslib-1.16/bgzip" +timer="/usr/bin/time" + +out="/home/timdunn/vcfdist/analysis-v2/clustering" + +# define reference FASTA +ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta" + +# T2T information +truth_name="t2t-q100" +truth_version="v0.9" + +# query VCF information +query_names=( + # "hprc" + "pav" + # "giab-tr" +) +query_versions=( + # "v1" + "v4" + # "v4.20" +) diff --git a/analysis-v2/multi_match/1_eval_vcfs.sh b/analysis-v2/multi_match/1_eval_vcfs.sh new file mode 100755 index 0000000..5be781b --- /dev/null +++ b/analysis-v2/multi_match/1_eval_vcfs.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +source globals.sh + + # --bed $data/cmrg-v1.00/HG002_GRCh38_CMRG_smallvar_v1.00.bed \ + # --max-threads 1 \ +# vcfdist evaluation +mkdir -p $out/evals/vcfdist +for i in "${!query_names[@]}"; do + echo "vcfdist: evaluating '${query_names[i]}'" + $timer -v ../../src/vcfdist \ + $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ + $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ + $data/refs/$ref_name \ + --bed $data/${truth_name}-${truth_version}/split/bench.bed \ + --keep-query --keep-truth \ + -l 1000 \ + -p $out/evals/vcfdist/${query_names[i]}. \ + 2> $out/evals/vcfdist/${query_names[i]}.log +done diff --git a/analysis-v2/multi_match/2_plot_vcf_multi.py b/analysis-v2/multi_match/2_plot_vcf_multi.py new file mode 100644 index 0000000..843cf41 --- /dev/null +++ b/analysis-v2/multi_match/2_plot_vcf_multi.py @@ -0,0 +1,137 @@ +from collections import defaultdict +import json +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np + +query_data, truth_data = {}, {} +BKs = ["TP", "PP", "FP", "FN"] +for BK in BKs: + query_data[BK] = defaultdict(int) + truth_data[BK] = defaultdict(int) +keys = set() + +QUERY = 0 +TRUTH = 1 +SINGLE = 0 +MULTI = 1 + +TP = 0 +PP = 1 +FP = 2 +FN = 3 +results = np.zeros((4,2,2), dtype=float) + +results_fn = f"/home/timdunn/vcfdist/analysis-v2/multi_match/evals/vcfdist/giab-tr.summary.vcf" +with open(results_fn, 'r') as results_file: + line_ct = 0 + for line in results_file: + if line[0] == "#": # header + continue + contig, pos, var_id, ref, alt, qual, filt, info, fmt, truth, query = line.strip().split('\t') + tGT, tBD, tBC, tBK, tQQ, tSC, tSG, tPB, tPS, tPF = truth.split(":") + qGT, qBD, qBC, qBK, qQQ, qSC, qSG, qPB, qPS, qPF = query.split(":") + + # flip qGT if necessary, to compare to correct truth variant + if qGT == "0|1": + if (int(qPS) + int(qPF)) % 2: + qGT = "1|0" + elif qGT == "1|0": + if (int(qPS) + int(qPF)) % 2: + qGT = "0|1" + + # parse truth variant + if tBC == ".": + pass + elif float(tBC) == 0: + truth_data["FN"][f"{contig}:{tGT}:{tSC}:{tSG}"] += 1 + keys.add(f"{contig}:{tGT}:{tSC}:{tSG}") + # print(f"Truth FN, {contig}:{tGT}:{tSC}:{tSG}") + elif float(tBC) < 1: + truth_data["PP"][f"{contig}:{tGT}:{tSC}:{tSG}"] += 1 + keys.add(f"{contig}:{tGT}:{tSC}:{tSG}") + # print(f"Truth PP, {contig}:{tGT}:{tSC}:{tSG}") + elif float(tBC) == 1: + truth_data["TP"][f"{contig}:{tGT}:{tSC}:{tSG}"] += 1 + keys.add(f"{contig}:{tGT}:{tSC}:{tSG}") + # print(f"Truth TP, {contig}:{tGT}:{tSC}:{tSG}") + else: + print("ERROR: unexpected tBC =", tBC) + + # parse query variant + if qBC == ".": + pass + elif float(qBC) == 0: + query_data["FP"][f"{contig}:{qGT}:{qSC}:{qSG}"] += 1 + keys.add(f"{contig}:{qGT}:{qSC}:{qSG}") + # print(f"Query FP, {contig}:{qGT}:{qSC}:{qSG}") + elif float(qBC) < 1: + query_data["PP"][f"{contig}:{qGT}:{qSC}:{qSG}"] += 1 + keys.add(f"{contig}:{qGT}:{qSC}:{qSG}") + # print(f"Query PP, {contig}:{qGT}:{qSC}:{qSG}") + elif float(qBC) == 1: + query_data["TP"][f"{contig}:{qGT}:{qSC}:{qSG}"] += 1 + keys.add(f"{contig}:{qGT}:{qSC}:{qSG}") + # print(f"Query TP, {contig}:{qGT}:{qSC}:{qSG}") + else: + print("ERROR: unexpected qBC =", qBC) + + line_ct += 1 + # if line_ct > 1000: break + +# plot results +for key in keys: + for bk, BK in enumerate(BKs): + # not in either + if truth_data[BK][key] == 0 and query_data[BK][key] == 0: + pass + # PP/TP, should be present in both + elif truth_data[BK][key] > 1 and query_data[BK][key] > 1: + results[bk][MULTI][MULTI] += truth_data[BK][key] + query_data[BK][key] + elif truth_data[BK][key] > 1 and query_data[BK][key] == 1: + results[bk][SINGLE][MULTI] += truth_data[BK][key] + query_data[BK][key] + elif truth_data[BK][key] == 1 and query_data[BK][key] > 1: + results[bk][MULTI][SINGLE] += truth_data[BK][key] + query_data[BK][key] + elif truth_data[BK][key] == 1 and query_data[BK][key] == 1: + results[bk][SINGLE][SINGLE] += truth_data[BK][key] + query_data[BK][key] + else: # present in one + if BK == "FP": + results[bk][SINGLE][SINGLE] += 1 + if truth_data[BK][key] > 0: + print("ERROR: FP key present in truth") + print(BK, key) + + elif BK == "FN": + results[bk][SINGLE][SINGLE] += 1 + if query_data[BK][key] > 0: + print("ERROR: FN key present in query") + print(BK, key) + else: + print("ERROR: PP/TP key not present for truth and query") + print(BK, key) + +fracs = [100*results[x].sum(axis=None) / results.sum(axis=None) for x in range(4)] +props = [100*results[x] / results[x].sum(axis=None) for x in range(4)] +# results = 100 * results / results.sum(axis=None) +# print(results) +cmaps = ["Greens", "Oranges", "Reds", "Blues"] +fig, ax = plt.subplots(1, 5, figsize=(15,4)) +for x in range(4): + ax[x].matshow(results[x], cmap=cmaps[x]) + ax[x].set_title(f"{BKs[x]} ({fracs[x]:.3f}%)") + ax[x].set_yticks([0, 1]) + ax[x].set_xticks([0, 1]) + ax[x].set_yticklabels(["SINGLE", "MULTI"]) + ax[x].set_xticklabels(["SINGLE", "MULTI"]) + ax[x].set_xlabel("Query") + ax[x].set_ylabel("Truth") + + for (i,j), z in np.ndenumerate(props[x]): + ax[x].text(j, i, f"{z:.3f}%", ha='center', va='center', + bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3')) + +ax[4].set_title("Variant Calls") +ax[4].pie(fracs, labels=BKs, autopct='%1.1f%%', explode=(0, 0.6, 0.3, 0), + colors=['green', 'orange', 'red', 'blue']) +plt.tight_layout() +plt.savefig("results.png") diff --git a/analysis-v2/multi_match/globals.sh b/analysis-v2/multi_match/globals.sh new file mode 100755 index 0000000..4b26904 --- /dev/null +++ b/analysis-v2/multi_match/globals.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +data="/home/timdunn/vcfdist/data" +out="/home/timdunn/vcfdist/analysis-v2/multi_match" +parallel="/home/timdunn/parallel-20211222/src/parallel" +vcfdist="/home/timdunn/vcfdist/src/vcfdist" +rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg" +tabix="/home/timdunn/software/htslib-1.16/tabix" +bgzip="/home/timdunn/software/htslib-1.16/bgzip" +timer="/usr/bin/time" + +# define reference FASTA +ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta" + +# T2T information +truth_name="t2t-q100" +truth_version="v0.9" + +# query VCF information +query_names=( + "hprc" + "pav" + "giab-tr" +) +query_versions=( + "v1" + "v4" + "v4.20" +) diff --git a/analysis-v2/phab_cm/1_eval_vcfs.sh b/analysis-v2/phab_cm/1_eval_vcfs.sh new file mode 100755 index 0000000..b628873 --- /dev/null +++ b/analysis-v2/phab_cm/1_eval_vcfs.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +source globals.sh + +# # truvari phab VCF normalization v4.1.0 (doesn't work) +# mkdir -p $out/phab-mafft +# mkdir -p $out/phab-wfa +# source ~/truvari/venv3.10/bin/activate +# for i in "${!query_names[@]}"; do +# echo "truvari v4.1: phab wfa for '${query_names[i]}'" +# truvari phab \ +# -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# --bSamples HG002 \ +# --cSamples HG002 \ +# -f $data/refs/$ref_name \ +# --align wfa \ +# -t 64 \ +# -o $out/phab-wfa/${query_names[i]}.vcf.gz \ +# 2> $out/phab-wfa/${query_names[i]}.log +# echo "truvari v4.1: phab mafft for '${query_names[i]}'" +# truvari phab \ +# -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# --bSamples HG002 \ +# --cSamples HG002 \ +# -f $data/refs/$ref_name \ +# --align mafft \ +# -t 64 \ +# -o $out/phab-mafft/${query_names[i]}.vcf.gz \ +# 2> $out/phab-mafft/${query_names[i]}.log +# done +# deactivate + +# # truvari phab VCF normalization v4.0 +# source ~/software/Truvari-v4.0.0/venv3.10/bin/activate +# mkdir -p $out/phab +# for i in "${!query_names[@]}"; do +# echo "truvari v4.0: phab mafft for '${query_names[i]}'" +# truvari phab \ +# -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# --bSamples HG002 \ +# --cSamples HG002 \ +# -f $data/refs/$ref_name \ +# -t 64 \ +# -o $out/phab/${query_names[i]}.vcf.gz \ +# 2> $out/phab/${query_names[i]}.log +# done +# deactivate + +# # split phab normalized VCF into truth/query +# in_samples=( +# "HG002" +# "p:HG002" +# ) +# out_samples=( +# "TRUTH" +# "QUERY" +# ) +# for s in "${out_samples[@]}"; do +# echo $s > "${s}.txt" +# done +# for i in "${!query_names[@]}"; do +# in_file="$out/phab/${query_names[i]}.vcf.gz" +# for s in "${!in_samples[@]}"; do +# bcftools view -c1 -s ${in_samples[s]} $in_file | bcftools reheader -s ${out_samples[s]}.txt | +# sed "s/\//|/g" | bcftools view -Oz &> $out/phab/${query_names[i]}.${out_samples[s]}.vcf.gz +# tabix -p vcf $out/phab/${query_names[i]}.${out_samples[s]}.vcf.gz +# done +# done + +# # vcfdist evaluation +# mkdir -p $out/vcfdist +# for i in "${!query_names[@]}"; do +# echo "vcfdist: evaluating '${query_names[i]}'" +# $timer -v /home/timdunn/vcfdist/src/vcfdist \ +# $out/phab/${query_names[i]}.QUERY.vcf.gz \ +# $out/phab/${query_names[i]}.TRUTH.vcf.gz \ +# $data/refs/$ref_name \ +# --bed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ +# -l 1000 \ +# -p $out/vcfdist/${query_names[i]}. \ +# 2> $out/vcfdist/${query_names[i]}.log +# done + +# # vcfeval evaluation +# mkdir -p $out/vcfeval +# for i in "${!query_names[@]}"; do +# echo "vcfeval: evaluating '${query_names[i]}'" +# $timer -v $rtg vcfeval \ +# -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ +# -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ +# -t $data/refs/${ref_name}.sdf \ +# -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ +# --threads 64 \ +# --all-records \ +# -o $out/vcfeval/${query_names[i]} \ +# 2> $out/vcfeval/${query_names[i]}.log +# done + +# Truvari evaluation +mkdir -p $out/truvari +source ~/software/Truvari-v4.0.0/venv3.10/bin/activate +for i in "${!query_names[@]}"; do + echo "Truvari: evaluating '${query_names[i]}'" + $timer -v truvari bench \ + -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ + -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ + -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ + --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ + --no-ref a \ + --sizemin 1 --sizefilt 1 --sizemax 1000 \ + --pick multi \ + --typeignore \ + --dup-to-ins \ + -o $out/truvari/${query_names[i]} + + laytr tru2ga \ + -i $out/truvari/${query_names[i]}/ \ + -o $out/truvari/${query_names[i]}/result + gunzip $out/truvari/${query_names[i]}/result*.gz +done +deactivate diff --git a/analysis-v2/phab_cm/globals.sh b/analysis-v2/phab_cm/globals.sh new file mode 100755 index 0000000..48a102a --- /dev/null +++ b/analysis-v2/phab_cm/globals.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +data="/home/timdunn/vcfdist/data" +out="/home/timdunn/vcfdist/analysis-v2/phab_cm" +parallel="/home/timdunn/parallel-20211222/src/parallel" +vcfdist="/home/timdunn/vcfdist/src/vcfdist" +rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg" +tabix="/home/timdunn/software/htslib-1.16/tabix" +bgzip="/home/timdunn/software/htslib-1.16/bgzip" +timer="/usr/bin/time" + +# define reference FASTA +ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta" + +# T2T information +truth_name="t2t-q100" +truth_version="v0.9" + +# query VCF information +query_names=( + "hprc" + "pav" + "giab-tr" +) +query_versions=( + "v1" + "v4" + "v4.20" +) diff --git a/analysis/13_truvari_compare.py b/analysis-v2/phab_cm/vcfdist_truvari_compare.py similarity index 97% rename from analysis/13_truvari_compare.py rename to analysis-v2/phab_cm/vcfdist_truvari_compare.py index 1fcae53..7c768d7 100644 --- a/analysis/13_truvari_compare.py +++ b/analysis-v2/phab_cm/vcfdist_truvari_compare.py @@ -2,12 +2,9 @@ import numpy as np import matplotlib.pyplot as plt -# full genome, no phab -# truvari_prefix = "../out/giabtr/truvari-init/result_" -# vcfdist_prefix = "../out/giabtr/vcfdist-keep/" # chr20, with phab -truvari_prefix = "../out/giabtr/truvari-norm-phab-chr20/result_" -vcfdist_prefix = "../out/giabtr/vcfdist-norm-phab-chr20/" +truvari_prefix = "./truvari/pav/result_" +vcfdist_prefix = "./vcfdist/pav." do_print = True SIZE = 0 diff --git a/analysis-v2/phasing/0_create_filtered_vcfs.sh b/analysis-v2/phasing/0_create_filtered_vcfs.sh new file mode 100755 index 0000000..8238a28 --- /dev/null +++ b/analysis-v2/phasing/0_create_filtered_vcfs.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +echo "filtering pav" +bcftools filter \ + ~/vcfdist/data/pav-v4/split/pav.all.vcf.gz \ + -R ~/vcfdist/data/t2t-q100-v0.9/split/bench.bed \ + -o ./vcfs/pav.vcf + +echo "filtering t2t-q100" +bcftools filter \ + ~/vcfdist/data/t2t-q100-v0.9/split/t2t-q100.all.vcf.gz \ + -R ~/vcfdist/data/t2t-q100-v0.9/split/bench.bed \ + -o ./vcfs/t2t-q100.vcf diff --git a/analysis/14_phase_analysis.py b/analysis-v2/phasing/14_phase_analysis.py similarity index 100% rename from analysis/14_phase_analysis.py rename to analysis-v2/phasing/14_phase_analysis.py diff --git a/analysis-v2/phasing/1_whatshap_compare.sh b/analysis-v2/phasing/1_whatshap_compare.sh new file mode 100755 index 0000000..398f260 --- /dev/null +++ b/analysis-v2/phasing/1_whatshap_compare.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +/home/timdunn/truvari/venv3.10/bin/whatshap compare \ + --names pav,t2t-q100 \ + ./vcfs/pav.vcf \ + ./vcfs/t2t-q100.vcf \ + --tsv-pairwise pav.tsv \ + --switch-error-bed switches.bed diff --git a/analysis-v2/phasing/parse_vcfs.py b/analysis-v2/phasing/parse_vcfs.py new file mode 100644 index 0000000..04f584b --- /dev/null +++ b/analysis-v2/phasing/parse_vcfs.py @@ -0,0 +1,64 @@ + +truth_vcf_fn = "./vcfs/t2t-q100.vcf" +query_vcf_fn = "./vcfs/pav.vcf" +out = open("diffs.txt", "w") + +# filter positions with multiple variants (like WhatsHap) +print("Finding query duplicates") +query_pos = set() +query_dups = set() +with open(query_vcf_fn, "r") as query_vcf: + for line in query_vcf: + if line[0] == "#": continue # skip header + fields = line.split() + gt = fields[9].split(":")[0] + pos = f"{fields[0]}:{fields[1]}:{gt}" + if pos in query_pos: + query_dups.add(pos) + query_pos.add(pos) + +print("Finding truth duplicates") +truth_pos = set() +truth_dups = set() +with open(truth_vcf_fn, "r") as truth_vcf: + for line in truth_vcf: + if line[0] == "#": continue # skip header + fields = line.split() + gt = fields[9].split(":")[0] + pos = f"{fields[0]}:{fields[1]}:{gt}" + print(pos) + if pos in truth_pos: + truth_dups.add(pos) + truth_pos.add(pos) + +query_vars = [] +query_gts = {} +truth_gts = {} +count = 0 +print("Parsing query VCF") +with open(query_vcf_fn, "r") as query_vcf: + for line in query_vcf: + if line[0] == "#": continue # skip header + fields = line.split() + gt = fields[9].split(":")[0] + if gt == "1|1": continue + if f"{fields[0]}:{fields[1]}" in query_dups: continue + var_id = f"{fields[0]}:{fields[1]}={fields[3]}>{fields[4]}" + query_vars.append(var_id) + query_gts[var_id] = gt + +print("Parsing truth VCF") +with open(truth_vcf_fn, "r") as truth_vcf: + for line in truth_vcf: + if line[0] == "#": continue # skip header + fields = line.split() + gt = fields[9].split(":")[0] + if gt == "1|1": continue + if f"{fields[0]}:{fields[1]}" in truth_dups: continue + var_id = f"{fields[0]}:{fields[1]}={fields[3]}>{fields[4]}" + truth_gts[var_id] = gt + +print("Printing GT mismatches") +for var in query_vars: + if var in truth_gts.keys() and query_gts[var] != truth_gts[var]: + print(f"variant: {var}\tquery:{query_gts[var]}\ttruth:{truth_gts[var]}", file=out) diff --git a/analysis-v2/small_sv_all/0_split.sh b/analysis-v2/small_sv_all/0_split.sh new file mode 100755 index 0000000..0a6eea0 --- /dev/null +++ b/analysis-v2/small_sv_all/0_split.sh @@ -0,0 +1,47 @@ +#!/bin/bash +data="/home/timdunn/vcfdist/data" +src="pav" +version="v4" +base="${data}/${src}-${version}/split/${src}" + +# split VCF into sub-VCFs of SNPs, INDELs, SVs + +echo "$src snp" +bcftools view \ + -i 'TYPE=="SNP"'\ + -Oz \ + -o ${base}.snv.vcf.gz \ + ${base}.all.vcf.gz +tabix -p vcf ${base}.snv.vcf.gz + +echo "$src indel" +bcftools view \ + -i 'ILEN < 50 && ILEN > -50'\ + -Oz \ + -o ${base}.indel.vcf.gz \ + ${base}.all.vcf.gz +tabix -p vcf ${base}.indel.vcf.gz + +echo "$src sv" +bcftools view \ + -i 'ILEN <= -50 || ILEN >= 50'\ + -Oz \ + -o ${base}.sv.vcf.gz \ + ${base}.all.vcf.gz +tabix -p vcf ${base}.sv.vcf.gz + +echo "$src small" +bcftools view \ + -i 'TYPE=="SNP" || (ILEN < 50 && ILEN > -50)'\ + -Oz \ + -o ${base}.small.vcf.gz \ + ${base}.all.vcf.gz +tabix -p vcf ${base}.small.vcf.gz + +echo "$src large" +bcftools view \ + -i 'ILEN <= -1 || ILEN >= 1'\ + -Oz \ + -o ${base}.large.vcf.gz \ + ${base}.all.vcf.gz +tabix -p vcf ${base}.large.vcf.gz diff --git a/analysis-v2/small_sv_all/1_eval_vcfs.sh b/analysis-v2/small_sv_all/1_eval_vcfs.sh new file mode 100755 index 0000000..5b1ab6c --- /dev/null +++ b/analysis-v2/small_sv_all/1_eval_vcfs.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +source globals.sh + +# vcfdist evaluation +mkdir -p vcfdist +for cat in "${splits[@]}"; do + for i in "${!ds_names[@]}"; do + echo "vcfdist: evaluating '${ds_names[i]}' ${cat}" + $timer -v $vcfdist \ + $data/${ds_names[i]}-${ds_versions[i]}/split/${ds_names[i]}.${cat}.vcf.gz \ + $data/t2t-q100-v0.9/split/t2t-q100.${cat}.vcf.gz \ + $ref_name \ + --bed $data/t2t-q100-v0.9/GRCh38_HG2-T2TQ100-V0.9_dipcall-z2k.benchmark.bed \ + --keep-query --keep-truth \ + -l 1000 \ + -p $dir/vcfdist/${ds_names[i]}.${cat}. \ + 2> $dir/vcfdist/log_${ds_names[i]}.${cat}.txt + done +done diff --git a/analysis-v2/small_sv_all/2_stratify_vcfs.sh b/analysis-v2/small_sv_all/2_stratify_vcfs.sh new file mode 100755 index 0000000..3a1a2b7 --- /dev/null +++ b/analysis-v2/small_sv_all/2_stratify_vcfs.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +source globals.sh + +$parallel \ + "bgzip ./vcfdist/{1}.{2}.summary.vcf" ::: \ + ${ds_names[@]} ::: ${splits[@]} + +$parallel \ + "tabix -p vcf ./vcfdist/{1}.{2}.summary.vcf.gz" ::: \ + ${ds_names[@]} ::: ${splits[@]} + +$parallel \ + "bcftools view \ + -R $data/genome-stratifications-v3.3/GRCh38_{3}.bed.gz \ + -Ou \ + ./vcfdist/{1}.{2}.summary.vcf.gz \ + -o ./vcfdist/{1}.{2}.{3}.vcf" ::: \ + ${ds_names[@]} ::: ${splits[@]} ::: ${bednames[@]} + +$parallel \ + "gunzip ./vcfdist/{1}.{2}.summary.vcf.gz" ::: \ + ${ds_names[@]} ::: ${splits[@]} diff --git a/analysis-v2/small_sv_all/3_plot_vcf_tp.py b/analysis-v2/small_sv_all/3_plot_vcf_tp.py new file mode 100644 index 0000000..bd04c8b --- /dev/null +++ b/analysis-v2/small_sv_all/3_plot_vcf_tp.py @@ -0,0 +1,122 @@ +from collections import defaultdict +import json +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np + +vcfs = ["t2t-q100", "hprc", "pav", "giab-tr"] +variant_types = ["snv", "indel", "sv"] +vcf_types = ["snv", "indel", "sv", "small", "large", "all"] +regions = [ + "summary", + "alldifficultregions", + "AllHomopolymers_ge7bp_imperfectge11bp_slop5", + "AllTandemRepeats", + "AllTandemRepeatsandHomopolymers_slop5", + "MHC", + "satellites_slop5", + "segdups", + "alllowmapandsegdupregions" +] + +for region in regions: +# initialize counts + print(region) + counts = {} + partial = {} + for vcf in vcfs: + counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)} + partial[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)} + colors = ["yellow", "blue", "red", "green", "orange", "purple"] + +# count variants + for vcf in vcfs: + print(f" {vcf}") + for typ in vcf_types: + results_fn = f"vcfdist/{vcf}.{typ}.{region}.vcf" + with open(results_fn, 'r') as results_file: + line_ct = 0 + for line in results_file: + if line[0] == "#": # header + continue + contig, pos, var_id, ref, alt, qual, filt, info, fmt, truth, query = line.strip().split('\t') + gt, decision, credit, ga4gh_cat, qual2, sc, swap = truth.split(":") + if credit == '.': continue + haps = sum([0 if x == '.' else int(x) for x in gt.split('|')]) + if decision == "TP" and float(credit) == 1: + if len(ref) == len(alt) == 1: # snp + counts[vcf]["snv"][typ] += haps + elif len(ref) == 1 and len(alt) > 1: # insertion + if len(alt) > 50: + counts[vcf]["sv"][typ] += haps + else: + counts[vcf]["indel"][typ] += haps + elif len(alt) == 1 and len(ref) > 1: # deletion + if len(ref) > 50: + counts[vcf]["sv"][typ] += haps + else: + counts[vcf]["indel"][typ] += haps + else: + print("ERROR: unexpected variant type") + elif float(credit) > 0: + if len(ref) == len(alt) == 1: # snp + partial[vcf]["snv"][typ] += haps + elif len(ref) == 1 and len(alt) > 1: # insertion + if len(alt) > 50: + partial[vcf]["sv"][typ] += haps + else: + partial[vcf]["indel"][typ] += haps + elif len(alt) == 1 and len(ref) > 1: # deletion + if len(ref) > 50: + partial[vcf]["sv"][typ] += haps + else: + partial[vcf]["indel"][typ] += haps + else: + print("ERROR: unexpected variant type") + line_ct += 1 + # if line_ct > 1000: break + print(json.dumps(counts, indent=4)) + + fig, ax = plt.subplots(1, 3, figsize=(15,6)) + indices = np.arange(len(vcfs)-1) + width = 0.1 + yquals = [0, 3.01, 6.99, 10, 13.01, 16.99, 20, 23.01, 26.99, 30, # 33.01, 36.99, 40, 43.01, 46.99, 50 + ] + ylabels = ["0%", "50%", "80%", "90%", "95%", "98%", "99%", "99.5%", "99.8%", "99.9%", # "99.95%", "99.98%", "99.99%", "99.995%", "99.998%", "99.999%" + ] + for var_type_idx, var_type in enumerate(variant_types): + for vcf_type_idx, vcf_type in enumerate(vcf_types): + + # skip certain plots (few variants due to complex not getting filtered) + if var_type == "snv" and vcf_type in ["indel", "large", "sv"]: continue + if var_type == "indel" and vcf_type in ["snv", "sv"]: continue + if var_type == "sv" and vcf_type in ["snv", "indel", "small"]: continue + + tp_fracs = [counts[vcf][var_type][vcf_type] / + max(1, counts["t2t-q100"][var_type][vcf_type]) for vcf in vcfs[1:]] + tp_qscores = [50 if frac == 1 else -10*np.log10(1-frac) for frac in tp_fracs] + pp_fracs = [partial[vcf][var_type][vcf_type] / + max(1, counts["t2t-q100"][var_type][vcf_type]) for vcf in vcfs[1:]] + pptp_fracs = [tp_frac + pp_frac for (pp_frac, tp_frac) in zip(pp_fracs, tp_fracs)] + pptp_qscores = [50 if frac == 1 else -10*np.log10(1-frac) for frac in pptp_fracs] + + ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, + tp_qscores, width, color=colors[vcf_type_idx]) + ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, + [pptpq-tpq for (pptpq, tpq) in zip(pptp_qscores, tp_qscores)], + width, color=colors[vcf_type_idx], + bottom=tp_qscores, alpha=0.5) + for yqual in yquals: + ax[var_type_idx].axhline(y=yqual, color='k', alpha=0.5, linestyle=':', zorder=-1) + ax[var_type_idx].set_title(var_type) + ax[var_type_idx].set_xlabel("VCFs") + ax[var_type_idx].set_xticks(indices) + ax[var_type_idx].set_xticklabels(vcfs[1:]) + ax[var_type_idx].set_yticks(yquals) + ax[var_type_idx].set_yticklabels(ylabels, fontsize=8) + ax[var_type_idx].set_ylim(0,30) + patches = [mpatches.Patch(color=c, label=l)for c,l in zip(colors, vcf_types)] + ax[2].legend(handles=patches, loc=(0.6,0.6)) + ax[0].set_ylabel("True Positive Recall") + plt.suptitle(f"{region}") + plt.savefig(f"counts-{region}.png", dpi=300) diff --git a/analysis-v2/small_sv_all/3_test_plot_tsv.py b/analysis-v2/small_sv_all/3_test_plot_tsv.py new file mode 100644 index 0000000..7880544 --- /dev/null +++ b/analysis-v2/small_sv_all/3_test_plot_tsv.py @@ -0,0 +1,85 @@ +from collections import defaultdict +import json +import matplotlib.pyplot as plt +import numpy as np + +vcfs = ["t2t-q100", "hprc", "pav", "giab-tr"] +variant_types = ["snv", "indel", "sv"] +vcf_types = ["snv", "indel", "sv", "small", "large", "all"] + +# initialize counts +counts = {} +partial = {} +for vcf in vcfs: + counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)} + partial[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)} +colors = ["yellow", "blue", "red", "green", "orange", "purple"] + +# count variants +for vcf in vcfs: + for typ in vcf_types: + print(f"{vcf} {typ}") + results_fn = f"vcfdist/{vcf}.{typ}.query.tsv" + with open(results_fn, 'r') as results_file: + next(results_file) # skip header + line_ct = 0 + for line in results_file: + contig, pos, hap, ref, alt, qual, var_type, err_type, credit, clust, sc, loc = line.strip().split('\t') + if err_type == "TP": + if len(ref) == len(alt) == 1 and var_type == "SNP": # snp + counts[vcf]["snv"][typ] += 1 + elif len(ref) == 0 and var_type == "INS": # insertion + if len(alt) >= 50: + counts[vcf]["sv"][typ] += 1 + else: + counts[vcf]["indel"][typ] += 1 + elif len(alt) == 0 and var_type == "DEL": # deletion + if len(ref) >= 50: + counts[vcf]["sv"][typ] += 1 + else: + counts[vcf]["indel"][typ] += 1 + else: + print("ERROR: unexpected variant type") + if err_type == "PP": + if len(ref) == len(alt) == 1 and var_type == "SNP": # snp + partial[vcf]["snv"][typ] += 1 + elif len(ref) == 0 and var_type == "INS": # insertion + if len(alt) >= 50: + partial[vcf]["sv"][typ] += 1 + else: + partial[vcf]["indel"][typ] += 1 + elif len(alt) == 0 and var_type == "DEL": # deletion + if len(ref) >= 50: + partial[vcf]["sv"][typ] += 1 + else: + partial[vcf]["indel"][typ] += 1 + else: + print("ERROR: unexpected variant type") + line_ct += 1 + # if line_ct > 1000: break +print(json.dumps(counts, indent=4)) + +fig, ax = plt.subplots(1, 3, figsize=(15,5)) +indices = np.arange(len(vcfs)) +width = 0.1 +for var_type_idx, var_type in enumerate(variant_types): + for vcf_type_idx, vcf_type in enumerate(vcf_types): + ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, + [counts[vcf][var_type][vcf_type] for vcf in vcfs], width, color=colors[vcf_type_idx]) + ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, + [partial[vcf][var_type][vcf_type] for vcf in vcfs], width, color=colors[vcf_type_idx], + bottom=[counts[vcf][var_type][vcf_type] for vcf in vcfs], alpha=0.5) + ax[var_type_idx].set_title(var_type) + ax[var_type_idx].set_ylabel("Counts") + ax[var_type_idx].set_xlabel("VCFs") + ax[var_type_idx].set_xticks(indices) + ax[var_type_idx].set_xticklabels(vcfs) +ax[0].set_ylim(0, 6_000_000) +ax[1].set_ylim(0, 1_400_000) +ax[2].set_ylim(0, 35_000) +fig.tight_layout() +ax[2].legend(vcf_types, loc=(0.6,0.6)) +leg = ax[2].get_legend() +for i in range(len(colors)): + leg.legendHandles[i].set_color(colors[i]) +plt.savefig("counts.png", dpi=300) diff --git a/analysis-v2/small_sv_all/4_plot_vcf_fp.py b/analysis-v2/small_sv_all/4_plot_vcf_fp.py new file mode 100644 index 0000000..67c60fa --- /dev/null +++ b/analysis-v2/small_sv_all/4_plot_vcf_fp.py @@ -0,0 +1,135 @@ +from collections import defaultdict +import json +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np + +vcfs = ["t2t-q100", "hprc", "pav", "giab-tr"] +variant_types = ["snv", "indel", "sv"] +vcf_types = ["snv", "indel", "sv", "small", "large", "all"] +regions = [ + "summary", + "alldifficultregions", + "AllHomopolymers_ge7bp_imperfectge11bp_slop5", + "AllTandemRepeats", + "AllTandemRepeatsandHomopolymers_slop5", + "MHC", + "satellites_slop5", + "segdups", + "alllowmapandsegdupregions" +] + +for region in regions: +# initialize fp_counts + print(region) + fp_counts = {} + pp_counts = {} + tp_counts = {} + for vcf in vcfs: + fp_counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)} + pp_counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)} + tp_counts[vcf] = {"snv": defaultdict(int), "indel": defaultdict(int), "sv": defaultdict(int)} + colors = ["yellow", "blue", "red", "green", "orange", "purple"] + +# count variants + for vcf in vcfs: + print(f" {vcf}") + for typ in vcf_types: + results_fn = f"vcfdist/{vcf}.{typ}.{region}.vcf" + with open(results_fn, 'r') as results_file: + line_ct = 0 + for line in results_file: + if line[0] == "#": # header + continue + contig, pos, var_id, ref, alt, qual, filt, info, fmt, truth, query = line.strip().split('\t') + gt, decision, credit, ga4gh_cat, qual2, sc, swap = query.split(":") + if credit == '.': continue + haps = sum([0 if x == '.' else int(x) for x in gt.split('|')]) + if decision == "FP" and float(credit) == 0: + if len(ref) == len(alt) == 1: # snp + fp_counts[vcf]["snv"][typ] += haps + elif len(ref) == 1 and len(alt) > 1: # insertion + if len(alt) > 50: + fp_counts[vcf]["sv"][typ] += haps + else: + fp_counts[vcf]["indel"][typ] += haps + elif len(alt) == 1 and len(ref) > 1: # deletion + if len(ref) > 50: + fp_counts[vcf]["sv"][typ] += haps + else: + fp_counts[vcf]["indel"][typ] += haps + else: + print("ERROR: unexpected variant type") + elif float(credit) < 1: + if len(ref) == len(alt) == 1: # snp + pp_counts[vcf]["snv"][typ] += haps + elif len(ref) == 1 and len(alt) > 1: # insertion + if len(alt) > 50: + pp_counts[vcf]["sv"][typ] += haps + else: + pp_counts[vcf]["indel"][typ] += haps + elif len(alt) == 1 and len(ref) > 1: # deletion + if len(ref) > 50: + pp_counts[vcf]["sv"][typ] += haps + else: + pp_counts[vcf]["indel"][typ] += haps + else: + print("ERROR: unexpected variant type") + else: # credit == 1 + if len(ref) == len(alt) == 1: # snp + tp_counts[vcf]["snv"][typ] += haps + elif len(ref) == 1 and len(alt) > 1: # insertion + if len(alt) > 50: + tp_counts[vcf]["sv"][typ] += haps + else: + tp_counts[vcf]["indel"][typ] += haps + elif len(alt) == 1 and len(ref) > 1: # deletion + if len(ref) > 50: + tp_counts[vcf]["sv"][typ] += haps + else: + tp_counts[vcf]["indel"][typ] += haps + else: + print("ERROR: unexpected variant type") + line_ct += 1 + # if line_ct > 1000: break + print(json.dumps(fp_counts, indent=4)) + + fig, ax = plt.subplots(1, 3, figsize=(15,6)) + indices = np.arange(len(vcfs)-1) + width = 0.1 + thirty_less_yquals = [0, 3.01, 6.99, 10, 13.01, 16.99, 20, 23.01, 26.99, 30] + ylabels = ["0.1%", "0.2%", "0.5%", "1%", "2%", "5%", "10%", "20%", "50%", "100%"] + for var_type_idx, var_type in enumerate(variant_types): + for vcf_type_idx, vcf_type in enumerate(vcf_types): + + # skip certain plots (few variants due to complex not getting filtered) + if var_type == "snv" and vcf_type in ["indel", "large", "sv"]: continue + if var_type == "indel" and vcf_type in ["snv", "sv"]: continue + if var_type == "sv" and vcf_type in ["snv", "indel", "small"]: continue + + fp = [fp_counts[vcf][var_type][vcf_type]/max(1, tp_counts[vcf][var_type][vcf_type]) for vcf in vcfs[1:]] + fpq = [0 if not frac else -10*np.log10(frac) for frac in fp] + pp = [pp_counts[vcf][var_type][vcf_type]/max(1, tp_counts[vcf][var_type][vcf_type]) for vcf in vcfs[1:]] + ppq = [0 if not frac else -10*np.log10(frac) for frac in pp] + ppfp = [fpf + ppf for (fpf, ppf) in zip(fp, pp)] + ppfpq = [0 if not frac else -10*np.log10(frac) for frac in ppfp] + + ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, + [30-fpqual for fpqual in fpq], width, color=colors[vcf_type_idx]) + ax[var_type_idx].bar(indices-2*width+vcf_type_idx*width, + [-(ppfpqual-fpqual) for (ppfpqual, fpqual) in zip(ppfpq, fpq)], + width, color=colors[vcf_type_idx], + bottom=[30-fpqual for fpqual in fpq], alpha=0.5) + for yqual in thirty_less_yquals: + ax[var_type_idx].axhline(y=yqual, color='k', alpha=0.5, linestyle=':', zorder=-1) + ax[var_type_idx].set_title(var_type) + ax[var_type_idx].set_xlabel("VCFs") + ax[var_type_idx].set_xticks(indices) + ax[var_type_idx].set_xticklabels(vcfs[1:]) + ax[var_type_idx].set_yticks(thirty_less_yquals) + ax[var_type_idx].set_yticklabels(ylabels, fontsize=8) + patches = [mpatches.Patch(color=c, label=l)for c,l in zip(colors, vcf_types)] + ax[2].legend(handles=patches, loc=(0.6,0.6)) + ax[0].set_ylabel("False Positive Rate") + plt.suptitle(f"{region}") + plt.savefig(f"counts-{region}-fp.png", dpi=300) diff --git a/analysis-v2/small_sv_all/5_extract_diffs.py b/analysis-v2/small_sv_all/5_extract_diffs.py new file mode 100644 index 0000000..9ec1dc2 --- /dev/null +++ b/analysis-v2/small_sv_all/5_extract_diffs.py @@ -0,0 +1,175 @@ +import sys + +datasets = ["hprc", "pav", "giab-tr"] +vcf_types = ["small", "sv"] + +def RED(text): + return "\x1b[0;31;49m" + text + "\033[0m" +def YLW(text): + return "\x1b[0;33;49m" + text + "\033[0m" +def GRN(text): + return "\x1b[0;32;49m" + text + "\033[0m" +# def RED(text): +# return "\x1b[7;49;31m" + text + "\033[0m" +# def YLW(text): +# return "\x1b[7;49;33m" + text + "\033[0m" +# def GRN(text): +# return "\x1b[7;49;32m" + text + "\033[0m" + +all_sc_dict = {} +for ds in datasets: + for vcf in vcf_types: + print(f"DATASET: {ds}, SUBSET: {vcf}") + # subset, not substitution mutation + sub_file = open(f"vcfdist/{ds}.{vcf}.summary.vcf", 'r') + all_file = open(f"vcfdist/{ds}.all.summary.vcf", 'r') + sub_vars = [l for l in sub_file.readlines() if l[0] != "#"] + all_vars = [l for l in all_file.readlines() if l[0] != "#"] + out_file = open(f"diffs/{ds}.{vcf}_vs_all.txt", 'w') + + # get all differences in subset (per supercluster) + sub_scs = [] + all_scs = [] + sc_ctgs = [] + svi = 0 + for avi, all_line in enumerate(all_vars): + # extract fields + all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \ + all_info, all_fmt, all_truth, all_query = \ + all_line.strip().split('\t') + if svi >= len(sub_vars): break + sub_line = sub_vars[svi] + sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \ + sub_info, sub_fmt, sub_truth, sub_query = \ + sub_line.strip().split('\t') + + # extract tags + all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":") + all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":") + all_SC = all_tSC if all_qSC == "." else all_qSC + + sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":") + sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":") + sub_SC = sub_tSC if sub_qSC == "." else sub_qSC + + # same variant + if sub_ctg == all_ctg and sub_pos == all_pos and \ + sub_ref == all_ref and sub_alt == all_alt: + + # create many-to-one mapping from sub_scs to all_scs (larger due to more variants) + if sub_SC != "." and all_SC != ".": + all_sc_dict[sub_ctg+":"+sub_SC] = all_SC + + # skip exact matches + if sub_qBD == all_qBD and sub_qBC == all_qBC: + svi += 1 + continue + + # different classification + if not len(all_scs) or int(all_SC) != all_scs[-1]: + sub_scs.append(int(sub_SC)) + all_scs.append(int(all_SC)) + sc_ctgs.append(sub_ctg) + print(sub_ctg, sub_SC, all_SC) + svi += 1 + + else: # different variant + if sub_ctg == all_ctg and int(sub_pos) < int(all_pos): + # this only occurs with variant missing from 'all' file due to overlap + svi += 1 + + # print superclusters + svi = 0 + avi = 0 + for sc_ctg, (sub_sc, all_sc) in zip(sc_ctgs, zip(sub_scs, all_scs)): + print(f"ctg: {sc_ctg}\t{vcf.upper()} supercluster: {sub_sc}\tALL supercluster: {all_sc}\t{RED('FP/FN')} {YLW('PP')} {GRN('TP')}", file=out_file) + print("\tCONTIG\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTRUTH\tQUERY", file=out_file) + + # get info for all vars + all_line = all_vars[avi] + all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \ + all_info, all_fmt, all_truth, all_query = \ + all_line.strip().split('\t') + all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":") + all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":") + all_SC = all_tSC if all_qSC == "." else all_qSC + all_BC = all_tBC if all_qBC == "." else all_qBC + all_BC = max(float(0 if all_tBC == "." else all_tBC), \ + float(0 if all_qBC == "." else all_qBC)) + + # find supercluster start + while int(all_SC) < all_sc or all_ctg != sc_ctg: + avi += 1 + all_line = all_vars[avi] + all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \ + all_info, all_fmt, all_truth, all_query = \ + all_line.strip().split('\t') + all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":") + all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":") + all_SC = all_tSC if all_qSC == "." else all_qSC + all_BC = max(float(0 if all_tBC == "." else all_tBC), \ + float(0 if all_qBC == "." else all_qBC)) + + # get info for sub vars + sub_line = sub_vars[svi] + sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \ + sub_info, sub_fmt, sub_truth, sub_query = \ + sub_line.strip().split('\t') + sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":") + sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":") + sub_SC = sub_tSC if sub_qSC == "." else sub_qSC + sub_BC = max(float(0 if sub_tBC == "." else sub_tBC), \ + float(0 if sub_qBC == "." else sub_qBC)) + + # find supercluster start + while int(sub_SC) < sub_sc or sub_ctg != sc_ctg: + svi += 1 + sub_line = sub_vars[svi] + sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \ + sub_info, sub_fmt, sub_truth, sub_query = \ + sub_line.strip().split('\t') + sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":") + sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":") + sub_SC = sub_tSC if sub_qSC == "." else sub_qSC + sub_BC = max(float(0 if sub_tBC == "." else sub_tBC), \ + float(0 if sub_qBC == "." else sub_qBC)) + + # print supercluster + while sub_ctg+":"+sub_SC in all_sc_dict and all_sc_dict[sub_ctg+":"+sub_SC] == all_SC: + if float(sub_BC) == 0: + print(f"{vcf.upper().rjust(6)}: {RED(sub_line)}", end="", file=out_file) + elif float(sub_BC) == 1: + print(f"{vcf.upper().rjust(6)}: {GRN(sub_line)}", end="", file=out_file) + else: + print(f"{vcf.upper().rjust(6)}: {YLW(sub_line)}", end="", file=out_file) + svi += 1 + sub_line = sub_vars[svi] + sub_ctg, sub_pos, sub_name, sub_ref, sub_alt, sub_qual, sub_filt, \ + sub_info, sub_fmt, sub_truth, sub_query = \ + sub_line.strip().split('\t') + sub_qGT, sub_qBD, sub_qBC, sub_qBK, sub_qQQ, sub_qSC, sub_qSP = sub_query.split(":") + sub_tGT, sub_tBD, sub_tBC, sub_tBK, sub_tQQ, sub_tSC, sub_tSP = sub_truth.split(":") + sub_SC = sub_tSC if sub_qSC == "." else sub_qSC + sub_BC = max(float(0 if sub_tBC == "." else sub_tBC), \ + float(0 if sub_qBC == "." else sub_qBC)) + + # print supercluster + print(" ", file=out_file) + while int(all_SC) == all_sc and all_ctg == sc_ctg: + if float(all_BC) == 0: + print(f" ALL: {RED(all_line)}", end="", file=out_file) + elif float(all_BC) == 1: + print(f" ALL: {GRN(all_line)}", end="", file=out_file) + else: + print(f" ALL: {YLW(all_line)}", end="", file=out_file) + avi += 1 + all_line = all_vars[avi] + all_ctg, all_pos, all_name, all_ref, all_alt, all_qual, all_filt, \ + all_info, all_fmt, all_truth, all_query = \ + all_line.strip().split('\t') + all_qGT, all_qBD, all_qBC, all_qBK, all_qQQ, all_qSC, all_qSP = all_query.split(":") + all_tGT, all_tBD, all_tBC, all_tBK, all_tQQ, all_tSC, all_tSP = all_truth.split(":") + all_SC = all_tSC if all_qSC == "." else all_qSC + all_BC = max(float(0 if all_tBC == "." else all_tBC), \ + float(0 if all_qBC == "." else all_qBC)) + print("\n", file=out_file) diff --git a/analysis-v2/small_sv_all/globals.sh b/analysis-v2/small_sv_all/globals.sh new file mode 100755 index 0000000..f3388b2 --- /dev/null +++ b/analysis-v2/small_sv_all/globals.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +data="/home/timdunn/vcfdist/data" +dir="/home/timdunn/vcfdist/analysis-v2/small_sv_all" +parallel="/home/timdunn/parallel-20211222/src/parallel" +vcfdist="/home/timdunn/vcfdist/src/vcfdist" +rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg" +tabix="/home/timdunn/software/htslib-1.16/tabix" +bgzip="/home/timdunn/software/htslib-1.16/bgzip" +timer="/usr/bin/time" + +# define reference FASTA +ref_name="${data}/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta" + +# filename information for each dataset +ds_names=( + # "t2t-q100" # must always be included as first VCF + # "hprc" + "pav" + # "giab-tr" +) +ds_versions=( + # "v0.9" + # "v1" + "v4" + # "v4.20" +) +splits=( + "snv" + "indel" + "sv" + "small" + "large" + "all" +) +bednames=( + "alldifficultregions" + "AllHomopolymers_ge7bp_imperfectge11bp_slop5" + "AllTandemRepeats" + "AllTandemRepeatsandHomopolymers_slop5" + "MHC" + "satellites_slop5" + "segdups" + "alllowmapandsegdupregions" +) diff --git a/analysis-v2/variants/1_plot_vars.py b/analysis-v2/variants/1_plot_vars.py new file mode 100644 index 0000000..6c8a954 --- /dev/null +++ b/analysis-v2/variants/1_plot_vars.py @@ -0,0 +1,36 @@ +import matplotlib.pyplot as plt +import numpy as np +from bisect import bisect_right + +vcf_fn = "/home/timdunn/vcfdist/data/t2t-q100-v0.9/split/all.vcf" + +bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000, + 100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000, + 100_000_000, 200000000, 500000000, 1_000_000_000] +labels = [1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"] +all_bins = [-x for x in bins[::-1]] + [0] + bins +all_labels = [f"-{l}" for l in labels[::-1]] + [0] + labels + +counts = [0]*len(all_bins) +with open(vcf_fn, "r") as vcf: + for line in vcf: + if line[0] == "#": continue + fields = line.split() + ctg, pos, uid, ref, alt = fields[:5] + varlen = len(alt) - len(ref) + idx = bisect_right(all_bins, varlen) + counts[idx] += 1 + +fig, ax = plt.subplots(1,1, figsize=(15,5)) +for x in range(56): + plt.axvline(x, color='k', linestyle=':') + +ax.bar(np.arange(-0.5, len(all_bins)-0.5), counts, width=1) +ax.set_xlim(0,len(all_bins)) +ax.set_xticks([0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 28, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56]) +ax.set_xticklabels(all_labels) +ax.set_yscale('log') +ax.set_ylim(0.5, 2000000) +ax.set_xlabel("Variant Size") +ax.set_ylabel("Counts") +plt.savefig(f"img/var-lengths.png") diff --git a/analysis-v2/vs_prior_work/1_eval_vcfs.sh b/analysis-v2/vs_prior_work/1_eval_vcfs.sh new file mode 100755 index 0000000..9580a1c --- /dev/null +++ b/analysis-v2/vs_prior_work/1_eval_vcfs.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +source globals.sh + +# # vcfdist evaluation +# mkdir -p $out/evals/vcfdist +# for i in "${!query_names[@]}"; do +# echo "vcfdist: evaluating '${query_names[i]}'" +# $timer -v ../../src/vcfdist \ +# $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# $data/refs/$ref_name \ +# --bed $data/${truth_name}-${truth_version}/split/bench.bed \ +# --keep-query --keep-truth \ +# -l 1000 \ +# -p $out/evals/vcfdist/${query_names[i]}. \ +# 2> $out/evals/vcfdist/${query_names[i]}.log +# done + +# # vcfeval evaluation +# mkdir -p $out/evals/vcfeval +# for i in "${!query_names[@]}"; do +# echo "vcfeval: evaluating '${query_names[i]}'" +# $timer -v $rtg vcfeval \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# -t $data/refs/${ref_name}.sdf \ +# -e $data/${truth_name}-${truth_version}/split/bench.bed \ +# --threads 64 \ +# --all-records \ +# -o $out/evals/vcfeval/${query_names[i]} \ +# 2> $out/evals/vcfeval/${query_names[i]}.log +# done + +source ~/truvari/venv3.10/bin/activate + +# # truvari evaluation +# mkdir -p $out/evals/truvari +# for i in "${!query_names[@]}"; do +# echo "truvari: evaluating '${query_names[i]}'" +# truvari bench \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# --bSample HG002 \ +# --cSample HG002 \ +# --includebed $data/${truth_name}-${truth_version}/split/bench.bed \ +# -f $data/refs/$ref_name \ +# -s 1 -S 1 \ +# --sizemax 1000 \ +# --no-ref a \ +# --pick multi \ +# -o $out/evals/truvari/${query_names[i]} \ +# 2> $out/evals/truvari/${query_names[i]}.log +# done + +# wfa refine +for i in "${!query_names[@]}"; do + if [[ $i -eq 1 ]]; then + continue + fi + echo "truvari: wfa refine '${query_names[i]}'" + rm -rf $out/evals/truvari/${query_names[i]}_wfa + cp -r $out/evals/truvari/${query_names[i]}_\ + $out/evals/truvari/${query_names[i]}_wfa + truvari refine \ + -f $data/refs/$ref_name \ + -t 64 \ + --use-original-vcfs \ + --align wfa \ + $out/evals/truvari/${query_names[i]}_wfa \ + 2> $out/evals/truvari/${query_names[i]}_wfa.log +done + +# for rep in "${reps[@]}"; do +# for i in "${!ds_names[@]}"; do +# echo "truvari: mafft refine $rep '${ds_names[i]}'" +# rm -rf $out/evals/$rep/truvari/${ds_names[i]}_mafft +# cp -r $out/evals/$rep/truvari/${ds_names[i]} \ +# $out/evals/$rep/truvari/${ds_names[i]}_mafft +# truvari refine \ +# -f $data/refs/$ref_name \ +# -t 64 \ +# --use-original-vcfs \ +# --align mafft \ +# $out/evals/$rep/truvari/${ds_names[i]}_mafft +# done +# done + +deactivate diff --git a/analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh b/analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh new file mode 100755 index 0000000..d606f60 --- /dev/null +++ b/analysis-v2/vs_prior_work/1_eval_vcfs_tr.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +source globals.sh + +# # vcfdist evaluation +# mkdir -p $out/evals/vcfdist_tr +# for i in "${!query_names[@]}"; do +# echo "vcfdist: evaluating '${query_names[i]}'" +# $timer -v ../../src/vcfdist \ +# $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# $data/refs/$ref_name \ +# --bed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \ +# -l 1000 \ +# -p $out/evals/vcfdist_tr/${query_names[i]}. \ +# 2> $out/evals/vcfdist_tr/${query_names[i]}.log +# done + +# # vcfeval evaluation +# mkdir -p $out/evals/vcfeval_tr +# for i in "${!query_names[@]}"; do +# echo "vcfeval: evaluating '${query_names[i]}'" +# $timer -v $rtg vcfeval \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# -t $data/refs/${ref_name}.sdf \ +# -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \ +# --threads 64 \ +# --all-records \ +# -o $out/evals/vcfeval_tr/${query_names[i]} \ +# 2> $out/evals/vcfeval_tr/${query_names[i]}.log +# done + +source ~/truvari/venv3.10/bin/activate + +# # truvari evaluation +# mkdir -p $out/evals/truvari_tr +# for i in "${!query_names[@]}"; do +# echo "truvari: evaluating '${query_names[i]}'" +# truvari bench \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# --bSample HG002 \ +# --cSample HG002 \ +# --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \ +# -f $data/refs/$ref_name \ +# -s 1 -S 1 \ +# --sizemax 1000 \ +# --no-ref a \ +# --pick multi \ +# -o $out/evals/truvari_tr/${query_names[i]} \ +# 2> $out/evals/truvari_tr/${query_names[i]}.log +# done + +# # wfa refine +# for i in "${!query_names[@]}"; do +# echo "truvari: wfa refine '${query_names[i]}'" +# rm -rf $out/evals/truvari_tr/${query_names[i]}_wfa +# cp -r $out/evals/truvari_tr/${query_names[i]} \ +# $out/evals/truvari_tr/${query_names[i]}_wfa +# truvari refine \ +# -f $data/refs/$ref_name \ +# -t 64 \ +# --use-original-vcfs \ +# --align wfa \ +# $out/evals/truvari_tr/${query_names[i]}_wfa \ +# 2> $out/evals/truvari_tr/${query_names[i]}_wfa.log +# done + +# mafft refine +for i in "${!query_names[@]}"; do + echo "truvari: mafft refine '${query_names[i]}'" + rm -rf $out/evals/truvari_tr/${query_names[i]}_mafft + cp -r $out/evals/truvari_tr/${query_names[i]} \ + $out/evals/truvari_tr/${query_names[i]}_mafft + truvari refine \ + -f $data/refs/$ref_name \ + -t 64 \ + --use-original-vcfs \ + --align mafft \ + $out/evals/truvari_tr/${query_names[i]}_mafft \ + 2> $out/evals/truvari_tr/${query_names[i]}_mafft.log +done + +deactivate diff --git a/analysis-v2/vs_prior_work/README.md b/analysis-v2/vs_prior_work/README.md new file mode 100644 index 0000000..3a7642d --- /dev/null +++ b/analysis-v2/vs_prior_work/README.md @@ -0,0 +1 @@ +Directory for comparing vcfdist, vcfeval, and Truvari diff --git a/analysis-v2/vs_prior_work/globals.sh b/analysis-v2/vs_prior_work/globals.sh new file mode 100755 index 0000000..8062207 --- /dev/null +++ b/analysis-v2/vs_prior_work/globals.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +data="/home/timdunn/vcfdist/data" +out="/home/timdunn/vcfdist/analysis-v2/vs_prior_work" +parallel="/home/timdunn/parallel-20211222/src/parallel" +vcfdist="/home/timdunn/vcfdist/src/vcfdist" +rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg" +tabix="/home/timdunn/software/htslib-1.16/tabix" +bgzip="/home/timdunn/software/htslib-1.16/bgzip" +timer="/usr/bin/time" + +# define reference FASTA +ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta" + +# T2T information +truth_name="t2t-q100" +truth_version="v0.9" + +# query VCF information +query_names=( + "hprc" + "pav" + "giab-tr" +) +query_versions=( + "v1" + "v4" + "v4.20" +) diff --git a/analysis/12_plot_ram.py b/analysis/10_plot_ram.py similarity index 100% rename from analysis/12_plot_ram.py rename to analysis/10_plot_ram.py diff --git a/analysis/10_vcfdist_sv.sh b/analysis/10_vcfdist_sv.sh deleted file mode 100755 index d460c38..0000000 --- a/analysis/10_vcfdist_sv.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -source ../pipeline/includes.sh - -# # Zook GIAB TR dataset (0:56, 3:40, 7:30) -# $timer -v ../src/vcfdist \ -# $data/zook/verkko.vcf.gz \ -# $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \ -# $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ -# -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.regions.bed \ -# -l 10000 \ -# -t -q \ -# -p ../out/giabtr/vcfdist-keep/ - -# Zook GIAB TR dataset -$timer -v ../src/vcfdist \ - $data/zook/verkko.vcf.gz \ - $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \ - $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ - -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ - -l 10000 \ - -p ../out/giabtr/vcfdist-chr20 - -# Benchmark on Truvari phab variants (chr20) -# file="../out/giabtr/phab-chr20/output.vcf.gz" -# for sample in `bcftools query -l $file`; do -# bcftools view -c1 -Oz -s $sample -o ${file/.vcf*/.$sample.vcf.gz} $file -# done -# $timer -v ../src/vcfdist \ -# ../out/giabtr/phab-chr20/norm.QUERY.vcf.gz \ -# ../out/giabtr/phab-chr20/norm.TRUTH.vcf.gz \ -# $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ -# -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# -l 10000 \ -# -t -q \ -# -p ../out/giabtr/vcfdist-norm-phab-chr20/ diff --git a/analysis/11_truvari_sv.sh b/analysis/11_truvari_sv.sh deleted file mode 100755 index 4e0c929..0000000 --- a/analysis/11_truvari_sv.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash - -source ~/truvari/venv3.10/bin/activate -source ../pipeline/includes.sh - -# bcftools annotate -x FMT/AD GRCh38_HG2-verrkoV1.1-V0.7_dipcall-z2k.vcf.gz > verkko0.vcf.gz -# bcftools norm -m-any --check-ref -w -f GCA... verkko0.vcf.gz -o verkko-norm.vcf.gz -# bcftools sort verkko-norm.vcf.gz -o verkko.vcf.gz -# tabix -p vcf verkko.vcf.gz - -# # TruVari without harmonization -# rm -r ../out/giabtr/truvari-init -# truvari bench \ -# -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \ -# -c $data/zook/verkko.vcf.gz \ -# -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ -# --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.lregions.bed \ -# --no-ref a \ -# --sizemin 1 --sizefilt 1 --sizemax 10000 \ -# --pick multi \ -# --typeignore \ -# --dup-to-ins \ -# -o ../out/giabtr/truvari-init/ - -# laytr tru2ga \ -# -i ../out/giabtr/truvari-init/ \ - # -o ../out/giabtr/truvari-init/result - -# Truvari benchmarking chr20 -giab_dir="../out/giabtr" -phab_dir="$giab_dir/phab-chr20" -truvari_phab_dir="$giab_dir/truvari-norm-phab-chr20" - -mkdir -p $phab_dir -$timer -v truvari phab \ - -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ - -b $data/giab-tr-v4.20/GIABTR.HG002.benchmark.vcf.gz \ - -c $data/zook/verkko.vcf.gz \ - --bSamples HG002\ - --cSamples syndip\ - --align wfa \ - -t 56 \ - -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ - -o $phab_dir/output.vcf.gz - -# file="../out/giabtr/phab-chr20/output.vcf.gz" -# for sample in `bcftools query -l $file`; do -# bcftools view -c1 -Oz -s $sample -o ${file/.vcf*/.$sample.vcf.gz} $file -# tabix -p vcf ${file/.vcf*/.$sample.vcf.gz} -# done -samples=( - "QUERY" - "TRUTH" -) -# for sample in ${samples[@]}; do -# bcftools norm \ -# -a \ -# -m-any \ -# --check-ref w \ -# -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ -# $phab_dir/output.${sample}.vcf.gz \ -# -o $phab_dir/norm.${sample}.vcf.gz -# tabix -p vcf $phab_dir/norm.${sample}.vcf.gz -# done - -# $timer -v truvari bench \ -# -b $phab_dir/norm.TRUTH.vcf.gz \ -# -c $phab_dir/norm.QUERY.vcf.gz \ -# -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ -# --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# --no-ref a \ -# --sizemin 1 --sizefilt 1 --sizemax 10000 \ -# --pick multi \ -# --typeignore \ -# --dup-to-ins \ -# -o $truvari_phab_dir - -# laytr tru2ga \ -# -i $truvari_phab_dir/ \ -# -o $truvari_phab_dir/result -# gunzip $truvari_phab_dir/result*.gz - -# deactivate diff --git a/analysis/2_general_sw_space_plot.py b/analysis/2_general_sw_space_plot.py index bf1324f..902cd05 100644 --- a/analysis/2_general_sw_space_plot.py +++ b/analysis/2_general_sw_space_plot.py @@ -6,7 +6,10 @@ mpl.rcParams.update(mpl.rcParamsDefault) mpl.rcParams['text.usetex'] = True -plt.figure(figsize=(12, 5)) +def mm2in(x): + return x / 25.4 + +plt.figure(figsize=(mm2in(180), mm2in(75))) xmin, xmax = 0, 3.2 ymin, ymax = -0.1, 1.5 label_pad = 0.03 @@ -15,11 +18,11 @@ x = list(np.arange(xmin, xmax+1, 0.1)) # G_open > 0 -plt.annotate("$o < 0$", (1.32,1.37), rotation=35, color="white") +plt.annotate("$o < 0$", (1.31,1.36), rotation=35, color="white", fontsize=7) plt.fill_between(x, x, ymax, alpha=0.8, facecolor='k') # vertical -plt.annotate("$2(o+e) < x$", (0.46,1.18), rotation=90, color="white") +plt.annotate("$2(o+e) < x$", (0.45,1.08), rotation=90, color="white", fontsize=7) plt.axvline(0.5, alpha=0.8, color='k', zorder=-100) # plt.annotate("$3x < o+e$", (3.01,1.22), rotation=90, color="black") # plt.axvspan(3, 3.2, alpha=0.1, color='k', zorder=-100) @@ -27,35 +30,35 @@ # horizontal # plt.annotate("$x < 2e$", (label_pad,0.57), ha="left", va="top", color="white") # plt.plot(x, [0.5]*len(x), color='k', alpha=0.8) -plt.annotate("$e < 0$", (label_pad,-0.03), ha="left", va="top", color="white") +plt.annotate("$e < 0$", (label_pad,-0.03), ha="left", va="top", color="white", fontsize=7) plt.fill_between(x, -0.2, 0, facecolor='k', alpha=0.8) # min ED i = 0 -plt.scatter(1,1, marker="X") -plt.annotate("edlib", (1+label_pad,1), ha="left", va="center", color=f"C{i}") +plt.scatter(1,1, marker="X", s=7) +plt.annotate("edlib", (1+label_pad,1), ha="left", va="center", color=f"C{i}", fontsize=7) i += 1 # minimap2 mm2_presets = { r"mm2 \texttt{sr}": { "a": 0, "b": 10, "o1": 13, "e1":3, "o2":25, "e2": 2, "m":"^" }, - r"\noindent mm2 \texttt{map-ont}\\winnowmap":{ "a": 0, "b": 6, "o1": 5, "e1":3, "o2":25, "e2": 2, "m":"s" }, + r"\noindent mm2 \texttt{map-ont}\vspace{-2mm}\\winnowmap":{ "a": 0, "b": 6, "o1": 5, "e1":3, "o2":25, "e2": 2, "m":"s" }, r"mm2 \texttt{map-pb}": { "a": 0, "b": 10, "o1": 13, "e1":5, "o2":53, "e2": 3, "m":"s" }, r"mm2 \texttt{asm5}": { "a": 0, "b": 40, "o1": 79, "e1":7, "o2":163, "e2": 3, "m":"P" }, r"mm2 \texttt{asm10}": { "a": 0, "b": 20, "o1": 33, "e1":5, "o2":83, "e2": 3, "m":"P" }, "pbmm2": { "a": 0, "b": 7, "o1": 6, "e1":5, "o2":57, "e2": 3, "m":"s" }, } for name, x in mm2_presets.items(): - start = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e1"]/x["b"], color=f"C{i}", marker=x["m"]) - end = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e2"]/x["b"], color=f"C{i}", marker=x["m"]) + start = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e1"]/x["b"], color=f"C{i}", marker=x["m"], s=7) + end = plt.scatter((x["o1"] + x["e1"])/x["b"], x["e2"]/x["b"], color=f"C{i}", marker=x["m"], s=7) plt.arrow((x["o1"] + x["e1"])/x["b"], x["e1"]/x["b"]-0.02, 0, x["e2"]/x["b"]-x["e1"]/x["b"]+0.04, color='k', length_includes_head=True, head_width=0.02, head_length = 0.016) - plt.annotate(name, ((x["o1"] + x["e1"])/x["b"]+label_pad, x["e1"]/x["b"]), ha="left", va="center", color=f"C{i}") + plt.annotate(name, ((x["o1"] + x["e1"])/x["b"]+label_pad, x["e1"]/x["b"]), ha="left", va="center", color=f"C{i}", fontsize=7) i += 1 # BWA, GRAF, DRAGEN -plt.scatter(1.6, 0.3, color=f"C{i}", marker="^", zorder=100) -plt.annotate(r"\noindent DRAGEN\\ BWA\\GRAF", (1.6+label_pad, 0.275), ha="right", va="center", color=f"C{i}") +plt.scatter(1.6, 0.3, color=f"C{i}", marker="^", zorder=100, s=7) +plt.annotate(r"\noindent DRAGEN\vspace{-2mm}\\BWA\vspace{-2mm}\\GRAF", (1.6+label_pad, 0.275), ha="right", va="center", color=f"C{i}", fontsize=7) i += 1 # ngmlr @@ -65,8 +68,8 @@ } for name, x in ngmlr_presets.items(): for e in np.arange(x["e_max"], x["e_min"], -x["e_dec"]): - plt.scatter(x["o"]/x["b"], e/x["b"], color=f"C{i}") - plt.annotate(name, (x["o"]/x["b"]+label_pad, (x["e_max"]+x["e_min"])/(2*x["b"])), ha="left", va="center", color=f"C{i}") + plt.scatter(x["o"]/x["b"], e/x["b"], color=f"C{i}", s=7) + plt.annotate(name, (x["o"]/x["b"]+label_pad, (x["e_max"]+x["e_min"])/(2*x["b"])), ha="left", va="center", color=f"C{i}", fontsize=7) i += 1 plt.arrow(x["o"]/x["b"], x["e_max"]/x["b"], 0, x["e_min"]/x["b"]-x["e_max"]/x["b"], color='k', length_includes_head=True, head_width=0.02, head_length = 0.016) @@ -74,35 +77,35 @@ # npore points = [[1.7, 0.5], [1.7,0.7], [1.7, 1], [1.7, 0.8], [1.7, 1.1], [2.3, 1.8], [2.8, 1], [3.2, 1.1], [3.2, 2.1], [3.2, 1], [3.7, 1.9], [3.7, 2], [3.7, 1], [3.7, 0.6], [3.8, 2.4], [3.8, 1.7], [3.8, 0.8], [4.6, 2.5], [4.6, 2.3], [4.6, 1.5]] for p in points: - plt.scatter(p[0]/5.72, p[1]/5.72, color=f"purple") -plt.annotate("npore", (0.5, 0.25), ha="left", va="center", color=f"purple") -plt.scatter(6/5.72, 1/5.72, marker="s", color=f"purple") -plt.annotate("npore", (6/5.72+label_pad, 1/5.72), ha="left", va="center", color=f"purple") + plt.scatter(p[0]/5.72, p[1]/5.72, color=f"purple", s=7) +plt.annotate("npore", (0.5, 0.25), ha="left", va="center", color=f"purple", fontsize=7) +plt.scatter(6/5.72, 1/5.72, marker="s", color=f"purple", s=7) +plt.annotate("npore", (6/5.72+label_pad, 1/5.72), ha="left", va="center", color=f"purple", fontsize=7) i += 1 # verkko -plt.scatter(0, 0, color=f"goldenrod", zorder=100) -plt.annotate("verkko", (label_pad, 0), ha="left", va="bottom", color=f"goldenrod") +plt.scatter(0, 0, color=f"goldenrod", zorder=100, s=7) +plt.annotate("verkko", (label_pad, 0), ha="left", va="bottom", color=f"goldenrod", fontsize=7) # points -plt.scatter(0.4, 0.3, marker="*", color=f"k") -plt.annotate("A", (0.4+label_pad, 0.3), ha="left", va="center", color="k") -plt.scatter(1, 0.333, marker="*", color=f"k") -plt.annotate("B", (1+label_pad, 0.333), ha="left", va="center", color="k") -plt.scatter(1.6, 0.4, marker="*", color=f"k") -plt.annotate("C", (1.6+label_pad, 0.4), ha="left", va="center", color="k") -plt.scatter(2, 0.2, marker="*", color=f"k") -plt.annotate("D", (2+label_pad, 0.2), ha="left", va="center", color="k") +plt.scatter(0.4, 0.3, marker="*", color=f"k", s=7) +plt.annotate("A", (0.4+label_pad, 0.3), ha="left", va="center", color="k", fontsize=7) +plt.scatter(1, 0.333, marker="*", color=f"k", s=7) +plt.annotate("B", (1+label_pad, 0.333), ha="left", va="center", color="k", fontsize=7) +plt.scatter(1.6, 0.4, marker="*", color=f"k", s=7) +plt.annotate("C", (1.6+label_pad, 0.4), ha="left", va="center", color="k", fontsize=7) +plt.scatter(2, 0.2, marker="*", color=f"k", s=7) +plt.annotate("D", (2+label_pad, 0.2), ha="left", va="center", color="k", fontsize=7) # arrows plt.arrow(2.1, 1.1, 0.1, 0.05, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # right -plt.annotate("prefer\nsubstitutions", (2.2+label_pad, 1.15), ha="left", va="center") +plt.annotate("prefer\nsubstitutions", (2.2+label_pad, 1.15), ha="left", va="center", fontsize=7) plt.arrow(2.1, 1.1, -0.1, -0.05, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # left -plt.annotate("prefer\nINDELs", (2.0-label_pad, 1.05), ha="right", va="center") +plt.annotate("prefer\nINDELs", (2.0-label_pad, 1.05), ha="right", va="center", fontsize=7) plt.arrow(2.065, 1.175, -0.01, 0.01, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # up -plt.annotate("prefer\nseparate\ngaps", (2.05, 1.2+label_pad), ha="center", va="bottom") +plt.annotate("prefer\nseparate\ngaps", (2.05, 1.2+label_pad), ha="center", va="bottom", fontsize=7) plt.arrow(2.13, 1.05, 0, -0.02, color='k', length_includes_head=True, head_width=0.02, head_length=0.016) # down -plt.annotate("prefer\nmerged\ngaps", (2.15, 1-label_pad), ha="center", va="top") +plt.annotate("prefer\nmerged\ngaps", (2.15, 1-label_pad), ha="center", va="top", fontsize=7) arc_angles = linspace(0 * pi, pi/4, 20) r = 0.2 arc_xs = r * cos(arc_angles) @@ -111,29 +114,31 @@ # legend sra = mlines.Line2D([], [], color='k', marker='^', linestyle='None', - markersize=7, label = 'Short Read Aligners') + markersize=6, label = 'Short Read Aligners') lra = mlines.Line2D([], [], color='k', marker='s', linestyle='None', - markersize=7, label = 'Long Read Aligners') + markersize=6, label = 'Long Read Aligners') aa = mlines.Line2D([], [], color='k', marker='P', linestyle='None', - markersize=7, label = 'Assembly Aligners') + markersize=6, label = 'Assembly Aligners') sva = mlines.Line2D([], [], color='k', marker='o', linestyle='None', - markersize=7, label = 'SV/CNV Aligners') + markersize=6, label = 'SV/CNV Aligners') ed = mlines.Line2D([], [], color='k', marker='X', linestyle='None', - markersize=7, label = 'Edit Distance Align') + markersize=6, label = 'Edit Distance Align') points = mlines.Line2D([], [], color='k', marker='*', linestyle='None', - markersize=9, label = 'Select Design Points') -leg1 = plt.legend(handles = [sra, lra, aa, sva, ed, points], loc = (0.8,0.2)) + markersize=8, label = 'Select Design Points') +leg1 = plt.legend(handles = [sra, lra, aa, sva, ed, points], loc = (0.78,0.15), fontsize=7) m = mlines.Line2D([], [], linestyle='None', label = r'$m$ : match penalty ($0$)') x = mlines.Line2D([], [], linestyle='None', label = r'$x$ : mismatch penalty') o = mlines.Line2D([], [], linestyle='None', label = r'$o$ : gap opening penalty') e = mlines.Line2D([], [], linestyle='None', label = r'$e$ : gap extension penalty') -plt.legend(handles = [m, x, o, e], loc=(0.78, 0.6), frameon=False) +plt.legend(handles = [m, x, o, e], loc=(0.75, 0.6), frameon=False, fontsize=7) plt.gca().add_artist(leg1) # labels -plt.xlabel(r'\LARGE{Starting a Gap: $(o+e) / x$}') -plt.ylabel(r'\LARGE{Extending a Gap: $e / x$}') +plt.xlabel(r'\textbf{Starting a Gap:} $(o+e) / x$', fontsize=7) +plt.ylabel(r'\textbf{Extending a Gap:} $e / x$', fontsize=7) +plt.xticks(fontsize=7) +plt.yticks(fontsize=7) plt.tight_layout() -# plt.savefig("img/2_general_sw_space.pdf", format="pdf") -plt.savefig("img/2_general_sw_space.png", dpi=200) +plt.savefig("img/2_general_sw_space.pdf", format="pdf") +# plt.savefig("img/2_general_sw_space.png", dpi=200) diff --git a/analysis/4_vcfdist_output.py b/analysis/4_vcfdist_output.py index 5f6ffa8..f8d78f4 100644 --- a/analysis/4_vcfdist_output.py +++ b/analysis/4_vcfdist_output.py @@ -2,11 +2,26 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np +import matplotlib as mpl +mpl.rcParams.update(mpl.rcParamsDefault) +mpl.rcParams['text.usetex'] = True +plt.rcParams.update({"figure.facecolor": (0,0,0,0)}) + +def mm2in(x): + return x / 25.4 names = ["O"] max_qual = 31 -fig, ax = plt.subplots(3, 2, figsize=(8,9)) +fig, ax = plt.subplots(3, 2, figsize=(mm2in(80),mm2in(90))) + +# source data headers +out_5bi = open("tsv/5bi.tsv", "w") +out_5bii = open("tsv/5bii.tsv", "w") +out_5biii = open("tsv/5biii.tsv", "w") +print("DATASET\tSUBMISSION\tVARIANT_TYPE\tQUALITY_SCORE\tPRECISION\tRECALL", file=out_5bi) +print("DATASET\tSUBMISSION\tVARIANT_TYPE\tQUALITY_SCORE\tEDIT_DISTANCE\tDISTINCT_EDITS", file=out_5bii) +print("DATASET\tSUBMISSION\tVARIANT_TYPE\tQUALITY_SCORE\tEDIT_DISTANCE\tDISTINCT_EDITS", file=out_5biii) # EDIT DISTANCE sub_des = [] @@ -33,25 +48,38 @@ ins_eds.append(int(ins_ed)) del_eds.append(int(del_ed)) eds.append(int(ed)) + print(f"NIST\tK4GT3\tSUB\t{qual}\t{sub_ed}\t{sub_de}", file=out_5bii) + print(f"NIST\tK4GT3\tINS\t{qual}\t{ins_ed}\t{ins_de}", file=out_5biii) + print(f"NIST\tK4GT3\tDEL\t{qual}\t{del_ed}\t{del_de}", file=out_5biii) -ax[1][0].plot(quals[:max_qual], sub_des[:max_qual], marker='.', linestyle='', color=f"C0", label=label) -ax[2][0].plot(quals[:max_qual], ins_des[:max_qual], marker='.', linestyle='', color=f"C1", label=label) -ax[2][0].plot(quals[:max_qual], del_des[:max_qual], marker='.', linestyle='', color=f"C2", label=label) +ax[1][0].plot(quals[:max_qual], sub_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C0", label=label) +ax[2][0].plot(quals[:max_qual], ins_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C1", label=label) +ax[2][0].plot(quals[:max_qual], del_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C2", label=label) -ax[1][1].plot(quals[:max_qual], sub_des[:max_qual], marker='.', linestyle='', color=f"C0", label=label) -ax[2][1].plot(quals[:max_qual], ins_eds[:max_qual], marker='.', linestyle='', color=f"C1", label=label) -ax[2][1].plot(quals[:max_qual], del_eds[:max_qual], marker='.', linestyle='', color=f"C2", label=label) +ax[1][1].plot(quals[:max_qual], sub_des[:max_qual], marker='.', markersize=1, linestyle='', color=f"C0", label=label) +ax[2][1].plot(quals[:max_qual], ins_eds[:max_qual], marker='.', markersize=1, linestyle='', color=f"C1", label=label) +ax[2][1].plot(quals[:max_qual], del_eds[:max_qual], marker='.', markersize=1, linestyle='', color=f"C2", label=label) for i in range(2): - ax[1][i].set_xlabel("Quality Threshold") - ax[1][i].set_ylabel("Distinct Edits (DE)") + ax[1][i].set_xlabel(r"\textbf{Quality Threshold}", fontsize=7) + if i % 2: + ax[1][i].set_ylabel(r"\textbf{Edit Distance}", fontsize=7) + else: + ax[1][i].set_ylabel(r"\textbf{Distinct Edits}", fontsize=7) ax[1][i].set_yscale('log') - ax[1][i].legend(["SNP"], loc="upper left") + ax[1][i].yaxis.set_minor_formatter(mpl.ticker.ScalarFormatter()) + ax[1][i].tick_params(axis='both', which='both', labelsize=5) + ax[1][i].legend(["substitution"], markerscale=3, loc="upper left", fontsize=5) for i in range(2): - ax[2][i].set_xlabel("Quality Threshold") - ax[2][i].set_ylabel("Edit Distance (ED)") + ax[2][i].set_xlabel(r"\textbf{Quality Threshold}", fontsize=7) + if i % 2: + ax[2][i].set_ylabel(r"\textbf{Edit Distance}", fontsize=7) + else: + ax[2][i].set_ylabel(r"\textbf{Distinct Edits}", fontsize=7) ax[2][i].set_yscale('log') - ax[2][i].legend(["INS", "DEL" ], loc="upper left") + ax[2][i].yaxis.set_minor_formatter(mpl.ticker.ScalarFormatter()) + ax[2][i].tick_params(axis='both', which='both', labelsize=5) + ax[2][i].legend(["insertion", "deletion" ], markerscale=3, loc="upper left", fontsize=5) # PRECISION RECALL with open(f"/x/vcfdist/data/pfda-v2/nist_vcfdist/K4GT3_HG002_O.precision-recall.tsv") as csv: @@ -72,6 +100,7 @@ else: indel_prec.append(float(truth_tp) / (float(truth_tp) + float(query_pp) + float(query_fp))) + print(f"NIST\tK4GT3\tINDEL\t{qual}\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out_5bi) elif typ == "SNP": snp_recall.append(float(truth_tp) / float(truth_tot)) if int(truth_tp) + int(query_pp) + int(query_fp) == 0: @@ -79,26 +108,32 @@ else: snp_prec.append(float(truth_tp) / (float(truth_tp) + float(query_pp) + float(query_fp))) + print(f"NIST\tK4GT3\tSNP\t{qual}\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out_5bi) -ax[0][0].set_title("SNPs") -ax[0][0].set_xlabel("Recall") -ax[0][0].set_ylabel("Precision") +ax[0][0].set_title(r"\textbf{SNPs}", fontsize=7) +ax[0][0].set_xlabel(r"\textbf{Recall}", fontsize=7) +ax[0][0].set_ylabel(r"\textbf{Precision}", fontsize=7) ax[0][0].set_xlim(0.995, 1) -ax[0][0].set_xticks(np.arange(0.995, 1.0001, 0.001)) +ax[0][0].set_xticks(np.arange(0.995, 1.0001, 0.0025)) ax[0][0].set_ylim(0.995, 1) ax[0][0].set_yticks(np.arange(0.995, 1.0001, 0.001)) +ax[0][0].tick_params(axis='both', which='both', labelsize=5) ax[0][0].plot(snp_recall, snp_prec, linestyle='', - marker='.', color=f"C0", label=label) + marker='.', color=f"C0", label=label, markersize=1) -ax[0][1].set_title("INDELs") -ax[0][1].set_xlabel("Recall") -ax[0][1].set_ylabel("Precision") +ax[0][1].set_title(r"\textbf{INDELs}", fontsize=7) +ax[0][1].set_xlabel(r"\textbf{Recall}", fontsize=7) +ax[0][1].set_ylabel(r"\textbf{Precision}", fontsize=7) ax[0][1].set_xlim(0.98, 1) -ax[0][1].set_xticks(np.arange(0.98, 1.001, 0.005)) +ax[0][1].set_xticks(np.arange(0.98, 1.001, 0.01)) ax[0][1].set_ylim(0.98, 1) ax[0][1].set_yticks(np.arange(0.98, 1.001, 0.005)) +ax[0][1].tick_params(axis='both', which='both', labelsize=5) ax[0][1].plot(indel_recall, indel_prec, linestyle='', - marker='.', color=f"C0", label=label) + marker='.', color=f"C0", label=label, markersize=1) plt.tight_layout() +out_5bi.close() +out_5bii.close() +out_5biii.close() plt.savefig('img/4_vcfdist_plot.pdf', format="pdf") diff --git a/analysis/7_vcfeval_pr_plot.py b/analysis/7_vcfeval_pr_plot.py index 13735f0..9cc03fe 100644 --- a/analysis/7_vcfeval_pr_plot.py +++ b/analysis/7_vcfeval_pr_plot.py @@ -2,8 +2,14 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np +import matplotlib as mpl +mpl.rcParams.update(mpl.rcParamsDefault) +mpl.rcParams['text.usetex'] = True plt.rcParams.update({"figure.facecolor": (0,0,0,0)}) +def mm2in(x): + return x / 25.4 + datasets = ["nist", "cmrg"] names = [ ["original", "bwa", "mm2-ont", "mm2-pb", "pbmm2"], @@ -17,10 +23,12 @@ ] sub_id = "K4GT3" -fig, ax = plt.subplots(3, 4, figsize=(20,12)) +fig, ax = plt.subplots(3, 4, figsize=(mm2in(180), mm2in(108))) for di, dataset in enumerate(datasets): for ei, eval_list in enumerate(names): + out3 = open(f"tsv/3{chr(ord('a')+ei)}.tsv", "w") + print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tPRECISION\tRECALL", file=out3) for ni, name in enumerate(eval_list): filename = name.replace("original", "O") try: @@ -35,6 +43,7 @@ indel_recall.append(float(line.split(',')[7])) indel_prec.append(float(line.split(',')[8])) indel_f1 = line.split(',')[10] + print(f"{dataset.upper()}\t{sub_id}\t{label.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out3) elif line[:19] == "SNP,*,*,PASS,*,QUAL": recall = float(line.split(',')[7]) snp_recall.append(recall) @@ -42,47 +51,62 @@ if prec == 0: prec = 1 snp_prec.append(prec) snp_f1 = line.split(',')[10] + print(f"{dataset.upper()}\t{sub_id}\t{label.upper()}\tSNP\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out3) ax[ei][di*2].plot(snp_recall, snp_prec, linestyle='', - marker='.', color=colors[ei][ni], label=label) + marker='.', markersize=1, color=colors[ei][ni], label=label) ax[ei][di*2+1].plot(indel_recall, indel_prec, linestyle='', - marker='.', color=colors[ei][ni], label=label) + marker='.', markersize=1, color=colors[ei][ni], label=label) except FileNotFoundError: print(f"'/x/vcfdist/data/pfda-v2/{dataset}_vcfeval/{sub_id}_HG002_{filename}.roc.all.csv' not found.") continue # SNP plot - ax[ei][di*2].set_xlabel("Recall", fontsize=15) - ax[ei][di*2].set_ylabel("Precision", fontsize=15) + ax[ei][0].set_ylabel(r"\textbf{Precision}", fontsize=7) + ax[2][di*2].set_xlabel(r"\textbf{Recall}", fontsize=7) + ax[2][di*2+1].set_xlabel(r"\textbf{Recall}", fontsize=7) if dataset == "nist": ax[ei][di*2].set_xlim(0.99, 1) - ax[ei][di*2].set_xticks(np.arange(0.99, 1.001, 0.002)) - ax[ei][di*2].set_ylim(0.999, 1.00001) - ax[ei][di*2].set_yticks(np.arange(0.999, 1.0001, 0.0002)) + ax[ei][di*2].set_xticks(np.arange(0.99, 1.001, 0.005)) + ax[ei][di*2].set_xticklabels([f"{x:.3f}" for x in + np.arange(0.99, 1.001, 0.005)], fontsize=5) + ax[ei][di*2].set_ylim(0.999, 1.00002) + ax[ei][di*2].set_yticks(np.arange(0.999, 1.00002, 0.0002)) ax[ei][di*2].set_yticklabels([f"{x:.4f}" for x in - np.arange(0.999, 1.00001, 0.0002)]) + np.arange(0.999, 1.00002, 0.0002)], fontsize=5) if dataset == "cmrg": ax[ei][di*2].set_xlim(0.9, 1) ax[ei][di*2].set_xticks(np.arange(0.9, 1.01, 0.02)) - ax[ei][di*2].set_ylim(0.95, 1.0001) + ax[ei][di*2].set_xticklabels([f"{x:.2f}" for x in + np.arange(0.9, 1.01, 0.02)], fontsize=5) + ax[ei][di*2].set_ylim(0.95, 1.001) ax[ei][di*2].set_yticks(np.arange(0.95, 1.001, 0.01)) ax[ei][di*2].set_yticklabels([f"{x:.3f}" for x in - np.arange(0.95, 1.001, 0.01)]) + np.arange(0.95, 1.001, 0.01)], fontsize=5) if di == 0: - ax[ei][di*2].legend(loc="lower left", fontsize=15, markerscale=3) + ax[ei][di*2].legend(loc="lower left", fontsize=5, markerscale=4) # INDEL plot - ax[ei][di*2+1].set_xlabel("Recall", fontsize=15) - ax[ei][di*2+1].set_ylabel("Precision", fontsize=15) + # ax[ei][0].set_xlabel("Recall", fontsize=7) + # ax[ei][di*2+1].set_ylabel("Precision", fontsize=7) if dataset == "nist": ax[ei][di*2+1].set_xlim(0.99, 1) - ax[ei][di*2+1].set_xticks(np.arange(0.99, 1.001, 0.002)) + ax[ei][di*2+1].set_xticks(np.arange(0.99, 1.001, 0.005)) + ax[ei][di*2+1].set_xticklabels([f"{x:.3f}" for x in + np.arange(0.99, 1.001, 0.005)], fontsize=5) ax[ei][di*2+1].set_ylim(0.995, 1.0001) - ax[ei][di*2+1].set_yticks(np.arange(0.995, 1.001, 0.001)) + ax[ei][di*2+1].set_yticks(np.arange(0.995, 1.0001, 0.001)) + ax[ei][di*2+1].set_yticklabels([f"{x:.3f}" for x in + np.arange(0.995, 1.0001, 0.001)], fontsize=5) if dataset == "cmrg": ax[ei][di*2+1].set_xlim(0.9, 1) ax[ei][di*2+1].set_xticks(np.arange(0.9, 1.01, 0.02)) - ax[ei][di*2+1].set_ylim(0.9, 1.001) - ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.01, 0.02)) + ax[ei][di*2+1].set_xticklabels([f"{x:.2f}" for x in + np.arange(0.9, 1.01, 0.02)], fontsize=5) + ax[ei][di*2+1].set_ylim(0.9, 1.002) + ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.002, 0.02)) + ax[ei][di*2+1].set_yticklabels([f"{x:.2f}" for x in + np.arange(0.9, 1.01, 0.02)], fontsize=5) + out3.close() plt.tight_layout() plt.savefig('img/7_vcfeval_pr_plot.pdf', format="pdf") diff --git a/analysis/8_vcfdist_pr_plot.py b/analysis/8_vcfdist_pr_plot.py index 892e21d..f76bcdd 100644 --- a/analysis/8_vcfdist_pr_plot.py +++ b/analysis/8_vcfdist_pr_plot.py @@ -2,21 +2,30 @@ import pandas as pd import matplotlib.pyplot as plt import numpy as np +import matplotlib as mpl +mpl.rcParams.update(mpl.rcParamsDefault) +mpl.rcParams['text.usetex'] = True plt.rcParams.update({"figure.facecolor": (0,0,0,0)}) +def mm2in(x): + return x / 25.4 + datasets = ["nist", "cmrg"] names = ["original", "A", "B", "C", "D"] -evals = ["vcfeval", "nopc", "vcfdist", "standard"] +evals = ["vcfeval", "vcfdist_no_partial", "vcfdist", "vcfdist_standard"] colors = ["k", "C0", "C1", "C2", "C3"] -fig, ax = plt.subplots(4, 4, figsize=(20,16)) +fig, ax = plt.subplots(4, 4, figsize=(mm2in(180), mm2in(144))) sub_id = "K4GT3" for di, dataset in enumerate(datasets): for ei, evaluation in enumerate(evals): + out4 = open(f"tsv/4{chr(ord('a')+ei)}.tsv", "w") + print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tPRECISION\tRECALL", file=out4) + for ni, name in enumerate(names): filename = "O" if name == "original" else name - evalstring = "std" if evaluation == "standard" else "" + evalstring = "std" if evaluation == "vcfdist_standard" else "" if evaluation == "vcfeval": with open(f"/x/vcfdist/data/pfda-v2/{dataset}_vcfeval/{sub_id}_HG002_{filename}std.roc.all.csv") as csv: indel_recall = [] @@ -28,17 +37,19 @@ indel_recall.append(float(line.split(',')[7])) indel_prec.append(float(line.split(',')[8])) indel_f1 = line.split(',')[10] + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4) elif line[:19] == "SNP,*,*,PASS,*,QUAL": recall = float(line.split(',')[7]) snp_recall.append(recall) prec = float(line.split(',')[8]) if prec == 0: prec = 1 snp_prec.append(prec) + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4) snp_f1 = line.split(',')[10] ax[ei][di*2].plot(snp_recall, snp_prec, linestyle='', - marker='.', color=colors[ni], label=name) + marker='.', markersize=1, color=colors[ni], label=name) ax[ei][di*2+1].plot(indel_recall, indel_prec, linestyle='', - marker='.', color=colors[ni], label=name) + marker='.', markersize=1, color=colors[ni], label=name) else: with open(f"/x/vcfdist/data/pfda-v2/{dataset}_vcfdist/{sub_id}_HG002_{filename}{evalstring}.precision-recall.tsv") as csv: label = f"{name}" @@ -51,7 +62,7 @@ for line in csv: typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \ query_tot, query_tp, query_pp, query_fp = line.split('\t') - if evaluation == "nopc": + if evaluation == "vcfdist_no_partial": if typ == "INDEL": indel_recall.append(float(truth_tp) / float(truth_tot)) if int(truth_tp) + int(query_fp) == 0: @@ -59,6 +70,7 @@ else: indel_prec.append(float(truth_tp) / (float(truth_tp) + float(query_fp))) + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4) elif typ == "SNP": snp_recall.append(float(truth_tp) / float(truth_tot)) if int(truth_tp) + int(query_fp) == 0: @@ -66,51 +78,68 @@ else: snp_prec.append(float(truth_tp) / (float(truth_tp) + float(query_fp))) + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out4) else: # normal vcfdist prec/recall calc, already done if typ == "INDEL": indel_recall.append(float(recall)) indel_prec.append(float(prec)) + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_prec[-1]}\t{indel_recall[-1]}", file=out4) elif typ == "SNP": snp_recall.append(float(recall)) snp_prec.append(float(prec)) + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_prec[-1]}\t{snp_recall[-1]}", file=out4) ax[ei][di*2].plot(snp_recall, snp_prec, linestyle='', - marker='.', color=colors[ni], label=label) + marker='.', markersize=1, color=colors[ni], label=label) ax[ei][di*2+1].plot(indel_recall, indel_prec, linestyle='', - marker='.', color=colors[ni], label=label) + marker='.', markersize=1, color=colors[ni], label=label) # SNP plot - ax[ei][di*2].set_xlabel("Recall", fontsize=15) - ax[ei][di*2].set_ylabel("Precision", fontsize=15) + ax[ei][0].set_ylabel(r"\textbf{Precision}", fontsize=7) + ax[3][di*2].set_xlabel(r"\textbf{Recall}", fontsize=7) + ax[3][di*2+1].set_xlabel(r"\textbf{Recall}", fontsize=7) if dataset == "nist": ax[ei][di*2].set_xlim(0.99, 1) - ax[ei][di*2].set_xticks(np.arange(0.99, 1.001, 0.002)) - ax[ei][di*2].set_ylim(0.998, 1.00001) - ax[ei][di*2].set_yticks(np.arange(0.998, 1.0001, 0.0004)) + ax[ei][di*2].set_xticks(np.arange(0.99, 1.0002, 0.005)) + ax[ei][di*2].set_xticklabels([f"{x:.3f}" for x in + np.arange(0.99, 1.0002, 0.005)], fontsize=5) + ax[ei][di*2].set_ylim(0.998, 1.00004) + ax[ei][di*2].set_yticks(np.arange(0.998, 1.00004, 0.0004)) ax[ei][di*2].set_yticklabels([f"{x:.4f}" for x in - np.arange(0.998, 1.00001, 0.0004)]) + np.arange(0.998, 1.00004, 0.0004)], fontsize=5) if dataset == "cmrg": ax[ei][di*2].set_xlim(0.9, 1) - ax[ei][di*2].set_xticks(np.arange(0.9, 1.01, 0.02)) - ax[ei][di*2].set_ylim(0.95, 1.0001) + ax[ei][di*2].set_xticks(np.arange(0.9, 1.002, 0.02)) + ax[ei][di*2].set_xticklabels([f"{x:.2f}" for x in + np.arange(0.9, 1.002, 0.02)], fontsize=5) + ax[ei][di*2].set_ylim(0.95, 1.001) ax[ei][di*2].set_yticks(np.arange(0.95, 1.001, 0.01)) - ax[ei][di*2].set_yticklabels([f"{x:.3f}" for x in - np.arange(0.95, 1.001, 0.01)]) + ax[ei][di*2].set_yticklabels([f"{x:.2f}" for x in + np.arange(0.95, 1.001, 0.01)], fontsize=5) if di == 0: - ax[ei][di*2].legend(loc="lower left", fontsize=15, markerscale=3) + ax[ei][di*2].legend(loc="lower left", fontsize=5, markerscale=4) # INDEL plot - ax[ei][di*2+1].set_xlabel("Recall", fontsize=15) - ax[ei][di*2+1].set_ylabel("Precision", fontsize=15) + # ax[ei][di*2+1].set_xlabel("Recall", fontsize=15) + # ax[ei][di*2+1].set_ylabel("Precision", fontsize=15) if dataset == "nist": ax[ei][di*2+1].set_xlim(0.95, 1) ax[ei][di*2+1].set_xticks(np.arange(0.95, 1.001, 0.01)) - ax[ei][di*2+1].set_ylim(0.98, 1.001) - ax[ei][di*2+1].set_yticks(np.arange(0.98, 1.001, 0.004)) + ax[ei][di*2+1].set_xticklabels([f"{x:.2f}" for x in + np.arange(0.95, 1.001, 0.01)], fontsize=5) + ax[ei][di*2+1].set_ylim(0.98, 1.0004) + ax[ei][di*2+1].set_yticks(np.arange(0.98, 1.0004, 0.004)) + ax[ei][di*2+1].set_yticklabels([f"{x:.3f}" for x in + np.arange(0.98, 1.0004, 0.004)], fontsize=5) if dataset == "cmrg": ax[ei][di*2+1].set_xlim(0.9, 1) - ax[ei][di*2+1].set_xticks(np.arange(0.9, 1.01, 0.02)) - ax[ei][di*2+1].set_ylim(0.9, 1.001) - ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.01, 0.02)) + ax[ei][di*2+1].set_xticks(np.arange(0.9, 1.002, 0.02)) + ax[ei][di*2+1].set_xticklabels([f"{x:.2f}" for x in + np.arange(0.9, 1.002, 0.02)], fontsize=5) + ax[ei][di*2+1].set_ylim(0.9, 1.002) + ax[ei][di*2+1].set_yticks(np.arange(0.9, 1.002, 0.02)) + ax[ei][di*2+1].set_yticklabels([f"{x:.2f}" for x in + np.arange(0.9, 1.002, 0.02)], fontsize=5) + out4.close() plt.tight_layout() plt.savefig('img/8_vcfdist_pr_plot.pdf', format="pdf") diff --git a/analysis/9_f1_ed_plot.py b/analysis/9_f1_ed_plot.py index 8102989..1d555bd 100644 --- a/analysis/9_f1_ed_plot.py +++ b/analysis/9_f1_ed_plot.py @@ -11,11 +11,17 @@ plt.rcParams.update({"figure.facecolor": (0,0,0,0)}) mpl.rcParams['text.usetex'] = True +def mm2in(x): + return x / 25.4 + data = "/home/timdunn/vcfdist/data" datasets = ["nist", "cmrg"] names = ["original", "A", "B", "C", "D"] colors = ["k", "C0", "C1", "C2", "C3"] +out6c = open("tsv/S5c.tsv", 'w') +print("DATASET\tSUBMISSION\tREPRESENTATION\tED_QSCORE\tDE_QSCORE\tALN_QSCORE", file=out6c) + sub_ids = [ "0GOOR", "23O09", "4GKR1", "61YRJ", "8H0ZB", "B1S5A", "C6JUX", "EIUT6", "H9OJ3", "IA789", "KXBR8", "MECML", "NWQ6Y", "R9CXN", "SEX9X", "UYMUW", "W91C1", "XC97E", "YBE9U", "YUI27", @@ -33,7 +39,7 @@ # ] # markers = ['s', 'o', '^'] -fig, ax = plt.subplots(1, 6, figsize=(20,4)) +fig, ax = plt.subplots(1, 6, figsize=(mm2in(180), mm2in(36))) for di, dataset in enumerate(datasets): @@ -66,6 +72,7 @@ de_f1_list.append(de_f1q) aln_f1_list.append(aln_f1q) print(f"{filename}={ed_f1q:.4f}\t", end="") + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\t{ed_f1q}\t{de_f1q}\t{aln_f1q}", file=out6c) except FileNotFoundError: # print(f"File '{data}/pfda-v2/{dataset}_vcfdist/{sub_id}_HG002_{filename}.distance-summary.tsv' not found.") break @@ -78,11 +85,11 @@ aln_f1_mean = np.mean(aln_f1_list) for ni, name in enumerate(names): ax[di*3].plot(ed_f1_mean, ed_f1_list[ni], linestyle='', - marker=markers[si], color=colors[ni], label=label) + marker=markers[si], markersize=1, color=colors[ni], label=label) ax[di*3+1].plot(de_f1_mean, de_f1_list[ni], linestyle='', - marker=markers[si], color=colors[ni], label=label) + marker=markers[si], markersize=1, color=colors[ni], label=label) ax[di*3+2].plot(aln_f1_mean, aln_f1_list[ni], linestyle='', - marker=markers[si], color=colors[ni], label=label) + marker=markers[si], markersize=1, color=colors[ni], label=label) all_ed_f1_y.extend(ed_f1_list) all_ed_f1_x.extend([ed_f1_mean]*5) all_ed_f1_meds.append(statistics.median(ed_f1_list)) @@ -95,22 +102,25 @@ print(" ") m, b, r, p, std_err = scipy.stats.linregress(all_ed_f1_x, all_ed_f1_y) - ax[di*3].set_xlabel("Avg ED Q-score", fontsize=15) - ax[di*3].set_ylabel("ED Q-score", fontsize=15) - ax[di*3].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, + ax[di*3].set_xlabel(r"\textbf{Avg ED Q-score}", fontsize=7) + ax[di*3].set_ylabel(r"\textbf{ED Q-score}", fontsize=7) + ax[di*3].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[di*3].transAxes) + ax[di*3].tick_params(axis='both', which='major', labelsize=5) m, b, r, p, std_err = scipy.stats.linregress(all_de_f1_x, all_de_f1_y) - ax[di*3+1].set_xlabel("Avg DE Q-score", fontsize=15) - ax[di*3+1].set_ylabel("DE Q-score", fontsize=15) - ax[di*3+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, + ax[di*3+1].set_xlabel(r"\textbf{Avg DE Q-score}", fontsize=7) + ax[di*3+1].set_ylabel(r"\textbf{DE Q-score}", fontsize=7) + ax[di*3+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[di*3+1].transAxes) + ax[di*3+1].tick_params(axis='both', which='major', labelsize=5) m, b, r, p, std_err = scipy.stats.linregress(all_aln_f1_x, all_aln_f1_y) - ax[di*3+2].set_xlabel("Avg ALN Q-score", fontsize=15) - ax[di*3+2].set_ylabel("ALN Q-score", fontsize=15) - ax[di*3+2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, + ax[di*3+2].set_xlabel(r"\textbf{Avg ALN Q-score}", fontsize=7) + ax[di*3+2].set_ylabel(r"\textbf{ALN Q-score}", fontsize=7) + ax[di*3+2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[di*3+2].transAxes) + ax[di*3+2].tick_params(axis='both', which='major', labelsize=5) # second pass, calculate average max rank change all_ed_f1_meds.sort() @@ -163,9 +173,10 @@ ed_amrc = np.mean(ed_rank_change) de_amrc = np.mean(de_rank_change) aln_amrc = np.mean(aln_rank_change) - ax[di*3].text(0.05,0.8,f"AMRC: {ed_amrc:.2f}", fontsize=15, transform=ax[di*3].transAxes) - ax[di*3+1].text(0.05,0.8,f"AMRC: {de_amrc:.2f}", fontsize=15, transform=ax[di*3+1].transAxes) - ax[di*3+2].text(0.05,0.8,f"AMRC: {aln_amrc:.2f}", fontsize=15, transform=ax[di*3+2].transAxes) + ax[di*3].text(0.05,0.8,f"AMRC: {ed_amrc:.2f}", fontsize=5, transform=ax[di*3].transAxes) + ax[di*3+1].text(0.05,0.8,f"AMRC: {de_amrc:.2f}", fontsize=5, transform=ax[di*3+1].transAxes) + ax[di*3+2].text(0.05,0.8,f"AMRC: {aln_amrc:.2f}", fontsize=5, transform=ax[di*3+2].transAxes) plt.tight_layout() +out6c.close() plt.savefig('img/9_f1_ed_plot.pdf', format="pdf") diff --git a/analysis/9_f1_pr_plot.py b/analysis/9_f1_pr_plot.py index 4beed7d..18fabad 100644 --- a/analysis/9_f1_pr_plot.py +++ b/analysis/9_f1_pr_plot.py @@ -11,30 +11,38 @@ mpl.rcParams['text.usetex'] = True plt.rcParams.update({"figure.facecolor": (0,0,0,0)}) +def mm2in(x): + return x / 25.4 + data = "/home/timdunn/vcfdist/data" datasets = ["nist", "cmrg"] tools = ["vcfeval", "vcfdist"] names = ["original", "A", "B", "C", "D"] colors = ["k", "C0", "C1", "C2", "C3"] -# sub_ids = [ -# "0GOOR", "23O09", "4GKR1", "61YRJ", "8H0ZB", "B1S5A", "C6JUX", "EIUT6", "H9OJ3", "IA789", -# "KXBR8", "MECML", "NWQ6Y", "R9CXN", "SEX9X", "UYMUW", "W91C1", "XC97E", "YBE9U", "YUI27", -# "4HL0B", "7NJ5C", "9DGOR", "BARQS", "CN36L", "ES3XW", "HB8P3", "ISKOS", "JIPSI", "KFA0T", -# "PGXA4", "RU88N", "TG5TE", "VES2R", "WGQ43", "XV7ZN", "YGOTK", "13678", "32LOW", "60Z59", -# "JBBK0", "K4GT3", "7RR8Z", "ASJT6", "BSODP", "CZA1Y", "0O7FL", "2OT9Q", "FFFGB", "HF8CT", "Y8QR8", "YJN61", "LR1FD", "MT57N", -# "J04SL", "K33QJ", "KPXT2", "M9KLP", "NFT0L", "QUE7Q", "S7K7S", "TZMTX", "W607K", "WX8VK" -# ] -# markers = ['.']*len(sub_ids) +out6a = open("tsv/6a.tsv", 'w') +out6b = open("tsv/6b.tsv", 'w') +print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tF1_QSCORE", file=out6a) +print("DATASET\tSUBMISSION\tREPRESENTATION\tVARIANT_TYPE\tF1_QSCORE", file=out6b) sub_ids = [ - "K4GT3", # Google Health - "W91C1", # Sentieon - "IA789" # Roche + "0GOOR", "23O09", "4GKR1", "61YRJ", "8H0ZB", "B1S5A", "C6JUX", "EIUT6", "H9OJ3", "IA789", + "KXBR8", "MECML", "NWQ6Y", "R9CXN", "SEX9X", "UYMUW", "W91C1", "XC97E", "YBE9U", "YUI27", + "4HL0B", "7NJ5C", "9DGOR", "BARQS", "CN36L", "ES3XW", "HB8P3", "ISKOS", "JIPSI", "KFA0T", + "PGXA4", "RU88N", "TG5TE", "VES2R", "WGQ43", "XV7ZN", "YGOTK", "13678", "32LOW", "60Z59", + "JBBK0", "K4GT3", "7RR8Z", "ASJT6", "BSODP", "CZA1Y", "0O7FL", "2OT9Q", "FFFGB", "HF8CT", "Y8QR8", "YJN61", "LR1FD", "MT57N", + "J04SL", "K33QJ", "KPXT2", "M9KLP", "NFT0L", "QUE7Q", "S7K7S", "TZMTX", "W607K", "WX8VK" ] -markers = ['s', 'o', '^'] +markers = ['.']*len(sub_ids) + +# sub_ids = [ +# "K4GT3", # Google Health +# "W91C1", # Sentieon +# "IA789" # Roche +# ] +# markers = ['s', 'o', '^'] -fig, ax = plt.subplots(2, 4, figsize=(20,8)) +fig, ax = plt.subplots(2, 4, figsize=(mm2in(180), mm2in(72))) for di, dataset in enumerate(datasets): for ti, tool in enumerate(tools): @@ -72,6 +80,8 @@ snp_f1_list.append(snp_f1q) indel_f1_list.append(indel_f1q) print(f"{filename}={snp_f1q:6.4f},{indel_f1q:6.4f}\t", end="") + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_f1q}", file=out6a) + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_f1q}", file=out6a) except FileNotFoundError: print(f"File '{data}/pfda-v2/{dataset}_vcfeval/{sub_id}_HG002_{filename}.summary.csv' not found.") continue @@ -80,9 +90,9 @@ indel_f1_mean = sum(indel_f1_list) / len(indel_f1_list) for ni, name in enumerate(names): ax[ti][di*2].plot(snp_f1_mean, snp_f1_list[ni], linestyle='', - marker=markers[si], color=colors[ni], label=label) + marker=markers[si], markersize=1, color=colors[ni], label=label) ax[ti][di*2+1].plot(indel_f1_mean, indel_f1_list[ni], linestyle='', - marker=markers[si], color=colors[ni], label=label) + marker=markers[si], markersize=1, color=colors[ni], label=label) all_snp_f1_y.extend(snp_f1_list) all_snp_f1_x.extend([snp_f1_mean]*len(names)) all_snp_f1_meds.append(statistics.median(snp_f1_list)) @@ -91,14 +101,16 @@ all_indel_f1_meds.append(statistics.median(indel_f1_list)) m, b, r, p, std_err = scipy.stats.linregress(all_snp_f1_x, all_snp_f1_y) - ax[ti][di*2].set_xlabel("Avg SNP F1 Q-score", fontsize=15) - ax[ti][di*2].set_ylabel("SNP F1 Q-score", fontsize=15) - ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2].transAxes) + ax[ti][di*2].set_xlabel(r"\textbf{Avg SNP F1 Q-score}", fontsize=7) + ax[ti][di*2].set_ylabel(r"\textbf{SNP F1 Q-score}", fontsize=7) + ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2].transAxes) + ax[ti][di*2].tick_params(axis='both', which='major', labelsize=5) m, b, r, p, std_err = scipy.stats.linregress(all_indel_f1_x, all_indel_f1_y) - ax[ti][di*2+1].set_xlabel("Avg INDEL F1 Q-score", fontsize=15) - ax[ti][di*2+1].set_ylabel("INDEL F1 Q-score", fontsize=15) - ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2+1].transAxes) + ax[ti][di*2+1].set_xlabel(r"\textbf{Avg INDEL F1 Q-score}", fontsize=7) + ax[ti][di*2+1].set_ylabel(r"\textbf{INDEL F1 Q-score}", fontsize=7) + ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2+1].transAxes) + ax[ti][di*2+1].tick_params(axis='both', which='major', labelsize=5) # second pass, calculate average max rank change all_snp_f1_meds.sort() @@ -146,8 +158,8 @@ indel_rank_change.append(indel_max_rank - indel_min_rank) snp_amrc = np.mean(snp_rank_change) indel_amrc = np.mean(indel_rank_change) - ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=15, transform=ax[ti][di*2].transAxes) - ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=15, transform=ax[ti][di*2+1].transAxes) + ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=5, transform=ax[ti][di*2].transAxes) + ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=5, transform=ax[ti][di*2+1].transAxes) # VCFDIST elif tool == "vcfdist": @@ -174,6 +186,8 @@ snp_f1_list.append(snp_f1q) indel_f1_list.append(indel_f1q) print(f"{filename}={snp_f1q:6.4f},{indel_f1q:6.4f}\t", end="") + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tSNP\t{snp_f1q}", file=out6b) + print(f"{dataset.upper()}\t{sub_id}\t{name.upper()}\tINDEL\t{indel_f1q}", file=out6b) except FileNotFoundError: print(f"File '{data}/pfda-v2/{dataset}_vcfdist/{sub_id}_HG002_{filename}.precision-recall-summary.tsv' not found.") continue @@ -182,9 +196,9 @@ indel_f1_mean = sum(indel_f1_list) / len(indel_f1_list) for ni, name in enumerate(names): ax[ti][di*2].plot(snp_f1_mean, snp_f1_list[ni], linestyle='', - marker=markers[si], color=colors[ni], label=label) + marker=markers[si], markersize=1, color=colors[ni], label=label) ax[ti][di*2+1].plot(indel_f1_mean, indel_f1_list[ni], linestyle='', - marker=markers[si], color=colors[ni], label=label) + marker=markers[si], markersize=1, color=colors[ni], label=label) all_snp_f1_y.extend(snp_f1_list) all_snp_f1_x.extend([snp_f1_mean]*len(names)) all_snp_f1_meds.append(statistics.median(snp_f1_list)) @@ -193,14 +207,16 @@ all_indel_f1_meds.append(statistics.median(indel_f1_list)) m, b, r, p, std_err = scipy.stats.linregress(all_snp_f1_x, all_snp_f1_y) - ax[ti][di*2].set_xlabel("Avg SNP F1 Q-score", fontsize=15) - ax[ti][di*2].set_ylabel("SNP F1 Q-score", fontsize=15) - ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2].transAxes) + ax[ti][di*2].set_xlabel(r"\textbf{Avg SNP F1 Q-score}", fontsize=7) + ax[ti][di*2].set_ylabel(r"\textbf{SNP F1 Q-score}", fontsize=7) + ax[ti][di*2].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2].transAxes) + ax[ti][di*2].tick_params(axis='both', which='major', labelsize=5) m, b, r, p, std_err = scipy.stats.linregress(all_indel_f1_x, all_indel_f1_y) - ax[ti][di*2+1].set_xlabel("Avg INDEL F1 Q-score", fontsize=15) - ax[ti][di*2+1].set_ylabel("INDEL F1 Q-score", fontsize=15) - ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=15, transform=ax[ti][di*2+1].transAxes) + ax[ti][di*2+1].set_xlabel(r"\textbf{Avg INDEL F1 Q-score}", fontsize=7) + ax[ti][di*2+1].set_ylabel(r"\textbf{INDEL F1 Q-score}", fontsize=7) + ax[ti][di*2+1].text(0.05,0.9,"$R^2$" +f": {r*r:.6f}", fontsize=5, transform=ax[ti][di*2+1].transAxes) + ax[ti][di*2+1].tick_params(axis='both', which='major', labelsize=5) # second pass, calculate average max rank change all_snp_f1_meds.sort() @@ -240,32 +256,34 @@ indel_rank_change.append(indel_max_rank - indel_min_rank) snp_amrc = np.mean(snp_rank_change) indel_amrc = np.mean(indel_rank_change) - ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=15, transform=ax[ti][di*2].transAxes) - ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=15, transform=ax[ti][di*2+1].transAxes) + ax[ti][di*2].text(0.05,0.8,f"AMRC: {snp_amrc:.2f}", fontsize=5, transform=ax[ti][di*2].transAxes) + ax[ti][di*2+1].text(0.05,0.8,f"AMRC: {indel_amrc:.2f}", fontsize=5, transform=ax[ti][di*2+1].transAxes) -square = mlines.Line2D([], [], linestyle="", color='k', marker='s', - markersize=12, label='Google Health') -circle = mlines.Line2D([], [], linestyle="", color='k', marker='o', - markersize=12, label='Sentieon') -triangle = mlines.Line2D([], [], linestyle="", color='k', marker='^', - markersize=12, label='Roche') -ax[0][0].legend(handles=[square, circle, triangle], loc="center right", fontsize=15) +# square = mlines.Line2D([], [], linestyle="", color='k', marker='s', +# markersize=12, label='Google Health') +# circle = mlines.Line2D([], [], linestyle="", color='k', marker='o', +# markersize=12, label='Sentieon') +# triangle = mlines.Line2D([], [], linestyle="", color='k', marker='^', +# markersize=12, label='Roche') +# ax[0][0].legend(handles=[square, circle, triangle], loc="center right", fontsize=15) O = mlines.Line2D([], [], linestyle="", color='k', marker='.', - markersize=12, label='original') + markersize=5, label='original') A = mlines.Line2D([], [], linestyle="", color='C0', marker='.', - markersize=12, label='A') + markersize=5, label='A') B = mlines.Line2D([], [], linestyle="", color='C1', marker='.', - markersize=12, label='B') + markersize=5, label='B') C = mlines.Line2D([], [], linestyle="", color='C2', marker='.', - markersize=12, label='C') + markersize=5, label='C') D = mlines.Line2D([], [], linestyle="", color='C3', marker='.', - markersize=12, label='D') -ax2 = ax[0][0].twinx() + markersize=5, label='D') +# ax2 = ax[0][0].twinx() -# ax2.legend(handles=[O,A,B,C,D], loc="center right", fontsize=15) -ax2.legend(handles=[O,A,B,C,D], loc="center left", fontsize=15) +ax[0][0].legend(handles=[O,A,B,C,D], loc="lower right", fontsize=5) +# ax2.legend(handles=[O,A,B,C,D], loc="center left", fontsize=5) plt.tight_layout() +out6a.close() +out6b.close() plt.savefig('img/9_f1_pr_plot.pdf', format="pdf") From 7dd52b58bec48a1244a4a3263843989a0131fc28 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 7 Nov 2023 16:53:26 -0500 Subject: [PATCH 03/16] Added FLIP_ERROR column to 'superclusters.tsv' --- src/print.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/print.cpp b/src/print.cpp index af0d6a7..33da71b 100644 --- a/src/print.cpp +++ b/src/print.cpp @@ -742,7 +742,7 @@ void write_results( FILE* out_clusterings = fopen(out_clusterings_fn.data(), "w"); if (g.verbosity >= 1) INFO(" Writing superclustering results to '%s'", out_clusterings_fn.data()); fprintf(out_clusterings, "CONTIG\tSUPERCLUSTER\tSTART\tSTOP\tSIZE\tQUERY1_VARS\tQUERY2_VARS" - "\tTRUTH1_VARS\tTRUTH2_VARS\tORIG_ED\tSWAP_ED\tPHASE\tPHASE_SET\tPHASE_BLOCK\n"); + "\tTRUTH1_VARS\tTRUTH2_VARS\tORIG_ED\tSWAP_ED\tPHASE\tPHASE_SET\tPHASE_BLOCK\tFLIP_ERROR\n"); for (std::string ctg : phasedata_ptr->contigs) { std::shared_ptr ctg_pbs = phasedata_ptr->phase_blocks[ctg]; std::shared_ptr ctg_scs = ctg_pbs->ctg_superclusters; @@ -754,6 +754,16 @@ void write_results( phase_block_idx++; } + // set supercluster flip/swap based on phaseblock and sc phasing + bool phase_switch = ctg_pbs->block_states[phase_block_idx]; + int phase_sc = ctg_scs->phase[i]; + bool flip_error; + if (phase_switch) { + flip_error = phase_sc == PHASE_ORIG; + } else { // no phase switch + flip_error = phase_sc == PHASE_SWAP; + } + // count query vars, allowing empty haps int query1_vars = ctg_scs->ctg_variants[QUERY][HAP1]->clusters.size() ? ctg_scs->ctg_variants[QUERY][HAP1]->clusters[ @@ -777,12 +787,12 @@ void write_results( ctg_scs->superclusters[TRUTH][HAP2][i]] : 0; // print data - fprintf(out_clusterings, "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\n", + fprintf(out_clusterings, "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n", ctg.data(), i, ctg_scs->begs[i], ctg_scs->ends[i], ctg_scs->ends[i] - ctg_scs->begs[i], query1_vars, query2_vars, truth1_vars, truth2_vars, ctg_scs->orig_phase_dist[i], ctg_scs->swap_phase_dist[i], - phase_strs[ctg_scs->phase[i]].data(), ctg_scs->phase_sets[i], phase_block_idx + phase_strs[phase_sc].data(), ctg_scs->phase_sets[i], phase_block_idx, flip_error ? "TRUE" : "FALSE" ); } } From 5c30ff6afcc1cb5aa0d8d71f1718c90197369490 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 7 Nov 2023 17:26:07 -0500 Subject: [PATCH 04/16] Added phasing threshold - instead of considering superclusters to be phased when the swapped edit distance is less than the original, --phasing-threshold requires that the edit distance is reduced by this fraction, 1.0 indicating that there is an exact match and 0.5 indicating it is halved --- src/dist.cpp | 15 ++++++++++++--- src/globals.cpp | 20 ++++++++++++++++++++ src/globals.h | 3 +++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/dist.cpp b/src/dist.cpp index 22c5d73..7c7a4b1 100644 --- a/src/dist.cpp +++ b/src/dist.cpp @@ -452,9 +452,18 @@ int store_phase( // calculate best phasing int orig_phase_dist = s[QUERY1_TRUTH1] + s[QUERY2_TRUTH2]; int swap_phase_dist = s[QUERY2_TRUTH1] + s[QUERY1_TRUTH2]; - int phase = PHASE_NONE; // default either way if equal dist - if (orig_phase_dist < swap_phase_dist) phase = PHASE_ORIG; - if (swap_phase_dist < orig_phase_dist) phase = PHASE_SWAP; + int phase = PHASE_NONE; // default either way if insufficient evidence + if (orig_phase_dist == swap_phase_dist) { // allow either phasing if equal dist + phase = PHASE_NONE; + } else if (orig_phase_dist == 0) { // protect division by zero + phase = PHASE_ORIG; + } else if (swap_phase_dist == 0) { // protect division by zero + phase = PHASE_SWAP; + } else if (1 - swap_phase_dist / orig_phase_dist > g.phase_threshold) { // significant reduction by swapping + phase = PHASE_SWAP; + } else if (1 - orig_phase_dist / swap_phase_dist > g.phase_threshold) { // significant reduction with orig + phase = PHASE_ORIG; + } // save alignment information clusterdata_ptr->superclusters[ctg]->set_phase(sc_idx, diff --git a/src/globals.cpp b/src/globals.cpp index 2278391..0c8fdfc 100644 --- a/src/globals.cpp +++ b/src/globals.cpp @@ -340,6 +340,21 @@ void Globals::parse_args(int argc, char ** argv) { if (this->max_threads < 1) { ERROR("Max threads must be positive"); } +/*******************************************************************************/ + } else if (std::string(argv[i]) == "-pt" || + std::string(argv[i]) == "--phasing-threshold") { + i++; + if (i == argc) { + ERROR("Option '--phasing-threshold' used without providing value"); + } + try { + this->phase_threshold = std::stod(argv[i++]); + } catch (const std::exception & e) { + ERROR("Invalid phasing threshold provided"); + } + if (this->phase_threshold <= 0 || this->phase_threshold > 1) { + ERROR("Provided phasing threshold must be on the interval (0,1]"); + } /*******************************************************************************/ } else if (std::string(argv[i]) == "-r" || std::string(argv[i]) == "--max-ram") { @@ -498,6 +513,11 @@ void Globals::print_usage() const printf(" minimum gap between independent clusters and superclusters (in bases),\n"); printf(" only applicable if used with '--simple-cluster' option\n"); + printf("\n Phasing:\n"); + printf(" -pt, --phasing-threshold [%.2f]\n", g.phase_threshold); + printf(" minimum fractional reduction in edit distance over other phasing\n"); + printf(" in order to consider this supercluster phased\n"); + printf("\n Distance:\n"); printf(" -ex, --eval-mismatch-penalty [%d]\n", g.eval_sub); printf(" mismatch penalty (distance evaluation)\n"); diff --git a/src/globals.h b/src/globals.h index 349828e..421e540 100644 --- a/src/globals.h +++ b/src/globals.h @@ -36,6 +36,9 @@ class Globals { int reach_min_gap = 10; int max_cluster_itrs = 4; + // phasing options + double phase_threshold = 0.5; + // memory params int max_threads = 64; double max_ram = 64; // GB From 2a9b41fa62d601cda1c4630d55b36b16ca643ac7 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 8 Nov 2023 11:15:38 -0500 Subject: [PATCH 05/16] Added g.phase_threshold to parameters.txt --- src/print.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/print.cpp b/src/print.cpp index 33da71b..8f17c84 100644 --- a/src/print.cpp +++ b/src/print.cpp @@ -41,14 +41,14 @@ void write_params() { "program = '%s'\nversion = '%s'\nout_prefix = '%s'\ncommand = '%s'\nreference_fasta = '%s'\n" "query_vcf = '%s'\ntruth_vcf = '%s'\nbed_file = '%s'\nwrite_outputs = %s\nfilters = '%s'\n" "min_var_qual = %d\nmax_var_qual = %d\nmin_var_size = %d\nmax_var_size = %d\n" - "realign_truth = %s\nrealign_query = %s\n" + "phase_threshold=%f\nrealign_truth = %s\nrealign_query = %s\n" "realign_only = %s\nsimple_cluster = %s\ncluster_min_gap = %d\n" "reach_min_gap = %d\nmax_cluster_itrs = %d\nmax_threads = %d\nmax_ram = %f\n" "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\n", g.PROGRAM.data(), g.VERSION.data(), g.out_prefix.data(), g.cmd.data(), g.ref_fasta_fn.data(), g.query_vcf_fn.data(), g.truth_vcf_fn.data(), g.bed_fn.data(), b2s(g.write).data(), filters_str.data(), g.min_qual, g.max_qual, g.min_size, g.max_size, - b2s(g.realign_truth).data(), b2s(g.realign_query).data(), + g.phase_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(), b2s(g.realign_only).data(), b2s(g.simple_cluster).data(), g.cluster_min_gap, g.reach_min_gap, g.max_cluster_itrs, g.max_threads, g.max_ram, g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend); From a5b4ab6c4ac38f95b50b21554369286536945335 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 8 Nov 2023 11:16:04 -0500 Subject: [PATCH 06/16] Output no longer always 'gm' - 'gm' for TP, 'lm' for PP, '.' for FP/FN --- src/phase.cpp | 2 +- src/variant.cpp | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/phase.cpp b/src/phase.cpp index 8b9c71f..4b92f0d 100644 --- a/src/phase.cpp +++ b/src/phase.cpp @@ -25,7 +25,7 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) { fprintf(out_vcf, "##FORMAT=\n"); fprintf(out_vcf, "##FORMAT== 0.5 else FP/FN (hap.py compatibility)\">\n"); fprintf(out_vcf, "##FORMAT=\n"); - fprintf(out_vcf, "##FORMAT=\n"); + fprintf(out_vcf, "##FORMAT=\n"); fprintf(out_vcf, "##FORMAT=\n"); fprintf(out_vcf, "##FORMAT=\n"); fprintf(out_vcf, "##FORMAT=\n"); diff --git a/src/variant.cpp b/src/variant.cpp index 98db573..a3119b5 100644 --- a/src/variant.cpp +++ b/src/variant.cpp @@ -259,17 +259,20 @@ void ctgVariants::print_var_sample(FILE* out_fp, int idx, std::string gt, // get categorization std::string errtype; + std::string match_type; switch (this->errtypes[swap][idx]) { - case ERRTYPE_TP: errtype = "TP"; break; - case ERRTYPE_FP: errtype = "FP"; break; - case ERRTYPE_FN: errtype = "FN"; break; + case ERRTYPE_TP: errtype = "TP"; match_type = "gm"; break; + case ERRTYPE_FP: errtype = "FP"; match_type = "."; break; + case ERRTYPE_FN: errtype = "FN"; match_type = "."; break; case ERRTYPE_PP: // for hap.py compatibility errtype = this->credit[swap][idx] >= 0.5 ? "TP" : - (query ? "FP" : "FN"); break; + (query ? "FP" : "FN"); + match_type = "lm"; + break; } - fprintf(out_fp, "\t%s:%s:%f:gm:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), - this->credit[swap][idx], int(this->var_quals[idx]), sc_idx, + fprintf(out_fp, "\t%s:%s:%f:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), + this->credit[swap][idx], match_type.data(), int(this->var_quals[idx]), sc_idx, int(this->sync_group[swap][idx]), this->phase_sets[idx], phase_block, query ? (phase_switch ? "1" : "0") : "." , query ? (phase_flip ? "1" : "0") : "." , From 6f5aa100afcf0c5e32276cb1e26d6f7b5fba45fc Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 8 Nov 2023 13:21:41 -0500 Subject: [PATCH 07/16] Replaced partial credit with credit threshold. --- src/defs.h | 9 +++-- src/dist.cpp | 30 +++------------- src/globals.cpp | 19 ++++++++++ src/globals.h | 3 ++ src/main.cpp | 2 +- src/phase.cpp | 4 +-- src/print.cpp | 94 +++++++++++++++++++------------------------------ src/variant.cpp | 17 +++++---- src/variant.h | 6 ++-- 9 files changed, 81 insertions(+), 103 deletions(-) diff --git a/src/defs.h b/src/defs.h index a31995b..16ec893 100644 --- a/src/defs.h +++ b/src/defs.h @@ -60,11 +60,10 @@ class idx2; #define ERRTYPE_TP 0 // true positive #define ERRTYPE_FP 1 // false positive #define ERRTYPE_FN 2 // false negative -#define ERRTYPE_PP 3 // partial positive (reduces ED, but not TP) -#define ERRTYPE_PE 4 // phase error (0|1 -> 1|0) -#define ERRTYPE_GE 5 // genotype error (0|1 -> 1|1) -#define ERRTYPE_UN 6 // unknown -#define ERRTYPES 7 +#define ERRTYPE_PE 3 // phase error (0|1 -> 1|0) +#define ERRTYPE_GE 4 // genotype error (0|1 -> 1|1) +#define ERRTYPE_UN 5 // unknown +#define ERRTYPES 6 // runtime timers #define TIME_READ 0 diff --git a/src/dist.cpp b/src/dist.cpp index 7c7a4b1..c55bf77 100644 --- a/src/dist.cpp +++ b/src/dist.cpp @@ -1269,7 +1269,7 @@ void calc_prec_recall( float credit = 1 - float(new_ed)/old_ed; // don't overwrite FPs if (query_vars->errtypes[swap][query_var_idx] == ERRTYPE_UN) { - if (new_ed == 0) { // TP + if (credit >= g.credit_threshold) { // TP query_vars->errtypes[swap][query_var_idx] = ERRTYPE_TP; query_vars->sync_group[swap][query_var_idx] = sync_group; query_vars->credit[swap][query_var_idx] = credit; @@ -1279,7 +1279,7 @@ void calc_prec_recall( query_vars->refs[query_var_idx].data(), query_vars->alts[query_var_idx].data(), "TP", credit); - } else if (new_ed == old_ed) { // FP + } else { // FP query_vars->errtypes[swap][query_var_idx] = ERRTYPE_FP; query_vars->sync_group[swap][query_var_idx] = sync_group; query_vars->credit[swap][query_var_idx] = 0; @@ -1289,16 +1289,6 @@ void calc_prec_recall( query_vars->refs[query_var_idx].data(), query_vars->alts[query_var_idx].data(), "FP", 0.0f); - } else { // PP - query_vars->errtypes[swap][query_var_idx] = ERRTYPE_PP; - query_vars->sync_group[swap][query_var_idx] = sync_group; - query_vars->credit[swap][query_var_idx] = credit; - query_vars->callq[swap][query_var_idx] = callq; - if (print) printf("%s:%d QUERY REF='%s'\tALT='%s'\t%s\t%f\n", - ctg.data(), query_vars->poss[query_var_idx], - query_vars->refs[query_var_idx].data(), - query_vars->alts[query_var_idx].data(), - "PP", credit); } } } @@ -1307,7 +1297,7 @@ void calc_prec_recall( for (int truth_var_idx = prev_truth_var_ptr; truth_var_idx > truth_var_ptr; truth_var_idx--) { float credit = 1 - float(new_ed)/old_ed; - if (new_ed == 0) { // TP + if (credit >= g.credit_threshold) { // TP truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_TP; truth_vars->sync_group[swap][truth_var_idx] = sync_group; truth_vars->credit[swap][truth_var_idx] = credit; @@ -1317,7 +1307,7 @@ void calc_prec_recall( truth_vars->refs[truth_var_idx].data(), truth_vars->alts[truth_var_idx].data(), "TP", credit); - } else if (new_ed == old_ed) { // FP call, FN truth + } else { // FP call, FN truth truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_FN; truth_vars->sync_group[swap][truth_var_idx] = sync_group; truth_vars->credit[swap][truth_var_idx] = credit; @@ -1327,17 +1317,7 @@ void calc_prec_recall( truth_vars->refs[truth_var_idx].data(), truth_vars->alts[truth_var_idx].data(), "FN", credit); - } else { // PP - truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_PP; - truth_vars->sync_group[swap][truth_var_idx] = sync_group; - truth_vars->credit[swap][truth_var_idx] = credit; - truth_vars->callq[swap][truth_var_idx] = callq; - if (print) printf("%s:%d TRUTH REF='%s'\tALT='%s'\t%s\t%f\n", - ctg.data(), truth_vars->poss[truth_var_idx], - truth_vars->refs[truth_var_idx].data(), - truth_vars->alts[truth_var_idx].data(), - "PP", credit); - } + } } if (print) printf("%s: SYNC @%s(%2d,%2d) = REF(%2d,%2d), ED %d->%d, QUERY %d-%d, TRUTH %d-%d\n\n", ctg.data(), diff --git a/src/globals.cpp b/src/globals.cpp index 0c8fdfc..6244811 100644 --- a/src/globals.cpp +++ b/src/globals.cpp @@ -355,6 +355,21 @@ void Globals::parse_args(int argc, char ** argv) { if (this->phase_threshold <= 0 || this->phase_threshold > 1) { ERROR("Provided phasing threshold must be on the interval (0,1]"); } +/*******************************************************************************/ + } else if (std::string(argv[i]) == "-ct" || + std::string(argv[i]) == "--credit-threshold") { + i++; + if (i == argc) { + ERROR("Option '--credit-threshold' used without providing value"); + } + try { + this->credit_threshold = std::stod(argv[i++]); + } catch (const std::exception & e) { + ERROR("Invalid credit threshold provided"); + } + if (this->credit_threshold <= 0 || this->credit_threshold > 1) { + ERROR("Provided credit threshold must be on the interval (0,1]"); + } /*******************************************************************************/ } else if (std::string(argv[i]) == "-r" || std::string(argv[i]) == "--max-ram") { @@ -485,6 +500,10 @@ void Globals::print_usage() const printf(" -e, --gap-extend-penalty [%d]\n", g.eval_extend); printf(" Smith-Waterman gap extension penalty\n"); + printf("\n Precision-Recall:\n"); + printf(" -ct, --credit-threshold [%.2f]\n", g.credit_threshold); + printf(" minimum partial credit to consider variant a true positive\n"); + printf("\n Utilization:\n"); printf(" -t, --max-threads [%d]\n", g.max_threads); printf(" maximum threads to use for clustering and precision/recall alignment\n"); diff --git a/src/globals.h b/src/globals.h index 421e540..53c30ae 100644 --- a/src/globals.h +++ b/src/globals.h @@ -39,6 +39,9 @@ class Globals { // phasing options double phase_threshold = 0.5; + // precision-recall options + double credit_threshold = 0.7; + // memory params int max_threads = 64; double max_ram = 64; // GB diff --git a/src/main.cpp b/src/main.cpp index d2ec555..c32d09e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,7 +14,7 @@ Globals g; std::vector type_strs = {"REF", "SNP", "INS", "DEL", "CPX"}; std::vector type_strs2 = {"ALL", "SNP", "INS", "DEL", "INDEL"}; std::vector vartype_strs = {"SNP", "INDEL"}; -std::vector error_strs = {"TP", "FP", "FN", "PP", "PE", "GE", "??"}; +std::vector error_strs = {"TP", "FP", "FN", "PE", "GE", "??"}; std::vector gt_strs = { "0", "1", "0|0", "0|1", "1|0", "1|1", "1|2", "2|1", ".|.", "M|N" }; std::vector region_strs = {"OUTSIDE", "INSIDE ", "BORDER ", "OFF CTG"}; diff --git a/src/phase.cpp b/src/phase.cpp index 4b92f0d..67324a1 100644 --- a/src/phase.cpp +++ b/src/phase.cpp @@ -23,9 +23,9 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) { } fprintf(out_vcf, "##FILTER=\n"); fprintf(out_vcf, "##FORMAT=\n"); - fprintf(out_vcf, "##FORMAT== 0.5 else FP/FN (hap.py compatibility)\">\n"); + fprintf(out_vcf, "##FORMAT=\n"); fprintf(out_vcf, "##FORMAT=\n"); - fprintf(out_vcf, "##FORMAT=\n"); + fprintf(out_vcf, "##FORMAT= 0, else '.')\">\n"); fprintf(out_vcf, "##FORMAT=\n"); fprintf(out_vcf, "##FORMAT=\n"); fprintf(out_vcf, "##FORMAT=\n"); diff --git a/src/print.cpp b/src/print.cpp index 8f17c84..3d36abd 100644 --- a/src/print.cpp +++ b/src/print.cpp @@ -41,14 +41,14 @@ void write_params() { "program = '%s'\nversion = '%s'\nout_prefix = '%s'\ncommand = '%s'\nreference_fasta = '%s'\n" "query_vcf = '%s'\ntruth_vcf = '%s'\nbed_file = '%s'\nwrite_outputs = %s\nfilters = '%s'\n" "min_var_qual = %d\nmax_var_qual = %d\nmin_var_size = %d\nmax_var_size = %d\n" - "phase_threshold=%f\nrealign_truth = %s\nrealign_query = %s\n" + "phase_threshold = %f\ncredit_threshold = %f\nrealign_truth = %s\nrealign_query = %s\n" "realign_only = %s\nsimple_cluster = %s\ncluster_min_gap = %d\n" "reach_min_gap = %d\nmax_cluster_itrs = %d\nmax_threads = %d\nmax_ram = %f\n" "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\n", g.PROGRAM.data(), g.VERSION.data(), g.out_prefix.data(), g.cmd.data(), g.ref_fasta_fn.data(), g.query_vcf_fn.data(), g.truth_vcf_fn.data(), g.bed_fn.data(), b2s(g.write).data(), filters_str.data(), g.min_qual, g.max_qual, g.min_size, g.max_size, - g.phase_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(), + g.phase_threshold, g.credit_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(), b2s(g.realign_only).data(), b2s(g.simple_cluster).data(), g.cluster_min_gap, g.reach_min_gap, g.max_cluster_itrs, g.max_threads, g.max_ram, g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend); @@ -321,13 +321,13 @@ void print_ptrs(const std::vector< std::vector > & ptrs, void write_precision_recall(std::unique_ptr & phasedata_ptr) { - // init counters; ax0: SUB/INDEL, ax1: TP,FP,FN,PP,PP_FRAC, ax2: QUAL - int PP_FRAC = 4; + // for each class, store variant counts above each quality threshold + // init counters; ax0: SUB/INDEL, ax1: TP,FP,FN ax2: QUAL std::vector< std::vector< std::vector > > query_counts(VARTYPES, - std::vector< std::vector >(5, + std::vector< std::vector >(3, std::vector(g.max_qual-g.min_qual+1, 0.0))) ; std::vector< std::vector< std::vector > > truth_counts(VARTYPES, - std::vector< std::vector >(5, + std::vector< std::vector >(3, std::vector(g.max_qual-g.min_qual+1, 0.0))) ; // calculate summary statistics @@ -374,9 +374,6 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { } for (int qual = g.min_qual; qual <= q; qual++) { query_counts[t][ ctg_vars[QUERY][h]->errtypes[swap][i] ][qual-g.min_qual]++; - if (ctg_vars[QUERY][h]->errtypes[swap][i] == ERRTYPE_PP) { - query_counts[t][PP_FRAC][qual-g.min_qual] += ctg_vars[QUERY][h]->credit[swap][i]; - } } } } @@ -419,11 +416,13 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { WARN("Unknown error type at TRUTH %s:%d", ctg.data(), ctg_vars[TRUTH][h]->poss[i]); continue; } + // corresponding query call is only correct until its Qscore, after which it falls below + // the quality threshold, is filtered, and becomes a false negative for (int qual = g.min_qual; qual <= q; qual++) { truth_counts[t][ ctg_vars[TRUTH][h]->errtypes[swap][i] ][qual-g.min_qual]++; - if (ctg_vars[TRUTH][h]->errtypes[swap][i] == ERRTYPE_PP) { - truth_counts[t][PP_FRAC][qual-g.min_qual] += ctg_vars[TRUTH][h]->credit[swap][i]; - } + } + for (int qual = q+1; qual < g.max_qual; qual++) { + truth_counts[t][ERRTYPE_FN][qual-g.min_qual]++; } } } @@ -438,7 +437,7 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { if (g.verbosity >= 1) INFO(" Writing precision-recall results to '%s'", out_pr_fn.data()); out_pr = fopen(out_pr_fn.data(), "w"); fprintf(out_pr, "VAR_TYPE\tMIN_QUAL\tPREC\tRECALL\tF1_SCORE\tF1_QSCORE\t" - "TRUTH_TOTAL\tTRUTH_TP\tTRUTH_PP\tTRUTH_FN\tQUERY_TOTAL\tQUERY_TP\tQUERY_PP\tQUERY_FP\n"); + "TRUTH_TOTAL\tTRUTH_TP\tTRUTH_FN\tQUERY_TOTAL\tQUERY_TP\tQUERY_FP\n"); } std::vector max_f1_score = {0, 0}; std::vector max_f1_qual = {0, 0}; @@ -448,36 +447,25 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { for (int qual = g.min_qual; qual <= g.max_qual; qual++) { int qidx = qual - g.min_qual; - int query_tot = query_counts[type][ERRTYPE_TP][qidx] + - query_counts[type][ERRTYPE_FP][qidx] + - query_counts[type][ERRTYPE_PP][qidx]; - float query_tp_f = query_counts[type][ERRTYPE_TP][qidx] + - query_counts[type][PP_FRAC][qidx]; - float query_fp_f = query_tot - query_tp_f; - - int truth_tot = truth_counts[type][ERRTYPE_TP][0] + - truth_counts[type][ERRTYPE_PP][0] + - truth_counts[type][ERRTYPE_FN][0]; - float truth_tp_f = truth_counts[type][ERRTYPE_TP][qidx] + - truth_counts[type][PP_FRAC][qidx]; - if (truth_tot == 0) break; - if (truth_tp_f + query_fp_f == 0) break; - - // ignore PP, this is only for summary output, not calculations - int truth_fn = truth_counts[type][ERRTYPE_FN][0] + - truth_counts[type][ERRTYPE_TP][0] - - truth_counts[type][ERRTYPE_TP][qidx]; - - float precision = truth_tp_f / (truth_tp_f + query_fp_f); - float recall = truth_tp_f / truth_tot; - float f1_score = 2*precision*recall / (precision + recall); + // define helper variables + int query_tp = query_counts[type][ERRTYPE_TP][qidx]; + int query_fp = query_counts[type][ERRTYPE_FP][qidx]; + int query_tot = query_tp + query_fp; + int truth_tp = truth_counts[type][ERRTYPE_TP][qidx]; + int truth_fn = truth_counts[type][ERRTYPE_FN][qidx]; + int truth_tot = truth_tp + truth_fn; + + // calculate summary metrics + float precision = query_tot == 0 ? 1 : float(query_tp) / query_tot; + float recall = truth_tot == 0 ? 1 : float(truth_tp) / truth_tot; + float f1_score = precision+recall ? 2*precision*recall / (precision + recall) : 0; if (f1_score > max_f1_score[type]) { max_f1_score[type] = f1_score; max_f1_qual[type] = qual; } if (g.write) fprintf(out_pr, - "%s\t%d\t%f\t%f\t%f\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", + "%s\t%d\t%f\t%f\t%f\t%f\t%d\t%d\t%d\t%d\t%d\t%d\n", vartype_strs[type].data(), qual, precision, @@ -486,11 +474,9 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { qscore(1-f1_score), truth_tot, int(truth_counts[type][ERRTYPE_TP][qidx]), - int(truth_counts[type][ERRTYPE_PP][qidx]), truth_fn, query_tot, int(query_counts[type][ERRTYPE_TP][qidx]), - int(query_counts[type][ERRTYPE_PP][qidx]), int(query_counts[type][ERRTYPE_FP][qidx]) ); } @@ -518,28 +504,20 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { // redo calculations for these two int qidx = qual - g.min_qual; - int query_tot = query_counts[type][ERRTYPE_TP][qidx] + - query_counts[type][ERRTYPE_FP][qidx] + - query_counts[type][ERRTYPE_PP][qidx]; - float query_tp_f = query_counts[type][ERRTYPE_TP][qidx] + - query_counts[type][PP_FRAC][qidx]; - float query_fp_f = query_tot - query_tp_f; + // define helper variables + int query_tp = query_counts[type][ERRTYPE_TP][qidx]; + int query_fp = query_counts[type][ERRTYPE_FP][qidx]; + int query_tot = query_tp + query_fp; if (query_tot == 0) WARN("No QUERY %s variant calls.", vartype_strs[type].data()); - - int truth_tot = truth_counts[type][ERRTYPE_TP][0] + - truth_counts[type][ERRTYPE_PP][0] + - truth_counts[type][ERRTYPE_FN][0]; - int truth_fn = truth_counts[type][ERRTYPE_FN][0] + - truth_counts[type][ERRTYPE_TP][0] - - truth_counts[type][ERRTYPE_TP][qidx]; - float truth_tp_f = truth_counts[type][ERRTYPE_TP][qidx] + - truth_counts[type][PP_FRAC][qidx]; + int truth_tp = truth_counts[type][ERRTYPE_TP][qidx]; + int truth_fn = truth_counts[type][ERRTYPE_FN][qidx]; + int truth_tot = truth_tp + truth_fn; if (truth_tot == 0) WARN("No TRUTH %s variant calls.", vartype_strs[type].data()); - float precision = truth_tp_f + query_fp_f == 0 ? - 1.0f : truth_tp_f / (truth_tp_f + query_fp_f); - float recall = truth_tot == 0 ? 1.0f : truth_tp_f / truth_tot; - float f1_score = 2*precision*recall / (precision + recall); + // calculate summary metrics + float precision = query_tot == 0 ? 1 : float(query_tp) / query_tot; + float recall = truth_tot == 0 ? 1 : float(truth_tp) / truth_tot; + float f1_score = precision+recall ? 2*precision*recall / (precision + recall) : 0; // print summary INFO("%s%s\tQ >= %d\t\t%-16d%-16d%-16d%-16d%f\t%f\t%f\t%f%s", diff --git a/src/variant.cpp b/src/variant.cpp index a3119b5..999d8e5 100644 --- a/src/variant.cpp +++ b/src/variant.cpp @@ -260,15 +260,14 @@ void ctgVariants::print_var_sample(FILE* out_fp, int idx, std::string gt, // get categorization std::string errtype; std::string match_type; - switch (this->errtypes[swap][idx]) { - case ERRTYPE_TP: errtype = "TP"; match_type = "gm"; break; - case ERRTYPE_FP: errtype = "FP"; match_type = "."; break; - case ERRTYPE_FN: errtype = "FN"; match_type = "."; break; - case ERRTYPE_PP: // for hap.py compatibility - errtype = this->credit[swap][idx] >= 0.5 ? "TP" : - (query ? "FP" : "FN"); - match_type = "lm"; - break; + if (this->credit[swap][idx] == 1) { + errtype = "TP"; match_type = "gm"; + } else if (this->credit[swap][idx] == 0) { + errtype = "FP"; match_type = "."; + } else if (this->credit[swap][idx] >= g.credit_threshold) { + errtype = "TP"; match_type = "lm"; + } else { + errtype = "FP"; match_type = "lm"; } fprintf(out_fp, "\t%s:%s:%f:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), diff --git a/src/variant.h b/src/variant.h index 011dc54..635554e 100644 --- a/src/variant.h +++ b/src/variant.h @@ -47,14 +47,14 @@ class ctgVariants { std::vector right_reaches; // cluster rightmost reach // set during prec_recall_aln() - // error type: TP, FP, FN, PP + // error type: TP, FP, FN std::vector< std::vector > errtypes; - // group of variants that participate in TP/PP + // group of variants that participate in credit std::vector< std::vector > sync_group; // call quality (for truth, of associated call) // or for call, min quality in sync group std::vector< std::vector > callq; - // fraction of TP for partial positive (PP) + // percentage reduction in edit dist of sync group with variants std::vector< std::vector > credit; }; From adc796755868589a153a6e429eb850c5800b512d Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 8 Nov 2023 17:52:10 -0500 Subject: [PATCH 08/16] Fixed definitions of NG50 and NGC50 - phase blocks are now based on phase sets, not backtracking - calculate full list of switch and flip positions - removed per-phaseblock block_state, added per-supercluster pb_phase --- src/cluster.cpp | 17 ++-- src/cluster.h | 3 +- src/defs.h | 8 ++ src/dist.cpp | 2 +- src/globals.h | 1 + src/main.cpp | 1 + src/phase.cpp | 205 +++++++++++++++++++++++++----------------------- src/phase.h | 10 +-- src/print.cpp | 87 ++++++++++---------- 9 files changed, 180 insertions(+), 154 deletions(-) diff --git a/src/cluster.cpp b/src/cluster.cpp index 7554df4..53b8152 100644 --- a/src/cluster.cpp +++ b/src/cluster.cpp @@ -19,7 +19,8 @@ void ctgSuperclusters::add_supercluster( this->superclusters[i>>1][i&1].push_back(brks[i]); this->begs.push_back(beg); this->ends.push_back(end); - this->phase.push_back(PHASE_NONE); + this->sc_phase.push_back(PHASE_NONE); + this->pb_phase.push_back(PHASE_NONE); this->orig_phase_dist.push_back(-1); this->swap_phase_dist.push_back(-1); this->n++; @@ -29,7 +30,7 @@ void ctgSuperclusters::set_phase( int sc_idx, int phase, int orig_phase_dist, int swap_phase_dist) { - this->phase[sc_idx] = phase; + this->sc_phase[sc_idx] = phase; this->orig_phase_dist[sc_idx] = orig_phase_dist; this->swap_phase_dist[sc_idx] = swap_phase_dist; } @@ -379,8 +380,8 @@ void superclusterData::transfer_phase_sets() { total_bases += lengths[i]; } - int query_ps_ng50 = calc_ng50(query_phase_set_sizes, total_bases); - int truth_ps_ng50 = calc_ng50(truth_phase_set_sizes, total_bases); + int query_pb_ng50 = calc_ng50(query_phase_set_sizes, total_bases); + int truth_pb_ng50 = calc_ng50(truth_phase_set_sizes, total_bases); // print if (total_multiple_phase_sets) @@ -391,10 +392,10 @@ void superclusterData::transfer_phase_sets() { WARN("%d total phase sets (PS) are in non-increasing order", total_non_increasing) - if (g.verbosity >= 1) INFO(" Input QUERY phase sets: %d", total_query_phase_sets); - if (g.verbosity >= 1) INFO(" Input QUERY phase set NG50: %d", query_ps_ng50); - if (g.verbosity >= 1) INFO(" Input TRUTH phase sets: %d", total_truth_phase_sets); - if (g.verbosity >= 1) INFO(" Input TRUTH phase set NG50: %d", truth_ps_ng50); + if (g.verbosity >= 1) INFO(" QUERY phase sets: %d", total_query_phase_sets); + if (g.verbosity >= 1) INFO(" QUERY phase block NG50: %d", query_pb_ng50); + if (g.verbosity >= 1) INFO(" TRUTH phase sets: %d", total_truth_phase_sets); + if (g.verbosity >= 1) INFO(" TRUTH phase block NG50: %d", truth_pb_ng50); } diff --git a/src/cluster.h b/src/cluster.h index d157780..ee04a98 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -36,7 +36,8 @@ class ctgSuperclusters { // data (length n) int n = 0; // number of superclusters std::vector begs, ends; // ref beg/end pos across haps and truth/query - std::vector phase; // keep/swap/unknown, from alignment + std::vector sc_phase; // keep/swap/unknown, from alignment + std::vector pb_phase; // keep/swap, from phasing algorithm std::vector phase_sets; // input, from first variant in sc std::vector orig_phase_dist, swap_phase_dist; // alignment distances }; diff --git a/src/defs.h b/src/defs.h index 16ec893..707a5ab 100644 --- a/src/defs.h +++ b/src/defs.h @@ -65,6 +65,14 @@ class idx2; #define ERRTYPE_UN 5 // unknown #define ERRTYPES 6 +#define SWITCHTYPE_FLIP 0 +#define SWITCHTYPE_SWITCH 1 +#define SWITCHTYPE_SWITCH_ERR 2 +#define SWITCHTYPE_FLIP_BEG 3 +#define SWITCHTYPE_FLIP_END 4 +#define SWITCHTYPE_NONE 5 +#define SWITCHTYPES 6 + // runtime timers #define TIME_READ 0 #define TIME_CLUST 1 diff --git a/src/dist.cpp b/src/dist.cpp index c55bf77..4422b26 100644 --- a/src/dist.cpp +++ b/src/dist.cpp @@ -1944,7 +1944,7 @@ editData edits_wrapper(std::shared_ptr clusterdata_ptr) { sc->begs[sc_idx], sc->ends[sc_idx], clusterdata_ptr->ref, ctg ); - int phase = sc->phase[sc_idx]; + int phase = sc->sc_phase[sc_idx]; ///////////////////////////////////////////////////////////////////// // SMITH-WATERMAN DISTANCE: don't allow skipping called variants diff --git a/src/globals.h b/src/globals.h index 53c30ae..e609859 100644 --- a/src/globals.h +++ b/src/globals.h @@ -92,5 +92,6 @@ extern std::vector gt_strs; extern std::vector region_strs; extern std::vector aln_strs; extern std::vector phase_strs; +extern std::vector switch_strs; #endif diff --git a/src/main.cpp b/src/main.cpp index c32d09e..1bd5438 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -23,6 +23,7 @@ std::vector callset_strs = {"QUERY", "TRUTH"}; std::vector phase_strs = {"=", "X", "?"}; std::vector timer_strs = {"reading", "clustering", "realigning", "reclustering", "superclustering", "precision/recall", "edit distance", "phasing", "writing", "total"}; +std::vector switch_strs = {"FLIP", "SWITCH", "SWITCH_ERR", "FLIP_BEG", "FLIP_END", "NONE"}; int main(int argc, char **argv) { diff --git a/src/phase.cpp b/src/phase.cpp index 67324a1..fce761e 100644 --- a/src/phase.cpp +++ b/src/phase.cpp @@ -51,8 +51,8 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) { // set supercluster flip/swap based on phaseblock and sc phasing int phase_block = 0; - bool phase_switch = ctg_pbs->block_states[phase_block]; - int phase_sc = ctg_scs->phase[sc_idx]; + bool phase_switch = ctg_scs->pb_phase[sc_idx]; + int phase_sc = ctg_scs->sc_phase[sc_idx]; bool phase_flip, swap; if (phase_switch) { if (phase_sc == PHASE_ORIG) { // flip error @@ -103,12 +103,12 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) { if (pos >= ctg_scs->ends[sc_idx]) { // update supercluster sc_idx++; - phase_sc = ctg_scs->phase[sc_idx]; + phase_sc = ctg_scs->sc_phase[sc_idx]; // update phase block if (sc_idx >= ctg_pbs->phase_blocks[phase_block+1]) phase_block++; - phase_switch = ctg_pbs->block_states[phase_block]; + phase_switch = ctg_scs->pb_phase[sc_idx]; // update switch/flip status if (phase_switch) { @@ -240,9 +240,25 @@ phaseblockData::phaseblockData(std::shared_ptr clusterdata_ptr this->ref = clusterdata_ptr->ref; // add pointers to clusters - for (std::string ctg : clusterdata_ptr->contigs) { + for (std::string ctg : this->contigs) { this->phase_blocks[ctg]->ctg_superclusters = clusterdata_ptr->superclusters[ctg]; } + + // add phase blocks based on phase sets for each contig + for (std::string ctg : this->contigs) { + std::shared_ptr ctg_pbs = this->phase_blocks[ctg]; + std::shared_ptr ctg_scs = ctg_pbs->ctg_superclusters; + int curr_phase_set = -1; + for (int sc_idx = 0; sc_idx < ctg_scs->n; sc_idx++) { + if (ctg_scs->phase_sets[sc_idx] != curr_phase_set) { + ctg_pbs->phase_blocks.push_back(sc_idx); + ctg_pbs->n++; + curr_phase_set = ctg_scs->phase_sets[sc_idx]; + } + } + ctg_pbs->phase_blocks.push_back(ctg_scs->n); + } + this->phase(); } @@ -268,7 +284,7 @@ void phaseblockData::phase() // determine costs (penalized if this phasing deemed incorrect) std::vector costs = {0, 0}; - switch (ctg_scs->phase[i]) { + switch (ctg_scs->sc_phase[i]) { case PHASE_NONE: costs[PHASE_PTR_KEEP] = 0; costs[PHASE_PTR_SWAP] = 0; @@ -282,7 +298,7 @@ void phaseblockData::phase() costs[PHASE_PTR_SWAP] = 0; break; default: - ERROR("Unexpected phase (%d)", ctg_scs->phase[i]); + ERROR("Unexpected phase (%d)", ctg_scs->sc_phase[i]); break; } @@ -313,28 +329,27 @@ void phaseblockData::phase() phase = 1; int i = ctg_scs->n-1; - ctg_pbs->phase_blocks.push_back(i+1); while (i > 0) { - if (ptrs[phase][i] == PHASE_PTR_SWAP) { // new phase block - ctg_pbs->n++; + ctg_scs->pb_phase[i] = phase; + if (ptrs[phase][i] == PHASE_PTR_SWAP) { // not a switch error if between phase sets - if (i+1 < ctg_scs->n && ctg_scs->phase_sets[i] == ctg_scs->phase_sets[i+1]) + if (i+1 < ctg_scs->n && ctg_scs->phase_sets[i] == ctg_scs->phase_sets[i+1]) { + ctg_pbs->switches.push_back(i+1); ctg_pbs->nswitches++; + } - ctg_pbs->phase_blocks.push_back(i+1); - ctg_pbs->block_states.push_back(phase); phase ^= 1; } else if (ptrs[phase][i] == PHASE_PTR_KEEP) { // within phase block - if (ctg_scs->phase[i] != PHASE_NONE && ctg_scs->phase[i] != phase) + if (ctg_scs->sc_phase[i] != PHASE_NONE && ctg_scs->sc_phase[i] != phase) { + ctg_pbs->flips.push_back(i); ctg_pbs->nflips++; + } } i--; } - ctg_pbs->phase_blocks.push_back(0); - std::reverse(ctg_pbs->phase_blocks.begin(), ctg_pbs->phase_blocks.end()); - ctg_pbs->block_states.push_back(phase); - std::reverse(ctg_pbs->block_states.begin(), ctg_pbs->block_states.end()); + std::reverse(ctg_pbs->flips.begin(), ctg_pbs->flips.end()); + std::reverse(ctg_pbs->switches.begin(), ctg_pbs->switches.end()); } } @@ -346,28 +361,10 @@ void phaseblockData::phase() int superclusters = 0; if (g.verbosity >= 1) INFO(" Contigs:"); int id = 0; - bool print = false; for (std::string ctg : this->contigs) { std::shared_ptr ctg_pbs = this->phase_blocks[ctg]; std::shared_ptr ctg_scs = ctg_pbs->ctg_superclusters; - // verbose, print phase of each supercluster - if (print) { - int s = 0; - std::vector colors {"\033[34m", "\033[32m"}; - int color_idx = 0; - for (int i = 0; i < ctg_scs->n; i++) { - if (i == ctg_pbs->phase_blocks[s]) { - printf("%s", colors[color_idx].data()); - color_idx ^= 1; - s++; - } - int phase = ctg_scs->phase[i]; - printf("%s", phase_strs[phase].data()); - } - printf("\033[0m\n"); - } - // print errors per contig if (g.verbosity >= 1) { INFO(" [%2d] %s: %d switch errors, %d flip errors, %d phase blocks", id, ctg.data(), @@ -379,26 +376,28 @@ void phaseblockData::phase() phase_blocks += ctg_pbs->n; id++; } - int ng50 = this->calculate_ng50(); - int ngc50 = this->calculate_ngc50(); + int ng50 = this->calculate_ng50(false, false); + int s_ngc50 = this->calculate_ng50(true, false); + int sf_ngc50 = this->calculate_ng50(true, true); if (g.verbosity >= 1) INFO(" "); - if (g.verbosity >= 1) INFO(" Total switch errors: %d", switch_errors); - if (g.verbosity >= 1) INFO(" Total flip errors: %d", flip_errors); - if (g.verbosity >= 1) INFO(" Total phase blocks: %d", phase_blocks); - if (g.verbosity >= 1) INFO(" Phase block NG50: %d", ng50); - if (g.verbosity >= 1) INFO(" Phase block NGC50: %d", ngc50); + if (g.verbosity >= 1) INFO(" Total phase blocks: %d", phase_blocks); + if (g.verbosity >= 1) INFO(" Total switch errors: %d", switch_errors); + if (g.verbosity >= 1) INFO(" Total flip errors: %d", flip_errors); if (g.verbosity >= 1 && superclusters) - INFO(" SC Switch error rate: %.6f%%", 100*switch_errors/float(superclusters)); + INFO(" Supercluster switch error rate: %.6f%%", 100*switch_errors/float(superclusters)); if (g.verbosity >= 1 && superclusters) - INFO(" SC Flip error rate: %.6f%%", 100*flip_errors/float(superclusters)); + INFO(" Supercluster flip error rate: %.6f%%", 100*flip_errors/float(superclusters)); + if (g.verbosity >= 1) INFO(" Phase block NG50: %d", ng50); + if (g.verbosity >= 1) INFO(" (switch) Phase block NGC50: %d", s_ngc50); + if (g.verbosity >= 1) INFO(" (switchflip) Phase block NGC50: %d", sf_ngc50); } /*******************************************************************************/ -int phaseblockData::calculate_ngc50() { +int phaseblockData::calculate_ng50(bool break_on_switch, bool break_on_flip) { // get total bases in genome size_t total_bases = 0; @@ -412,23 +411,69 @@ int phaseblockData::calculate_ngc50() { std::shared_ptr ctg_pbs = this->phase_blocks[ctg]; std::shared_ptr ctg_scs = ctg_pbs->ctg_superclusters; - for (int pb = 0; pb < ctg_pbs->n && ctg_scs->n > 0; pb++) { // each phase block - - // init start of this correct block - int sc = ctg_pbs->phase_blocks[pb]; - int beg = ctg_scs->begs[sc]; - int end = 0; - for (; sc < ctg_pbs->phase_blocks[pb+1]; sc++) { // each superclsuter - if (ctg_scs->phase[sc] != PHASE_NONE && // flip error - ctg_scs->phase[sc] != ctg_pbs->block_states[pb]) { - end = ctg_scs->ends[sc-1]; - correct_blocks.push_back(end-beg); - beg = ctg_scs->begs[sc]; // reset - } + + int switch_idx = 0; + int flip_idx = 0; + int pb_idx = 1; + if (ctg_scs->n == 0) continue; + + // init start of this correct block + int sc = 0; + int next_sc = ctg_scs->n; // default to last + int beg = ctg_scs->begs[0]; + int end = 0; + int type = SWITCHTYPE_NONE; + + while (true) { + + // get type and supercluster of next switch/flip/phaseset + // check flip first because it will cause a second switch + if (break_on_flip && flip_idx < ctg_pbs->nflips && ctg_pbs->flips[flip_idx] < next_sc) { + type = SWITCHTYPE_FLIP; + next_sc = ctg_pbs->flips[flip_idx]; + } + if (pb_idx < ctg_pbs->n && ctg_pbs->phase_blocks[pb_idx] < next_sc) { + type = SWITCHTYPE_SWITCH; + next_sc = ctg_pbs->phase_blocks[pb_idx]; + } + if (break_on_switch && switch_idx < ctg_pbs->nswitches && ctg_pbs->switches[switch_idx] < next_sc) { + type = SWITCHTYPE_SWITCH_ERR; + next_sc = ctg_pbs->switches[switch_idx]; + } + if (type == SWITCHTYPE_NONE) { // all out-of-bounds + break; } - end = ctg_scs->ends[sc-1]; - correct_blocks.push_back(end-beg); + if (next_sc <= sc) ERROR("Next SC is not after current SC"); + + + // get block(s) + if (type == SWITCHTYPE_FLIP) { + end = ctg_scs->ends[next_sc-1]; + correct_blocks.push_back(end-beg); + beg = ctg_scs->begs[next_sc]; + + end = ctg_scs->ends[next_sc]; + correct_blocks.push_back(end-beg); + beg = ctg_scs->begs[next_sc+1]; + flip_idx++; + } else if (type == SWITCHTYPE_SWITCH) { + end = ctg_scs->ends[next_sc-1]; + correct_blocks.push_back(end-beg); + beg = ctg_scs->begs[next_sc]; + pb_idx++; + } else if (type == SWITCHTYPE_SWITCH_ERR) { + end = ctg_scs->ends[next_sc-1]; + correct_blocks.push_back(end-beg); + beg = ctg_scs->begs[next_sc]; + switch_idx++; + } + + sc = next_sc; + next_sc = ctg_scs->n; // reset to end + type = SWITCHTYPE_NONE; } + end = ctg_scs->ends[ctg_scs->n-1]; + correct_blocks.push_back(end-beg); } // return NGC50 @@ -441,41 +486,3 @@ int phaseblockData::calculate_ngc50() { } return 0; } - - -/*******************************************************************************/ - - -int phaseblockData::calculate_ng50() { - - // get total bases in genome - size_t total_bases = 0; - for (size_t i = 0; i < this->contigs.size(); i++) { - total_bases += lengths[i]; - } - - // get sizes of each phase block - std::vector phase_blocks; - for (std::string ctg: this->contigs) { - - std::shared_ptr ctg_pbs = this->phase_blocks[ctg]; - std::shared_ptr ctg_scs = ctg_pbs->ctg_superclusters; - for (int i = 0; i < ctg_pbs->n && ctg_scs->n > 0; i++) { - int beg_idx = ctg_pbs->phase_blocks[i]; - int end_idx = ctg_pbs->phase_blocks[i+1]-1; - int beg = ctg_scs->begs[beg_idx]; - int end = ctg_scs->ends[end_idx]; - phase_blocks.push_back(end-beg); - } - } - - // return NG50 - size_t total_phased = 0; - std::sort(phase_blocks.begin(), phase_blocks.end(), std::greater<>()); - for (size_t i = 0; i < phase_blocks.size(); i++) { - total_phased += phase_blocks[i]; - if (total_phased >= total_bases / 2) - return phase_blocks[i]; - } - return 0; -} diff --git a/src/phase.h b/src/phase.h index 1b346b8..962b037 100644 --- a/src/phase.h +++ b/src/phase.h @@ -13,11 +13,12 @@ class ctgPhaseblocks { std::shared_ptr ctg_superclusters = nullptr; - int n = 1; // number of phase blocks + int n = 0; // number of phase blocks // data, filled out during phase() - std::vector phase_blocks; // n+1, indices of superclusters where phasing switches - std::vector block_states; // n, phasing for each phase block (keep/swap) + std::vector phase_blocks; // n+1, supercluster indices of new phase sets + std::vector switches; // switch occurs before this supercluster + std::vector flips; // indices of flipped superclusters int nswitches = 0; // number of phase switch errors int nflips = 0; // number of supercluster phase flip errors }; @@ -28,8 +29,7 @@ class phaseblockData { void write_summary_vcf(std::string vcf_fn); void phase(); - int calculate_ng50(); - int calculate_ngc50(); + int calculate_ng50(bool break_on_switch = false, bool break_on_flip = false); std::shared_ptr ref; std::vector contigs; diff --git a/src/print.cpp b/src/print.cpp index 3d36abd..52b6f59 100644 --- a/src/print.cpp +++ b/src/print.cpp @@ -346,10 +346,10 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { // get swap info bool swap; - switch (ctg_scs->phase[sci]) { + switch (ctg_scs->sc_phase[sci]) { case PHASE_ORIG: swap = false; break; case PHASE_SWAP: swap = true; break; - default: swap = ctg_pbs->block_states[pbi]; break; + default: swap = ctg_scs->pb_phase[sci]; break; } for (int i = ctg_vars[QUERY][h]->clusters[ @@ -390,10 +390,10 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { // get swap info bool swap; - switch (ctg_scs->phase[sci]) { + switch (ctg_scs->sc_phase[sci]) { case PHASE_ORIG: swap = false; break; case PHASE_SWAP: swap = true; break; - default: swap = ctg_pbs->block_states[pbi]; break; + default: swap = ctg_scs->pb_phase[sci]; break; } for (int i = ctg_vars[TRUTH][h]->clusters[ @@ -699,7 +699,7 @@ void write_results( std::string out_phaseblocks_fn = g.out_prefix + "phase-blocks.tsv"; FILE* out_phaseblocks = fopen(out_phaseblocks_fn.data(), "w"); if (g.verbosity >= 1) INFO(" Writing phasing results to '%s'", out_phaseblocks_fn.data()); - fprintf(out_phaseblocks, "CONTIG\tPHASE_BLOCK\tSTART\tSTOP\tSIZE\tSUPERCLUSTERS\tBLOCK_STATE\n"); + fprintf(out_phaseblocks, "CONTIG\tPHASE_BLOCK\tSTART\tSTOP\tSIZE\tSUPERCLUSTERS\tFLIP_ERRORS\tSWITCH_ERRORS\n"); for (std::string ctg : phasedata_ptr->contigs) { std::shared_ptr ctg_pbs = phasedata_ptr->phase_blocks[ctg]; std::shared_ptr ctg_scs = ctg_pbs->ctg_superclusters; @@ -708,9 +708,16 @@ void write_results( int end_idx = ctg_pbs->phase_blocks[i+1]-1; int beg = ctg_scs->begs[beg_idx]; int end = ctg_scs->ends[end_idx]; - int phase = ctg_pbs->block_states[i]; - fprintf(out_phaseblocks, "%s\t%d\t%d\t%d\t%d\t%d\t%d\n", - ctg.data(), i, beg, end, end-beg, end_idx-beg_idx+1, phase); + int nswitches = 0; + for (int si = 0; si < ctg_pbs->nswitches; si++) { + if (ctg_pbs->switches[si] > beg_idx && ctg_pbs->switches[si] <= end_idx) nswitches++; + } + int nflips = 0; + for (int fi = 0; fi < ctg_pbs->nflips; fi++) { + if (ctg_pbs->flips[fi] > beg_idx && ctg_pbs->flips[fi] <= end_idx) nflips++; + } + fprintf(out_phaseblocks, "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", + ctg.data(), i, beg, end, end-beg, end_idx-beg_idx+1, nflips, nswitches); } } fclose(out_phaseblocks); @@ -733,8 +740,8 @@ void write_results( } // set supercluster flip/swap based on phaseblock and sc phasing - bool phase_switch = ctg_pbs->block_states[phase_block_idx]; - int phase_sc = ctg_scs->phase[i]; + bool phase_switch = ctg_scs->pb_phase[i]; + int phase_sc = ctg_scs->sc_phase[i]; bool flip_error; if (phase_switch) { flip_error = phase_sc == PHASE_ORIG; @@ -790,7 +797,7 @@ void write_results( std::shared_ptr query1_vars = ctg_scs->ctg_variants[QUERY][HAP1]; std::shared_ptr query2_vars = ctg_scs->ctg_variants[QUERY][HAP2]; - int supercluster_idx = 0; + int sci = 0; int var1_idx = 0; int var2_idx = 0; int cluster1_idx = 0; @@ -804,16 +811,16 @@ void write_results( if (cluster1_idx+1 >= int(query1_vars->clusters.size())) ERROR("Out of bounds cluster during write_results(): query1") if (query1_vars->clusters[cluster1_idx+1] <= var1_idx) cluster1_idx++; - if (supercluster_idx >= int(ctg_scs->begs.size())) + if (sci >= int(ctg_scs->begs.size())) ERROR("Out of bounds supercluster during write_results(): query1") - while (query1_vars->poss[var1_idx] >= ctg_scs->ends[supercluster_idx]) - supercluster_idx++; + while (query1_vars->poss[var1_idx] >= ctg_scs->ends[sci]) + sci++; // update phasing info - if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1]) + if (sci >= ctg_pbs->phase_blocks[phase_block+1]) phase_block++; - bool phase_switch = ctg_pbs->block_states[phase_block]; - int phase_sc = ctg_scs->phase[supercluster_idx]; + bool phase_switch = ctg_scs->pb_phase[sci]; + int phase_sc = ctg_scs->sc_phase[sci]; bool swap; switch (phase_sc) { case PHASE_ORIG: swap = false; break; @@ -832,7 +839,7 @@ void write_results( error_strs[query1_vars->errtypes[swap][var1_idx]].data(), query1_vars->credit[swap][var1_idx], cluster1_idx, - supercluster_idx, + sci, query1_vars->sync_group[swap][var1_idx], region_strs[query1_vars->locs[var1_idx]].data() ); @@ -841,16 +848,16 @@ void write_results( if (cluster2_idx+1 >= int(query2_vars->clusters.size())) ERROR("Out of bounds cluster during write_results(): query2") if (query2_vars->clusters[cluster2_idx+1] <= var2_idx) cluster2_idx++; - if (supercluster_idx >= int(ctg_scs->begs.size())) + if (sci >= int(ctg_scs->begs.size())) ERROR("Out of bounds supercluster during write_results(): query2") - while (query2_vars->poss[var2_idx] >= ctg_scs->ends[supercluster_idx]) - supercluster_idx++; + while (query2_vars->poss[var2_idx] >= ctg_scs->ends[sci]) + sci++; // update phasing info - if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1]) + if (sci >= ctg_pbs->phase_blocks[phase_block+1]) phase_block++; - bool phase_switch = ctg_pbs->block_states[phase_block]; - int phase_sc = ctg_scs->phase[supercluster_idx]; + bool phase_switch = ctg_scs->pb_phase[sci]; + int phase_sc = ctg_scs->sc_phase[sci]; bool swap; switch (phase_sc) { case PHASE_ORIG: swap = false; break; @@ -869,7 +876,7 @@ void write_results( error_strs[query2_vars->errtypes[swap][var2_idx]].data(), query2_vars->credit[swap][var2_idx], cluster2_idx, - supercluster_idx, + sci, query2_vars->sync_group[swap][var2_idx], region_strs[query2_vars->locs[var2_idx]].data() ); @@ -896,7 +903,7 @@ void write_results( int var2_idx = 0; int cluster1_idx = 0; int cluster2_idx = 0; - int supercluster_idx = 0; + int sci = 0; int phase_block = 0; while (var1_idx < truth1_vars->n || var2_idx < truth2_vars->n) { if (var2_idx >= truth2_vars->n || // only truth1 has remaining vars @@ -904,16 +911,16 @@ void write_results( if (cluster1_idx+1 >= int(truth1_vars->clusters.size())) ERROR("Out of bounds cluster during write_results(): truth1") if (truth1_vars->clusters[cluster1_idx+1] <= var1_idx) cluster1_idx++; - if (supercluster_idx >= int(ctg_scs->begs.size())) + if (sci >= int(ctg_scs->begs.size())) ERROR("Out of bounds supercluster during write_results(): truth1") - while (truth1_vars->poss[var1_idx] >= ctg_scs->ends[supercluster_idx]) - supercluster_idx++; + while (truth1_vars->poss[var1_idx] >= ctg_scs->ends[sci]) + sci++; // update phasing info - if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1]) + if (sci >= ctg_pbs->phase_blocks[phase_block+1]) phase_block++; - bool phase_switch = ctg_pbs->block_states[phase_block]; - int phase_sc = ctg_scs->phase[supercluster_idx]; + bool phase_switch = ctg_scs->pb_phase[sci]; + int phase_sc = ctg_scs->sc_phase[sci]; bool swap; switch (phase_sc) { case PHASE_ORIG: swap = false; break; @@ -932,7 +939,7 @@ void write_results( error_strs[truth1_vars->errtypes[swap][var1_idx]].data(), truth1_vars->credit[swap][var1_idx], cluster1_idx, - supercluster_idx, + sci, truth1_vars->sync_group[swap][var1_idx], region_strs[truth1_vars->locs[var1_idx]].data() ); @@ -942,16 +949,16 @@ void write_results( ERROR("Out of bounds cluster during write_results(): truth2") } if (truth2_vars->clusters[cluster2_idx+1] <= var2_idx) cluster2_idx++; - if (supercluster_idx >= int(ctg_scs->begs.size())) + if (sci >= int(ctg_scs->begs.size())) ERROR("Out of bounds supercluster during write_results(): truth2") - while (truth2_vars->poss[var2_idx] >= ctg_scs->ends[supercluster_idx]) - supercluster_idx++; + while (truth2_vars->poss[var2_idx] >= ctg_scs->ends[sci]) + sci++; // update phasing info - if (supercluster_idx >= ctg_pbs->phase_blocks[phase_block+1]) + if (sci >= ctg_pbs->phase_blocks[phase_block+1]) phase_block++; - bool phase_switch = ctg_pbs->block_states[phase_block]; - int phase_sc = ctg_scs->phase[supercluster_idx]; + bool phase_switch = ctg_scs->pb_phase[sci]; + int phase_sc = ctg_scs->sc_phase[sci]; bool swap; switch (phase_sc) { case PHASE_ORIG: swap = false; break; @@ -970,7 +977,7 @@ void write_results( error_strs[truth2_vars->errtypes[swap][var2_idx]].data(), truth2_vars->credit[swap][var2_idx], cluster2_idx, - supercluster_idx, + sci, truth2_vars->sync_group[swap][var2_idx], region_strs[truth2_vars->locs[var2_idx]].data() ); From 4fce6a220dac5a722bfb84b2a6e9324f8b530822 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Wed, 8 Nov 2023 18:11:32 -0500 Subject: [PATCH 09/16] Only write truth/query VCF if realign selected. --- src/main.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 1bd5438..e0f2a67 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -49,15 +49,17 @@ int main(int argc, char **argv) { // write results if (g.write) { - g.timers[TIME_WRITE].start(); - if (g.verbosity >= 1) INFO(" "); - if (g.verbosity >= 1) INFO(" Writing original query VCF to '%s'", - std::string(g.out_prefix + "orig-query.vcf").data()); + if (g.realign_query) { + g.timers[TIME_WRITE].start(); + if (g.verbosity >= 1) INFO(" Writing original query VCF to '%s'", + std::string(g.out_prefix + "orig-query.vcf").data()); query_ptr->write_vcf(g.out_prefix + "orig-query.vcf"); - if (g.verbosity >= 1) INFO(" Writing original truth VCF to '%s'", - std::string(g.out_prefix + "orig-truth.vcf").data()); + } else if (g.realign_truth) { + if (g.verbosity >= 1) INFO(" Writing original truth VCF to '%s'", + std::string(g.out_prefix + "orig-truth.vcf").data()); truth_ptr->write_vcf(g.out_prefix + "orig-truth.vcf"); - g.timers[TIME_WRITE].stop(); + g.timers[TIME_WRITE].stop(); + } } // ensure each input contains all contigs in BED @@ -94,7 +96,7 @@ int main(int argc, char **argv) { g.timers[TIME_REALN].stop(); } - if (g.realign_only && g.write) { // realign only, exit early + if (g.realign_only && g.realign_query && g.write) { // realign only, exit early g.timers[TIME_WRITE].start(); query_ptr->write_vcf(g.out_prefix + "query.vcf"); g.timers[TIME_WRITE].stop(); @@ -155,7 +157,7 @@ int main(int argc, char **argv) { } if (g.realign_only) { // realign only, exit early - if (g.write) { + if (g.realign_truth && g.write) { g.timers[TIME_WRITE].start(); truth_ptr->write_vcf(g.out_prefix + "truth.vcf"); g.timers[TIME_WRITE].stop(); From a20c152ee4d223529ef889c69e17e00571210738 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 9 Nov 2023 09:57:40 -0500 Subject: [PATCH 10/16] Added phasing summary TSV. --- src/phase.cpp | 18 ++++++++++++++++++ src/phase.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/src/phase.cpp b/src/phase.cpp index fce761e..6cbd1d4 100644 --- a/src/phase.cpp +++ b/src/phase.cpp @@ -391,6 +391,24 @@ void phaseblockData::phase() if (g.verbosity >= 1) INFO(" Phase block NG50: %d", ng50); if (g.verbosity >= 1) INFO(" (switch) Phase block NGC50: %d", s_ngc50); if (g.verbosity >= 1) INFO(" (switchflip) Phase block NGC50: %d", sf_ngc50); + this->write_phasing_summary(phase_blocks, switch_errors, flip_errors, ng50, s_ngc50, sf_ngc50); +} + + +/*******************************************************************************/ + + +void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors, + int flip_errors, int ng50, int s_ngc50, int sf_ngc50) { + std::string out_phasing_summary_fn = g.out_prefix + "phasing-summary.tsv"; + if (g.verbosity >= 1) INFO(" Writing phasing summary to '%s'", + out_phasing_summary_fn.data()); + FILE* out_phasing_summary = fopen(out_phasing_summary_fn.data(), "w"); + fprintf(out_phasing_summary, + "PHASE_BLOCKS\tFLIP_ERRORS\tSWITCH_ERRORS\tNG_50\tSWITCH_NGC50\tSWITCHFLIP_NGC50\n"); + fprintf(out_phasing_summary, "%d\t%d\t%d\t%d\t%d\t%d", phase_blocks, + switch_errors, flip_errors, ng50, s_ngc50, sf_ngc50); + fclose(out_phasing_summary); } diff --git a/src/phase.h b/src/phase.h index 962b037..b91f359 100644 --- a/src/phase.h +++ b/src/phase.h @@ -28,6 +28,8 @@ class phaseblockData { phaseblockData(std::shared_ptr clusterdata_ptr); void write_summary_vcf(std::string vcf_fn); + void write_phasing_summary(int phase_blocks, int switch_errors, + int flip_errors, int ng50, int s_ngc50, int sf_ngc50); void phase(); int calculate_ng50(bool break_on_switch = false, bool break_on_flip = false); From 9cf032ac1ddb40a44d1b1b95327ad3e427bd4790 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 9 Nov 2023 10:45:06 -0500 Subject: [PATCH 11/16] Distance calculations are now optional, skipped by default. --- src/edit.cpp | 131 ++++++++++++++++++++++++++++++++++++++++++++++ src/edit.h | 2 + src/globals.cpp | 32 ++++++++---- src/globals.h | 42 ++++++++------- src/main.cpp | 26 +++++++--- src/phase.cpp | 1 + src/print.cpp | 135 ++---------------------------------------------- src/print.h | 2 +- 8 files changed, 203 insertions(+), 168 deletions(-) diff --git a/src/edit.cpp b/src/edit.cpp index 18667e0..2e56ced 100644 --- a/src/edit.cpp +++ b/src/edit.cpp @@ -142,3 +142,134 @@ int editData::get_score(int qual) const { } return score; } + + +/*******************************************************************************/ + + +void editData::write_distance() { + + // log all distance results + std::string dist_fn = g.out_prefix + "distance.tsv"; + FILE* out_dists = 0; + if (g.write) { + out_dists = fopen(dist_fn.data(), "w"); + if (g.verbosity >= 1) INFO(" "); + if (g.verbosity >= 1) INFO(" Writing distance results to '%s'", dist_fn.data()); + fprintf(out_dists, "MIN_QUAL\tSUB_DE\tINS_DE\tDEL_DE\tSUB_ED\tINS_ED\tDEL_ED\t" + "DISTINCT_EDITS\tEDIT_DIST\tALN_SCORE\tALN_QSCORE\n"); + } + + // get original scores / distance (above g.max_qual, no vars applied) + std::vector orig_edit_dists(TYPES, 0); + std::vector orig_distinct_edits(TYPES, 0); + int orig_score = this->get_score(g.max_qual+1); + for (int type = 0; type < TYPES; type++) { + orig_edit_dists[type] = this->get_ed(g.max_qual+1, type); + orig_distinct_edits[type] = this->get_de(g.max_qual+1, type); + } + + std::vector best_score(TYPES, std::numeric_limits::max()); + std::vector best_qual(TYPES, 0); + for (int q = g.min_qual; q <= g.max_qual+1; q++) { // all qualities + + // get ED/DE for each Q threshold, for each type + std::vector edit_dists(TYPES, 0); + std::vector distinct_edits(TYPES, 0); + for (int type = 0; type < TYPES; type++) { + edit_dists[type] = this->get_ed(q, type); + distinct_edits[type] = this->get_de(q, type); + + // save best Q threshold so far + double score = double(edit_dists[type]) * distinct_edits[type]; + if (score < best_score[type]) { + best_score[type] = score; + best_qual[type] = q; + } + } + + if (g.write) fprintf(out_dists, + "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%f\n", + q, distinct_edits[TYPE_SUB], + distinct_edits[TYPE_INS], distinct_edits[TYPE_DEL], + edit_dists[TYPE_SUB], edit_dists[TYPE_INS], edit_dists[TYPE_DEL], + distinct_edits[TYPE_ALL], edit_dists[TYPE_ALL], + this->get_score(q), qscore(double(this->get_score(q))/orig_score)); + } + if (g.write) fclose(out_dists); + + // summarize distance results + std::string dist_summ_fn = g.out_prefix + "distance-summary.tsv"; + FILE* dists_summ = 0; + if (g.write) { + dists_summ = fopen(dist_summ_fn.data(), "w"); + if (g.verbosity >= 1) + INFO(" Writing distance summary to '%s'", dist_summ_fn.data()); + fprintf(dists_summ, "VAR_TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE\n"); + } + INFO(" "); + INFO("%sALIGNMENT DISTANCE SUMMARY%s", COLOR_BLUE, COLOR_WHITE); + for (int type = 0; type < TYPES; type++) { + + // skip INS/DEL individually unless higher print verbosity + if (g.verbosity == 0 && type != TYPE_ALL) + continue; + if (g.verbosity == 1 && (type == TYPE_INS || type == TYPE_DEL)) + continue; + + INFO(" "); + if (type == TYPE_ALL) { + INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE%s", + COLOR_BLUE, COLOR_WHITE); + } else { + INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE%s", + COLOR_BLUE, COLOR_WHITE); + } + std::vector quals = {g.min_qual, best_qual[type], g.max_qual+1}; + for (int q : quals) { + + // fill out ED/DE for selected quals + std::vector edit_dists(TYPES, 0); + std::vector distinct_edits(TYPES, 0); + for (int type = 0; type < TYPES; type++) { + edit_dists[type] =this->get_ed(q, type); + distinct_edits[type] = this->get_de(q, type); + } + + float ed_qscore = qscore(double(edit_dists[type]) / orig_edit_dists[type]); + float de_qscore = qscore(double(distinct_edits[type]) / orig_distinct_edits[type]); + float all_qscore = type == TYPE_ALL ? qscore(double(this->get_score(q)) / orig_score) : 0; + + // print summary + if (g.write) fprintf(dists_summ, "%s\t%d\t%d\t%d\t%f\t%f\t%f\n", type_strs2[type].data(), + q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore); + if (type == TYPE_ALL) { + INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f\t%f%s", COLOR_BLUE, type_strs2[type].data(), + q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore, COLOR_WHITE); + } else { + INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f%s", COLOR_BLUE, type_strs2[type].data(), + q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, COLOR_WHITE); + } + } + } + INFO(" "); + if (g.write) fclose(dists_summ); +} + + +/*******************************************************************************/ + + +void editData::write_edits() { + std::string edit_fn = g.out_prefix + "edits.tsv"; + if (g.verbosity >= 1) INFO(" Writing edit results to '%s'", edit_fn.data()); + FILE* out_edits = fopen(edit_fn.data(), "w"); + fprintf(out_edits, "CONTIG\tSTART\tHAP\tTYPE\tSIZE\tSUPERCLUSTER\tMIN_QUAL\tMAX_QUAL\n"); + for (int i = 0; i < this->n; i++) { + fprintf(out_edits, "%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", + this->ctgs[i].data(), this->poss[i], this->haps[i], + type_strs[this->types[i]].data(), this->lens[i], + this->superclusters[i], this->min_quals[i], this->max_quals[i]); + } + fclose(out_edits); +} diff --git a/src/edit.h b/src/edit.h index 49fb0b8..3e56965 100644 --- a/src/edit.h +++ b/src/edit.h @@ -24,6 +24,8 @@ class editData { int get_ed(int qual, int type=TYPE_ALL) const; // edit distance int get_de(int qual, int type=TYPE_ALL) const; // distinct edits int get_score(int qual) const; // smith-waterman distance + void write_distance(); + void write_edits(); // data (all of size n) int n = 0; diff --git a/src/globals.cpp b/src/globals.cpp index 6244811..dad8bdd 100644 --- a/src/globals.cpp +++ b/src/globals.cpp @@ -178,7 +178,8 @@ void Globals::parse_args(int argc, char ** argv) { ERROR("Invalid maximum variant size provided"); } /*******************************************************************************/ - } else if (std::string(argv[i]) == "--min-qual") { + } else if (std::string(argv[i]) == "-mn" || + std::string(argv[i]) == "--min-qual") { i++; if (i == argc) { ERROR("Option '--min-qual' used without providing minimum variant quality"); @@ -192,7 +193,8 @@ void Globals::parse_args(int argc, char ** argv) { ERROR("Must provide non-negative minimum variant quality"); } /*******************************************************************************/ - } else if (std::string(argv[i]) == "--max-qual") { + } else if (std::string(argv[i]) == "-mx" || + std::string(argv[i]) == "--max-qual") { i++; if (i == argc) { ERROR("Option '--max-qual' used without providing maximum variant quality"); @@ -385,6 +387,11 @@ void Globals::parse_args(int argc, char ** argv) { if (this->max_ram < 0) { ERROR("Max RAM must be positive"); } +/*******************************************************************************/ + } else if (std::string(argv[i]) == "-d" || + std::string(argv[i]) == "--distance") { + i++; + g.distance = true; /*******************************************************************************/ } else if (std::string(argv[i]) == "-ro" || std::string(argv[i]) == "--realign-only") { @@ -406,7 +413,8 @@ void Globals::parse_args(int argc, char ** argv) { i++; g.realign_query = true; /*******************************************************************************/ - } else if (std::string(argv[i]) == "--simple-cluster") { + } else if (std::string(argv[i]) == "-sc" || + std::string(argv[i]) == "--simple-cluster") { i++; g.simple_cluster = true; /*******************************************************************************/ @@ -481,12 +489,12 @@ void Globals::print_usage() const printf(" minimum variant size, smaller variants ignored (SNPs are size 1)\n"); printf(" -l, --largest-variant [%d]\n", g.max_size); printf(" maximum variant size, larger variants ignored\n"); - printf(" --min-qual [%d]\n", g.min_qual); + printf(" -mn, --min-qual [%d]\n", g.min_qual); printf(" minimum variant quality, lower qualities ignored\n"); - printf(" --max-qual [%d]\n", g.max_qual); + printf(" -mx, --max-qual [%d]\n", g.max_qual); printf(" maximum variant quality, higher qualities kept but thresholded\n"); - printf("\n ReAlignment:\n"); + printf("\n Re-Alignment:\n"); printf(" -rq, --realign-query\n"); printf(" realign query variants using Smith-Waterman parameters\n"); printf(" -rt, --realign-truth\n"); @@ -504,6 +512,10 @@ void Globals::print_usage() const printf(" -ct, --credit-threshold [%.2f]\n", g.credit_threshold); printf(" minimum partial credit to consider variant a true positive\n"); + printf("\n Distance:\n"); + printf(" -d, --distance\n"); + printf(" flag to include alignment distance calculations, skipped by default\n"); + printf("\n Utilization:\n"); printf(" -t, --max-threads [%d]\n", g.max_threads); printf(" maximum threads to use for clustering and precision/recall alignment\n"); @@ -526,7 +538,7 @@ void Globals::print_usage() const printf("\n Clustering:\n"); printf(" -i, --max-iterations [%d]\n", g.max_cluster_itrs); printf(" maximum iterations for expanding/merging clusters\n"); - printf(" --simple-cluster\n"); + printf(" -sc, --simple-cluster\n"); printf(" instead of biWFA-based clustering, use gap-based clustering \n"); printf(" -g, --cluster-gap [%d]\n", g.cluster_min_gap); printf(" minimum gap between independent clusters and superclusters (in bases),\n"); @@ -539,11 +551,11 @@ void Globals::print_usage() const printf("\n Distance:\n"); printf(" -ex, --eval-mismatch-penalty [%d]\n", g.eval_sub); - printf(" mismatch penalty (distance evaluation)\n"); + printf(" mismatch penalty (distance evaluation only)\n"); printf(" -eo, --eval-gap-open-penalty [%d]\n", g.eval_open); - printf(" gap opening penalty (distance evaluation)\n"); + printf(" gap opening penalty (distance evaluation only)\n"); printf(" -ee, --eval-gap-extend-penalty [%d]\n", g.eval_extend); - printf(" gap extension penalty (distance evaluation)\n"); + printf(" gap extension penalty (distance evaluation only)\n"); } diff --git a/src/globals.h b/src/globals.h index e609859..e2e630e 100644 --- a/src/globals.h +++ b/src/globals.h @@ -9,6 +9,9 @@ class Globals { public: + // constructors + Globals() {;} + // input files std::string ref_fasta_fn; FILE* ref_fasta_fp; @@ -19,7 +22,7 @@ class Globals { bool bed_exists = false; bool write = true; - // variant params + // variant filtering std::vector filters; std::vector filter_ids; int min_qual = 0; @@ -27,21 +30,32 @@ class Globals { int min_size = 1; int max_size = 5000; - // clustering params - bool realign_truth = false; - bool realign_query = false; - bool realign_only = false; + // clustering bool simple_cluster = false; int cluster_min_gap = 50; int reach_min_gap = 10; int max_cluster_itrs = 4; - // phasing options + // re-alignment + bool realign_truth = false; + bool realign_query = false; + bool realign_only = false; + int sub = 5; + int open = 6; + int extend = 2; + + // phasing double phase_threshold = 0.5; - // precision-recall options + // precision-recall double credit_threshold = 0.7; + // edit distance + bool distance = false; + int eval_sub = 3; + int eval_open = 2; + int eval_extend = 1; + // memory params int max_threads = 64; double max_ram = 64; // GB @@ -55,19 +69,6 @@ class Globals { std::string out_prefix; std::string cmd; - // alignment parameters - int sub = 5; - int open = 6; - int extend = 2; - int eval_sub = 3; - int eval_open = 2; - int eval_extend = 1; - - std::vector timers; - - // constructors - Globals() {;} - // member functions void parse_args(int argc, char ** argv); void print_version() const; @@ -78,6 +79,7 @@ class Globals { // program data const std::string VERSION = "2.2.1"; const std::string PROGRAM = "vcfdist"; + std::vector timers; }; extern Globals g; diff --git a/src/main.cpp b/src/main.cpp index e0f2a67..0d022e7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -210,9 +210,21 @@ int main(int argc, char **argv) { g.timers[TIME_PR_ALN].stop(); // calculate edit distance - g.timers[TIME_EDITS].start(); - editData edits = edits_wrapper(clusterdata_ptr); - g.timers[TIME_EDITS].stop(); + editData edits; + if (g.distance) { + g.timers[TIME_EDITS].start(); + editData edits = edits_wrapper(clusterdata_ptr); + g.timers[TIME_EDITS].stop(); + + g.timers[TIME_WRITE].start(); + edits.write_distance(); + if (g.write) edits.write_edits(); + g.timers[TIME_WRITE].stop(); + } else { + if (g.verbosity >= 1) INFO(" "); + if (g.verbosity >= 1) INFO("%s[6/8] Skipping distance metrics%s", + COLOR_PURPLE, COLOR_WHITE); + } // calculate global phasings g.timers[TIME_PHASE].start(); @@ -221,12 +233,14 @@ int main(int argc, char **argv) { // write supercluster/phaseblock results in CSV format g.timers[TIME_WRITE].start(); - write_results(phasedata_ptr, edits); + write_results(phasedata_ptr); // save new VCF if (g.write) { - query_ptr->write_vcf(g.out_prefix + "query.vcf"); - truth_ptr->write_vcf(g.out_prefix + "truth.vcf"); + if (g.realign_query) + query_ptr->write_vcf(g.out_prefix + "query.vcf"); + if (g.realign_truth) + truth_ptr->write_vcf(g.out_prefix + "truth.vcf"); phasedata_ptr->write_summary_vcf(g.out_prefix + "summary.vcf"); } g.timers[TIME_WRITE].stop(); diff --git a/src/phase.cpp b/src/phase.cpp index 6cbd1d4..25fc511 100644 --- a/src/phase.cpp +++ b/src/phase.cpp @@ -401,6 +401,7 @@ void phaseblockData::phase() void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors, int flip_errors, int ng50, int s_ngc50, int sf_ngc50) { std::string out_phasing_summary_fn = g.out_prefix + "phasing-summary.tsv"; + if (g.verbosity >= 1) INFO(" "); if (g.verbosity >= 1) INFO(" Writing phasing summary to '%s'", out_phasing_summary_fn.data()); FILE* out_phasing_summary = fopen(out_phasing_summary_fn.data(), "w"); diff --git a/src/print.cpp b/src/print.cpp index 52b6f59..2963412 100644 --- a/src/print.cpp +++ b/src/print.cpp @@ -44,14 +44,14 @@ void write_params() { "phase_threshold = %f\ncredit_threshold = %f\nrealign_truth = %s\nrealign_query = %s\n" "realign_only = %s\nsimple_cluster = %s\ncluster_min_gap = %d\n" "reach_min_gap = %d\nmax_cluster_itrs = %d\nmax_threads = %d\nmax_ram = %f\n" - "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\n", + "sub = %d\nopen = %d\nextend = %d\neval_sub = %d\neval_open = %d\neval_extend = %d\ndistance = %s", g.PROGRAM.data(), g.VERSION.data(), g.out_prefix.data(), g.cmd.data(), g.ref_fasta_fn.data(), g.query_vcf_fn.data(), g.truth_vcf_fn.data(), g.bed_fn.data(), b2s(g.write).data(), filters_str.data(), g.min_qual, g.max_qual, g.min_size, g.max_size, g.phase_threshold, g.credit_threshold, b2s(g.realign_truth).data(), b2s(g.realign_query).data(), b2s(g.realign_only).data(), b2s(g.simple_cluster).data(), g.cluster_min_gap, g.reach_min_gap, g.max_cluster_itrs, g.max_threads, g.max_ram, - g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend); + g.sub, g.open, g.extend, g.eval_sub, g.eval_open, g.eval_extend, b2s(g.distance).data()); fclose(out_params); } @@ -434,6 +434,7 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { std::string out_pr_fn = g.out_prefix + "precision-recall.tsv"; FILE* out_pr = 0; if (g.write) { + if (g.verbosity >= 1) INFO(" "); if (g.verbosity >= 1) INFO(" Writing precision-recall results to '%s'", out_pr_fn.data()); out_pr = fopen(out_pr_fn.data(), "w"); fprintf(out_pr, "VAR_TYPE\tMIN_QUAL\tPREC\tRECALL\tF1_SCORE\tF1_QSCORE\t" @@ -557,143 +558,15 @@ void write_precision_recall(std::unique_ptr & phasedata_ptr) { /*******************************************************************************/ -void write_distance(const editData & edits) { - - // log all distance results - std::string dist_fn = g.out_prefix + "distance.tsv"; - FILE* out_dists = 0; - if (g.write) { - out_dists = fopen(dist_fn.data(), "w"); - fprintf(out_dists, "MIN_QUAL\tSUB_DE\tINS_DE\tDEL_DE\tSUB_ED\tINS_ED\tDEL_ED\t" - "DISTINCT_EDITS\tEDIT_DIST\tALN_SCORE\tALN_QSCORE\n"); - if (g.verbosity >= 1) INFO(" Writing distance results to '%s'", dist_fn.data()); - } - - // get original scores / distance (above g.max_qual, no vars applied) - std::vector orig_edit_dists(TYPES, 0); - std::vector orig_distinct_edits(TYPES, 0); - int orig_score = edits.get_score(g.max_qual+1); - for (int type = 0; type < TYPES; type++) { - orig_edit_dists[type] = edits.get_ed(g.max_qual+1, type); - orig_distinct_edits[type] = edits.get_de(g.max_qual+1, type); - } - - std::vector best_score(TYPES, std::numeric_limits::max()); - std::vector best_qual(TYPES, 0); - for (int q = g.min_qual; q <= g.max_qual+1; q++) { // all qualities - - // get ED/DE for each Q threshold, for each type - std::vector edit_dists(TYPES, 0); - std::vector distinct_edits(TYPES, 0); - for (int type = 0; type < TYPES; type++) { - edit_dists[type] = edits.get_ed(q, type); - distinct_edits[type] = edits.get_de(q, type); - - // save best Q threshold so far - double score = double(edit_dists[type]) * distinct_edits[type]; - if (score < best_score[type]) { - best_score[type] = score; - best_qual[type] = q; - } - } - - if (g.write) fprintf(out_dists, - "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%f\n", - q, distinct_edits[TYPE_SUB], - distinct_edits[TYPE_INS], distinct_edits[TYPE_DEL], - edit_dists[TYPE_SUB], edit_dists[TYPE_INS], edit_dists[TYPE_DEL], - distinct_edits[TYPE_ALL], edit_dists[TYPE_ALL], - edits.get_score(q), qscore(double(edits.get_score(q))/orig_score)); - } - if (g.write) fclose(out_dists); - - // summarize distance results - std::string dist_summ_fn = g.out_prefix + "distance-summary.tsv"; - FILE* dists_summ = 0; - if (g.write) { - dists_summ = fopen(dist_summ_fn.data(), "w"); - if (g.verbosity >= 1) - INFO(" Writing distance summary to '%s'", dist_summ_fn.data()); - fprintf(dists_summ, "VAR_TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE\n"); - } - INFO(" "); - INFO("%sALIGNMENT DISTANCE SUMMARY%s", COLOR_BLUE, COLOR_WHITE); - for (int type = 0; type < TYPES; type++) { - - // skip INS/DEL individually unless higher print verbosity - if (g.verbosity == 0 && type != TYPE_ALL) - continue; - if (g.verbosity == 1 && (type == TYPE_INS || type == TYPE_DEL)) - continue; - - INFO(" "); - if (type == TYPE_ALL) { - INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE%s", - COLOR_BLUE, COLOR_WHITE); - } else { - INFO("%sTYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE%s", - COLOR_BLUE, COLOR_WHITE); - } - std::vector quals = {g.min_qual, best_qual[type], g.max_qual+1}; - for (int q : quals) { - - // fill out ED/DE for selected quals - std::vector edit_dists(TYPES, 0); - std::vector distinct_edits(TYPES, 0); - for (int type = 0; type < TYPES; type++) { - edit_dists[type] = edits.get_ed(q, type); - distinct_edits[type] = edits.get_de(q, type); - } - - float ed_qscore = qscore(double(edit_dists[type]) / orig_edit_dists[type]); - float de_qscore = qscore(double(distinct_edits[type]) / orig_distinct_edits[type]); - float all_qscore = type == TYPE_ALL ? qscore(double(edits.get_score(q)) / orig_score) : 0; - - // print summary - if (g.write) fprintf(dists_summ, "%s\t%d\t%d\t%d\t%f\t%f\t%f\n", type_strs2[type].data(), - q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore); - if (type == TYPE_ALL) { - INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f\t%f%s", COLOR_BLUE, type_strs2[type].data(), - q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, all_qscore, COLOR_WHITE); - } else { - INFO("%s%s\tQ >= %d\t\t%-16d%-16d%f\t%f%s", COLOR_BLUE, type_strs2[type].data(), - q, edit_dists[type], distinct_edits[type], ed_qscore, de_qscore, COLOR_WHITE); - } - } - } - INFO(" "); - if (g.write) fclose(dists_summ); -} - - -/*******************************************************************************/ - - -void write_results( - std::unique_ptr & phasedata_ptr, - const editData & edits) { +void write_results(std::unique_ptr & phasedata_ptr) { if (g.verbosity >= 1) INFO(" "); if (g.verbosity >= 1) INFO("%s[8/8] Writing results%s", COLOR_PURPLE, COLOR_WHITE); // print summary (precision/recall) information write_precision_recall(phasedata_ptr); - // print distance information - write_distance(edits); - // print edit information if (g.write) { - std::string edit_fn = g.out_prefix + "edits.tsv"; - FILE* out_edits = fopen(edit_fn.data(), "w"); - fprintf(out_edits, "CONTIG\tSTART\tHAP\tTYPE\tSIZE\tSUPERCLUSTER\tMIN_QUAL\tMAX_QUAL\n"); - if (g.verbosity >= 1) INFO(" Writing edit results to '%s'", edit_fn.data()); - for (int i = 0; i < edits.n; i++) { - fprintf(out_edits, "%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\n", - edits.ctgs[i].data(), edits.poss[i], edits.haps[i], - type_strs[edits.types[i]].data(), edits.lens[i], - edits.superclusters[i], edits.min_quals[i], edits.max_quals[i]); - } - fclose(out_edits); // print phasing information std::string out_phaseblocks_fn = g.out_prefix + "phase-blocks.tsv"; diff --git a/src/print.h b/src/print.h index 2fbb051..0467a48 100644 --- a/src/print.h +++ b/src/print.h @@ -37,7 +37,7 @@ void print_wfa_ptrs( const std::vector< std::vector< std::vector > > & ptrs, const std::vector< std::vector< std::vector > > & offs); -void write_results(std::unique_ptr & phasings, const editData & edits); +void write_results(std::unique_ptr & phasings); void write_params(); #endif From f169dda9fbb3b6deda9990402aac7e4adbf386da Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 9 Nov 2023 11:26:05 -0500 Subject: [PATCH 12/16] Fixed integer division on phase threshold comparison. --- src/dist.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dist.cpp b/src/dist.cpp index 4422b26..398f06f 100644 --- a/src/dist.cpp +++ b/src/dist.cpp @@ -459,9 +459,9 @@ int store_phase( phase = PHASE_ORIG; } else if (swap_phase_dist == 0) { // protect division by zero phase = PHASE_SWAP; - } else if (1 - swap_phase_dist / orig_phase_dist > g.phase_threshold) { // significant reduction by swapping + } else if (1 - float(swap_phase_dist) / orig_phase_dist > g.phase_threshold) { // significant reduction by swapping phase = PHASE_SWAP; - } else if (1 - orig_phase_dist / swap_phase_dist > g.phase_threshold) { // significant reduction with orig + } else if (1 - float(orig_phase_dist) / swap_phase_dist > g.phase_threshold) { // significant reduction with orig phase = PHASE_ORIG; } From 6cfc50955c0104c89dde688596ef87631464b8c5 Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 9 Nov 2023 11:52:59 -0500 Subject: [PATCH 13/16] Added detailed switchflip output. --- src/globals.h | 2 +- src/main.cpp | 1 + src/phase.cpp | 105 +++++++++++++++++++++++++++++++++++++++++++++++++- src/phase.h | 1 + 4 files changed, 107 insertions(+), 2 deletions(-) diff --git a/src/globals.h b/src/globals.h index e2e630e..aac31fb 100644 --- a/src/globals.h +++ b/src/globals.h @@ -45,7 +45,7 @@ class Globals { int extend = 2; // phasing - double phase_threshold = 0.5; + double phase_threshold = 0.6; // precision-recall double credit_threshold = 0.7; diff --git a/src/main.cpp b/src/main.cpp index 0d022e7..74b3634 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -233,6 +233,7 @@ int main(int argc, char **argv) { // write supercluster/phaseblock results in CSV format g.timers[TIME_WRITE].start(); + if (g.write) phasedata_ptr->write_switchflips(); write_results(phasedata_ptr); // save new VCF diff --git a/src/phase.cpp b/src/phase.cpp index 25fc511..c946f19 100644 --- a/src/phase.cpp +++ b/src/phase.cpp @@ -398,6 +398,109 @@ void phaseblockData::phase() /*******************************************************************************/ +void phaseblockData::write_switchflips() { + + std::string out_sf_fn = g.out_prefix + "switchflips.tsv"; + FILE* out_sf = 0; + if (g.verbosity >= 1) INFO(" Writing switchflip results to '%s'", out_sf_fn.data()); + out_sf = fopen(out_sf_fn.data(), "w"); + fprintf(out_sf, "CONTIG\tSTART\tSTOP\tSWITCH_TYPE\tSUPERCLUSTER\tPHASE_BLOCK\n"); + + // get sizes of each correct phase block (split on flips, not just switch) + for (std::string ctg: this->contigs) { + + std::shared_ptr ctg_pbs = this->phase_blocks[ctg]; + std::shared_ptr ctg_scs = ctg_pbs->ctg_superclusters; + + int switch_idx = 0; + int flip_idx = 0; + int pb_idx = 1; + if (ctg_scs->n == 0) continue; + + // init start of this correct block + int sc = 0; + int next_sc = ctg_scs->n; // default to last + int beg = 0; int end = 0; + int type = SWITCHTYPE_NONE; + + while (true) { + + // get type and supercluster of next switch/flip/phaseset + // check flip first because it will cause a second switch + if (flip_idx < ctg_pbs->nflips && ctg_pbs->flips[flip_idx] < next_sc) { + type = SWITCHTYPE_FLIP; + next_sc = ctg_pbs->flips[flip_idx]; + } + if (pb_idx < ctg_pbs->n && ctg_pbs->phase_blocks[pb_idx] < next_sc) { + type = SWITCHTYPE_SWITCH; + next_sc = ctg_pbs->phase_blocks[pb_idx]; + } + if (switch_idx < ctg_pbs->nswitches && ctg_pbs->switches[switch_idx] < next_sc) { + type = SWITCHTYPE_SWITCH_ERR; + next_sc = ctg_pbs->switches[switch_idx]; + } + if (type == SWITCHTYPE_NONE) { // all out-of-bounds + break; + } + if (next_sc <= sc) ERROR("Next SC is not after current SC"); + + + // get block(s) + if (type == SWITCHTYPE_FLIP) { + // switch could have occurred anywhere after last phased supercluster + int left = next_sc-1; + while (left > 0 && ctg_scs->sc_phase[left] == PHASE_NONE) + left--; + if (left >= 0) { + beg = ctg_scs->ends[left]; + end = ctg_scs->begs[next_sc]; + fprintf(out_sf, "%s\t%d\t%d\t%s\t%d\t%d\n", ctg.data(), beg, end, + switch_strs[SWITCHTYPE_FLIP_BEG].data(), next_sc, pb_idx-1); + } + + // switch could have occurred anywhere before next phased supercluster + int right = next_sc+1; + while (right < ctg_scs->n-1 && ctg_scs->sc_phase[right] == PHASE_NONE) + right++; + if (right < ctg_scs->n) { + beg = ctg_scs->ends[next_sc]; + end = ctg_scs->begs[right]; + fprintf(out_sf, "%s\t%d\t%d\t%s\t%d\t%d\n", ctg.data(), beg, end, + switch_strs[SWITCHTYPE_FLIP_END].data(), next_sc, pb_idx-1); + } + flip_idx++; + } else if (type == SWITCHTYPE_SWITCH) { + // end of phase block, don't print anything since not an error + pb_idx++; + } else if (type == SWITCHTYPE_SWITCH_ERR) { + // expand left/right from in between these clusters + int left = next_sc-1; + while (left > 0 && ctg_scs->sc_phase[left] == PHASE_NONE) + left--; + int right = next_sc; + while (right < ctg_scs->n-1 && ctg_scs->sc_phase[right] == PHASE_NONE) + right++; + if (left >= 0 && right < ctg_scs->n) { + beg = ctg_scs->ends[left]; + end = ctg_scs->begs[right]; + fprintf(out_sf, "%s\t%d\t%d\t%s\t%d\t%d\n", ctg.data(), beg, end, + switch_strs[SWITCHTYPE_SWITCH_ERR].data(), next_sc, pb_idx-1); + } + switch_idx++; + } + + sc = next_sc; + next_sc = ctg_scs->n; // reset to end + type = SWITCHTYPE_NONE; + } + } + fclose(out_sf); +} + + +/*******************************************************************************/ + + void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors, int flip_errors, int ng50, int s_ngc50, int sf_ngc50) { std::string out_phasing_summary_fn = g.out_prefix + "phasing-summary.tsv"; @@ -406,7 +509,7 @@ void phaseblockData::write_phasing_summary(int phase_blocks, int switch_errors, out_phasing_summary_fn.data()); FILE* out_phasing_summary = fopen(out_phasing_summary_fn.data(), "w"); fprintf(out_phasing_summary, - "PHASE_BLOCKS\tFLIP_ERRORS\tSWITCH_ERRORS\tNG_50\tSWITCH_NGC50\tSWITCHFLIP_NGC50\n"); + "PHASE_BLOCKS\tSWITCH_ERRORS\tFLIP_ERRORS\tNG_50\tSWITCH_NGC50\tSWITCHFLIP_NGC50\n"); fprintf(out_phasing_summary, "%d\t%d\t%d\t%d\t%d\t%d", phase_blocks, switch_errors, flip_errors, ng50, s_ngc50, sf_ngc50); fclose(out_phasing_summary); diff --git a/src/phase.h b/src/phase.h index b91f359..784eabb 100644 --- a/src/phase.h +++ b/src/phase.h @@ -30,6 +30,7 @@ class phaseblockData { void write_summary_vcf(std::string vcf_fn); void write_phasing_summary(int phase_blocks, int switch_errors, int flip_errors, int ng50, int s_ngc50, int sf_ngc50); + void write_switchflips(); void phase(); int calculate_ng50(bool break_on_switch = false, bool break_on_flip = false); From 7f361237519ed7f992453b8d784e6cc42bc0e0ae Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 9 Nov 2023 13:08:33 -0500 Subject: [PATCH 14/16] Fixed bug labelling truth FNs FPs --- src/variant.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/variant.cpp b/src/variant.cpp index 999d8e5..2057c21 100644 --- a/src/variant.cpp +++ b/src/variant.cpp @@ -263,11 +263,11 @@ void ctgVariants::print_var_sample(FILE* out_fp, int idx, std::string gt, if (this->credit[swap][idx] == 1) { errtype = "TP"; match_type = "gm"; } else if (this->credit[swap][idx] == 0) { - errtype = "FP"; match_type = "."; + errtype = query ? "FP" : "FN"; match_type = "."; } else if (this->credit[swap][idx] >= g.credit_threshold) { errtype = "TP"; match_type = "lm"; } else { - errtype = "FP"; match_type = "lm"; + errtype = query ? "FP" : "FN"; match_type = "lm"; } fprintf(out_fp, "\t%s:%s:%f:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(), @@ -625,7 +625,8 @@ variantData::variantData(std::string vcf_fn, if (ngt == -1) { // GT not defined in header if (!gt_warn) { gt_warn = true; - WARN("'GT' tag not defined in header, assuming monoploid"); + WARN("'GT' tag not defined in %s VCF header, assuming monoploid", + callset_strs[callset].data()); } } else if (ngt <= 0) { // other error ERROR("Failed to read %s GT at %s:%lld", @@ -699,7 +700,8 @@ variantData::variantData(std::string vcf_fn, if (nPS == -1) { // PS not defined in header if (!PS_warn) { PS_warn = true; - WARN("'PS' tag not defined in header, assuming one phase set per contig"); + WARN("'PS' tag not defined in %s VCF header, assuming one phase set per contig", + callset_strs[callset].data()); } phase_set = 0; @@ -885,11 +887,11 @@ variantData::variantData(std::string vcf_fn, /* WARN("%d total missing GQ tags in %s VCF, all considered GQ=0", */ /* gq_missing_total, callset_strs[callset].data()); */ - if (failed_filter_total) + if (failed_filter_total && print) INFO("%d variants failed FILTER in %s VCF, skipped", failed_filter_total, callset_strs[callset].data()); - if (pass_min_qual[FAIL]) + if (pass_min_qual[FAIL] && print) INFO("%d variants of low quality (<%d) in %s VCF, skipped", pass_min_qual[FAIL], g.min_qual, callset_strs[callset].data()); @@ -912,7 +914,7 @@ variantData::variantData(std::string vcf_fn, multi_total = GT_counts[GT_ALT1_ALT1] + GT_counts[GT_ALT1_ALT2] + GT_counts[GT_ALT2_ALT1] + GT_counts[GT_OTHER]; - if (multi_total) + if (multi_total && print) INFO("%d homozygous and multi-allelic variants in %s VCF, split for evaluation", multi_total, callset_strs[callset].data()); @@ -932,12 +934,12 @@ variantData::variantData(std::string vcf_fn, WARN("%d reference variants in %s VCF, skipped", ref_call_total, callset_strs[callset].data()); - if (nregions[BED_OFFCTG] + nregions[BED_OUTSIDE]) + if (nregions[BED_OFFCTG] + nregions[BED_OUTSIDE] && print) INFO("%d variants outside selected regions in %s VCF, skipped", nregions[BED_OFFCTG] + nregions[BED_OUTSIDE], callset_strs[callset].data()); - if (nregions[BED_BORDER]) + if (nregions[BED_BORDER] && print) INFO("%d variants on border of selected regions in %s VCF, kept", nregions[BED_BORDER], callset_strs[callset].data()); From c2b8c9047dd7c24b23249f61647f8e6ee55958fe Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 9 Nov 2023 13:13:00 -0500 Subject: [PATCH 15/16] Version bump (v2.3.0) --- README.md | 2 +- demo/output.txt | 36 ++++++++---------------------------- demo/pr_plot.py | 34 ++++++++-------------------------- docs/release.md | 19 ++++++++++++------- src/Makefile | 6 +++--- src/README.md | 16 ++++++++++++---- src/globals.h | 2 +- src/print.cpp | 2 +- src/run | 2 +- 9 files changed, 47 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 0e1a533..a5ca2e5 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ If you do already have HTSlib installed elsewhere, make sure you've added it to > cd vcfdist/src > make > ./vcfdist --version -vcfdist v2.2.1 +vcfdist v2.3.0 ``` ### Option 2: Docker Image diff --git a/demo/output.txt b/demo/output.txt index 4b46f2f..893553e 100644 --- a/demo/output.txt +++ b/demo/output.txt @@ -1,29 +1,9 @@ - PRECISION-RECALL SUMMARY - - TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE - SNP Q >= 0 8220 8220 5 6 0.999149 0.999271 0.999210 31.023401 - SNP Q >= 0 8220 8220 5 6 0.999149 0.999271 0.999210 31.023401 - - TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE - INDEL Q >= 0 932 932 3 3 0.996793 0.996261 0.996527 24.592749 - INDEL Q >= 0 932 932 3 3 0.996793 0.996261 0.996527 24.592749 - - - ALIGNMENT DISTANCE SUMMARY - - TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE ALN_QSCORE - ALL Q >= 0 26 16 26.565773 27.579651 27.154125 - ALL Q >= 0 26 16 26.565773 27.579651 27.154125 - ALL Q >= 61 11791 9164 0.000000 0.000000 0.000000 - - TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE - SNP Q >= 0 10 10 29.130186 29.130186 - SNP Q >= 0 10 10 29.130186 29.130186 - SNP Q >= 61 8185 8185 0.000000 0.000000 - - TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE - INDEL Q >= 0 16 6 23.529057 22.126314 - INDEL Q >= 0 16 6 23.529057 22.126314 - INDEL Q >= 61 3606 979 0.000000 0.000000 - + +TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE +SNP Q >= 0 8222 8222 1 2 0.999757 0.999878 0.999818 37.388565 +SNP Q >= 0 8222 8222 1 2 0.999757 0.999878 0.999818 37.388565 + +TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE +INDEL Q >= 0 876 876 51 12 0.986486 0.944984 0.965289 14.595358 +INDEL Q >= 0 876 876 51 12 0.986486 0.944984 0.965289 14.595358 diff --git a/demo/pr_plot.py b/demo/pr_plot.py index f0ff0bb..96b7723 100644 --- a/demo/pr_plot.py +++ b/demo/pr_plot.py @@ -6,8 +6,6 @@ fig, ax = plt.subplots(1, 2, figsize=(10,8)) -partial_credit = True - with open(f"results/precision-recall.tsv") as csv: indel_recall = [] indel_prec = [] @@ -16,30 +14,14 @@ next(csv) # skip header for line in csv: - typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \ - query_tot, query_tp, query_pp, query_fp = line.split('\t') - if partial_credit: # normal vcfdist prec/recall calc, already done - if typ == "INDEL": - indel_recall.append(float(recall)) - indel_prec.append(float(prec)) - elif typ == "SNP": - snp_recall.append(float(recall)) - snp_prec.append(float(prec)) - else: # recalculate using TP/FP/FN - if typ == "INDEL": - indel_recall.append(float(truth_tp) / float(truth_tot)) - if int(truth_tp) + int(query_fp) == 0: - indel_prec.append(1) - else: - indel_prec.append(float(truth_tp) / - (float(truth_tp) + float(query_fp))) - elif typ == "SNP": - snp_recall.append(float(truth_tp) / float(truth_tot)) - if int(truth_tp) + int(query_fp) == 0: - snp_prec.append(1) - else: - snp_prec.append(float(truth_tp) / - (float(truth_tp) + float(query_fp))) + typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_fn, \ + query_tot, query_tp, query_fp = line.split('\t') + if typ == "INDEL": + indel_recall.append(float(recall)) + indel_prec.append(float(prec)) + elif typ == "SNP": + snp_recall.append(float(recall)) + snp_prec.append(float(prec)) ax[0].plot(snp_recall, snp_prec, linestyle='', marker='.') ax[1].plot(indel_recall, indel_prec, linestyle='', marker='.') diff --git a/docs/release.md b/docs/release.md index 7557f31..7c16437 100644 --- a/docs/release.md +++ b/docs/release.md @@ -1,14 +1,19 @@ -1. Update `vcfdist` version in `globals.h` +1. Benchmark against `run` and `demo` scripts, updating version +- update `output.txt` for `demo` if necessary -2. Update `vcfdist` help text in `src/README.md` +2. Update `vcfdist` version in `globals.h` + +3. Update `vcfdist` help text in `src/README.md` ```bash -./vcfdist >> src/README.md +cd src +make +./vcfdist >> README.md ``` Manually delete previous text -3. Update `vcfdist` version in `README.md` +4. Update `vcfdist` version in `README.md` -4. Make final Git commit of changes +5. Make final Git commit of changes ```bash git commit -m "version bump (vX.X.X)" git checkout master @@ -17,9 +22,9 @@ git push origin master git checkout dev ``` -5. Make new release and tag on Github +6. Make new release and tag on Github -6. Build and deploy new Docker image +7. Build and deploy new Docker image ```bash cd ~/vcfdist/src sudo docker login -u timd1 diff --git a/src/Makefile b/src/Makefile index 819b6c9..cb9d1f5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -12,7 +12,7 @@ $(TARGET): $(OBJS) main.cpp globals.o: globals.cpp globals.h bed.h print.h defs.h timer.h $(CXX) -c $(CXXFLAGS) globals.cpp -print.o: print.cpp print.h globals.h phase.h dist.h edit.h defs.h +print.o: print.cpp print.h globals.h phase.h dist.h edit.h defs.h variant.h $(CXX) -c $(CXXFLAGS) print.cpp timer.o: timer.cpp timer.h globals.h defs.h @@ -30,10 +30,10 @@ bed.o: bed.cpp bed.h print.h defs.h globals.h edit.o: edit.cpp edit.h defs.h globals.h $(CXX) -c $(CXXFLAGS) edit.cpp -cluster.o: cluster.cpp cluster.h variant.h globals.h dist.h defs.h +cluster.o: cluster.cpp cluster.h variant.h fasta.h globals.h dist.h defs.h $(CXX) -c $(CXXFLAGS) cluster.cpp -phase.o: phase.cpp phase.h cluster.h print.h globals.h defs.h +phase.o: phase.cpp phase.h cluster.h print.h globals.h defs.h variant.h $(CXX) -c $(CXXFLAGS) phase.cpp clean: diff --git a/src/README.md b/src/README.md index ea884b2..57a35ab 100644 --- a/src/README.md +++ b/src/README.md @@ -26,12 +26,12 @@ Options: minimum variant size, smaller variants ignored (SNPs are size 1) -l, --largest-variant [5000] maximum variant size, larger variants ignored - --min-qual [0] + -mn, --min-qual [0] minimum variant quality, lower qualities ignored - --max-qual [60] + -mx, --max-qual [60] maximum variant quality, higher qualities kept but thresholded - ReAlignment: + Re-Alignment: -rq, --realign-query realign query variants using Smith-Waterman parameters -rt, --realign-truth @@ -45,6 +45,14 @@ Options: -e, --gap-extend-penalty [1] Smith-Waterman gap extension penalty + Precision-Recall: + -ct, --credit-threshold [0.70] + minimum partial credit to consider variant a true positive + + Distance: + -d, --distance + flag to include alignment distance calculations, skipped by default + Utilization: -t, --max-threads [64] maximum threads to use for clustering and precision/recall alignment @@ -59,5 +67,5 @@ Options: -c, --citation please cite vcfdist if used in your analyses; thanks :) -v, --version - print vcfdist version (v2.2.1) + print vcfdist version (v2.3.0) ``` diff --git a/src/globals.h b/src/globals.h index aac31fb..1fd9fb6 100644 --- a/src/globals.h +++ b/src/globals.h @@ -77,7 +77,7 @@ class Globals { void init_timers(std::vector timer_strs); // program data - const std::string VERSION = "2.2.1"; + const std::string VERSION = "2.3.0"; const std::string PROGRAM = "vcfdist"; std::vector timers; }; diff --git a/src/print.cpp b/src/print.cpp index 2963412..8733730 100644 --- a/src/print.cpp +++ b/src/print.cpp @@ -658,7 +658,7 @@ void write_results(std::unique_ptr & phasedata_ptr) { // print query variant information std::string out_query_fn = g.out_prefix + "query.tsv"; - if (g.verbosity >= 1) INFO(" Writing call variant results to '%s'", out_query_fn.data()); + if (g.verbosity >= 1) INFO(" Writing query variant results to '%s'", out_query_fn.data()); FILE* out_query = fopen(out_query_fn.data(), "w"); fprintf(out_query, "CONTIG\tPOS\tHAP\tREF\tALT\tQUAL\tTYPE\tERR_TYPE" "\tCREDIT\tCLUSTER\tSUPERCLUSTER\tSYNC_GROUP\tLOCATION\n"); diff --git a/src/run b/src/run index 1ab9bbb..c803706 100755 --- a/src/run +++ b/src/run @@ -14,4 +14,4 @@ source ../pipeline/includes.sh /x/gm24385/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ -b ../test/bench20.bed \ -l 1000 \ - -p ../out/ + -p ../out/dev. From fcb41be9aee60f12deee9150dfa26185fb8790fe Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Thu, 9 Nov 2023 13:14:12 -0500 Subject: [PATCH 16/16] Updated analyses --- .../clustering/3_vcfdist_sweep_varlens.sh | 4 +- analysis-v2/phab_cm/1_eval_vcfs.sh | 89 ++++++------------- .../phab_cm/vcfdist_truvari_compare.py | 48 +++++----- analysis-v2/small_sv_all/3_plot_vcf_tp.py | 18 ++-- analysis-v2/small_sv_all/4_plot_vcf_fp.py | 18 ++-- 5 files changed, 76 insertions(+), 101 deletions(-) diff --git a/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh index 3e3442b..fdc4242 100755 --- a/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh +++ b/analysis-v2/clustering/3_vcfdist_sweep_varlens.sh @@ -4,10 +4,10 @@ source globals.sh lens=( # "10" - # "50" + "50" # "100" # "500" - "1000" + # "1000" # "5000" # "10000" ) diff --git a/analysis-v2/phab_cm/1_eval_vcfs.sh b/analysis-v2/phab_cm/1_eval_vcfs.sh index b628873..622626d 100755 --- a/analysis-v2/phab_cm/1_eval_vcfs.sh +++ b/analysis-v2/phab_cm/1_eval_vcfs.sh @@ -2,38 +2,6 @@ source globals.sh -# # truvari phab VCF normalization v4.1.0 (doesn't work) -# mkdir -p $out/phab-mafft -# mkdir -p $out/phab-wfa -# source ~/truvari/venv3.10/bin/activate -# for i in "${!query_names[@]}"; do -# echo "truvari v4.1: phab wfa for '${query_names[i]}'" -# truvari phab \ -# -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ -# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ -# --bSamples HG002 \ -# --cSamples HG002 \ -# -f $data/refs/$ref_name \ -# --align wfa \ -# -t 64 \ -# -o $out/phab-wfa/${query_names[i]}.vcf.gz \ -# 2> $out/phab-wfa/${query_names[i]}.log -# echo "truvari v4.1: phab mafft for '${query_names[i]}'" -# truvari phab \ -# -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ -# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ -# --bSamples HG002 \ -# --cSamples HG002 \ -# -f $data/refs/$ref_name \ -# --align mafft \ -# -t 64 \ -# -o $out/phab-mafft/${query_names[i]}.vcf.gz \ -# 2> $out/phab-mafft/${query_names[i]}.log -# done -# deactivate - # # truvari phab VCF normalization v4.0 # source ~/software/Truvari-v4.0.0/venv3.10/bin/activate # mkdir -p $out/phab @@ -87,37 +55,38 @@ source globals.sh # 2> $out/vcfdist/${query_names[i]}.log # done -# # vcfeval evaluation -# mkdir -p $out/vcfeval -# for i in "${!query_names[@]}"; do -# echo "vcfeval: evaluating '${query_names[i]}'" -# $timer -v $rtg vcfeval \ -# -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ -# -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ -# -t $data/refs/${ref_name}.sdf \ -# -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# --threads 64 \ -# --all-records \ -# -o $out/vcfeval/${query_names[i]} \ -# 2> $out/vcfeval/${query_names[i]}.log -# done - -# Truvari evaluation -mkdir -p $out/truvari -source ~/software/Truvari-v4.0.0/venv3.10/bin/activate +# vcfeval evaluation +mkdir -p $out/vcfeval for i in "${!query_names[@]}"; do - echo "Truvari: evaluating '${query_names[i]}'" - $timer -v truvari bench \ + echo "vcfeval: evaluating '${query_names[i]}'" + $timer -v $rtg vcfeval \ -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ - -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ - --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ - --no-ref a \ - --sizemin 1 --sizefilt 1 --sizemax 1000 \ - --pick multi \ - --typeignore \ - --dup-to-ins \ - -o $out/truvari/${query_names[i]} + -t $data/refs/${ref_name}.sdf \ + -m ga4gh \ + -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ + --threads 64 \ + --all-records \ + -o $out/vcfeval/${query_names[i]} \ + 2> $out/vcfeval/${query_names[i]}.log +done + +# # Truvari evaluation +# mkdir -p $out/truvari +# source ~/software/Truvari-v4.0.0/venv3.10/bin/activate +# for i in "${!query_names[@]}"; do +# echo "Truvari: evaluating '${query_names[i]}'" +# $timer -v truvari bench \ +# -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ +# -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ +# -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ +# --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ +# --no-ref a \ +# --sizemin 1 --sizefilt 1 --sizemax 1000 \ +# --pick multi \ +# --typeignore \ +# --dup-to-ins \ +# -o $out/truvari/${query_names[i]} laytr tru2ga \ -i $out/truvari/${query_names[i]}/ \ diff --git a/analysis-v2/phab_cm/vcfdist_truvari_compare.py b/analysis-v2/phab_cm/vcfdist_truvari_compare.py index 7c768d7..4badc48 100644 --- a/analysis-v2/phab_cm/vcfdist_truvari_compare.py +++ b/analysis-v2/phab_cm/vcfdist_truvari_compare.py @@ -3,18 +3,17 @@ import matplotlib.pyplot as plt # chr20, with phab -truvari_prefix = "./truvari/pav/result_" -vcfdist_prefix = "./vcfdist/pav." -do_print = True +truvari_prefix = "./truvari/giab-tr/result_" +vcfdist_prefix = "./vcfdist/giab-tr." +do_print = False SIZE = 0 SZ_SNP = 0 -SZ_INDEL_1_10 = 1 -SZ_INDEL_10_50 = 2 -SZ_INDEL_50_500 = 3 -SZ_INDEL_500PLUS = 4 -SZ_DIMS = 5 -sizes = ["SNP", "INDEL 1-10", "INDEL 10-50", "INDEL 50-500", "INDEL 500+"] +SZ_INDEL_1_50 = 1 +SZ_INDEL_50_500 = 2 +SZ_INDEL_500PLUS = 3 +SZ_DIMS = 4 +sizes = ["SNP", "INDEL 1-50", "INDEL 50-500", "INDEL 500+"] VCF_DIST = 1 VD_NONE = 0 @@ -51,10 +50,8 @@ def get_size(ref : str, alt : str): if size_diff == 0: print("ERROR: size 0 INDEL") exit(1) - if size_diff <= 10: - return SZ_INDEL_1_10 elif size_diff <= 50: - return SZ_INDEL_10_50 + return SZ_INDEL_1_50 elif size_diff <= 500: return SZ_INDEL_50_500 else: @@ -93,7 +90,7 @@ def get_tv_type(tv_type : str): truvari_vcf = vcf.Reader(open(f"{truvari_prefix}{callset}.vcf", "r")) name = callset.upper() print(name) - tv_used, vd_used = True, True + tv_used, vd_used, vd_2used = True, True, False this_ctg = "chr1" print(this_ctg) @@ -102,11 +99,12 @@ def get_tv_type(tv_type : str): # get next valid records for each if tv_used: tv_rec = next(truvari_vcf, None) if vd_used: vd_rec = next(vcfdist_vcf, None) + if vd_2used: vd_rec = next(vcfdist_vcf, None) while vd_rec != None and vd_rec.genotype(name)['BD'] == None: # skip other callset vd_rec = next(vcfdist_vcf, None) while tv_rec != None and tv_rec.ALT[0] == "*": # skip nested var tv_rec = next(truvari_vcf, None) - tv_used, vd_used = False, False + tv_used, vd_used, vd_2used = False, False, False if do_print: print("============================================================") @@ -141,6 +139,8 @@ def get_tv_type(tv_type : str): if do_print: print(f"TV VD {name} {tv_rec.genotype(name)['GT']} {vd_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}") if tv_rec.REF == vd_rec.REF and tv_rec.ALT[0] == vd_rec.ALT[0]: # full match tv_used, vd_used = True, True + if tv_rec.genotype(name)['GT'] == '1|1' or tv_rec.genotype(name)['GT'] == '1/1': + vd_2used = True # skip second split of this GT size = get_size(tv_rec.REF, tv_rec.ALT[0]) vd_type = get_vd_type(float(vd_rec.genotype(name)['BC'])) tv_type = get_tv_type(tv_rec.genotype(name)['BD']) @@ -200,7 +200,7 @@ def get_tv_type(tv_type : str): counts[size2][vd_type2][tv_type2] += 1 tv_used, vd_used = True, True else: - if do_print: print("ERROR: failed to match") + print("ERROR: failed to match") # discard current two, pretend we haven't looked at next two counts[size][VD_NONE][TV_NONE] += 2 @@ -213,17 +213,23 @@ def get_tv_type(tv_type : str): size = get_size(vd_rec.REF, vd_rec.ALT[0]) if do_print: print(f" VD {name} {vd_rec.genotype(name)['GT']} {vd_rec.CHROM}:{vd_rec.POS}\t{vd_rec.REF}\t{vd_rec.ALT[0]}\t") - if size != SZ_SNP: - print("WARN: vcfdist only, not SNP") + if size != SZ_SNP: + print(f" VD {name} {vd_rec.genotype(name)['GT']} {vd_rec.CHROM}:{vd_rec.POS}\t{vd_rec.REF}\t{vd_rec.ALT[0]}\t") + print("WARN: vcfdist only, not SNP") vd_type = get_vd_type(float(vd_rec.genotype(name)['BC'])) tv_type = TV_NONE counts[size][vd_type][tv_type] += 1 elif tv_pos < vd_pos: # truvari only - if do_print: - print(f"TV {name} {tv_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}") - print("WARN: Truvari only") + print(f"TV {name} {tv_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}") + print("WARN: Truvari only") tv_used = True + + # skip unphased truvari variants + gt = tv_rec.genotype(name)['GT'] + if gt[1] == '/' and gt[0] != gt[2]: + continue + size = get_size(tv_rec.REF, tv_rec.ALT[0]) vd_type = VD_NONE tv_type = get_tv_type(tv_rec.genotype(name)['BD']) @@ -257,4 +263,4 @@ def get_tv_type(tv_type : str): for (i,j), z in np.ndenumerate(counts[size_idx]): ax.text(j, i, f"{int(z)}", ha='center', va='center', bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3')) - plt.savefig(f"img/13_{callset}_{size_idx}_cm.png", dpi=200) + plt.savefig(f"img/tv_vd_{callset}_{size_idx}_cm.png", dpi=200) diff --git a/analysis-v2/small_sv_all/3_plot_vcf_tp.py b/analysis-v2/small_sv_all/3_plot_vcf_tp.py index bd04c8b..10c4644 100644 --- a/analysis-v2/small_sv_all/3_plot_vcf_tp.py +++ b/analysis-v2/small_sv_all/3_plot_vcf_tp.py @@ -9,14 +9,14 @@ vcf_types = ["snv", "indel", "sv", "small", "large", "all"] regions = [ "summary", - "alldifficultregions", - "AllHomopolymers_ge7bp_imperfectge11bp_slop5", - "AllTandemRepeats", - "AllTandemRepeatsandHomopolymers_slop5", - "MHC", - "satellites_slop5", - "segdups", - "alllowmapandsegdupregions" + # "alldifficultregions", + # "AllHomopolymers_ge7bp_imperfectge11bp_slop5", + # "AllTandemRepeats", + # "AllTandemRepeatsandHomopolymers_slop5", + # "MHC", + # "satellites_slop5", + # "segdups", + # "alllowmapandsegdupregions" ] for region in regions: @@ -119,4 +119,4 @@ ax[2].legend(handles=patches, loc=(0.6,0.6)) ax[0].set_ylabel("True Positive Recall") plt.suptitle(f"{region}") - plt.savefig(f"counts-{region}.png", dpi=300) + plt.savefig(f"./img/counts-{region}.pdf", format='pdf') diff --git a/analysis-v2/small_sv_all/4_plot_vcf_fp.py b/analysis-v2/small_sv_all/4_plot_vcf_fp.py index 67c60fa..0d04a07 100644 --- a/analysis-v2/small_sv_all/4_plot_vcf_fp.py +++ b/analysis-v2/small_sv_all/4_plot_vcf_fp.py @@ -9,14 +9,14 @@ vcf_types = ["snv", "indel", "sv", "small", "large", "all"] regions = [ "summary", - "alldifficultregions", - "AllHomopolymers_ge7bp_imperfectge11bp_slop5", - "AllTandemRepeats", - "AllTandemRepeatsandHomopolymers_slop5", - "MHC", - "satellites_slop5", - "segdups", - "alllowmapandsegdupregions" + # "alldifficultregions", + # "AllHomopolymers_ge7bp_imperfectge11bp_slop5", + # "AllTandemRepeats", + # "AllTandemRepeatsandHomopolymers_slop5", + # "MHC", + # "satellites_slop5", + # "segdups", + # "alllowmapandsegdupregions" ] for region in regions: @@ -132,4 +132,4 @@ ax[2].legend(handles=patches, loc=(0.6,0.6)) ax[0].set_ylabel("False Positive Rate") plt.suptitle(f"{region}") - plt.savefig(f"counts-{region}-fp.png", dpi=300) + plt.savefig(f"img/counts-{region}-fp.pdf", format='pdf')