Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
TimD1 committed Nov 9, 2023
2 parents 82aee34 + fcb41be commit 62704ee
Show file tree
Hide file tree
Showing 63 changed files with 2,630 additions and 827 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
*.vcf
*.gz
*.gprof
*.png
src/vcfdist

# ignore analysis
analysis/*/
analysis/*.png
analysis-v2/*/*/

# ignore input
data/*/*
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ If you do already have HTSlib installed elsewhere, make sure you've added it to
> cd vcfdist/src
> make
> ./vcfdist --version
vcfdist v2.2.1
vcfdist v2.3.0
```

### Option 2: Docker Image
Expand Down
42 changes: 42 additions & 0 deletions analysis-v2/clustering/1_plot_beds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import matplotlib.pyplot as plt
import numpy as np

ds_names = ["t2t-q100", "nist", "giab-tr", "cmrg"]
ds_versions = ["v0.9", "v4.2.1","v4.20", "v1.00"]
ds_beds = ["GRCh38_HG2-T2TQ100-V0.9_dipcall-z2k.benchmark.bed",
"HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed",
"GIABTR.HG002.benchmark.regions.bed",
"HG002_GRCh38_CMRG_smallvar_v1.00.bed"]
data = "/home/timdunn/vcfdist/data"

bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000,
100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000,
100_000_000, 200000000, 500000000, 1_000_000_000]
for name, version, bed in zip(ds_names, ds_versions, ds_beds):
print(f"parsing {name} BED")
counts = [0]*len(bins)
bases, bases2 = 0, 0
with open(f"{data}/{name}-{version}/{bed}", "r") as bedfile:
for line in bedfile:
fields = line.split()
ctg, start, stop = fields[0], int(fields[1]), int(fields[2])

for bin_idx, bin_val in enumerate(bins):
if stop - start < bin_val:
counts[bin_idx] += 1
bases += stop-start
bases2 += (stop-start)*(stop-start)
break
fig, ax = plt.subplots(1,1)
ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
ax.set_xlim(0,len(bins))
ax.set_xticks(range(0, len(bins), 3))
ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
ax.set_yscale('log')
ax.set_ylim(0.5, 2000000)
ax.set_xlabel("Region Size")
ax.set_ylabel("Counts")
ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
plt.savefig(f"img/bed-{name}.png")
38 changes: 38 additions & 0 deletions analysis-v2/clustering/2_plot_gap_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import matplotlib.pyplot as plt
import numpy as np

gaps = [10, 20, 50, 100, 200] # 500, 1000 didn't finish
data = "/home/timdunn/vcfdist/analysis-v2/clustering"

bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000,
100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000,
100_000_000, 200000000, 500000000, 1_000_000_000]
for gap in gaps:
print(f"parsing gap {gap} superclusters")
counts = [0]*len(bins)
bases, bases2 = 0, 0
with open(f"{data}/gaps/{gap}.superclusters.tsv", "r") as bedfile:
next(bedfile)
for line in bedfile:
fields = line.split()
ctg, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3])

for bin_idx, bin_val in enumerate(bins):
if size < bin_val:
counts[bin_idx] += 1
bases += size
bases2 += size*size
break
fig, ax = plt.subplots(1,1)
ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
ax.set_xlim(0,len(bins))
ax.set_xticks(range(0, len(bins), 3))
ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
ax.set_yscale('log')
ax.set_ylim(0.5, 20000000)
ax.set_xlabel("Region Size")
ax.set_ylabel("Counts")
ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
plt.savefig(f"img/gaps-{gap}.png")
29 changes: 29 additions & 0 deletions analysis-v2/clustering/2_vcfdist_sweep_gaplens.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

source globals.sh

dists=(
"10"
# "20"
# "50"
# "100"
# "200"
)

# vcfdist evaluation
mkdir -p $out/sweep_gaplens
for q in "${!query_names[@]}"; do
for d in "${dists[@]}"; do
echo "vcfdist: evaluating len 1000 '${query_names[q]}' gap $d"
$timer -v ../../src/vcfdist \
$data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
$data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
$data/refs/$ref_name \
--bed $data/${truth_name}-${truth_version}/split/bench.bed \
--simple-cluster \
-g $d \
-l 1000 \
-p $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000. \
2> $out/sweep_gaplens/${query_names[q]}.gap${d}.len1000.log
done
done
39 changes: 39 additions & 0 deletions analysis-v2/clustering/3_plot_len_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import matplotlib.pyplot as plt
import numpy as np

# lens = [10, 50, 100, 500, 1000] # 5000, 10000 didn't finish
lens = [1000]
data = "/home/timdunn/vcfdist/analysis-v2/clustering"

bins = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1_000, 2000,5000, 10_000,20000, 50000,
100_000, 200000, 500000, 1_000_000, 2000000, 5000000, 10_000_000, 20000000, 50000000,
100_000_000, 200000000, 500000000, 1_000_000_000]
for length in lens:
print(f"Parsing WFA variant max length {length} superclusters")
counts = [0]*len(bins)
bases, bases2 = 0, 0
with open(f"{data}/sweep_varlens/pav.wfa.len{length}ra.superclusters.tsv", "r") as bedfile:
next(bedfile)
for line in bedfile:
fields = line.split()
ctg, sc, start, stop, size = fields[0], int(fields[1]), int(fields[2]), int(fields[3]), int(fields[4])

for bin_idx, bin_val in enumerate(bins):
if size < bin_val:
counts[bin_idx] += 1
bases += size
bases2 += size*size
break
fig, ax = plt.subplots(1,1)
ax.bar(np.arange(-0.5, len(bins)-0.5), counts, width=1)
ax.set_xlim(0,len(bins))
ax.set_xticks(range(0, len(bins), 3))
ax.set_xticklabels([1, 10, 100, "1K", "10K", "100K", "1M", "10M", "100M", "1G"])
ax.set_yscale('log')
ax.set_ylim(0.5, 20000000)
ax.set_xlabel("Region Size")
ax.set_ylabel("Counts")
ax.text(0.5, 0.8, f"$\sum$bases: {bases:,}", transform=ax.transAxes)
ax.text(0.5, 0.75, f"$\sum$bases$^2$: {bases2:,}", transform=ax.transAxes)
ax.text(0.5, 0.7, f"regions: {sum(counts):,}", transform=ax.transAxes)
plt.savefig(f"img/lens-{length}.png")
62 changes: 62 additions & 0 deletions analysis-v2/clustering/3_vcfdist_sweep_varlens.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

source globals.sh

lens=(
# "10"
"50"
# "100"
# "500"
# "1000"
# "5000"
# "10000"
)

mkdir -p $out/sweep_varlens

# # wfa-based superclustering
# for q in "${!query_names[@]}"; do
# for l in "${lens[@]}"; do
# echo "vcfdist: evaluating wfa '${query_names[q]}' len $l"
# $timer -v ../../src/vcfdist \
# $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
# $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
# $data/refs/$ref_name \
# --bed $data/${truth_name}-${truth_version}/split/bench.bed \
# -l $l \
# -p $out/sweep_varlens/${query_names[q]}.wfa.len${l}. \
# 2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}.log
# done
# done

# wfa-based superclustering
for q in "${!query_names[@]}"; do
for l in "${lens[@]}"; do
echo "vcfdist: evaluating wfa '${query_names[q]}' len $l"
$timer -v ../../src/vcfdist \
$data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
$data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
$data/refs/$ref_name \
--bed $data/${truth_name}-${truth_version}/split/bench.bed \
-l $l \
-p $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra. \
2> $out/sweep_varlens/${query_names[q]}.wfa.len${l}ra.log
done
done

# # gap-based superclustering
# for q in "${!query_names[@]}"; do
# for l in "${lens[@]}"; do
# echo "vcfdist: evaluating gap 100 '${query_names[q]}' len $l"
# $timer -v ../../src/vcfdist \
# $data/${query_names[q]}-${query_versions[q]}/split/${query_names[q]}.all.vcf.gz \
# $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
# $data/refs/$ref_name \
# --bed $data/${truth_name}-${truth_version}/split/bench.bed \
# --simple-cluster \
# -g 100 \
# -l $l \
# -p $out/sweep_varlens/${query_names[q]}.gap100.len${l}. \
# 2> $out/sweep_varlens/${query_names[q]}.gap100.len${l}.log
# done
# done
67 changes: 67 additions & 0 deletions analysis-v2/clustering/4_plot_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# plt.rcParams.update({"figure.facecolor": (0,0,0,0)})

fig, ax = plt.subplots(1, 2, figsize=(10,6))

partial_credit = True

types = ["gap", "gap", "gap", "gap", "gap", "wfa_len"]
vals = [10, 20, 50, 100, 200, 1000]

for t, v in zip(types, vals):
print(f"plotting {t} {v}")
with open(f"./{t}s/{v}.precision-recall.tsv") as csv:
indel_recall = []
indel_prec = []
snp_recall = []
snp_prec = []
next(csv) # skip header

for line in csv:
typ, qual, prec, recall, f1, f1q, truth_tot, truth_tp, truth_pp, truth_fn, \
query_tot, query_tp, query_pp, query_fp = line.split('\t')
if partial_credit: # normal vcfdist prec/recall calc, already done
if typ == "INDEL":
indel_recall.append(float(recall))
indel_prec.append(float(prec))
elif typ == "SNP":
snp_recall.append(float(recall))
snp_prec.append(float(prec))
else: # recalculate using TP/FP/FN
if typ == "INDEL":
indel_recall.append(float(truth_tp) / float(truth_tot))
if int(truth_tp) + int(query_fp) == 0:
indel_prec.append(1)
else:
indel_prec.append(float(truth_tp) /
(float(truth_tp) + float(query_fp)))
elif typ == "SNP":
snp_recall.append(float(truth_tp) / float(truth_tot))
if int(truth_tp) + int(query_fp) == 0:
snp_prec.append(1)
else:
snp_prec.append(float(truth_tp) /
(float(truth_tp) + float(query_fp)))
ax[0].plot(snp_recall, snp_prec, linestyle='', marker='.')
ax[1].plot(indel_recall, indel_prec, linestyle='', marker='.')

# SNP plot
ax[0].set_title("SNPs")
ax[0].set_xlabel("Recall", fontsize=15)
ax[0].set_ylabel("Precision", fontsize=15)
ax[0].set_xlim((0.9,1))
ax[0].set_ylim((0.9,1))

# INDEL plot
ax[1].set_title("INDELs")
ax[1].set_xlabel("Recall", fontsize=15)
ax[1].set_ylabel("Precision", fontsize=15)
ax[1].set_xlim((0.9,1))
ax[1].set_ylim((0.9,1))

ax[0].legend(["Gap 10", "Gap 20", "Gap 50", "Gap 100", "Gap 200", "WFA"])
plt.tight_layout()
plt.savefig('img/accuracy.png', dpi=200)
3 changes: 3 additions & 0 deletions analysis-v2/clustering/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gaps = modify gap length
wfa_lens = modify variant lengths, wfa superclustering
gap_lens = modify variant lengths, gap superclustering
30 changes: 30 additions & 0 deletions analysis-v2/clustering/globals.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

data="/home/timdunn/vcfdist/data"
parallel="/home/timdunn/parallel-20211222/src/parallel"
vcfdist="/home/timdunn/vcfdist/src/vcfdist"
rtg="/home/timdunn/software/rtg-tools-3.12.1/rtg"
tabix="/home/timdunn/software/htslib-1.16/tabix"
bgzip="/home/timdunn/software/htslib-1.16/bgzip"
timer="/usr/bin/time"

out="/home/timdunn/vcfdist/analysis-v2/clustering"

# define reference FASTA
ref_name="GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta"

# T2T information
truth_name="t2t-q100"
truth_version="v0.9"

# query VCF information
query_names=(
# "hprc"
"pav"
# "giab-tr"
)
query_versions=(
# "v1"
"v4"
# "v4.20"
)
20 changes: 20 additions & 0 deletions analysis-v2/multi_match/1_eval_vcfs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

source globals.sh

# --bed $data/cmrg-v1.00/HG002_GRCh38_CMRG_smallvar_v1.00.bed \
# --max-threads 1 \
# vcfdist evaluation
mkdir -p $out/evals/vcfdist
for i in "${!query_names[@]}"; do
echo "vcfdist: evaluating '${query_names[i]}'"
$timer -v ../../src/vcfdist \
$data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
$data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
$data/refs/$ref_name \
--bed $data/${truth_name}-${truth_version}/split/bench.bed \
--keep-query --keep-truth \
-l 1000 \
-p $out/evals/vcfdist/${query_names[i]}. \
2> $out/evals/vcfdist/${query_names[i]}.log
done
Loading

0 comments on commit 62704ee

Please sign in to comment.