Skip to content

Commit

Permalink
Added old ED and new ED to summary VCF and var TSV
Browse files Browse the repository at this point in the history
 - allows stratifying variants by resulting edit distance
 - fixed bug where query credit was 0 for FPs below threshold
  • Loading branch information
TimD1 committed Feb 8, 2024
1 parent 6f461d5 commit 8d72e6c
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 13 deletions.
12 changes: 11 additions & 1 deletion src/dist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,8 @@ void calc_prec_recall(
query_vars->errtypes[swap][query_var_ptr] = ERRTYPE_FP;
query_vars->sync_group[swap][query_var_ptr] = sync_group++;
query_vars->credit[swap][query_var_ptr] = 0;
query_vars->old_ed[swap][query_var_ptr] = 0;
query_vars->new_ed[swap][query_var_ptr] = 0;
query_vars->callq[swap][query_var_ptr] =
query_vars->var_quals[query_var_ptr];
if (print) printf("%s: QUERY REF='%s'\tALT='%s'\t%s\t%f\n", ctg.data(),
Expand Down Expand Up @@ -1278,6 +1280,8 @@ void calc_prec_recall(
query_vars->errtypes[swap][query_var_idx] = ERRTYPE_TP;
query_vars->sync_group[swap][query_var_idx] = sync_group;
query_vars->credit[swap][query_var_idx] = credit;
query_vars->old_ed[swap][query_var_idx] = old_ed;
query_vars->new_ed[swap][query_var_idx] = new_ed;
query_vars->callq[swap][query_var_idx] = callq;
if (print) printf("%s:%d QUERY REF='%s'\tALT='%s'\t%s\t%f\n",
ctg.data(), query_vars->poss[query_var_idx],
Expand All @@ -1287,7 +1291,9 @@ void calc_prec_recall(
} else { // FP
query_vars->errtypes[swap][query_var_idx] = ERRTYPE_FP;
query_vars->sync_group[swap][query_var_idx] = sync_group;
query_vars->credit[swap][query_var_idx] = 0;
query_vars->credit[swap][query_var_idx] = credit;
query_vars->old_ed[swap][query_var_idx] = old_ed;
query_vars->new_ed[swap][query_var_idx] = new_ed;
query_vars->callq[swap][query_var_idx] = callq;
if (print) printf("%s:%d QUERY REF='%s'\tALT='%s'\t%s\t%f\n",
ctg.data(), query_vars->poss[query_var_idx],
Expand All @@ -1306,6 +1312,8 @@ void calc_prec_recall(
truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_TP;
truth_vars->sync_group[swap][truth_var_idx] = sync_group;
truth_vars->credit[swap][truth_var_idx] = credit;
truth_vars->old_ed[swap][truth_var_idx] = old_ed;
truth_vars->new_ed[swap][truth_var_idx] = new_ed;
truth_vars->callq[swap][truth_var_idx] = callq;
if (print) printf("%s:%d TRUTH REF='%s'\tALT='%s'\t%s\t%f\n",
ctg.data(), truth_vars->poss[truth_var_idx],
Expand All @@ -1316,6 +1324,8 @@ void calc_prec_recall(
truth_vars->errtypes[swap][truth_var_idx] = ERRTYPE_FN;
truth_vars->sync_group[swap][truth_var_idx] = sync_group;
truth_vars->credit[swap][truth_var_idx] = credit;
truth_vars->old_ed[swap][truth_var_idx] = old_ed;
truth_vars->new_ed[swap][truth_var_idx] = new_ed;
truth_vars->callq[swap][truth_var_idx] = g.max_qual;
if (print) printf("%s:%d TRUTH REF='%s'\tALT='%s'\t%s\t%f\n",
ctg.data(), truth_vars->poss[truth_var_idx],
Expand Down
2 changes: 2 additions & 0 deletions src/phase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
fprintf(out_vcf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"GenoType\">\n");
fprintf(out_vcf, "##FORMAT=<ID=BD,Number=1,Type=String,Description=\"Benchmark Decision for call (TP/FP/FN).\">\n");
fprintf(out_vcf, "##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Benchmark Credit (on the interval [0,1], based on sync group edit distance)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=OD,Number=1,Type=Integer,Description=\"Original sync group edit Distance\">\n");
fprintf(out_vcf, "##FORMAT=<ID=ND,Number=1,Type=Integer,Description=\"New sync group edit Distance\">\n");
fprintf(out_vcf, "##FORMAT=<ID=BK,Number=1,Type=String,Description=\"BenchmarK category ('gm' if credit == 1, 'lm' if credit > 0, else '.')\">\n");
fprintf(out_vcf, "##FORMAT=<ID=QQ,Number=1,Type=Float,Description=\"variant Quality\">\n");
fprintf(out_vcf, "##FORMAT=<ID=SC,Number=1,Type=Integer,Description=\"SuperCluster (index in contig)\">\n");
Expand Down
20 changes: 14 additions & 6 deletions src/print.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
if (g.verbosity >= 1) INFO(" Writing query variant results to '%s'", out_query_fn.data());
FILE* out_query = fopen(out_query_fn.data(), "w");
fprintf(out_query, "CONTIG\tPOS\tHAP\tREF\tALT\tQUAL\tTYPE\tERR_TYPE"
"\tCREDIT\tCLUSTER\tSUPERCLUSTER\tSYNC_GROUP\tLOCATION\n");
"\tCREDIT\tCLUSTER\tSUPERCLUSTER\tSYNC_GROUP\tOLD_DIST\tNEW_DIST\tLOCATION\n");
for (std::string ctg : phasedata_ptr->contigs) {

// set pointers to variants and superclusters
Expand Down Expand Up @@ -710,7 +710,7 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
default: swap = phase_switch; break;
}

fprintf(out_query, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%s\n",
fprintf(out_query, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%d\t%d\t%s\n",
ctg.data(),
query1_vars->poss[var1_idx],
query1_vars->haps[var1_idx],
Expand All @@ -723,6 +723,8 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
cluster1_idx,
sci,
query1_vars->sync_group[swap][var1_idx],
query1_vars->old_ed[swap][var1_idx],
query1_vars->new_ed[swap][var1_idx],
region_strs[query1_vars->locs[var1_idx]].data()
);
var1_idx++;
Expand All @@ -745,7 +747,7 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
default: swap = phase_switch; break;
}

fprintf(out_query, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%s\n",
fprintf(out_query, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%d\t%d\t%s\n",
ctg.data(),
query2_vars->poss[var2_idx],
query2_vars->haps[var2_idx],
Expand All @@ -758,6 +760,8 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
cluster2_idx,
sci,
query2_vars->sync_group[swap][var2_idx],
query2_vars->old_ed[swap][var2_idx],
query2_vars->new_ed[swap][var2_idx],
region_strs[query2_vars->locs[var2_idx]].data()
);
var2_idx++;
Expand All @@ -770,7 +774,7 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
std::string out_truth_fn = g.out_prefix + "truth.tsv";
if (g.verbosity >= 1) INFO(" Writing truth variant results to '%s'", out_truth_fn.data());
FILE* out_truth = fopen(out_truth_fn.data(), "w");
fprintf(out_truth, "CONTIG\tPOS\tHAP\tREF\tALT\tQUAL\tTYPE\tERRTYPE\tCREDIT\tCLUSTER\tSUPERCLUSTER\tSYNC_GROUP\tLOCATION\n");
fprintf(out_truth, "CONTIG\tPOS\tHAP\tREF\tALT\tQUAL\tTYPE\tERRTYPE\tCREDIT\tCLUSTER\tSUPERCLUSTER\tSYNC_GROUP\tOLD_DIST\tNEW_DIST\tLOCATION\n");
for (std::string ctg : phasedata_ptr->contigs) {

// set pointers to variants and superclusters
Expand Down Expand Up @@ -805,7 +809,7 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
default: swap = phase_switch; break;
}

fprintf(out_truth, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%s\n",
fprintf(out_truth, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%d\t%d\t%s\n",
ctg.data(),
truth1_vars->poss[var1_idx],
truth1_vars->haps[var1_idx],
Expand All @@ -818,6 +822,8 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
cluster1_idx,
sci,
truth1_vars->sync_group[swap][var1_idx],
truth1_vars->old_ed[swap][var1_idx],
truth1_vars->new_ed[swap][var1_idx],
region_strs[truth1_vars->locs[var1_idx]].data()
);
var1_idx++;
Expand All @@ -841,7 +847,7 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
default: swap = phase_switch; break;
}

fprintf(out_truth, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%s\n",
fprintf(out_truth, "%s\t%d\t%d\t%s\t%s\t%.2f\t%s\t%s\t%f\t%d\t%d\t%d\t%d\t%d\t%s\n",
ctg.data(),
truth2_vars->poss[var2_idx],
truth2_vars->haps[var2_idx],
Expand All @@ -854,6 +860,8 @@ void write_results(std::unique_ptr<phaseblockData> & phasedata_ptr) {
cluster2_idx,
sci,
truth2_vars->sync_group[swap][var2_idx],
truth2_vars->old_ed[swap][var2_idx],
truth2_vars->new_ed[swap][var2_idx],
region_strs[truth2_vars->locs[var2_idx]].data()
);
var2_idx++;
Expand Down
16 changes: 10 additions & 6 deletions src/variant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ ctgVariants::ctgVariants() {
this->sync_group.push_back(std::vector<int>());
this->callq.push_back(std::vector<float>());
this->credit.push_back(std::vector<float>());

this->old_ed.push_back(std::vector<int>());
this->new_ed.push_back(std::vector<int>());
}
}

Expand All @@ -44,6 +45,8 @@ void ctgVariants::add_var(int pos, int rlen, uint8_t hap, uint8_t type, uint8_t
this->errtypes[i].push_back(ERRTYPE_UN);
this->sync_group[i].push_back(0);
this->credit[i].push_back(0);
this->old_ed[i].push_back(0);
this->new_ed[i].push_back(0);
this->callq[i].push_back(0);
}
}
Expand Down Expand Up @@ -227,14 +230,14 @@ void ctgVariants::print_var_info(FILE* out_fp, std::shared_ptr<fastaData> ref,
char ref_base;
switch (this->types[idx]) {
case TYPE_SUB:
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:BK:QQ:SC:SG:PS:PB:BS:FE",
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:OD:ND:BK:QQ:SC:SG:PS:PB:BS:FE",
ctg.data(), this->poss[idx]+1, this->refs[idx].data(),
this->alts[idx].data());
break;
case TYPE_INS:
case TYPE_DEL:
ref_base = ref->fasta.at(ctg)[this->poss[idx]-1];
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:BK:QQ:SC:SG:PS:PB:BS:FE", ctg.data(),
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:OD:ND:BK:QQ:SC:SG:PS:PB:BS:FE", ctg.data(),
this->poss[idx], (ref_base + this->refs[idx]).data(),
(ref_base + this->alts[idx]).data());
break;
Expand All @@ -246,7 +249,7 @@ void ctgVariants::print_var_info(FILE* out_fp, std::shared_ptr<fastaData> ref,

void ctgVariants::print_var_empty(FILE* out_fp, int sc_idx,
int phase_block, bool query /* = false */) {
fprintf(out_fp, "\t.:.:.:.:.:%d:.:.:%d:.:.%s", sc_idx, phase_block, query ? "\n" : "");
fprintf(out_fp, "\t.:.:.:.:.:.:.:%d:.:.:%d:.:.%s", sc_idx, phase_block, query ? "\n" : "");
}


Expand All @@ -270,8 +273,9 @@ void ctgVariants::print_var_sample(FILE* out_fp, int idx, std::string gt,
errtype = query ? "FP" : "FN"; match_type = "lm";
}

fprintf(out_fp, "\t%s:%s:%f:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(),
this->credit[swap][idx], match_type.data(), int(this->var_quals[idx]), sc_idx,
fprintf(out_fp, "\t%s:%s:%f:%d:%d:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(),
this->credit[swap][idx], this->old_ed[swap][idx], this->new_ed[swap][idx],
match_type.data(), int(this->var_quals[idx]), sc_idx,
int(this->sync_group[swap][idx]), this->phase_sets[idx], phase_block,
query ? (phase_switch ? "1" : "0") : "." ,
query ? (phase_flip ? "1" : "0") : "." ,
Expand Down
2 changes: 2 additions & 0 deletions src/variant.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class ctgVariants {
// or for call, min quality in sync group
std::vector< std::vector<float> > callq;
// percentage reduction in edit dist of sync group with variants
std::vector< std::vector<int> > old_ed;
std::vector< std::vector<int> > new_ed;
std::vector< std::vector<float> > credit;
};

Expand Down

0 comments on commit 8d72e6c

Please sign in to comment.