From 604edcaf238e185d397bb7da7669d3d05d646870 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 27 Nov 2023 15:14:40 +0100 Subject: [PATCH] With --details, add X0:i tag (no. of identically scored best alignments) --- CHANGES.md | 5 +++++ CONTRIBUTING.md | 5 ++++- src/aln.cpp | 60 ++++++++++++++++++++++++++++--------------------- src/sam.cpp | 9 ++++---- src/sam.hpp | 2 ++ 5 files changed, 51 insertions(+), 30 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 3e0dcade..924c6644 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,10 @@ # Strobealign Changelog +## development version + +* If `--details` is used, output `X0:i` SAM tag with the number of + identically-scored best alignments + ## v0.12.0 (2023-11-23) * #293: Fix: When mapping single-end reads, many multimappers were previously diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b1ec3963..6feb8284 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -68,7 +68,7 @@ style guide, but new code should follow it. ## Detailed SAM output (`--details`) -When `--details` or `-d` is provided, the following additional SAM tags are output for +When `--details` is provided, the following additional SAM tags are output for mapped reads. `na`: Number of NAMs (seeds) found @@ -77,3 +77,6 @@ mapped reads. the expected region given its mate) `al`: Number of times an attempt was made to extend a seed (by gapped or ungapped alignment) `ga`: Number of times an attempt was made to extend a seed by gapped alignment +`X0`: Number of equally scored best alignments (greater 1 for multimappers). + For paired-end reads, the tag is output for both reads, but the value is + identical and is the number of equally scored best alignment *pairs*. diff --git a/src/aln.cpp b/src/aln.cpp index c7bf6954..082e4285 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -162,6 +162,7 @@ inline void align_single( } tries++; } + details.best_alignments = alignments_with_best_score; uint8_t mapq = (60.0 * (best_score - second_best_score) + best_score - 1) / best_score; bool is_primary = true; sam.add(best_alignment, record, read.rc, mapq, is_primary, details); @@ -491,11 +492,12 @@ inline Alignment rescue_align( } /* - * Turn a vector of scored alignment pairs into one that - * is sorted by score (highest first) and contains only unique entries. + * Remove consecutive identical alignment pairs and leave only the first. */ void deduplicate_scored_pairs(std::vector& pairs) { - + if (pairs.size() < 2) { + return; + } int prev_ref_start1 = pairs[0].alignment1.ref_start; int prev_ref_start2 = pairs[0].alignment2.ref_start; int prev_ref_id1 = pairs[0].alignment1.ref_id; @@ -523,28 +525,24 @@ void deduplicate_scored_pairs(std::vector& pairs) { pairs.resize(j); } + /* - * If there are multiple top-scoring alignments (all with the same score), - * pick one randomly and move it to the front. + * Count how many best alignments there are that all have the same score */ -void pick_random_top_pair( - std::vector& high_scores, - std::minstd_rand& random_engine -) { +size_t count_best_alignment_pairs(const std::vector& pairs) { + if (pairs.empty()) { + return 0; + } size_t i = 1; - for ( ; i < high_scores.size(); ++i) { - if (high_scores[i].score != high_scores[0].score) { + for ( ; i < pairs.size(); ++i) { + if (pairs[i].score != pairs[0].score) { break; } } - if (i > 1) { - size_t random_index = std::uniform_int_distribution<>(0, i - 1)(random_engine); - if (random_index != 0) { - std::swap(high_scores[0], high_scores[random_index]); - } - } + return i; } + /* * Align a pair of reads for which only one has NAMs. For the other, rescue * is attempted by aligning it locally. @@ -598,7 +596,7 @@ std::vector rescue_read( } void output_aligned_pairs( - std::vector& high_scores, + const std::vector& high_scores, Sam& sam, size_t max_secondary, double secondary_dropoff, @@ -608,17 +606,13 @@ void output_aligned_pairs( const Read& read2, float mu, float sigma, - const std::array& details, - std::minstd_rand& random_engine + const std::array& details ) { if (high_scores.empty()) { sam.add_unmapped_pair(record1, record2); return; } - std::sort(high_scores.begin(), high_scores.end(), by_score); - deduplicate_scored_pairs(high_scores); - pick_random_top_pair(high_scores, random_engine); auto [mapq1, mapq2] = joint_mapq_from_high_scores(high_scores); auto best_aln_pair = high_scores[0]; @@ -1036,9 +1030,26 @@ void align_or_map_paired( uint8_t mapq1 = proper_pair_mapq(nams_pair[0]); uint8_t mapq2 = proper_pair_mapq(nams_pair[1]); + details[0].best_alignments = 1; + details[1].best_alignments = 1; bool is_primary = true; sam.add_pair(alignment1, alignment2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper, is_primary, details); } else { + std::sort(alignment_pairs.begin(), alignment_pairs.end(), by_score); + deduplicate_scored_pairs(alignment_pairs); + + // If there are multiple top-scoring alignments (all with the same score), + // pick one randomly and move it to the front. + size_t i = count_best_alignment_pairs(alignment_pairs); + details[0].best_alignments = i; + details[1].best_alignments = i; + if (i > 1) { + size_t random_index = std::uniform_int_distribution<>(0, i - 1)(random_engine); + if (random_index != 0) { + std::swap(alignment_pairs[0], alignment_pairs[random_index]); + } + } + double secondary_dropoff = 2 * aligner.parameters.mismatch + aligner.parameters.gap_open; output_aligned_pairs( alignment_pairs, @@ -1051,8 +1062,7 @@ void align_or_map_paired( read2, isize_est.mu, isize_est.sigma, - details, - random_engine + details ); } } diff --git a/src/sam.cpp b/src/sam.cpp index 320c833b..c5ab3a70 100644 --- a/src/sam.cpp +++ b/src/sam.cpp @@ -46,10 +46,11 @@ void Sam::append_rg_and_newline() { void Sam::append_details(const Details& details) { std::stringstream s; - s << "\tna:i:" << details.nams; - s << "\tnr:i:" << details.nam_rescue; - s << "\tal:i:" << details.tried_alignment; - s << "\tga:i:" << details.gapped; + s << "\tna:i:" << details.nams + << "\tnr:i:" << details.nam_rescue + << "\tal:i:" << details.tried_alignment + << "\tga:i:" << details.gapped + << "\tX0:i:" << details.best_alignments; sam_string.append(s.str()); } diff --git a/src/sam.hpp b/src/sam.hpp index 3345ba9f..4bbb97c2 100644 --- a/src/sam.hpp +++ b/src/sam.hpp @@ -51,6 +51,7 @@ struct Details { uint64_t mate_rescue{0}; // No. of times rescue by local alignment was attempted uint64_t tried_alignment{0}; // No. of computed alignments (get_alignment or rescue_mate) uint64_t gapped{0}; // No. of gapped alignments computed (in get_alignment) + uint64_t best_alignments{0}; // No. of best alignments with same score Details& operator+=(const Details& other) { nam_rescue = nam_rescue || other.nam_rescue; @@ -59,6 +60,7 @@ struct Details { mate_rescue += other.mate_rescue; tried_alignment += other.tried_alignment; gapped += other.gapped; + best_alignments += other.best_alignments; return *this; } };