From 03765721cf95dac3bbf5b3de93026990565b88b0 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 26 Jul 2023 00:17:25 +0200 Subject: [PATCH 1/4] Rework single-end MAPQ computation First, there was a bug in MAPQ computation due to initializing best_score to -1000. In the first iteration, min_mapq_diff is updated thus: min_mapq_diff = std::max(0, alignment.score - best_score); This is just alignment.score + 1000, so most of the time, min_mapq_diff ends up being something like 1600 or so for typical read lengths. (This is clamped to 60 in the output, but still.) Here, we keep track of the highest and second-highest alignment score in best_score and second_best_score. Then, mapq is computed such that it is * 60 for second_best_score=0 and * 0 for second_best_score=best_score, where anything in between is interpolated. See #25 --- src/aln.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/aln.cpp b/src/aln.cpp index aa392309..d3edcc0e 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -106,12 +106,11 @@ static inline void align_SE( Nam n_max = nams[0]; int best_edit_distance = std::numeric_limits::max(); - int best_score = -1000; + int best_score = 0; + int second_best_score = 0; Alignment best_alignment; - best_alignment.score = -100000; best_alignment.is_unaligned = true; - int min_mapq_diff = best_edit_distance; for (auto &nam : nams) { float score_dropoff = (float) nam.n_hits / n_max.n_hits; @@ -124,24 +123,25 @@ static inline void align_SE( details.tried_alignment++; details.gapped += alignment.gapped; - int diff_to_best = std::abs(best_score - alignment.score); - min_mapq_diff = std::min(min_mapq_diff, diff_to_best); - if (max_secondary > 0) { alignments.emplace_back(alignment); } if (alignment.score > best_score) { - min_mapq_diff = std::max(0, alignment.score - best_score); // new distance to next best match + second_best_score = best_score; best_score = alignment.score; best_alignment = std::move(alignment); if (max_secondary == 0) { best_edit_distance = best_alignment.global_ed; } + } else if (alignment.score > second_best_score) { + second_best_score = alignment.score; } tries++; } + uint8_t mapq = 60.0 * (best_score - second_best_score) / best_score; + if (max_secondary == 0) { - best_alignment.mapq = std::min(min_mapq_diff, 60); + best_alignment.mapq = mapq; sam.add(best_alignment, record, read.rc, true, details); return; } @@ -160,7 +160,7 @@ static inline void align_SE( } bool is_primary = i == 0; if (is_primary) { - alignment.mapq = std::min(min_mapq_diff, 60); + alignment.mapq = mapq; } else { alignment.mapq = 255; } From aeb730826c8c8fb6cb2d49c79fb10a3cafc7c76b Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Tue, 15 Aug 2023 22:46:29 +0200 Subject: [PATCH 2/4] Remove Alignment::mapq field Keep mapping quality separate from the alignment itself appears to make more sense: - Mapping quality for an alignment is not a property of an alignment per se, but is derived from its relationship to other alignments. - get_alignment returns an Alignment, but leaves mapq uninitialized, that is, we must remember to fill it in afterwards, which can easily be forgotten. - Sam::add_pair is passed two mapq values and also two Alignment instances, which also contain mapq values. It is unclear for the caller which of the values is chosen. - Adding a mapq parameter to Sam::add makes it mirror Sam::add_pair. --- src/aln.cpp | 10 ++-------- src/sam.cpp | 4 +++- src/sam.hpp | 3 +-- tests/test_sam.cpp | 5 ++--- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/aln.cpp b/src/aln.cpp index d3edcc0e..b1ff6655 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -141,8 +141,7 @@ static inline void align_SE( uint8_t mapq = 60.0 * (best_score - second_best_score) / best_score; if (max_secondary == 0) { - best_alignment.mapq = mapq; - sam.add(best_alignment, record, read.rc, true, details); + sam.add(best_alignment, record, read.rc, mapq, true, details); return; } // Sort alignments by score, highest first @@ -159,12 +158,7 @@ static inline void align_SE( break; } bool is_primary = i == 0; - if (is_primary) { - alignment.mapq = mapq; - } else { - alignment.mapq = 255; - } - sam.add(alignment, record, read.rc, is_primary, details); + sam.add(alignment, record, read.rc, mapq, is_primary, details); } } diff --git a/src/sam.cpp b/src/sam.cpp index 962ff9ce..0fb5a272 100644 --- a/src/sam.cpp +++ b/src/sam.cpp @@ -120,6 +120,7 @@ void Sam::add( const Alignment& alignment, const KSeq& record, const std::string& sequence_rc, + uint8_t mapq, bool is_primary, const Details& details ) { @@ -131,8 +132,9 @@ void Sam::add( } if (!is_primary) { flags |= SECONDARY; + mapq = 255; } - add_record(record.name, flags, references.names[alignment.ref_id], alignment.ref_start, alignment.mapq, alignment.cigar, "*", -1, 0, record.seq, sequence_rc, record.qual, alignment.edit_distance, alignment.score, details); + add_record(record.name, flags, references.names[alignment.ref_id], alignment.ref_start, mapq, alignment.cigar, "*", -1, 0, record.seq, sequence_rc, record.qual, alignment.edit_distance, alignment.score, details); } // Add one individual record diff --git a/src/sam.hpp b/src/sam.hpp index c214dac4..ae9fb62a 100644 --- a/src/sam.hpp +++ b/src/sam.hpp @@ -15,7 +15,6 @@ struct Alignment { int edit_distance; int global_ed; int score; - uint8_t mapq; int length; bool is_rc; bool is_unaligned{false}; @@ -79,7 +78,7 @@ class Sam { } /* Add an alignment */ - void add(const Alignment& alignment, const klibpp::KSeq& record, const std::string& sequence_rc, bool is_primary, const Details& details); + void add(const Alignment& alignment, const klibpp::KSeq& record, const std::string& sequence_rc, uint8_t mapq, bool is_primary, const Details& details); void add_pair(const Alignment& alignment1, const Alignment& alignment2, const klibpp::KSeq& record1, const klibpp::KSeq& record2, const std::string& read1_rc, const std::string& read2_rc, uint8_t mapq1, uint8_t mapq2, bool is_proper, bool is_primary, const std::array& details); void add_unmapped(const klibpp::KSeq& record, uint16_t flags = UNMAP); void add_unmapped_pair(const klibpp::KSeq& r1, const klibpp::KSeq& r2); diff --git a/tests/test_sam.cpp b/tests/test_sam.cpp index b54a195d..d023d699 100644 --- a/tests/test_sam.cpp +++ b/tests/test_sam.cpp @@ -52,7 +52,6 @@ TEST_CASE("Sam::add") { aln.ref_start = 2; aln.edit_distance = 3; aln.score = 9; - aln.mapq = 55; aln.cigar = Cigar("2S2=1X3=3S"); std::string read_rc = reverse_complement(record.seq); @@ -61,7 +60,7 @@ TEST_CASE("Sam::add") { SUBCASE("Cigar =/X") { std::string sam_string; Sam sam(sam_string, references); - sam.add(aln, record, read_rc, is_primary, details); + sam.add(aln, record, read_rc, 55, is_primary, details); CHECK(sam_string == "readname\t16\tcontig1\t3\t55\t2S2=1X3=3S\t*\t0\t0\tACGTT\tBB#>\tNM:i:3\tAS:i:9\n" ); @@ -69,7 +68,7 @@ TEST_CASE("Sam::add") { SUBCASE("Cigar M") { std::string sam_string; Sam sam(sam_string, references, CigarOps::M); - sam.add(aln, record, read_rc, is_primary, details); + sam.add(aln, record, read_rc, 55, is_primary, details); CHECK(sam_string == "readname\t16\tcontig1\t3\t55\t2S6M3S\t*\t0\t0\tACGTT\tBB#>\tNM:i:3\tAS:i:9\n" ); From c9c685b8b63ba0a9a359359126512ba6198ab6d8 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 13 Nov 2023 21:25:31 +0100 Subject: [PATCH 3/4] Changelog entry --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 74a2e869..2efbe8e1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,7 @@ * #321: Fix: For paired-end reads that cannot be placed as proper pairs, we now prefer placing them onto the same chrosome instead of on different ones if there is a choice. +* #328: Adjust MAPQ computation for single-end reads. * #318: Added a `--details` option mainly intended for debugging. When used, some strobealign-specific tags are added to the SAM output that inform about things like no. of seeds found, whether mate rescue was performed etc. From 9120362a0121e70b77b886e91467c25ec4d6f1c8 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 15 Nov 2023 16:35:40 +0100 Subject: [PATCH 4/4] Ensure we never report MAPQ=0 if best and 2nd best score differs --- src/aln.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aln.cpp b/src/aln.cpp index b1ff6655..76350004 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -138,7 +138,7 @@ static inline void align_SE( } tries++; } - uint8_t mapq = 60.0 * (best_score - second_best_score) / best_score; + uint8_t mapq = (60.0 * (best_score - second_best_score) + best_score - 1) / best_score; if (max_secondary == 0) { sam.add(best_alignment, record, read.rc, mapq, true, details);