From 51eab56e07dbbb88c67772e7cafa4b777646946f Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Tue, 11 Apr 2023 14:44:02 +0200 Subject: [PATCH 1/5] Let alignment::cigar be of type Cigar --- src/aln.cpp | 33 +++++++++++++++------------------ src/cigar.cpp | 3 +++ src/cigar.hpp | 4 +++- src/sam.cpp | 4 ++-- src/sam.hpp | 10 ++++++---- tests/test_aligner.cpp | 2 +- tests/test_cigar.cpp | 9 ++++----- tests/test_sam.cpp | 6 +++--- 8 files changed, 37 insertions(+), 34 deletions(-) diff --git a/src/aln.cpp b/src/aln.cpp index 5c638cc7..72ac2adb 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -118,14 +118,14 @@ static inline void align_SE_secondary_hits( if (sam_aln.sw_score > best_align_sw_score) { min_mapq_diff = std::max(0, sam_aln.sw_score - best_align_sw_score); // new distance to next best match best_align_sw_score = sam_aln.sw_score; - best_sam_aln = sam_aln; + best_sam_aln = std::move(sam_aln); if (max_secondary == 1) { - best_align_dist = sam_aln.global_ed; + best_align_dist = best_sam_aln.global_ed; } } // sam_aln.ed = 10000; // init if (max_secondary > 1) { - alignments.push_back(sam_aln); + alignments.emplace_back(sam_aln); } cnt++; } @@ -181,7 +181,7 @@ static inline alignment align_segment( if (hamming_dist >= 0 && (((float) hamming_dist / read_segm_len) < 0.05) ) { //Hamming distance worked fine, no need to ksw align auto info = hamming_align(read_segm, ref_segm_ham, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); - sam_aln_segm.cigar = info.cigar.to_string(); + sam_aln_segm.cigar = std::move(info.cigar); sam_aln_segm.ed = info.edit_distance; sam_aln_segm.sw_score = info.sw_score; sam_aln_segm.ref_start = ref_start + ext_left + info.query_start; @@ -193,7 +193,7 @@ static inline alignment align_segment( } } auto info = aligner.align(read_segm, ref_segm); - sam_aln_segm.cigar = info.cigar.to_string(); + sam_aln_segm.cigar = std::move(info.cigar); sam_aln_segm.ed = info.edit_distance; sam_aln_segm.sw_score = info.sw_score; sam_aln_segm.ref_start = ref_start + info.ref_start; @@ -262,7 +262,7 @@ static inline alignment get_alignment( } int softclipped = info.query_start + (query.size() - info.query_end); alignment sam_aln; - sam_aln.cigar = info.cigar.to_string(); + sam_aln.cigar = std::move(info.cigar); sam_aln.ed = info.edit_distance; sam_aln.global_ed = info.edit_distance + softclipped; sam_aln.sw_score = info.sw_score; @@ -310,8 +310,8 @@ static inline alignment get_alignment_unused( int hamming_dist = hamming_distance(r_tmp, ref_segm); if (hamming_dist >= 0 && (((float) hamming_dist / ref_segm_size) < 0.05) ) { //Hamming distance worked fine, no need to ksw align - const auto info = hamming_align(r_tmp, ref_segm, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); - sam_aln.cigar = info.cigar.to_string(); + auto info = hamming_align(r_tmp, ref_segm, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); + sam_aln.cigar = std::move(info.cigar); sam_aln.ed = info.edit_distance; sam_aln.sw_score = info.sw_score; // aln_params.match*(read_len-hamming_dist) - aln_params.mismatch*hamming_dist; sam_aln.ref_start = ref_start + ext_left + info.query_start; @@ -383,7 +383,7 @@ static inline alignment get_alignment_unused( // Stitch together sam_aln.ref_id = n.ref_id; - sam_aln.cigar = sam_aln_segm_left.cigar + sam_aln_segm_right.cigar; + sam_aln.cigar = Cigar(sam_aln_segm_left.cigar.to_string() + sam_aln_segm_right.cigar.to_string()); sam_aln.ed = sam_aln_segm_left.ed + sam_aln_segm_right.ed; sam_aln.sw_score = sam_aln_segm_left.sw_score + sam_aln_segm_right.sw_score; sam_aln.ref_start = sam_aln_segm_left.ref_start; @@ -568,9 +568,6 @@ static inline bool sort_scores(const std::tuple &a return std::get<0>(a) > std::get<0>(b); } - - - static inline void get_best_scoring_pair( const std::vector &aln_scores1, const std::vector &aln_scores2, std::vector> &high_scores, float mu, float sigma ) { @@ -585,7 +582,7 @@ static inline void get_best_scoring_pair( // 10 corresponds to a value of log(normal_pdf(dist, mu, sigma)) of more than 4 stddevs away score -= 10; } - high_scores.push_back(std::make_tuple(score, a1, a2)); + high_scores.emplace_back(std::make_tuple(score, a1, a2)); } } std::sort(high_scores.begin(), high_scores.end(), sort_scores); // Sorting by highest score first @@ -742,7 +739,7 @@ static inline void rescue_mate( ref_end = std::min(ref_len, std::max(0, b)); if (ref_end < ref_start + k){ - sam_aln.cigar = "*"; + sam_aln.cigar = Cigar(); sam_aln.ed = read_len; sam_aln.sw_score = 0; sam_aln.aln_score = 0; @@ -756,7 +753,7 @@ static inline void rescue_mate( std::string ref_segm = references.sequences[n.ref_id].substr(ref_start, ref_end - ref_start); if (!has_shared_substring(r_tmp, ref_segm, k)){ - sam_aln.cigar = "*"; + sam_aln.cigar = Cigar(); sam_aln.ed = read_len; sam_aln.sw_score = 0; sam_aln.aln_score = 0; @@ -786,7 +783,7 @@ static inline void rescue_mate( // info = ksw_align(ref_ptr, ref_segm.size(), read_ptr, r_tmp.size(), 1, 4, 6, 1, ez); // std::cerr << "Cigar: " << info.cigar << std::endl; - sam_aln.cigar = info.cigar.to_string(); + sam_aln.cigar = info.cigar; sam_aln.ed = info.edit_distance; sam_aln.sw_score = info.sw_score; sam_aln.aln_score = sam_aln.sw_score; @@ -837,13 +834,13 @@ void rescue_read( statistics.did_not_fit++; } alignment a1 = get_alignment(aligner, n, references, read1, fits); - aln_scores1.push_back(a1); + aln_scores1.emplace_back(a1); //////// Force SW alignment to rescue mate ///////// alignment a2; // std::cerr << query_acc2 << " force rescue" << std::endl; rescue_mate(aligner, n, references, read1, read2, a2, mu, sigma, statistics.tot_rescued, k); - aln_scores2.push_back(a2); + aln_scores2.emplace_back(a2); cnt1 ++; statistics.tot_all_tried ++; diff --git a/src/cigar.cpp b/src/cigar.cpp index 18354112..955ef4bd 100644 --- a/src/cigar.cpp +++ b/src/cigar.cpp @@ -31,6 +31,9 @@ Cigar Cigar::to_eqx(const std::string& query, const std::string& ref) const { } std::string Cigar::to_string() const { + if (m_ops.empty()) { + return "*"; + } std::stringstream s; for (auto op_len : m_ops) { s << (op_len >> 4) << "MIDNSHP=X"[op_len & 0xf]; diff --git a/src/cigar.hpp b/src/cigar.hpp index adfc49aa..89299d04 100644 --- a/src/cigar.hpp +++ b/src/cigar.hpp @@ -26,7 +26,7 @@ class Cigar { explicit Cigar(std::vector ops) : m_ops(std::move(ops)) { } - Cigar(Cigar& other) : m_ops(other.m_ops) { } + Cigar(const Cigar& other) : m_ops(other.m_ops) { } explicit Cigar(uint32_t* ops, size_t n) { m_ops.assign(ops, ops + n); @@ -38,6 +38,8 @@ class Cigar { *this = std::move(other); } + Cigar& operator=(const Cigar&) = default; + Cigar& operator=(Cigar&& other) { if (this != &other) { m_ops = std::move(other.m_ops); diff --git a/src/sam.cpp b/src/sam.cpp index da6496e0..d29aecd0 100644 --- a/src/sam.cpp +++ b/src/sam.cpp @@ -104,7 +104,7 @@ void Sam::add_record( const std::string& reference_name, int pos, int mapq, - const std::string& cigar, + const Cigar& cigar, const std::string& mate_name, int mate_ref_start, int template_len, @@ -124,7 +124,7 @@ void Sam::add_record( sam_string.append("\t"); sam_string.append(std::to_string(mapq)); sam_string.append("\t"); - sam_string.append(cigar); + sam_string.append(cigar.to_string()); sam_string.append("\t"); sam_string.append(mate_name); diff --git a/src/sam.hpp b/src/sam.hpp index 72af8707..254bbdb6 100644 --- a/src/sam.hpp +++ b/src/sam.hpp @@ -1,12 +1,14 @@ -#ifndef SAM_HPP -#define SAM_HPP +#ifndef STROBEALIGN_SAM_HPP +#define STROBEALIGN_SAM_HPP #include #include "kseq++.hpp" #include "refs.hpp" +#include "cigar.hpp" + struct alignment { - std::string cigar; + Cigar cigar; int ref_start; int ed; int global_ed; @@ -55,7 +57,7 @@ class Sam { void add_unmapped_pair(const klibpp::KSeq& r1, const klibpp::KSeq& r2); void add_unmapped_mate(const klibpp::KSeq& record, int flags, const std::string& mate_rname, int mate_pos); - void add_record(const std::string& query_name, int flags, const std::string& reference_name, int pos, int mapq, const std::string& cigar, const std::string& mate_name, int mate_ref_start, int template_len, const std::string& query_sequence, const std::string& query_sequence_rc, const std::string& qual, int ed, int aln_score); + void add_record(const std::string& query_name, int flags, const std::string& reference_name, int pos, int mapq, const Cigar& cigar, const std::string& mate_name, int mate_ref_start, int template_len, const std::string& query_sequence, const std::string& query_sequence_rc, const std::string& qual, int ed, int aln_score); private: void append_tail(); diff --git a/tests/test_aligner.cpp b/tests/test_aligner.cpp index 340cd9d0..a72d54b0 100644 --- a/tests/test_aligner.cpp +++ b/tests/test_aligner.cpp @@ -7,7 +7,7 @@ TEST_CASE("hamming_align") { "", "", 7, 5, 0 ); - CHECK(info.cigar.to_string() == ""); + CHECK(info.cigar.empty()); CHECK(info.edit_distance == 0); CHECK(info.sw_score == 0); CHECK(info.ref_start == 0); diff --git a/tests/test_cigar.cpp b/tests/test_cigar.cpp index 48e3612e..733e1c19 100644 --- a/tests/test_cigar.cpp +++ b/tests/test_cigar.cpp @@ -12,10 +12,9 @@ TEST_CASE("compress_cigar") { } TEST_CASE("Parse CIGAR") { - std::vector cigars = {"", "1M", "10M2I1D99=1X4P5S"}; - for (auto& s : cigars) { - CHECK(Cigar(s).to_string() == s); - } + CHECK(Cigar("").empty()); + CHECK(Cigar("1M").to_string() == "1M"); + CHECK(Cigar("10M2I1D99=1X4P5S").to_string() == "10M2I1D99=1X4P5S"); // Not standard, only for convenience CHECK(Cigar("M").to_string() == "1M"); CHECK(Cigar("M M").to_string() == "2M"); @@ -26,7 +25,7 @@ TEST_CASE("Parse CIGAR") { TEST_CASE("Cigar construction and push") { Cigar c1; - CHECK(c1.to_string() == ""); + CHECK(c1.to_string() == "*"); c1.push(CIGAR_MATCH, 1); CHECK(c1.to_string() == "1M"); diff --git a/tests/test_sam.cpp b/tests/test_sam.cpp index 5b52bb16..89bb37ef 100644 --- a/tests/test_sam.cpp +++ b/tests/test_sam.cpp @@ -37,7 +37,7 @@ TEST_CASE("Pair with one unmapped SAM record") { aln1.is_rc = true; aln1.ed = 17; aln1.aln_score = 9; - aln1.cigar = "2M"; + aln1.cigar = Cigar("2M"); alignment aln2; aln2.is_unaligned = true; @@ -91,7 +91,7 @@ TEST_CASE("TLEN zero when reads map to different contigs") { aln1.is_rc = false; aln1.ed = 17; aln1.aln_score = 9; - aln1.cigar = "2M"; + aln1.cigar = Cigar("2M"); alignment aln2; aln2.is_unaligned = false; @@ -100,7 +100,7 @@ TEST_CASE("TLEN zero when reads map to different contigs") { aln2.is_rc = false; aln2.ed = 2; aln2.aln_score = 4; - aln2.cigar = "3M"; + aln2.cigar = Cigar("3M"); klibpp::KSeq record1; klibpp::KSeq record2; From 84e92d82f415dea2d6d4553889c715cdf64fd215 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Tue, 11 Apr 2023 15:04:23 +0200 Subject: [PATCH 2/5] Test Sam::add --- tests/test_sam.cpp | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_sam.cpp b/tests/test_sam.cpp index 89bb37ef..490aeb61 100644 --- a/tests/test_sam.cpp +++ b/tests/test_sam.cpp @@ -24,6 +24,35 @@ TEST_CASE("Formatting unmapped SAM record") { } } +TEST_CASE("Sam::add") { + References references; + references.add("contig1", "AACCGGTT"); + std::string sam_string; + Sam sam(sam_string, references); + + klibpp::KSeq record; + record.name = "readname"; + record.seq = "AACGT"; + record.qual = ">#BB"; + + alignment aln; + aln.ref_id = 0; + aln.is_unaligned = false; + aln.is_rc = true; + aln.ref_start = 2; + aln.ed = 3; + aln.aln_score = 9; + aln.mapq = 55; + aln.cigar = Cigar("2S2=1X3=3S"); + + std::string read_rc = reverse_complement(record.seq); + bool is_secondary = false; + sam.add(aln, record, read_rc, is_secondary); + CHECK(sam_string == + "readname\t16\tcontig1\t3\t55\t2S2=1X3=3S\t*\t0\t0\tACGTT\tBB#>\tNM:i:3\tAS:i:9\n" + ); +} + TEST_CASE("Pair with one unmapped SAM record") { References references; references.add("contig1", "ACGT"); From 96f14e70ba3f71d29e7e39737e0c360a02d60671 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Tue, 11 Apr 2023 15:34:19 +0200 Subject: [PATCH 3/5] Let Sam be able to emit M instead of =/X --- src/cigar.cpp | 17 ++++++++++++++--- src/cigar.hpp | 3 +++ src/pc.cpp | 2 +- src/sam.cpp | 16 +++++++++++++++- src/sam.hpp | 16 +++++++++++++++- tests/test_cigar.cpp | 10 ++++++++-- tests/test_sam.cpp | 27 +++++++++++++++++++-------- 7 files changed, 75 insertions(+), 16 deletions(-) diff --git a/src/cigar.cpp b/src/cigar.cpp index 955ef4bd..88676d53 100644 --- a/src/cigar.cpp +++ b/src/cigar.cpp @@ -2,6 +2,20 @@ #include #include "cigar.hpp" +/* Return a new Cigar that uses M operations instead of =/X */ +Cigar Cigar::to_m() const { + Cigar cigar; + for (auto op_len : m_ops) { + auto op = op_len & 0xf; + auto len = op_len >> 4; + if (op == CIGAR_EQ || op == CIGAR_X) { + cigar.push(CIGAR_MATCH, len); + } else { + cigar.push(op, len); + } + } + return cigar; +} Cigar Cigar::to_eqx(const std::string& query, const std::string& ref) const { size_t i = 0, j = 0; @@ -31,9 +45,6 @@ Cigar Cigar::to_eqx(const std::string& query, const std::string& ref) const { } std::string Cigar::to_string() const { - if (m_ops.empty()) { - return "*"; - } std::stringstream s; for (auto op_len : m_ops) { s << (op_len >> 4) << "MIDNSHP=X"[op_len & 0xf]; diff --git a/src/cigar.hpp b/src/cigar.hpp index 89299d04..ced193b1 100644 --- a/src/cigar.hpp +++ b/src/cigar.hpp @@ -81,6 +81,9 @@ class Cigar { std::reverse(m_ops.begin(), m_ops.end()); } + /* Return a new Cigar that uses M instead of =/X */ + Cigar to_m() const; + /* Return a new Cigar that uses =/X instead of M */ Cigar to_eqx(const std::string& query, const std::string& ref) const; diff --git a/src/pc.cpp b/src/pc.cpp index 3f88c1cc..6f3c256b 100644 --- a/src/pc.cpp +++ b/src/pc.cpp @@ -146,7 +146,7 @@ void perform_task( std::string sam_out; sam_out.reserve(7*map_param.r * (records1.size() + records3.size())); - Sam sam{sam_out, references, read_group_id, map_param.output_unmapped}; + Sam sam{sam_out, references, CigarOps::EQX, read_group_id, map_param.output_unmapped}; for (size_t i = 0; i < records1.size(); ++i) { auto record1 = records1[i]; auto record2 = records2[i]; diff --git a/src/sam.cpp b/src/sam.cpp index d29aecd0..ba9bcffa 100644 --- a/src/sam.cpp +++ b/src/sam.cpp @@ -42,6 +42,20 @@ void Sam::append_tail() { sam_string.append(tail); } +std::string Sam::cigar_string(const Cigar& cigar) const { + if (cigar.empty()) { + // This case should normally not occur because + // unmapped reads would be added with add_unmapped, + // which hardcodes the "*" + return "*"; + } + if (cigar_ops == CigarOps::EQX) { + return cigar.to_string(); + } else { + return cigar.to_m().to_string(); + } +} + void Sam::add_unmapped(const KSeq& record, int flags) { if (!output_unmapped) { return; @@ -124,7 +138,7 @@ void Sam::add_record( sam_string.append("\t"); sam_string.append(std::to_string(mapq)); sam_string.append("\t"); - sam_string.append(cigar.to_string()); + sam_string.append(cigar_string(cigar)); sam_string.append("\t"); sam_string.append(mate_name); diff --git a/src/sam.hpp b/src/sam.hpp index 254bbdb6..b86ed5a1 100644 --- a/src/sam.hpp +++ b/src/sam.hpp @@ -36,12 +36,24 @@ enum SamFlags { SUPPLEMENTARY = 0x800, }; +enum struct CigarOps { + EQX = 0, // use = and X CIGAR operations + M = 1, // use M CIGAR operations +}; + class Sam { public: - Sam(std::string& sam_string, const References& references, const std::string& read_group_id = "", bool output_unmapped = true) + Sam( + std::string& sam_string, + const References& references, + CigarOps cigar_ops = CigarOps::EQX, + const std::string& read_group_id = "", + bool output_unmapped = true + ) : sam_string(sam_string) , references(references) + , cigar_ops(cigar_ops) , output_unmapped(output_unmapped) { if (read_group_id.empty()) { tail = "\n"; @@ -61,8 +73,10 @@ class Sam { private: void append_tail(); + std::string cigar_string(const Cigar& cigar) const; std::string& sam_string; const References& references; + const CigarOps cigar_ops; std::string tail; bool output_unmapped; }; diff --git a/tests/test_cigar.cpp b/tests/test_cigar.cpp index 733e1c19..675b22d6 100644 --- a/tests/test_cigar.cpp +++ b/tests/test_cigar.cpp @@ -25,7 +25,7 @@ TEST_CASE("Parse CIGAR") { TEST_CASE("Cigar construction and push") { Cigar c1; - CHECK(c1.to_string() == "*"); + CHECK(c1.to_string() == ""); c1.push(CIGAR_MATCH, 1); CHECK(c1.to_string() == "1M"); @@ -47,7 +47,7 @@ TEST_CASE("Cigar construction and push") { CHECK(c2.to_string() == "3M5X7I13D"); } -TEST_CASE("Cigar =/X conversion") { +TEST_CASE("Cigar conversion from M to =/X") { Cigar c1{"1M"}; CHECK(c1.to_eqx("A", "A").to_string() == "1="); CHECK(c1.to_eqx("A", "G").to_string() == "1X"); @@ -61,6 +61,12 @@ TEST_CASE("Cigar =/X conversion") { CHECK(c.to_eqx("ACTTTGCATT", "ACGTATGAAA").to_string() == "2=1D1=1X2=1I1=2X"); } +TEST_CASE("Cigar conversion from =/X to M") { + CHECK(Cigar("").to_m().empty()); + CHECK(Cigar("5=").to_m().to_string() == "5M"); + CHECK(Cigar("5S3=1X2=4S").to_m().to_string() == "5S6M4S"); +} + TEST_CASE("concatenate Cigar") { Cigar c{"3M"}; c += Cigar{"2M1X"}; diff --git a/tests/test_sam.cpp b/tests/test_sam.cpp index 490aeb61..e1e1f16c 100644 --- a/tests/test_sam.cpp +++ b/tests/test_sam.cpp @@ -8,16 +8,17 @@ TEST_CASE("Formatting unmapped SAM record") { kseq.seq = "ACGT"; kseq.qual = ">#BB"; std::string sam_string; + References references; SUBCASE("without RG") { - Sam sam(sam_string, References()); + Sam sam(sam_string, references); sam.add_unmapped(kseq); CHECK(sam_string == "read1\t4\t*\t0\t0\t*\t*\t0\t0\tACGT\t>#BB\n"); } SUBCASE("with RG") { - Sam sam(sam_string, References(), "rg1"); + Sam sam(sam_string, references, CigarOps::EQX, "rg1"); sam.add_unmapped(kseq); CHECK(sam_string == "read1\t4\t*\t0\t0\t*\t*\t0\t0\tACGT\t>#BB\tRG:Z:rg1\n"); @@ -27,8 +28,6 @@ TEST_CASE("Formatting unmapped SAM record") { TEST_CASE("Sam::add") { References references; references.add("contig1", "AACCGGTT"); - std::string sam_string; - Sam sam(sam_string, references); klibpp::KSeq record; record.name = "readname"; @@ -47,10 +46,22 @@ TEST_CASE("Sam::add") { std::string read_rc = reverse_complement(record.seq); bool is_secondary = false; - sam.add(aln, record, read_rc, is_secondary); - CHECK(sam_string == - "readname\t16\tcontig1\t3\t55\t2S2=1X3=3S\t*\t0\t0\tACGTT\tBB#>\tNM:i:3\tAS:i:9\n" - ); + SUBCASE("Cigar =/X") { + std::string sam_string; + Sam sam(sam_string, references); + sam.add(aln, record, read_rc, is_secondary); + CHECK(sam_string == + "readname\t16\tcontig1\t3\t55\t2S2=1X3=3S\t*\t0\t0\tACGTT\tBB#>\tNM:i:3\tAS:i:9\n" + ); + } + SUBCASE("Cigar M") { + std::string sam_string; + Sam sam(sam_string, references, CigarOps::M); + sam.add(aln, record, read_rc, is_secondary); + CHECK(sam_string == + "readname\t16\tcontig1\t3\t55\t2S6M3S\t*\t0\t0\tACGTT\tBB#>\tNM:i:3\tAS:i:9\n" + ); + } } TEST_CASE("Pair with one unmapped SAM record") { From 7f26bbe2176dc8b1c4b94799a0252ab5a79aa83c Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Tue, 11 Apr 2023 15:47:26 +0200 Subject: [PATCH 4/5] Add --m-op option for emitting M instead of =/X Closes #20 --- CHANGES.md | 2 ++ src/aln.hpp | 1 + src/cmdline.cpp | 2 ++ src/cmdline.hpp | 1 + src/main.cpp | 1 + src/pc.cpp | 3 ++- tests/run.sh | 6 ++++++ 7 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 98c9f348..19b6b943 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,8 @@ * #258: Fix compilation on MinGW. Thanks @teepean. * #260: Include full command line in the SAM PG header. +* #20: Add command-line option `--m-op`. If used, emit `M` CIGAR operations + instead of `=` and `X`. ## v0.9.0 (2023-03-16) diff --git a/src/aln.hpp b/src/aln.hpp index c87b0da6..75526973 100644 --- a/src/aln.hpp +++ b/src/aln.hpp @@ -53,6 +53,7 @@ struct mapping_params { int maxTries { 20 }; int rescue_cutoff; bool is_sam_out { true }; + bool cigar_eqx { true }; bool output_unmapped { true }; }; diff --git a/src/cmdline.cpp b/src/cmdline.cpp index 8e464439..65f0bc10 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -26,6 +26,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { args::ValueFlag o(parser, "PATH", "redirect output to file [stdout]", {'o'}); args::Flag v(parser, "v", "Verbose output", {'v'}); args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"}); + args::Flag cigar_m(parser, "cigar-m", "Use CIGAR M instead of =/X", {"m-op"}); args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'}); args::Flag U(parser, "U", "Suppress output of unmapped reads", {'U'}); args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"}); @@ -88,6 +89,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { if (o) { opt.output_file_name = args::get(o); opt.write_to_stdout = false; } if (v) { opt.verbose = true; } if (no_progress) { opt.show_progress = false; } + if (cigar_m) { opt.cigar_eqx = false; } if (x) { opt.is_sam_out = false; } if (U) { opt.output_unmapped = false; } if (rgid) { opt.read_group_id = args::get(rgid); } diff --git a/src/cmdline.hpp b/src/cmdline.hpp index 11ab2440..bd53839d 100644 --- a/src/cmdline.hpp +++ b/src/cmdline.hpp @@ -14,6 +14,7 @@ struct CommandLineOptions { bool write_to_stdout { true }; bool verbose { false }; bool show_progress { true }; + bool cigar_eqx { true }; std::string read_group_id { "" }; std::vector read_group_fields; std::string logfile_name { "" }; diff --git a/src/main.cpp b/src/main.cpp index d23a67e4..d6b19591 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -172,6 +172,7 @@ int run_strobealign(int argc, char **argv) { map_param.R = opt.R; map_param.maxTries = opt.maxTries; map_param.is_sam_out = opt.is_sam_out; + map_param.cigar_eqx = opt.cigar_eqx; map_param.output_unmapped = opt.output_unmapped; log_parameters(index_parameters, map_param, aln_params); diff --git a/src/pc.cpp b/src/pc.cpp index 6f3c256b..ac138fde 100644 --- a/src/pc.cpp +++ b/src/pc.cpp @@ -146,7 +146,8 @@ void perform_task( std::string sam_out; sam_out.reserve(7*map_param.r * (records1.size() + records3.size())); - Sam sam{sam_out, references, CigarOps::EQX, read_group_id, map_param.output_unmapped}; + CigarOps cigar_ops = map_param.cigar_eqx ? CigarOps::EQX : CigarOps::M; + Sam sam{sam_out, references, cigar_ops, read_group_id, map_param.output_unmapped}; for (size_t i = 0; i < records1.size(); ++i) { auto record1 = records1[i]; auto record2 = records2[i]; diff --git a/tests/run.sh b/tests/run.sh index 599488ea..3dc5bfd0 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -25,6 +25,12 @@ strobealign --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library -v tests/phi diff tests/phix.se.sam phix.se.sam rm phix.se.sam +# Single-end SAM, M CIGAR operators +strobealign --m-op tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.m.sam +if samtools view phix.se.m.sam | cut -f6 | grep -q '[X=]'; then false; fi + +rm phix.se.m.sam + # Paired-end SAM strobealign --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library tests/phix.fasta tests/phix.1.fastq tests/phix.2.fastq | grep -v '^@PG' > phix.pe.sam diff tests/phix.pe.sam phix.pe.sam From 56742b9767faa79b142d47e337d5bf9ac14ad6a9 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Tue, 11 Apr 2023 21:23:22 +0200 Subject: [PATCH 5/5] Emit M CIGAR operations by default instead of =/X and add --eqx option --- CHANGES.md | 18 +++++++++--------- README.md | 1 + src/cmdline.cpp | 4 ++-- src/cmdline.hpp | 2 +- tests/run.sh | 6 +++--- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 19b6b943..3e4dc067 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,14 +2,14 @@ ## development version -* #258: Fix compilation on MinGW. Thanks @teepean. -* #260: Include full command line in the SAM PG header. -* #20: Add command-line option `--m-op`. If used, emit `M` CIGAR operations - instead of `=` and `X`. +* #258: Fixed compilation on MinGW. Thanks @teepean. +* #260: Include full command line in the SAM PG header. Thanks @telmin. +* #20: By default, emit `M` CIGAR operations instead of `=` and `X`. + Added option `--eqx` to use `=` and `X` as before. ## v0.9.0 (2023-03-16) -* Add progress report (only shown if output is not a terminal; can be +* Added progress report (only shown if output is not a terminal; can be disabled with `--no-progress`) * PR #250: Avoid overeager soft clipping by adding an “end bonus” to the alignment score if the alignment reaches the 5' or 3' end of the read. @@ -17,12 +17,12 @@ accuracy, in particular for short reads, as candidate mapping sites with and without soft clipping are compared more fairly. Use `-L` to change the end bonus. (This emulates a feature found in BWA-MEM.) -* Issue #238: Fix occasionally incorrect soft clipping. -* PR #239: Fix an uninitialized variable that could lead to nondeterministic +* Issue #238: Fixed occasionally incorrect soft clipping. +* PR #239: Fixed an uninitialized variable that could lead to nondeterministic results. * Issue #137: Compute TLEN (in SAM output) correctly -* PR #255: Add support for reading gzip-compressed reference FASTA files. -* Issue #222: Make it possible again to build strobealign from the release +* PR #255: Added support for reading gzip-compressed reference FASTA files. +* Issue #222: Made it possible again to build strobealign from the release tarball (not only from the Git repository). ## v0.8.0 (2023-02-01) diff --git a/README.md b/README.md index 1438ca4a..1d63ec7c 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,7 @@ options. Some important ones are: `--create-index`, see [index files](#index-files). * `-t N`, `--threads=N`: Use N threads. This mainly applies to the mapping step as the indexing step is only partially parallelized. +* `--eqx`: Emit `=` and `X` CIGAR operations instead of `M`. * `-x`: Only map reads, do not do no base-level alignment. This switches the output format from SAM to [PAF](https://github.com/lh3/miniasm/blob/master/PAF.md). * `--rg-id=ID`: Add RG tag to each SAM record. diff --git a/src/cmdline.cpp b/src/cmdline.cpp index 65f0bc10..4674ab3f 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -26,7 +26,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { args::ValueFlag o(parser, "PATH", "redirect output to file [stdout]", {'o'}); args::Flag v(parser, "v", "Verbose output", {'v'}); args::Flag no_progress(parser, "no-progress", "Disable progress report (enabled by default if output is a terminal)", {"no-progress"}); - args::Flag cigar_m(parser, "cigar-m", "Use CIGAR M instead of =/X", {"m-op"}); + args::Flag eqx(parser, "eqx", "Emit =/X instead of M CIGAR operations", {"eqx"}); args::Flag x(parser, "x", "Only map reads, no base level alignment (produces PAF file)", {'x'}); args::Flag U(parser, "U", "Suppress output of unmapped reads", {'U'}); args::Flag interleaved(parser, "interleaved", "Interleaved input", {"interleaved"}); @@ -89,7 +89,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { if (o) { opt.output_file_name = args::get(o); opt.write_to_stdout = false; } if (v) { opt.verbose = true; } if (no_progress) { opt.show_progress = false; } - if (cigar_m) { opt.cigar_eqx = false; } + if (eqx) { opt.cigar_eqx = true; } if (x) { opt.is_sam_out = false; } if (U) { opt.output_unmapped = false; } if (rgid) { opt.read_group_id = args::get(rgid); } diff --git a/src/cmdline.hpp b/src/cmdline.hpp index bd53839d..0239d075 100644 --- a/src/cmdline.hpp +++ b/src/cmdline.hpp @@ -14,7 +14,7 @@ struct CommandLineOptions { bool write_to_stdout { true }; bool verbose { false }; bool show_progress { true }; - bool cigar_eqx { true }; + bool cigar_eqx { false }; std::string read_group_id { "" }; std::vector read_group_fields; std::string logfile_name { "" }; diff --git a/tests/run.sh b/tests/run.sh index 3dc5bfd0..8e5bc910 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -21,18 +21,18 @@ if strobealign -G > /dev/null 2> /dev/null; then false; fi strobealign -h > /dev/null # Single-end SAM -strobealign --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library -v tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.sam +strobealign --eqx --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library -v tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.sam diff tests/phix.se.sam phix.se.sam rm phix.se.sam # Single-end SAM, M CIGAR operators -strobealign --m-op tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.m.sam +strobealign tests/phix.fasta tests/phix.1.fastq | grep -v '^@PG' > phix.se.m.sam if samtools view phix.se.m.sam | cut -f6 | grep -q '[X=]'; then false; fi rm phix.se.m.sam # Paired-end SAM -strobealign --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library tests/phix.fasta tests/phix.1.fastq tests/phix.2.fastq | grep -v '^@PG' > phix.pe.sam +strobealign --eqx --chunk-size 3 --rg-id 1 --rg SM:sample --rg LB:library tests/phix.fasta tests/phix.1.fastq tests/phix.2.fastq | grep -v '^@PG' > phix.pe.sam diff tests/phix.pe.sam phix.pe.sam rm phix.pe.sam