diff --git a/src/aln.cpp b/src/aln.cpp index 0497ceb8..de134dec 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -14,7 +14,7 @@ using namespace klibpp; static inline Alignment get_alignment( const Aligner& aligner, - const Nam &n, + const Nam &nam, const References& references, const Read& read, bool consistent_nam @@ -34,36 +34,36 @@ static inline bool score(const Nam &a, const Nam &b) { * in place and return true. * - If first and last strobe do not match consistently, return false. */ -bool reverse_nam_if_needed(Nam& n, const Read& read, const References& references, int k) { +bool reverse_nam_if_needed(Nam& nam, const Read& read, const References& references, int k) { auto read_len = read.size(); - std::string ref_start_kmer = references.sequences[n.ref_id].substr(n.ref_s, k); - std::string ref_end_kmer = references.sequences[n.ref_id].substr(n.ref_e-k, k); + std::string ref_start_kmer = references.sequences[nam.ref_id].substr(nam.ref_start, k); + std::string ref_end_kmer = references.sequences[nam.ref_id].substr(nam.ref_end-k, k); std::string seq, seq_rc; - if (n.is_rc) { + if (nam.is_rc) { seq = read.rc; seq_rc = read.seq; } else { seq = read.seq; seq_rc = read.rc; } - std::string read_start_kmer = seq.substr(n.query_s, k); - std::string read_end_kmer = seq.substr(n.query_e-k, k); + std::string read_start_kmer = seq.substr(nam.query_start, k); + std::string read_end_kmer = seq.substr(nam.query_end-k, k); if (ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer) { return true; } // False forward or false reverse (possible due to symmetrical hash values) // we need two extra checks for this - hopefully this will remove all the false hits we see (true hash collisions should be very few) - int q_start_tmp = read_len - n.query_e; - int q_end_tmp = read_len - n.query_s; + int q_start_tmp = read_len - nam.query_end; + int q_end_tmp = read_len - nam.query_start; // false reverse hit, change coordinates in nam to forward read_start_kmer = seq_rc.substr(q_start_tmp, k); read_end_kmer = seq_rc.substr(q_end_tmp - k, k); if (ref_start_kmer == read_start_kmer && ref_end_kmer == read_end_kmer) { - n.is_rc = !n.is_rc; - n.query_s = q_start_tmp; - n.query_e = q_end_tmp; + nam.is_rc = !nam.is_rc; + nam.query_start = q_start_tmp; + nam.query_end = q_end_tmp; return true; } return false; @@ -72,16 +72,16 @@ bool reverse_nam_if_needed(Nam& n, const Read& read, const References& reference static inline void align_SE( const Aligner& aligner, Sam& sam, - std::vector& all_nams, + std::vector& nams, const KSeq& record, int k, const References& references, Details& details, - float dropoff, + float dropoff_threshold, int max_tries, unsigned max_secondary ) { - if (all_nams.empty()) { + if (nams.empty()) { sam.add_unmapped(record); return; } @@ -89,71 +89,68 @@ static inline void align_SE( Read read(record.seq); std::vector alignments; int tries = 0; - float score_dropoff; - Nam n_max = all_nams[0]; - - int best_align_dist = ~0U >> 1; - int best_align_sw_score = -1000; - - Alignment best_sam_aln; - best_sam_aln.sw_score = -100000; - best_sam_aln.is_unaligned = true; - int min_mapq_diff = best_align_dist; - for (auto &n : all_nams) { - score_dropoff = (float) n.n_hits / n_max.n_hits; - - if (tries >= max_tries || best_align_dist == 0 || score_dropoff < dropoff) { // only consider top 20 hits as minimap2 and break if alignment is exact match to reference or the match below droppoff cutoff. + Nam n_max = nams[0]; + + int best_edit_distance = std::numeric_limits::max(); + int best_score = -1000; + + Alignment best_alignment; + best_alignment.score = -100000; + best_alignment.is_unaligned = true; + int min_mapq_diff = best_edit_distance; + for (auto &nam : nams) { + float score_dropoff = (float) nam.n_hits / n_max.n_hits; + if (tries >= max_tries || best_edit_distance == 0 || score_dropoff < dropoff_threshold) { break; } - bool consistent_nam = reverse_nam_if_needed(n, read, references, k); + bool consistent_nam = reverse_nam_if_needed(nam, read, references, k); details.nam_inconsistent += !consistent_nam; - auto sam_aln = get_alignment(aligner, n, references, read, consistent_nam); + auto alignment = get_alignment(aligner, nam, references, read, consistent_nam); details.tried_alignment++; - details.gapped += sam_aln.gapped; + details.gapped += alignment.gapped; - int diff_to_best = std::abs(best_align_sw_score - sam_aln.sw_score); + int diff_to_best = std::abs(best_score - alignment.score); min_mapq_diff = std::min(min_mapq_diff, diff_to_best); if (max_secondary > 0) { - alignments.emplace_back(sam_aln); + alignments.emplace_back(alignment); } - if (sam_aln.sw_score > best_align_sw_score) { - min_mapq_diff = std::max(0, sam_aln.sw_score - best_align_sw_score); // new distance to next best match - best_align_sw_score = sam_aln.sw_score; - best_sam_aln = std::move(sam_aln); + if (alignment.score > best_score) { + min_mapq_diff = std::max(0, alignment.score - best_score); // new distance to next best match + best_score = alignment.score; + best_alignment = std::move(alignment); if (max_secondary == 0) { - best_align_dist = best_sam_aln.global_ed; + best_edit_distance = best_alignment.global_ed; } } tries++; } if (max_secondary == 0) { - best_sam_aln.mapq = std::min(min_mapq_diff, 60); - sam.add(best_sam_aln, record, read.rc, true, details); + best_alignment.mapq = std::min(min_mapq_diff, 60); + sam.add(best_alignment, record, read.rc, true, details); return; } // Sort alignments by score, highest first std::sort(alignments.begin(), alignments.end(), [](const Alignment& a, const Alignment& b) -> bool { - return a.sw_score > b.sw_score; + return a.score > b.score; } ); auto max_out = std::min(alignments.size(), static_cast(max_secondary) + 1); - bool is_primary{true}; for (size_t i = 0; i < max_out; ++i) { - auto sam_aln = alignments[i]; - if ((sam_aln.sw_score - best_align_sw_score) > (2*aligner.parameters.mismatch + aligner.parameters.gap_open) ){ + auto alignment = alignments[i]; + if (alignment.score - best_score > 2*aligner.parameters.mismatch + aligner.parameters.gap_open) { break; } - if (!is_primary) { - sam_aln.mapq = 255; + bool is_primary = i == 0; + if (is_primary) { + alignment.mapq = std::min(min_mapq_diff, 60); } else { - sam_aln.mapq = std::min(min_mapq_diff, 60); + alignment.mapq = 255; } - sam.add(sam_aln, record, read.rc, is_primary, details); - is_primary = false; + sam.add(alignment, record, read.rc, is_primary, details); } } @@ -167,11 +164,10 @@ static inline Alignment align_segment( bool consistent_nam, bool is_rc ) { - Alignment sam_aln_segm; - auto ref_segm_len = ref_segm.size(); + Alignment alignment; auto read_segm_len = read_segm.size(); // The ref_segm includes an extension of ext_left bases upstream and ext_right bases downstream - int ref_segm_len_ham = ref_segm_len - ext_left - ext_right; // we send in the already extended ref segment to save time. This is not true in center alignment if merged match have diff length + auto ref_segm_len_ham = ref_segm.size() - ext_left - ext_right; // we send in the already extended ref segment to save time. This is not true in center alignment if merged match have diff length if (ref_segm_len_ham == read_segm_len && consistent_nam) { std::string ref_segm_ham = ref_segm.substr(ext_left, read_segm_len); @@ -179,27 +175,25 @@ static inline Alignment align_segment( if (hamming_dist >= 0 && (((float) hamming_dist / read_segm_len) < 0.05) ) { //Hamming distance worked fine, no need to ksw align auto info = hamming_align(read_segm, ref_segm_ham, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); - sam_aln_segm.cigar = std::move(info.cigar); - sam_aln_segm.ed = info.edit_distance; - sam_aln_segm.sw_score = info.sw_score; - sam_aln_segm.ref_start = ref_start + ext_left + info.query_start; - sam_aln_segm.is_rc = is_rc; - sam_aln_segm.is_unaligned = false; - sam_aln_segm.aln_score = info.sw_score; - sam_aln_segm.aln_length = read_segm_len; - return sam_aln_segm; + alignment.cigar = std::move(info.cigar); + alignment.edit_distance = info.edit_distance; + alignment.score = info.sw_score; + alignment.ref_start = ref_start + ext_left + info.query_start; + alignment.is_rc = is_rc; + alignment.is_unaligned = false; + alignment.length = read_segm_len; + return alignment; } } auto info = aligner.align(read_segm, ref_segm); - sam_aln_segm.cigar = std::move(info.cigar); - sam_aln_segm.ed = info.edit_distance; - sam_aln_segm.sw_score = info.sw_score; - sam_aln_segm.ref_start = ref_start + info.ref_start; - sam_aln_segm.is_rc = is_rc; - sam_aln_segm.is_unaligned = false; - sam_aln_segm.aln_score = info.sw_score; - sam_aln_segm.aln_length = info.ref_span(); - return sam_aln_segm; + alignment.cigar = std::move(info.cigar); + alignment.edit_distance = info.edit_distance; + alignment.score = info.sw_score; + alignment.ref_start = ref_start + info.ref_start; + alignment.is_rc = is_rc; + alignment.is_unaligned = false; + alignment.length = info.ref_span(); + return alignment; } @@ -224,16 +218,16 @@ static inline Alignment align_segment( static inline Alignment get_alignment( const Aligner& aligner, - const Nam &n, + const Nam &nam, const References& references, const Read& read, bool consistent_nam ) { - const std::string query = n.is_rc ? read.rc : read.seq; - const std::string& ref = references.sequences[n.ref_id]; + const std::string query = nam.is_rc ? read.rc : read.seq; + const std::string& ref = references.sequences[nam.ref_id]; - const int projected_ref_start = std::max(0, n.ref_s - n.query_s); - const int projected_ref_end = std::min(n.ref_e + query.size() - n.query_e, ref.size()); + const auto projected_ref_start = std::max(0, nam.ref_start - nam.query_start); + const auto projected_ref_end = std::min(nam.ref_end + query.size() - nam.query_end, ref.size()); aln_info info; int result_ref_start; @@ -249,296 +243,54 @@ static inline Alignment get_alignment( } } if (gapped) { - const int diff = std::abs(n.ref_span() - n.query_span()); + const int diff = std::abs(nam.ref_span() - nam.query_span()); const int ext_left = std::min(50, projected_ref_start); const int ref_start = projected_ref_start - ext_left; - const int ext_right = std::min(std::size_t(50), ref.size() - n.ref_e); + const int ext_right = std::min(std::size_t(50), ref.size() - nam.ref_end); const auto ref_segm_size = read.size() + diff + ext_left + ext_right; const auto ref_segm = ref.substr(ref_start, ref_segm_size); info = aligner.align(query, ref_segm); result_ref_start = ref_start + info.ref_start; } int softclipped = info.query_start + (query.size() - info.query_end); - Alignment sam_aln; - sam_aln.cigar = std::move(info.cigar); - sam_aln.ed = info.edit_distance; - sam_aln.global_ed = info.edit_distance + softclipped; - sam_aln.sw_score = info.sw_score; - sam_aln.aln_score = info.sw_score; - sam_aln.ref_start = result_ref_start; - sam_aln.aln_length = info.ref_span(); - sam_aln.is_rc = n.is_rc; - sam_aln.is_unaligned = false; - sam_aln.ref_id = n.ref_id; - sam_aln.gapped = gapped; - - return sam_aln; + Alignment alignment; + alignment.cigar = std::move(info.cigar); + alignment.edit_distance = info.edit_distance; + alignment.global_ed = info.edit_distance + softclipped; + alignment.score = info.sw_score; + alignment.ref_start = result_ref_start; + alignment.length = info.ref_span(); + alignment.is_rc = nam.is_rc; + alignment.is_unaligned = false; + alignment.ref_id = nam.ref_id; + alignment.gapped = gapped; + + return alignment; } -static inline Alignment get_alignment_unused( - const Aligner& aligner, - const Nam &n, - const References& references, - const Read& read, - int k, - bool consistent_nam -) { - Alignment sam_aln; - - const auto read_len = read.size(); - const int diff = std::abs(n.ref_span() - n.query_span()); - - const std::string r_tmp = n.is_rc ? read.rc : read.seq; - const bool is_rc = n.is_rc; - - int ext_left; - int ext_right; - int ref_tmp_segm_size; - const int ref_len = references.lengths[n.ref_id]; - int ref_tmp_start; - std::string read_segm; - - // test full hamming based alignment first - ref_tmp_start = std::max(0, n.ref_s - n.query_s); - int ref_start = std::max(0, ref_tmp_start); - ref_tmp_segm_size = read_len + diff; - auto ref_segm_size = std::min(ref_tmp_segm_size, ref_len - ref_start + 1); - - std::string ref_segm = references.sequences[n.ref_id].substr(ref_start, ref_segm_size); - if (ref_segm_size == read_len && consistent_nam) { - int hamming_dist = hamming_distance(r_tmp, ref_segm); - - if (hamming_dist >= 0 && (((float) hamming_dist / ref_segm_size) < 0.05) ) { //Hamming distance worked fine, no need to ksw align - auto info = hamming_align(r_tmp, ref_segm, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); - sam_aln.cigar = std::move(info.cigar); - sam_aln.ed = info.edit_distance; - sam_aln.sw_score = info.sw_score; // aln_params.match*(read_len-hamming_dist) - aln_params.mismatch*hamming_dist; - sam_aln.ref_start = ref_start + ext_left + info.query_start; - sam_aln.is_rc = is_rc; - sam_aln.is_unaligned = false; - sam_aln.aln_score = info.sw_score; - return sam_aln; - } - } - - //// Didn't work with global Hamming - split into parts - - // identify one or two split points within the read if the segment is are larger than T - int T = 20; - // find the most central split point Use convex function result sum of squares - int left_outer = pow (n.query_s, 2) + pow(read_len - n.query_s, 2); - int right_inner = pow (n.query_e - k, 2) + pow (read_len - (n.query_e - k), 2); - - - int global_max_bp = left_outer < right_inner ? n.query_s : n.query_e - k; - int break_point = (global_max_bp >= T) && (global_max_bp <= (read_len - T)) ? global_max_bp : -1; - if (break_point > 0 ){ -// std::cerr << "MAX BREAKPOINT " << break_point << " candidates: " << n.query_s << " " << n.query_e - k << std::endl; - int left_region_bp = break_point + k; - int right_region_bp = break_point; -// std::cerr << "left_region_bp " << left_region_bp << " right_region_bp: " << right_region_bp << std::endl; - int right_ref_start_bp = -1; - if (break_point == n.query_s){ - right_ref_start_bp = n.ref_s; - } else if (break_point == (n.query_e - k)) { - right_ref_start_bp = n.ref_e-k; - } else { - std::cerr << "BUUUUUUG " << std::endl; - } - - // Left region align - read_segm = r_tmp.substr(0, left_region_bp); - ref_tmp_start = std::max(0, n.ref_s - n.query_s); - ext_left = std::min(50, ref_tmp_start); - ext_right = 0; - ref_start = ref_tmp_start - ext_left; - ref_segm_size = left_region_bp + ext_left + diff; - ref_segm = references.sequences[n.ref_id].substr(ref_start, ref_segm_size); -// std::cerr << " " << std::endl; -// std::cerr << "GOING IN LEFT: " << " read segm len " << read_segm.length() << " ref segm len " << ref_segm_size << " ext_left: " << ext_left << std::endl; -// std::cerr << diff << std::endl; -// std::cerr << read_segm << std::endl; -// std::cerr << ref_segm << std::endl; - - auto sam_aln_segm_left = align_segment(aligner, read_segm, ref_segm, ref_start, ext_left, ext_right, consistent_nam, is_rc); -// std::cerr << "LEFT CIGAR: " << sam_aln_segm_left.cigar << std::endl; - - //Right region align - read_segm = r_tmp.substr(right_region_bp, read_len - right_region_bp ); - ref_tmp_segm_size = right_ref_start_bp + (read_len + diff - right_region_bp) < ref_len ? (read_len + diff - right_region_bp) : ref_len - right_ref_start_bp; - ext_left = 0; - ext_right = std::min(50, ref_len - (right_ref_start_bp + ref_tmp_segm_size)); - ref_segm_size = ref_tmp_segm_size + ext_right; - ref_segm = references.sequences[n.ref_id].substr(right_ref_start_bp, ref_segm_size); -// std::cerr << " " << std::endl; -// std::cerr << "GOING IN RIGHT: " << " read segm len " << read_segm.length() << " ref segm len " << ref_segm_size << " ext_right: " << ext_right << std::endl; -// std::cerr << diff << std::endl; -// std::cerr << read_segm << std::endl; -// std::cerr << ref_segm << std::endl; -// std::cerr << "read_segm.length(): " << read_segm.length() << " ref_segm_size " << ref_segm_size << std::endl; - auto sam_aln_segm_right = align_segment(aligner, read_segm, ref_segm, ref_start, ext_left, ext_right, consistent_nam, is_rc); -// std::cerr << "RIGHT CIGAR: " << sam_aln_segm_right.cigar << std::endl; - - - // Stitch together - sam_aln.ref_id = n.ref_id; - sam_aln.cigar = Cigar(sam_aln_segm_left.cigar.to_string() + sam_aln_segm_right.cigar.to_string()); - sam_aln.ed = sam_aln_segm_left.ed + sam_aln_segm_right.ed; - sam_aln.sw_score = sam_aln_segm_left.sw_score + sam_aln_segm_right.sw_score; - sam_aln.ref_start = sam_aln_segm_left.ref_start; - sam_aln.is_rc = n.is_rc; - sam_aln.is_unaligned = false; - sam_aln.aln_score = sam_aln.sw_score; - } else { -// std::cerr << "NOOOO MAX BREAKPOINT " << break_point << " candidates: " << n.query_s << " " << n.query_e - k << std::endl; - // full align - ref_tmp_start = std::max(0, n.ref_s - n.query_s); - ext_left = std::min(50, ref_tmp_start); - ref_start = ref_tmp_start - ext_left; - - ref_tmp_segm_size = read_len + diff; - ext_right = std::min(50, ref_len - (n.ref_e +1)); - - ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; - ref_segm = references.sequences[n.ref_id].substr(ref_start, ref_segm_size); -// std::cerr << " ref_tmp_start " << ref_tmp_start << " ext left " << ext_left << " ext right " << ext_right << " ref_tmp_segm_size " << ref_tmp_segm_size << " ref_segm_size " << ref_segm_size << " ref_segm " << ref_segm << std::endl; - sam_aln = align_segment(aligner, r_tmp, ref_segm, ref_start, ext_left, ext_right, consistent_nam, is_rc); - sam_aln.ref_id = n.ref_id; - } - - - // TODO: Several breakpoints. To complicated for probably little gain if short reads, maybe implement later.. -// // Left and right breakpoint -// int left_bp_read = n.query_s + k; -// int right_bp_read = n.query_e - k; -// left_bp_read = left_bp_read > T ? left_bp_read : -1; -// right_bp_read = read_len - right_bp_read > T ? read_len - right_bp_read : -1; -// -// // Decide where to break -// bool break1, break2; -// if ( ((read_len - right_bp_read - left_bp_read) > T) && (left_bp_read >= T) && (right_bp_read >= T) ){ // Use two breakpoints ---------------X|------------------X|--------------- -// break1 = true; -// break2 = true; -// std::cerr << "CASE1 " << std::endl; -// } else if ( (left_bp_read >= T) && (right_bp_read >= T) ) { // Use one breakpoint ---------------------------X--------X-------------------------- -// break1 = left_bp_read >= right_bp_read ? true : false; // ------------------------------------X|--------X-------------------- -// break2 = left_bp_read < right_bp_read ? true : false; // ---------------------X--------|X----------------------------------- -// std::cerr << "CASE2 " << std::endl; -// } else if ( (left_bp_read >= T) && ( (read_len - left_bp_read) >= T) ) { // Use one breakpoint -----------------------X|-------------------X------- -// break1 = true; -// break2 = false; -// std::cerr << "CASE3 " << std::endl; -// } else if ( ((right_bp_read) > T) && (read_len - right_bp_read >= T) ) { // Use one breakpoint ----------X-------------------X|--------------------------- -// break1 = false; -// break2 = true; -// std::cerr << "CASE4 " << std::endl; -// } else { // No breakpoints ------X--------------------------------------X-------- -// break1 = false; -// break2 = false; -// std::cerr << "CASE5 " << std::endl; -// } -// -// std::cerr << "BREAKPOINTS: LEFT: " << left_bp_read << " RIGHT: " << right_bp_read << " BREAKING LEFT: " << break1 << " BREAKING RIGHT: " << break2 << std::endl; -// std::cerr << "MAPPING POS n.ref_s: " << n.ref_s << " n.ref_e: " << n.ref_e << " n.query_s " << n.query_s << " n.query_e: " << n.query_e << std::endl; -// std::cerr << " " << std::endl; - - -// // left alignment -// ref_end = n.ref_s + k; -// ref_start = ref_end - ext_left; -// ext_left = ref_end < 50 ? ref_end : 50; -// -// ref_tmp_segm_size = n.query_s + k; -// ext_right = 0; -// -// ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; -// ref_segm = ref_seqs[n.ref_id].substr(ref_end - ext_left, ref_segm_size); -// alignment sam_aln_segm_left; -// sam_aln_segm_left.ref_id = n.ref_id; -// read_segm = r_tmp.substr(0, n.query_s+k); -// -// std::cerr << "LEFT: ref_start: " << ref_start << " LEFT: ref_end: " << ref_end << " ref_segm_size " << ref_segm_size << " read segm size: " << read_segm.length() << std::endl; -// -// align_segment(aln_params, read_segm, ref_segm, read_segm.length(), ref_segm_size, ref_start, ext_left, ext_right, consistent_nam, is_rc, sam_aln_segm_left, tot_ksw_aligned); -// -// -// // center alignment -// ref_tmp_start = n.ref_s - n.query_s; -// ref_start = ref_tmp_start > 0 ? ref_tmp_start : 0; -// ext_left = 0; -// -// ref_tmp_segm_size = n.ref_e - n.ref_s; -// ext_right = 0; -// -// ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; -// ref_segm = ref_seqs[n.ref_id].substr(ref_start, ref_segm_size); -// alignment sam_aln_segm_center; -// sam_aln_segm_center.ref_id = n.ref_id; -// read_segm = r_tmp.substr(n.query_s, n.query_e - n.query_s); -// std::cerr << "CENTER: ref_start: " << ref_start << " CENTER: ref_end: " << ref_end << " ref_segm_size " << ref_segm_size << " read segm size: " << read_segm.length() << std::endl; -// -// align_segment(aln_params, read_segm, ref_segm, read_segm.length(), ref_segm_size, ref_start, ext_left, ext_right, consistent_nam, is_rc, sam_aln_segm_center, tot_ksw_aligned); -// -// -// // right alignment -// ext_left = 0; -// ref_start = n.ref_e - k; -// -// ext_right = ref_len - (n.ref_e +1) < 50 ? ref_len - (n.ref_e +1) : 50; -// ref_tmp_segm_size = n.ref_e + ext_right - ref_start; -// -// ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; -// ref_segm = ref_seqs[n.ref_id].substr(n.ref_e - ext_left, ref_segm_size); -// alignment sam_aln_segm_right; -// sam_aln_segm_left.ref_id = n.ref_id; -// read_segm = r_tmp.substr(0, n.query_s+k); -// std::cerr << "RIGHT: ref_start: " << ref_start << " RIGHT: ref_end: " << ref_end << " ref_segm_size " << ref_segm_size << " read segm size: " << read_segm.length() << std::endl; -// -// align_segment(aln_params, read_segm, ref_segm, read_segm.length(), ref_segm_size, ref_start, ext_left, ext_right, consistent_nam, is_rc, sam_aln_segm_right, tot_ksw_aligned); -// -// std::cout << sam_aln_segm_left.cigar << " " << sam_aln_segm_center.cigar << " " << sam_aln_segm_right.cigar << std::endl; -// -// sam_aln.ref_id = n.ref_id; -// sam_aln.cigar = sam_aln_segm_left.cigar + sam_aln_segm_center.cigar + sam_aln_segm_right.cigar; -// sam_aln.ed = sam_aln_segm_left.ed + sam_aln_segm_center.ed + sam_aln_segm_right.ed; -// sam_aln.sw_score = sam_aln_segm_left.sw_score + sam_aln_segm_center.sw_score + sam_aln_segm_right.sw_score; -// sam_aln.ref_start = sam_aln_segm_left.ref_start; -// sam_aln.is_rc = sam_aln_segm_left.is_rc; -// sam_aln.is_unaligned = false; -// sam_aln.aln_score = sam_aln.sw_score; -// std::cout << "Joint: " << sam_aln.cigar << std::endl; - - return sam_aln; -} - - - - -static inline int get_MAPQ(const std::vector &all_nams, const Nam &n_max) { - const float s1 = n_max.score; - if (all_nams.size() <= 1) { +static inline uint8_t get_mapq(const std::vector &nams, const Nam &n_max) { + if (nams.size() <= 1) { return 60; } - const Nam n_second = all_nams[1]; - const float s2 = n_second.score; + const float s1 = n_max.score; + const float s2 = nams[1].score; // from minimap2: MAPQ = 40(1−s2/s1) ·min{1,|M|/10} · log s1 const float min_matches = std::min(n_max.n_hits / 10.0, 1.0); const int uncapped_mapq = 40 * (1 - s2 / s1) * min_matches * log(s1); return std::min(uncapped_mapq, 60); } - -static inline std::pair joint_mapq_from_alignment_scores(float S1, float S2) { +static inline std::pair joint_mapq_from_alignment_scores(float score1, float score2) { int mapq; - if (S1 == S2) { // At least two identical placements + if (score1 == score2) { // At least two identical placements mapq = 0; } else { - const int diff = S1 - S2; // (1.0 - (S1 - S2) / S1); + const int diff = score1 - score2; // (1.0 - (S1 - S2) / S1); // float log10_p = diff > 6 ? -6.0 : -diff; // Corresponds to: p_error= 0.1^diff // change in sw score times rough illumina error rate. This is highly heauristic, but so seem most computations of mapq scores - if (S1 > 0 && S2 > 0) { + if (score1 > 0 && score2 > 0) { mapq = std::min(60, diff); // mapq1 = -10 * log10_p < 60 ? -10 * log10_p : 60; - } else if (S1 > 0 && S2 <= 0) { + } else if (score1 > 0 && score2 <= 0) { mapq = 60; } else { // both negative SW one is better mapq = 1; @@ -547,7 +299,6 @@ static inline std::pair joint_mapq_from_alignment_scores(float S1, flo return std::make_pair(mapq, mapq); } - static inline float normal_pdf(float x, float m, float s) { static const float inv_sqrt_2pi = 0.3989422804014327; @@ -558,7 +309,7 @@ static inline float normal_pdf(float x, float m, float s) static inline bool score_sw(const Alignment &a, const Alignment &b) { - return a.sw_score > b.sw_score; + return a.score > b.score; } static inline bool sort_scores(const std::tuple &a, @@ -577,7 +328,7 @@ static inline std::vector> get_best_scori for (auto &a1 : alignments1) { for (auto &a2 : alignments2) { float dist = std::abs(a1.ref_start - a2.ref_start); - double score = a1.sw_score + a2.sw_score; + double score = a1.score + a2.score; if ((a1.is_rc ^ a2.is_rc) && (dist < mu + 4 * sigma)) { score += log(normal_pdf(dist, mu, sigma)); } @@ -597,8 +348,8 @@ bool is_proper_nam_pair(const Nam nam1, const Nam nam2, float mu, float sigma) { if (nam1.ref_id != nam2.ref_id || nam1.is_rc == nam2.is_rc) { return false; } - int a = std::max(0, nam1.ref_s - nam2.query_s); - int b = std::max(0, nam2.ref_s - nam2.query_s); + int a = std::max(0, nam1.ref_start - nam2.query_start); + int b = std::max(0, nam2.ref_start - nam2.query_start); // r1 ---> <---- r2 bool r1_r2 = nam2.is_rc && (a <= b) && (b - a < mu + 10*sigma); @@ -610,13 +361,13 @@ bool is_proper_nam_pair(const Nam nam1, const Nam nam2, float mu, float sigma) { } static inline std::vector> get_best_scoring_NAM_locations( - const std::vector &all_nams1, - const std::vector &all_nams2, + const std::vector &nams1, + const std::vector &nams2, float mu, float sigma ) { std::vector> joint_NAM_scores; - if (all_nams1.empty() && all_nams2.empty()) { + if (nams1.empty() && nams2.empty()) { return joint_NAM_scores; } @@ -624,8 +375,8 @@ static inline std::vector> get_best_scoring_NAM_location robin_hood::unordered_set added_n2; int joint_hits; int hjss = 0; // highest joint score seen - for (auto &n1 : all_nams1) { - for (auto &n2 : all_nams2) { + for (auto &n1 : nams1) { + for (auto &n2 : nams2) { if (n1.n_hits + n2.n_hits < hjss/2) { break; } @@ -642,10 +393,10 @@ static inline std::vector> get_best_scoring_NAM_location } Nam dummy_nan; - dummy_nan.ref_s = -1; - if (!all_nams1.empty()) { - int hjss1 = hjss > 0 ? hjss : all_nams1[0].n_hits; - for (auto &n1 : all_nams1) { + dummy_nan.ref_start = -1; + if (!nams1.empty()) { + int hjss1 = hjss > 0 ? hjss : nams1[0].n_hits; + for (auto &n1 : nams1) { if (n1.n_hits < hjss1/2){ break; } @@ -660,10 +411,10 @@ static inline std::vector> get_best_scoring_NAM_location } } - if ( !all_nams2.empty() ){ - int hjss2 = hjss > 0 ? hjss : all_nams2[0].n_hits; + if ( !nams2.empty() ){ + int hjss2 = hjss > 0 ? hjss : nams2[0].n_hits; // int hjss2 = all_nams2[0].n_hits; - for (auto &n2 : all_nams2) { + for (auto &n2 : nams2) { if (n2.n_hits < hjss2/2){ break; } @@ -713,11 +464,11 @@ bool has_shared_substring(const std::string& read_seq, const std::string& ref_se /* Return true iff rescue by alignment was actually attempted */ static inline bool rescue_mate( const Aligner& aligner, - Nam &n, + Nam &nam, const References& references, const Read& guide, const Read& read, - Alignment &sam_aln, + Alignment &alignment, float mu, float sigma, int k @@ -727,46 +478,44 @@ static inline bool rescue_mate( bool a_is_rc; auto read_len = read.size(); - reverse_nam_if_needed(n, guide, references, k); - if (n.is_rc){ + reverse_nam_if_needed(nam, guide, references, k); + if (nam.is_rc){ r_tmp = read.seq; - a = n.ref_s - n.query_s - (mu+5*sigma); - b = n.ref_s - n.query_s + read_len/2; // at most half read overlap + a = nam.ref_start - nam.query_start - (mu+5*sigma); + b = nam.ref_start - nam.query_start + read_len/2; // at most half read overlap a_is_rc = false; } else { r_tmp = read.rc; // mate is rc since fr orientation - a = n.ref_e + (read_len - n.query_e) - read_len/2; // at most half read overlap - b = n.ref_e + (read_len - n.query_e) + (mu+5*sigma); + a = nam.ref_end + (read_len - nam.query_end) - read_len/2; // at most half read overlap + b = nam.ref_end + (read_len - nam.query_end) + (mu+5*sigma); a_is_rc = true; } - auto ref_len = static_cast(references.lengths[n.ref_id]); + auto ref_len = static_cast(references.lengths[nam.ref_id]); auto ref_start = std::max(0, std::min(a, ref_len)); auto ref_end = std::min(ref_len, std::max(0, b)); if (ref_end < ref_start + k){ - sam_aln.cigar = Cigar(); - sam_aln.ed = read_len; - sam_aln.sw_score = 0; - sam_aln.aln_score = 0; - sam_aln.ref_start = 0; - sam_aln.is_rc = n.is_rc; - sam_aln.ref_id = n.ref_id; - sam_aln.is_unaligned = true; + alignment.cigar = Cigar(); + alignment.edit_distance = read_len; + alignment.score = 0; + alignment.ref_start = 0; + alignment.is_rc = nam.is_rc; + alignment.ref_id = nam.ref_id; + alignment.is_unaligned = true; // std::cerr << "RESCUE: Caught Bug3! ref start: " << ref_start << " ref end: " << ref_end << " ref len: " << ref_len << std::endl; return false; } - std::string ref_segm = references.sequences[n.ref_id].substr(ref_start, ref_end - ref_start); + std::string ref_segm = references.sequences[nam.ref_id].substr(ref_start, ref_end - ref_start); if (!has_shared_substring(r_tmp, ref_segm, k)){ - sam_aln.cigar = Cigar(); - sam_aln.ed = read_len; - sam_aln.sw_score = 0; - sam_aln.aln_score = 0; - sam_aln.ref_start = 0; - sam_aln.is_rc = n.is_rc; - sam_aln.ref_id = n.ref_id; - sam_aln.is_unaligned = true; + alignment.cigar = Cigar(); + alignment.edit_distance = read_len; + alignment.score = 0; + alignment.ref_start = 0; + alignment.is_rc = nam.is_rc; + alignment.ref_id = nam.ref_id; + alignment.is_unaligned = true; // std::cerr << "Avoided!" << std::endl; return false; // std::cerr << "Aligning anyway at: " << ref_start << " to " << ref_end << "ref len:" << ref_len << " ref_id:" << n.ref_id << std::endl; @@ -777,7 +526,7 @@ static inline bool rescue_mate( // std::cerr<< "________________________________________" << std::endl; // std::cerr<< "RESCUE MODE: " << mu << " " << sigma << std::endl; // std::cerr<< read << " " << read_rc << std::endl; -// std::cerr << r_tmp << " " << n.n_hits << " " << n.score << " " << " " << sam_aln.ed << " " << n.query_s << " " << n.query_e << " "<< n.ref_s << " " << n.ref_e << " " << n.is_rc << " " << " " << sam_aln.cigar << " " << info.sw_score << std::endl; +// std::cerr << r_tmp << " " << n.n_hits << " " << n.score << " " << " " << alignment.ed << " " << n.query_s << " " << n.query_e << " "<< n.ref_s << " " << n.ref_e << " " << n.is_rc << " " << " " << alignment.cigar << " " << info.sw_score << std::endl; // std::cerr << "a " << a << " b " << b << " ref_start " << ref_start << " ref_end " << ref_end << " ref_end - ref_start " << ref_end - ref_start << " n.is_flipped " << n.is_flipped << std::endl; // std::cerr<< "________________________________________" << std::endl; // } @@ -789,15 +538,14 @@ static inline bool rescue_mate( // info = ksw_align(ref_ptr, ref_segm.size(), read_ptr, r_tmp.size(), 1, 4, 6, 1, ez); // std::cerr << "Cigar: " << info.cigar << std::endl; - sam_aln.cigar = info.cigar; - sam_aln.ed = info.edit_distance; - sam_aln.sw_score = info.sw_score; - sam_aln.aln_score = sam_aln.sw_score; - sam_aln.ref_start = ref_start + info.ref_start; - sam_aln.is_rc = a_is_rc; - sam_aln.ref_id = n.ref_id; - sam_aln.is_unaligned = info.cigar.empty(); - sam_aln.aln_length = info.ref_span(); + alignment.cigar = info.cigar; + alignment.edit_distance = info.edit_distance; + alignment.score = info.sw_score; + alignment.ref_start = ref_start + info.ref_start; + alignment.is_rc = a_is_rc; + alignment.ref_id = nam.ref_id; + alignment.is_unaligned = info.cigar.empty(); + alignment.length = info.ref_span(); return true; } @@ -807,7 +555,7 @@ void rescue_read( const Read& read1, // read that has NAMs const Aligner& aligner, const References& references, - std::vector &all_nams1, + std::vector &nams1, int max_tries, float dropoff, std::array& details, @@ -821,29 +569,28 @@ void rescue_read( const klibpp::KSeq& record2, bool swap_r1r2 // TODO get rid of this ) { - float score_dropoff1; - Nam n_max1 = all_nams1[0]; + Nam n_max1 = nams1[0]; int tries = 0; std::vector alignments1; std::vector alignments2; - for (auto& n : all_nams1) { - score_dropoff1 = (float) n.n_hits / n_max1.n_hits; - if (tries >= max_tries || score_dropoff1 < dropoff) { // only consider top 20 hits as minimap2 and break if alignment is exact match to reference or the match below droppoff cutoff. + for (auto& nam : nams1) { + float score_dropoff1 = (float) nam.n_hits / n_max1.n_hits; + // only consider top hits (as minimap2 does) and break if below dropoff cutoff. + if (tries >= max_tries || score_dropoff1 < dropoff) { break; } - //////// the actual testing of base pair alignment part start - const bool consistent_nam = reverse_nam_if_needed(n, read1, references, k); + const bool consistent_nam = reverse_nam_if_needed(nam, read1, references, k); details[0].nam_inconsistent += !consistent_nam; - auto alignment = get_alignment(aligner, n, references, read1, consistent_nam); + auto alignment = get_alignment(aligner, nam, references, read1, consistent_nam); details[0].gapped += alignment.gapped; alignments1.emplace_back(alignment); details[0].tried_alignment++; // Force SW alignment to rescue mate Alignment a2; - details[1].mate_rescue += rescue_mate(aligner, n, references, read1, read2, a2, mu, sigma, k); + details[1].mate_rescue += rescue_mate(aligner, nam, references, read1, read2, a2, mu, sigma, k); alignments2.emplace_back(a2); tries++; @@ -870,12 +617,12 @@ void rescue_read( // append both alignments to string here if (max_secondary == 0) { auto best_aln_pair = high_scores[0]; - Alignment sam_aln1 = std::get<1>(best_aln_pair); - Alignment sam_aln2 = std::get<2>(best_aln_pair); + Alignment alignment1 = std::get<1>(best_aln_pair); + Alignment alignment2 = std::get<2>(best_aln_pair); if (swap_r1r2) { - sam.add_pair(sam_aln2, sam_aln1, record2, record1, read2.rc, read1.rc, mapq2, mapq1, is_proper_pair(sam_aln2, sam_aln1, mu, sigma), true, details); + sam.add_pair(alignment2, alignment1, record2, record1, read2.rc, read1.rc, mapq2, mapq1, is_proper_pair(alignment2, alignment1, mu, sigma), true, details); } else { - sam.add_pair(sam_aln1, sam_aln2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper_pair(sam_aln1, sam_aln2, mu, sigma), true, details); + sam.add_pair(alignment1, alignment2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper_pair(alignment1, alignment2, mu, sigma), true, details); } } else { auto max_out = std::min(high_scores.size(), max_secondary); @@ -890,16 +637,16 @@ void rescue_read( } auto aln_pair = high_scores[i]; auto s_score = std::get<0>(aln_pair); - Alignment sam_aln1 = std::get<1>(aln_pair); - Alignment sam_aln2 = std::get<2>(aln_pair); + Alignment alignment1 = std::get<1>(aln_pair); + Alignment alignment2 = std::get<2>(aln_pair); if (s_max - s_score < secondary_dropoff) { if (swap_r1r2) { - bool is_proper = is_proper_pair(sam_aln2, sam_aln1, mu, sigma); + bool is_proper = is_proper_pair(alignment2, alignment1, mu, sigma); std::array swapped_details{details[1], details[0]}; - sam.add_pair(sam_aln2, sam_aln1, record2, record1, read2.rc, read1.rc, mapq2, mapq1, is_proper, is_primary, swapped_details); + sam.add_pair(alignment2, alignment1, record2, record1, read2.rc, read1.rc, mapq2, mapq1, is_proper, is_primary, swapped_details); } else { - bool is_proper = is_proper_pair(sam_aln1, sam_aln2, mu, sigma); - sam.add_pair(sam_aln1, sam_aln2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper, is_primary, details); + bool is_proper = is_proper_pair(alignment1, alignment2, mu, sigma); + sam.add_pair(alignment1, alignment2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper, is_primary, details); } } else { break; @@ -950,8 +697,8 @@ static std::pair joint_mapq_from_high_scores(const std::vector &all_nams1, - std::vector &all_nams2, + std::vector &nams1, + std::vector &nams2, const KSeq &record1, const KSeq &record2, int k, @@ -968,20 +715,20 @@ inline void align_PE( Read read2(record2.seq); double secondary_dropoff = 2 * aligner.parameters.mismatch + aligner.parameters.gap_open; - if (all_nams1.empty() && all_nams2.empty()) { + if (nams1.empty() && nams2.empty()) { // None of the reads have any NAMs sam.add_unmapped_pair(record1, record2); return; } - if (!all_nams1.empty() && all_nams2.empty()) { + if (!nams1.empty() && nams2.empty()) { // Only read 1 has NAMS: attempt to rescue read 2 rescue_read( read2, read1, aligner, references, - all_nams1, + nams1, max_tries, dropoff, details, @@ -998,14 +745,14 @@ inline void align_PE( return; } - if (all_nams1.empty() && !all_nams2.empty()) { + if (nams1.empty() && !nams2.empty()) { // Only read 2 has NAMS: attempt to rescue read 1 rescue_read( read1, read2, aligner, references, - all_nams2, + nams2, max_tries, dropoff, details, @@ -1023,15 +770,15 @@ inline void align_PE( } // If we get here, both reads have NAMs - assert(!all_nams1.empty() && !all_nams2.empty()); + assert(!nams1.empty() && !nams2.empty()); int tries = 0; double S = 0.0; - Nam n_max1 = all_nams1[0]; - Nam n_max2 = all_nams2[0]; + Nam n_max1 = nams1[0]; + Nam n_max2 = nams2[0]; - float score_dropoff1 = all_nams1.size() > 1 ? (float) all_nams1[1].n_hits / n_max1.n_hits : 0.0; - float score_dropoff2 = all_nams2.size() > 1 ? (float) all_nams2[1].n_hits / n_max2.n_hits : 0.0; + float score_dropoff1 = nams1.size() > 1 ? (float) nams1[1].n_hits / n_max1.n_hits : 0.0; + float score_dropoff2 = nams2.size() > 1 ? (float) nams2[1].n_hits / n_max2.n_hits : 0.0; score_dropoff1 = n_max1.n_hits > 2 ? score_dropoff1 : 1.0; score_dropoff2 = n_max2.n_hits > 2 ? score_dropoff2 : 1.0; @@ -1043,19 +790,19 @@ inline void align_PE( bool consistent_nam2 = reverse_nam_if_needed(n_max2, read2, references, k); details[1].nam_inconsistent += !consistent_nam2; - auto sam_aln1 = get_alignment(aligner, n_max1, references, read1, consistent_nam1); + auto alignment1 = get_alignment(aligner, n_max1, references, read1, consistent_nam1); details[0].tried_alignment++; - details[0].gapped += sam_aln1.gapped; - auto sam_aln2 = get_alignment(aligner, n_max2, references, read2, consistent_nam2); + details[0].gapped += alignment1.gapped; + auto alignment2 = get_alignment(aligner, n_max2, references, read2, consistent_nam2); details[1].tried_alignment++; - details[1].gapped += sam_aln2.gapped; - int mapq1 = get_MAPQ(all_nams1, n_max1); - int mapq2 = get_MAPQ(all_nams2, n_max2); - bool is_proper = is_proper_pair(sam_aln1, sam_aln2, mu, sigma); - sam.add_pair(sam_aln1, sam_aln2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper, true, details); - - if ((isize_est.sample_size < 400) && ((sam_aln1.ed + sam_aln2.ed) < 3) && is_proper) { - isize_est.update(std::abs(sam_aln1.ref_start - sam_aln2.ref_start)); + details[1].gapped += alignment2.gapped; + int mapq1 = get_mapq(nams1, n_max1); + int mapq2 = get_mapq(nams2, n_max2); + bool is_proper = is_proper_pair(alignment1, alignment2, mu, sigma); + sam.add_pair(alignment1, alignment2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper, true, details); + + if ((isize_est.sample_size < 400) && ((alignment1.edit_distance + alignment2.edit_distance) < 3) && is_proper) { + isize_est.update(std::abs(alignment1.ref_start - alignment2.ref_start)); } return; } @@ -1066,13 +813,13 @@ inline void align_PE( // Get top hit counts for all locations. The joint hit count is the sum of hits of the two mates. Then align as long as score dropoff or cnt < 20 // (score, aln1, aln2) - std::vector> joint_NAM_scores = get_best_scoring_NAM_locations(all_nams1, all_nams2, mu, sigma); + std::vector> joint_NAM_scores = get_best_scoring_NAM_locations(nams1, nams2, mu, sigma); auto nam_max = joint_NAM_scores[0]; auto max_score = std::get<0>(nam_max); robin_hood::unordered_map is_aligned1; robin_hood::unordered_map is_aligned2; - auto n1_max = all_nams1[0]; + auto n1_max = nams1[0]; bool consistent_nam1 = reverse_nam_if_needed(n1_max, read1, references, k); details[0].nam_inconsistent += !consistent_nam1; @@ -1082,7 +829,7 @@ inline void align_PE( is_aligned1[n1_max.nam_id] = a1_indv_max; details[0].tried_alignment++; details[0].gapped += a1_indv_max.gapped; - auto n2_max = all_nams2[0]; + auto n2_max = nams2[0]; bool consistent_nam2 = reverse_nam_if_needed(n2_max, read2, references, k); details[1].nam_inconsistent += !consistent_nam2; auto a2_indv_max = get_alignment(aligner, n2_max, references, read2, @@ -1109,7 +856,7 @@ inline void align_PE( //////// the actual testing of base pair alignment part start //////// ////////////////////////////////////////////////////////////////////// Alignment a1; - if (n1.ref_s >= 0) { + if (n1.ref_start >= 0) { if (is_aligned1.find(n1.nam_id) != is_aligned1.end() ){ a1 = is_aligned1[n1.nam_id]; } else { @@ -1129,13 +876,13 @@ inline void align_PE( // a1_indv_max = a1.sw_score > a1_indv_max.sw_score ? a1 : a1_indv_max; // min_ed = a1.ed < min_ed ? a1.ed : min_ed; - if (a1.sw_score > a1_indv_max.sw_score){ + if (a1.score > a1_indv_max.score){ a1_indv_max = a1; // cnt = 0; } Alignment a2; - if (n2.ref_s >= 0) { + if (n2.ref_start >= 0) { if (is_aligned2.find(n2.nam_id) != is_aligned2.end() ){ a2 = is_aligned2[n2.nam_id]; } else { @@ -1156,7 +903,7 @@ inline void align_PE( // a2_indv_max = a2.sw_score > a2_indv_max.sw_score ? a2 : a2_indv_max; // min_ed = a2.ed < min_ed ? a2.ed : min_ed; - if (a2.sw_score > a2_indv_max.sw_score){ + if (a2.score > a2_indv_max.score){ a2_indv_max = a2; // cnt = 0; } @@ -1166,10 +913,10 @@ inline void align_PE( if (r1_r2 || r2_r1) { float x = std::abs(a1.ref_start - a2.ref_start); - S = (double)a1.sw_score + (double)a2.sw_score + log(normal_pdf(x, mu, sigma)); //* (1 - s2 / s1) * min_matches * log(s1); + S = (double)a1.score + (double)a2.score + log(normal_pdf(x, mu, sigma)); //* (1 - s2 / s1) * min_matches * log(s1); // std::cerr << " CASE1: " << S << " " << log( normal_pdf(x, mu, sigma ) ) << " " << (double)a1.sw_score << " " << (double)a2.sw_score << std::endl; } else{ // individual score - S = (double)a1.sw_score + (double)a2.sw_score - 20; // 20 corresponds to a value of log( normal_pdf(x, mu, sigma ) ) of more than 5 stddevs away (for most reasonable values of stddev) + S = (double)a1.score + (double)a2.score - 20; // 20 corresponds to a value of log( normal_pdf(x, mu, sigma ) ) of more than 5 stddevs away (for most reasonable values of stddev) // std::cerr << " CASE2: " << S << " " << (double)a1.sw_score << " " << (double)a2.sw_score << std::endl; } @@ -1180,7 +927,7 @@ inline void align_PE( } // Finally, add highest scores of both mates as individually mapped - S = (double)a1_indv_max.sw_score + (double)a2_indv_max.sw_score - 20; // 20 corresponds to a value of log( normal_pdf(x, mu, sigma ) ) of more than 5 stddevs away (for most reasonable values of stddev) + S = (double)a1_indv_max.score + (double)a2_indv_max.score - 20; // 20 corresponds to a value of log( normal_pdf(x, mu, sigma ) ) of more than 5 stddevs away (for most reasonable values of stddev) std::tuple aln_tuple (S, a1_indv_max, a2_indv_max); high_scores.push_back(aln_tuple); std::sort(high_scores.begin(), high_scores.end(), sort_scores); // Sorting by highest score first @@ -1205,49 +952,49 @@ inline void align_PE( std::tie(mapq1, mapq2) = joint_mapq_from_high_scores(high_scores); auto best_aln_pair = high_scores[0]; - auto sam_aln1 = std::get<1>(best_aln_pair); - auto sam_aln2 = std::get<2>(best_aln_pair); + auto alignment1 = std::get<1>(best_aln_pair); + auto alignment2 = std::get<2>(best_aln_pair); if (max_secondary == 0) { - bool is_proper = is_proper_pair(sam_aln1, sam_aln2, mu, sigma); - sam.add_pair(sam_aln1, sam_aln2, record1, record2, read1.rc, read2.rc, + bool is_proper = is_proper_pair(alignment1, alignment2, mu, sigma); + sam.add_pair(alignment1, alignment2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper, true, details); } else { int max_out = std::min(high_scores.size(), max_secondary); // remove eventual duplicates - comes from, e.g., adding individual best alignments above (if identical to joint best alignment) float s_max = std::get<0>(best_aln_pair); - int prev_start_m1 = sam_aln1.ref_start; - int prev_start_m2 = sam_aln2.ref_start; - int prev_ref_id_m1 = sam_aln1.ref_id; - int prev_ref_id_m2 = sam_aln2.ref_id; + int prev_start_m1 = alignment1.ref_start; + int prev_start_m2 = alignment2.ref_start; + int prev_ref_id_m1 = alignment1.ref_id; + int prev_ref_id_m2 = alignment2.ref_id; bool is_primary = true; for (int i = 0; i < max_out; ++i) { auto aln_pair = high_scores[i]; - sam_aln1 = std::get<1>(aln_pair); - sam_aln2 = std::get<2>(aln_pair); + alignment1 = std::get<1>(aln_pair); + alignment2 = std::get<2>(aln_pair); float s_score = std::get<0>(aln_pair); if (i > 0) { is_primary = false; mapq1 = 255; mapq2 = 255; - bool same_pos = (prev_start_m1 == sam_aln1.ref_start) && (prev_start_m2 == sam_aln2.ref_start); - bool same_ref = (prev_ref_id_m1 == sam_aln1.ref_id) && (prev_ref_id_m2 == sam_aln2.ref_id); + bool same_pos = (prev_start_m1 == alignment1.ref_start) && (prev_start_m2 == alignment2.ref_start); + bool same_ref = (prev_ref_id_m1 == alignment1.ref_id) && (prev_ref_id_m2 == alignment2.ref_id); if ( same_pos && same_ref ){ continue; } } if (s_max - s_score < secondary_dropoff) { - bool is_proper = is_proper_pair(sam_aln1, sam_aln2, mu, sigma); - sam.add_pair(sam_aln1, sam_aln2, record1, record2, read1.rc, read2.rc, + bool is_proper = is_proper_pair(alignment1, alignment2, mu, sigma); + sam.add_pair(alignment1, alignment2, record1, record2, read1.rc, read2.rc, mapq1, mapq2, is_proper, is_primary, details); } else { break; } - prev_start_m1 = sam_aln1.ref_start; - prev_start_m2 = sam_aln2.ref_start; - prev_ref_id_m1 = sam_aln1.ref_id; - prev_ref_id_m2 = sam_aln2.ref_id; + prev_start_m1 = alignment1.ref_start; + prev_start_m2 = alignment2.ref_start; + prev_ref_id_m1 = alignment1.ref_id; + prev_ref_id_m2 = alignment2.ref_id; } } } @@ -1257,8 +1004,8 @@ inline void get_best_map_location(std::vector &nams1, std::vector &nam Nam n1_joint_max, n2_joint_max, n1_indiv_max, n2_indiv_max; float score_joint = 0; float score_indiv = 0; - best_nam1.ref_s = -1; //Unmapped until proven mapped - best_nam2.ref_s = -1; //Unmapped until proven mapped + best_nam1.ref_start = -1; //Unmapped until proven mapped + best_nam2.ref_start = -1; //Unmapped until proven mapped if (joint_NAM_scores.empty()) { return; @@ -1267,7 +1014,7 @@ inline void get_best_map_location(std::vector &nams1, std::vector &nam for (auto &t : joint_NAM_scores) { // already sorted by descending score auto n1 = std::get<1>(t); auto n2 = std::get<2>(t); - if ((n1.ref_s >=0) && (n2.ref_s >=0) ){ // Valid pair + if ((n1.ref_start >=0) && (n2.ref_start >=0) ){ // Valid pair score_joint = n1.score + n2.score; n1_joint_max = n1; n2_joint_max = n2; @@ -1292,7 +1039,7 @@ inline void get_best_map_location(std::vector &nams1, std::vector &nam } if (isize_est.sample_size < 400 && score_joint > score_indiv) { - isize_est.update(std::abs(n1_joint_max.ref_s - n2_joint_max.ref_s)); + isize_est.update(std::abs(n1_joint_max.ref_start - n2_joint_max.ref_start)); } } @@ -1330,7 +1077,7 @@ void align_PE_read( AlignmentStatistics &statistics, i_dist_est &isize_est, const Aligner &aligner, - const mapping_params &map_param, + const MappingParameters &map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index @@ -1347,7 +1094,7 @@ void align_PE_read( auto [nonrepetitive_fraction1, nams1] = find_nams(query_randstrobes1, index); auto [nonrepetitive_fraction2, nams2] = find_nams(query_randstrobes2, index); statistics.tot_find_nams += nam_timer.duration(); - if (map_param.R > 1) { + if (map_param.rescue_level > 1) { Timer rescue_timer; if (nams1.empty() || nonrepetitive_fraction1 < 0.7) { nams1 = find_nams_rescue(query_randstrobes1, index, map_param.rescue_cutoff); @@ -1388,7 +1135,7 @@ void align_PE_read( record2, index_parameters.syncmer.k, references, details, - map_param.dropoff_threshold, isize_est, map_param.maxTries, map_param.max_secondary); + map_param.dropoff_threshold, isize_est, map_param.max_tries, map_param.max_secondary); } statistics.tot_extend += extend_timer.duration(); statistics += details[0]; @@ -1402,7 +1149,7 @@ void align_SE_read( std::string &outstring, AlignmentStatistics &statistics, const Aligner &aligner, - const mapping_params &map_param, + const MappingParameters &map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index @@ -1417,7 +1164,7 @@ void align_SE_read( auto [nonrepetitive_fraction, nams] = find_nams(query_randstrobes, index); statistics.tot_find_nams += nam_timer.duration(); - if (map_param.R > 1) { + if (map_param.rescue_level > 1) { Timer rescue_timer; if (nams.empty() || nonrepetitive_fraction < 0.7) { details.nam_rescue = true; @@ -1438,7 +1185,7 @@ void align_SE_read( } else { align_SE( aligner, sam, nams, record, index_parameters.syncmer.k, - references, details, map_param.dropoff_threshold, map_param.maxTries, + references, details, map_param.dropoff_threshold, map_param.max_tries, map_param.max_secondary ); } diff --git a/src/aln.hpp b/src/aln.hpp index e068d0d4..b9896b20 100644 --- a/src/aln.hpp +++ b/src/aln.hpp @@ -55,17 +55,23 @@ struct AlignmentStatistics { } }; -struct mapping_params { +struct MappingParameters { int r { 150 }; int max_secondary { 0 }; float dropoff_threshold { 0.5 }; - int R { 2 }; - int maxTries { 20 }; + int rescue_level { 2 }; + int max_tries { 20 }; int rescue_cutoff; bool is_sam_out { true }; CigarOps cigar_ops{CigarOps::M}; bool output_unmapped { true }; bool details{false}; + + void verify() const { + if (max_tries < 1) { + throw BadParameter("max_tries must be greater than zero"); + } + } }; class i_dist_est { @@ -88,7 +94,7 @@ void align_PE_read( AlignmentStatistics& statistics, i_dist_est& isize_est, const Aligner& aligner, - const mapping_params& map_param, + const MappingParameters& map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index @@ -100,7 +106,7 @@ void align_SE_read( std::string& outstring, AlignmentStatistics& statistics, const Aligner& aligner, - const mapping_params& map_param, + const MappingParameters& map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index diff --git a/src/cmdline.cpp b/src/cmdline.cpp index 43fd500d..8328ac36 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -122,8 +122,8 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { // Search parameters if (f) { opt.f = args::get(f); } if (S) { opt.dropoff_threshold = args::get(S); } - if (M) { opt.maxTries = args::get(M); } - if (R) { opt.R = args::get(R); } + if (M) { opt.max_tries = args::get(M); } + if (R) { opt.rescue_level = args::get(R); } // Reference and read files opt.ref_filename = args::get(ref_filename); diff --git a/src/cmdline.hpp b/src/cmdline.hpp index 33aa25e0..acb9e0b0 100644 --- a/src/cmdline.hpp +++ b/src/cmdline.hpp @@ -52,8 +52,8 @@ struct CommandLineOptions { // Search parameters float f { 0.0002 }; float dropoff_threshold { 0.5 }; - int maxTries { 20 }; - int R { 2 }; + int max_tries { 20 }; + int rescue_level { 2 }; // Reference and read files std::string ref_filename; // This is either a fasta file or an index file - if fasta, indexing will be run diff --git a/src/main.cpp b/src/main.cpp index 4f17c265..c5cf4a81 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -56,7 +56,7 @@ void warn_if_no_optimizations() { } } -void log_parameters(const IndexParameters& index_parameters, const mapping_params& map_param, const alignment_params& aln_params) { +void log_parameters(const IndexParameters& index_parameters, const MappingParameters& map_param, const alignment_params& aln_params) { logger.debug() << "Using" << std::endl << "k: " << index_parameters.syncmer.k << std::endl << "s: " << index_parameters.syncmer.s << std::endl @@ -64,7 +64,7 @@ void log_parameters(const IndexParameters& index_parameters, const mapping_param << "w_max: " << index_parameters.randstrobe.w_max << std::endl << "Read length (r): " << map_param.r << std::endl << "Maximum seed length: " << index_parameters.randstrobe.max_dist + index_parameters.syncmer.k << std::endl - << "R: " << map_param.R << std::endl + << "R: " << map_param.rescue_level << std::endl << "Expected [w_min, w_max] in #syncmers: [" << index_parameters.randstrobe.w_min << ", " << index_parameters.randstrobe.w_max << "]" << std::endl << "Expected [w_min, w_max] in #nucleotides: [" << (index_parameters.syncmer.k - index_parameters.syncmer.s + 1) * index_parameters.randstrobe.w_min << ", " << (index_parameters.syncmer.k - index_parameters.syncmer.s + 1) * index_parameters.randstrobe.w_max << "]" << std::endl << "A: " << aln_params.match << std::endl @@ -168,16 +168,17 @@ int run_strobealign(int argc, char **argv) { aln_params.gap_extend = opt.E; aln_params.end_bonus = opt.end_bonus; - mapping_params map_param; + MappingParameters map_param; map_param.r = opt.r; map_param.max_secondary = opt.max_secondary; map_param.dropoff_threshold = opt.dropoff_threshold; - map_param.R = opt.R; - map_param.maxTries = opt.maxTries; + map_param.rescue_level = opt.rescue_level; + map_param.max_tries = opt.max_tries; map_param.is_sam_out = opt.is_sam_out; map_param.cigar_ops = opt.cigar_eqx ? CigarOps::EQX : CigarOps::M; map_param.output_unmapped = opt.output_unmapped; map_param.details = opt.details; + map_param.verify(); log_parameters(index_parameters, map_param, aln_params); logger.debug() << "Threads: " << opt.n_threads << std::endl; @@ -257,7 +258,7 @@ int run_strobealign(int argc, char **argv) { // Map/align reads Timer map_align_timer; - map_param.rescue_cutoff = map_param.R < 100 ? map_param.R * index.filter_cutoff : 1000; + map_param.rescue_cutoff = map_param.rescue_level < 100 ? map_param.rescue_level * index.filter_cutoff : 1000; logger.debug() << "Using rescue cutoff: " << map_param.rescue_cutoff << std::endl; std::streambuf* buf; @@ -331,7 +332,7 @@ int main(int argc, char **argv) { try { return run_strobealign(argc, argv); } catch (BadParameter& e) { - logger.error() << "A mapping or seeding parameter is invalid: " << e.what() << std::endl; + logger.error() << "A parameter is invalid: " << e.what() << std::endl; } catch (const std::runtime_error& e) { logger.error() << "strobealign: " << e.what() << std::endl; } diff --git a/src/nam.cpp b/src/nam.cpp index c47dbb66..b0f84886 100644 --- a/src/nam.cpp +++ b/src/nam.cpp @@ -3,28 +3,28 @@ namespace { struct Hit { - int query_s; - int query_e; - int ref_s; - int ref_e; + int query_start; + int query_end; + int ref_start; + int ref_end; bool is_rc = false; }; void add_to_hits_per_ref( robin_hood::unordered_map>& hits_per_ref, - int query_s, - int query_e, + int query_start, + int query_end, bool is_rc, const StrobemerIndex& index, size_t position, int min_diff ) { for (const auto hash = index.get_hash(position); index.get_hash(position) == hash; ++position) { - int ref_s = index.get_strobe1_position(position); - int ref_e = ref_s + index.strobe2_offset(position) + index.k(); - int diff = std::abs((query_e - query_s) - (ref_e - ref_s)); + int ref_start = index.get_strobe1_position(position); + int ref_end = ref_start + index.strobe2_offset(position) + index.k(); + int diff = std::abs((query_end - query_start) - (ref_end - ref_start)); if (diff <= min_diff) { - hits_per_ref[index.reference_index(position)].push_back(Hit{query_s, query_e, ref_s, ref_e, is_rc}); + hits_per_ref[index.reference_index(position)].push_back(Hit{query_start, query_end, ref_start, ref_end, is_rc}); min_diff = diff; } } @@ -41,7 +41,7 @@ std::vector merge_hits_into_nams( if (sort) { std::sort(hits.begin(), hits.end(), [](const Hit& a, const Hit& b) -> bool { // first sort on query starts, then on reference starts - return (a.query_s < b.query_s) || ( (a.query_s == b.query_s) && (a.ref_s < b.ref_s) ); + return (a.query_start < b.query_start) || ( (a.query_start == b.query_start) && (a.ref_start < b.ref_start) ); } ); } @@ -53,24 +53,24 @@ std::vector merge_hits_into_nams( for (auto & o : open_nams) { // Extend NAM - if (( o.is_rc == h.is_rc) && (o.query_prev_hit_startpos < h.query_s) && (h.query_s <= o.query_e ) && (o.ref_prev_hit_startpos < h.ref_s) && (h.ref_s <= o.ref_e) ){ - if ( (h.query_e > o.query_e) && (h.ref_e > o.ref_e) ) { - o.query_e = h.query_e; - o.ref_e = h.ref_e; + if (( o.is_rc == h.is_rc) && (o.query_prev_hit_startpos < h.query_start) && (h.query_start <= o.query_end ) && (o.ref_prev_hit_startpos < h.ref_start) && (h.ref_start <= o.ref_end) ){ + if ( (h.query_end > o.query_end) && (h.ref_end > o.ref_end) ) { + o.query_end = h.query_end; + o.ref_end = h.ref_end; // o.previous_query_start = h.query_s; // o.previous_ref_start = h.ref_s; // keeping track so that we don't . Can be caused by interleaved repeats. - o.query_prev_hit_startpos = h.query_s; // log the last strobemer hit in case of outputting paf - o.ref_prev_hit_startpos = h.ref_s; // log the last strobemer hit in case of outputting paf + o.query_prev_hit_startpos = h.query_start; // log the last strobemer hit in case of outputting paf + o.ref_prev_hit_startpos = h.ref_start; // log the last strobemer hit in case of outputting paf o.n_hits ++; // o.score += (float)1/ (float)h.count; is_added = true; break; } - else if ((h.query_e <= o.query_e) && (h.ref_e <= o.ref_e)) { + else if ((h.query_end <= o.query_end) && (h.ref_end <= o.ref_end)) { // o.previous_query_start = h.query_s; // o.previous_ref_start = h.ref_s; // keeping track so that we don't . Can be caused by interleaved repeats. - o.query_prev_hit_startpos = h.query_s; // log the last strobemer hit in case of outputting paf - o.ref_prev_hit_startpos = h.ref_s; // log the last strobemer hit in case of outputting paf + o.query_prev_hit_startpos = h.query_start; // log the last strobemer hit in case of outputting paf + o.ref_prev_hit_startpos = h.ref_start; // log the last strobemer hit in case of outputting paf o.n_hits ++; // o.score += (float)1/ (float)h.count; is_added = true; @@ -84,15 +84,15 @@ std::vector merge_hits_into_nams( Nam n; n.nam_id = nam_id_cnt; nam_id_cnt ++; - n.query_s = h.query_s; - n.query_e = h.query_e; - n.ref_s = h.ref_s; - n.ref_e = h.ref_e; + n.query_start = h.query_start; + n.query_end = h.query_end; + n.ref_start = h.ref_start; + n.ref_end = h.ref_end; n.ref_id = ref_id; // n.previous_query_start = h.query_s; // n.previous_ref_start = h.ref_s; - n.query_prev_hit_startpos = h.query_s; - n.ref_prev_hit_startpos = h.ref_s; + n.query_prev_hit_startpos = h.query_start; + n.ref_prev_hit_startpos = h.ref_start; n.n_hits = 1; n.is_rc = h.is_rc; // n.score += (float)1 / (float)h.count; @@ -100,11 +100,11 @@ std::vector merge_hits_into_nams( } // Only filter if we have advanced at least k nucleotides - if (h.query_s > prev_q_start + k) { + if (h.query_start > prev_q_start + k) { // Output all NAMs from open_matches to final_nams that the current hit have passed for (auto &n : open_nams) { - if (n.query_e < h.query_s) { + if (n.query_end < h.query_start) { int n_max_span = std::max(n.query_span(), n.ref_span()); int n_min_span = std::min(n.query_span(), n.ref_span()); float n_score; @@ -116,10 +116,10 @@ std::vector merge_hits_into_nams( } // Remove all NAMs from open_matches that the current hit have passed - auto c = h.query_s; - auto predicate = [c](decltype(open_nams)::value_type const &nam) { return nam.query_e < c; }; + auto c = h.query_start; + auto predicate = [c](decltype(open_nams)::value_type const &nam) { return nam.query_end < c; }; open_nams.erase(std::remove_if(open_nams.begin(), open_nams.end(), predicate), open_nams.end()); - prev_q_start = h.query_s; + prev_q_start = h.query_start; } } @@ -180,13 +180,13 @@ std::vector find_nams_rescue( struct RescueHit { unsigned int count; size_t position; - unsigned int query_s; - unsigned int query_e; + unsigned int query_start; + unsigned int query_end; bool is_rc; bool operator< (const RescueHit& rhs) const { - return std::tie(count, query_s, query_e, is_rc) - < std::tie(rhs.count, rhs.query_s, rhs.query_e, rhs.is_rc); + return std::tie(count, query_start, query_end, is_rc) + < std::tie(rhs.count, rhs.query_start, rhs.query_end, rhs.is_rc); } }; @@ -218,7 +218,7 @@ std::vector find_nams_rescue( if ((rh.count > filter_cutoff && cnt >= 5) || rh.count > 1000) { break; } - add_to_hits_per_ref(hits_per_ref, rh.query_s, rh.query_e, rh.is_rc, index, rh.position, 1000); + add_to_hits_per_ref(hits_per_ref, rh.query_start, rh.query_end, rh.is_rc, index, rh.position, 1000); cnt++; } } @@ -227,6 +227,6 @@ std::vector find_nams_rescue( } std::ostream& operator<<(std::ostream& os, const Nam& n) { - os << "Nam(ref_id=" << n.ref_id << ", query: " << n.query_s << ".." << n.query_e << ", ref: " << n.ref_s << ".." << n.ref_e << ", score=" << n.score << ")"; + os << "Nam(ref_id=" << n.ref_id << ", query: " << n.query_start << ".." << n.query_end << ", ref: " << n.ref_start << ".." << n.ref_end << ", score=" << n.score << ")"; return os; } diff --git a/src/nam.hpp b/src/nam.hpp index d2d549e6..24a8fcd6 100644 --- a/src/nam.hpp +++ b/src/nam.hpp @@ -8,11 +8,11 @@ // Non-overlapping approximate match struct Nam { int nam_id; - int query_s; - int query_e; + int query_start; + int query_end; int query_prev_hit_startpos; - int ref_s; - int ref_e; + int ref_start; + int ref_end; int ref_prev_hit_startpos; int n_hits = 0; int ref_id; @@ -22,11 +22,11 @@ struct Nam { bool is_rc = false; int ref_span() const { - return ref_e - ref_s; + return ref_end - ref_start; } int query_span() const { - return query_e - query_s; + return query_end - query_start; } }; diff --git a/src/paf.cpp b/src/paf.cpp index 4bb069aa..112dea3a 100644 --- a/src/paf.cpp +++ b/src/paf.cpp @@ -15,14 +15,14 @@ * 12 mapping quality (0-255; 255 for missing) */ void output_hits_paf_PE(std::string &paf_output, const Nam &n, const std::string &query_name, const References& references, int k, int read_len) { - if (n.ref_s < 0 ) { + if (n.ref_start < 0 ) { return; } paf_output.append(query_name); paf_output.append("\t"); paf_output.append(std::to_string(read_len)); paf_output.append("\t"); - paf_output.append(std::to_string(n.query_s)); + paf_output.append(std::to_string(n.query_start)); paf_output.append("\t"); paf_output.append(std::to_string(n.query_prev_hit_startpos + k)); paf_output.append("\t"); @@ -32,13 +32,13 @@ void output_hits_paf_PE(std::string &paf_output, const Nam &n, const std::string paf_output.append("\t"); paf_output.append(std::to_string(references.lengths[n.ref_id])); paf_output.append("\t"); - paf_output.append(std::to_string(n.ref_s)); + paf_output.append(std::to_string(n.ref_start)); paf_output.append("\t"); paf_output.append(std::to_string(n.ref_prev_hit_startpos + k)); paf_output.append("\t"); paf_output.append(std::to_string(n.n_hits)); paf_output.append("\t"); - paf_output.append(std::to_string(n.ref_prev_hit_startpos + k - n.ref_s)); + paf_output.append(std::to_string(n.ref_prev_hit_startpos + k - n.ref_start)); paf_output.append("\t255\n"); } diff --git a/src/pc.cpp b/src/pc.cpp index 2e0ff9ed..9988caf6 100644 --- a/src/pc.cpp +++ b/src/pc.cpp @@ -135,7 +135,7 @@ void perform_task( AlignmentStatistics& statistics, int& done, const alignment_params &aln_params, - const mapping_params &map_param, + const MappingParameters &map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, diff --git a/src/pc.hpp b/src/pc.hpp index f163368e..8fbabd58 100644 --- a/src/pc.hpp +++ b/src/pc.hpp @@ -64,7 +64,7 @@ class OutputBuffer { void perform_task(InputBuffer &input_buffer, OutputBuffer &output_buffer, AlignmentStatistics& statistics, int& done, const alignment_params &aln_params, - const mapping_params &map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, const std::string& read_group_id); + const MappingParameters &map_param, const IndexParameters& index_parameters, const References& references, const StrobemerIndex& index, const std::string& read_group_id); bool same_name(const std::string& n1, const std::string& n2); diff --git a/src/python/strobealign.cpp b/src/python/strobealign.cpp index 27febe0e..c3316c38 100644 --- a/src/python/strobealign.cpp +++ b/src/python/strobealign.cpp @@ -144,10 +144,10 @@ NB_MODULE(strobealign_extension, m_) { nb::bind_vector(m, "QueryRandstrobeVector"); nb::class_(m, "Nam") - .def_ro("query_start", &Nam::query_s) - .def_ro("query_end", &Nam::query_e) - .def_ro("ref_start", &Nam::ref_s) - .def_ro("ref_end", &Nam::ref_e) + .def_ro("query_start", &Nam::query_start) + .def_ro("query_end", &Nam::query_end) + .def_ro("ref_start", &Nam::ref_start) + .def_ro("ref_end", &Nam::ref_end) .def_ro("score", &Nam::score) .def_ro("n_hits", &Nam::n_hits) .def_ro("reference_index", &Nam::ref_id) diff --git a/src/sam.cpp b/src/sam.cpp index b54e6394..962ff9ce 100644 --- a/src/sam.cpp +++ b/src/sam.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #define SAM_UNMAPPED_MAPQ 0 #define SAM_UNMAPPED_MAPQ_STRING "0" @@ -72,7 +73,7 @@ std::string Sam::cigar_string(const Cigar& cigar) const { } } -void Sam::add_unmapped(const KSeq& record, int flags) { +void Sam::add_unmapped(const KSeq& record, uint16_t flags) { if (!output_unmapped) { return; } @@ -87,7 +88,7 @@ void Sam::add_unmapped(const KSeq& record, int flags) { append_rg_and_newline(); } -void Sam::add_unmapped_mate(const KSeq& record, int flags, const std::string& mate_reference_name, int mate_pos) { +void Sam::add_unmapped_mate(const KSeq& record, uint16_t flags, const std::string& mate_reference_name, uint32_t mate_pos) { assert(flags & (UNMAP|PAIRED)); sam_string.append(strip_suffix(record.name)); sam_string.append("\t"); @@ -116,34 +117,35 @@ void Sam::add_unmapped_pair(const KSeq& r1, const KSeq& r2) { // Add single-end alignment void Sam::add( - const Alignment& sam_aln, + const Alignment& alignment, const KSeq& record, const std::string& sequence_rc, bool is_primary, const Details& details ) { - assert(!sam_aln.is_unaligned); + assert(!alignment.is_unaligned); + int flags = 0; - if (!sam_aln.is_unaligned && sam_aln.is_rc) { + if (!alignment.is_unaligned && alignment.is_rc) { flags |= REVERSE; } if (!is_primary) { flags |= SECONDARY; } - add_record(record.name, flags, references.names[sam_aln.ref_id], sam_aln.ref_start, sam_aln.mapq, sam_aln.cigar, "*", -1, 0, record.seq, sequence_rc, record.qual, sam_aln.ed, sam_aln.aln_score, details); + add_record(record.name, flags, references.names[alignment.ref_id], alignment.ref_start, alignment.mapq, alignment.cigar, "*", -1, 0, record.seq, sequence_rc, record.qual, alignment.edit_distance, alignment.score, details); } // Add one individual record void Sam::add_record( const std::string& query_name, - int flags, + uint16_t flags, const std::string& reference_name, - int pos, - int mapq, + uint32_t pos, + uint8_t mapq, const Cigar& cigar, const std::string& mate_reference_name, - int mate_pos, - int template_len, + uint32_t mate_pos, + int32_t template_len, const std::string& query_sequence, const std::string& query_sequence_rc, const std::string& qual, @@ -209,14 +211,14 @@ void Sam::add_record( } void Sam::add_pair( - const Alignment &sam_aln1, - const Alignment &sam_aln2, + const Alignment &alignment1, + const Alignment &alignment2, const KSeq& record1, const KSeq& record2, const std::string &read1_rc, const std::string &read2_rc, - int mapq1, - int mapq2, + uint8_t mapq1, + uint8_t mapq2, bool is_proper, bool is_primary, const std::array& details @@ -229,14 +231,14 @@ void Sam::add_pair( } int template_len1 = 0; - bool both_aligned = !sam_aln1.is_unaligned && !sam_aln2.is_unaligned; - if (both_aligned && sam_aln1.ref_id == sam_aln2.ref_id) { - const int dist = sam_aln2.ref_start - sam_aln1.ref_start; + bool both_aligned = !alignment1.is_unaligned && !alignment2.is_unaligned; + if (both_aligned && alignment1.ref_id == alignment2.ref_id) { + const int dist = alignment2.ref_start - alignment1.ref_start; if (dist > 0) { - template_len1 = dist + sam_aln2.aln_length; + template_len1 = dist + alignment2.length; } else { - template_len1 = dist - sam_aln1.aln_length; + template_len1 = dist - alignment1.length; } } if (is_proper) { @@ -245,35 +247,35 @@ void Sam::add_pair( } std::string reference_name1; - int pos1 = sam_aln1.ref_start; - int edit_distance1 = sam_aln1.ed; - if (sam_aln1.is_unaligned) { + int pos1 = alignment1.ref_start; + int edit_distance1 = alignment1.edit_distance; + if (alignment1.is_unaligned) { f1 |= UNMAP; f2 |= MUNMAP; pos1 = -1; reference_name1 = "*"; } else { - if (sam_aln1.is_rc) { + if (alignment1.is_rc) { f1 |= REVERSE; f2 |= MREVERSE; } - reference_name1 = references.names[sam_aln1.ref_id]; + reference_name1 = references.names[alignment1.ref_id]; } std::string reference_name2; - int pos2 = sam_aln2.ref_start; - int edit_distance2 = sam_aln2.ed; - if (sam_aln2.is_unaligned) { + int pos2 = alignment2.ref_start; + int edit_distance2 = alignment2.edit_distance; + if (alignment2.is_unaligned) { f2 |= UNMAP; f1 |= MUNMAP; pos2 = -1; reference_name2 = "*"; } else { - if (sam_aln2.is_rc) { + if (alignment2.is_rc) { f1 |= MREVERSE; f2 |= REVERSE; } - reference_name2 = references.names[sam_aln2.ref_id]; + reference_name2 = references.names[alignment2.ref_id]; } // Reference name as used in the RNEXT field; @@ -281,39 +283,39 @@ void Sam::add_pair( std::string mate_reference_name1 = reference_name1; std::string mate_reference_name2 = reference_name2; if ( - (!sam_aln1.is_unaligned && !sam_aln2.is_unaligned && sam_aln1.ref_id == sam_aln2.ref_id) - || (sam_aln1.is_unaligned != sam_aln2.is_unaligned) + (!alignment1.is_unaligned && !alignment2.is_unaligned && alignment1.ref_id == alignment2.ref_id) + || (alignment1.is_unaligned != alignment2.is_unaligned) ) { mate_reference_name1 = "="; mate_reference_name2 = "="; } - if (sam_aln1.is_unaligned != sam_aln2.is_unaligned) { - if (sam_aln1.is_unaligned) { + if (alignment1.is_unaligned != alignment2.is_unaligned) { + if (alignment1.is_unaligned) { pos1 = pos2; } else { pos2 = pos1; } } - if (sam_aln1.is_unaligned) { + if (alignment1.is_unaligned) { add_unmapped_mate(record1, f1, reference_name2, pos2); } else { - add_record(record1.name, f1, reference_name1, sam_aln1.ref_start, mapq1, sam_aln1.cigar, mate_reference_name2, pos2, template_len1, record1.seq, read1_rc, record1.qual, edit_distance1, sam_aln1.aln_score, details[0]); + add_record(record1.name, f1, reference_name1, alignment1.ref_start, mapq1, alignment1.cigar, mate_reference_name2, pos2, template_len1, record1.seq, read1_rc, record1.qual, edit_distance1, alignment1.score, details[0]); } - if (sam_aln2.is_unaligned) { + if (alignment2.is_unaligned) { add_unmapped_mate(record2, f2, reference_name1, pos1); } else { - add_record(record2.name, f2, reference_name2, sam_aln2.ref_start, mapq2, sam_aln2.cigar, mate_reference_name1, pos1, -template_len1, record2.seq, read2_rc, record2.qual, edit_distance2, sam_aln2.aln_score, details[1]); + add_record(record2.name, f2, reference_name2, alignment2.ref_start, mapq2, alignment2.cigar, mate_reference_name1, pos1, -template_len1, record2.seq, read2_rc, record2.qual, edit_distance2, alignment2.score, details[1]); } } -bool is_proper_pair(const Alignment& sam_aln1, const Alignment& sam_aln2, float mu, float sigma) { - const int dist = sam_aln2.ref_start - sam_aln1.ref_start; - const bool same_reference = sam_aln1.ref_id == sam_aln2.ref_id; - const bool both_aligned = same_reference && !sam_aln1.is_unaligned && !sam_aln2.is_unaligned; - const bool r1_r2 = !sam_aln1.is_rc && sam_aln2.is_rc && dist >= 0; // r1 ---> <---- r2 - const bool r2_r1 = !sam_aln2.is_rc && sam_aln1.is_rc && dist <= 0; // r2 ---> <---- r1 +bool is_proper_pair(const Alignment& alignment1, const Alignment& alignment2, float mu, float sigma) { + const int dist = alignment2.ref_start - alignment1.ref_start; + const bool same_reference = alignment1.ref_id == alignment2.ref_id; + const bool both_aligned = same_reference && !alignment1.is_unaligned && !alignment2.is_unaligned; + const bool r1_r2 = !alignment1.is_rc && alignment2.is_rc && dist >= 0; // r1 ---> <---- r2 + const bool r2_r1 = !alignment2.is_rc && alignment1.is_rc && dist <= 0; // r2 ---> <---- r1 const bool rel_orientation_good = r1_r2 || r2_r1; const bool insert_good = std::abs(dist) <= mu + 6 * sigma; diff --git a/src/sam.hpp b/src/sam.hpp index 85a905ff..c214dac4 100644 --- a/src/sam.hpp +++ b/src/sam.hpp @@ -9,15 +9,14 @@ struct Alignment { - Cigar cigar; + int ref_id; int ref_start; - int ed; + Cigar cigar; + int edit_distance; int global_ed; - int sw_score; - int aln_score; - int ref_id; - int mapq; - int aln_length; + int score; + uint8_t mapq; + int length; bool is_rc; bool is_unaligned{false}; // Whether a gapped alignment function was used to obtain this alignment @@ -80,14 +79,14 @@ class Sam { } /* Add an alignment */ - void add(const Alignment& sam_aln, const klibpp::KSeq& record, const std::string& sequence_rc, bool is_primary, const Details& details); - void add_pair(const Alignment& sam_aln1, const Alignment& sam_aln2, const klibpp::KSeq& record1, const klibpp::KSeq& record2, const std::string& read1_rc, const std::string& read2_rc, int mapq1, int mapq2, bool is_proper, bool is_primary, const std::array& details); - void add_unmapped(const klibpp::KSeq& record, int flags = UNMAP); + void add(const Alignment& alignment, const klibpp::KSeq& record, const std::string& sequence_rc, bool is_primary, const Details& details); + void add_pair(const Alignment& alignment1, const Alignment& alignment2, const klibpp::KSeq& record1, const klibpp::KSeq& record2, const std::string& read1_rc, const std::string& read2_rc, uint8_t mapq1, uint8_t mapq2, bool is_proper, bool is_primary, const std::array& details); + void add_unmapped(const klibpp::KSeq& record, uint16_t flags = UNMAP); void add_unmapped_pair(const klibpp::KSeq& r1, const klibpp::KSeq& r2); - void add_unmapped_mate(const klibpp::KSeq& record, int flags, const std::string& mate_reference_name, int mate_pos); + void add_unmapped_mate(const klibpp::KSeq& record, uint16_t flags, const std::string& mate_reference_name, uint32_t mate_pos); private: - void add_record(const std::string& query_name, int flags, const std::string& reference_name, int pos, int mapq, const Cigar& cigar, const std::string& mate_reference_name, int mate_pos, int template_len, const std::string& query_sequence, const std::string& query_sequence_rc, const std::string& qual, int ed, int aln_score, const Details& details); + void add_record(const std::string& query_name, uint16_t flags, const std::string& reference_name, uint32_t pos, uint8_t mapq, const Cigar& cigar, const std::string& mate_reference_name, uint32_t mate_pos, int32_t template_len, const std::string& query_sequence, const std::string& query_sequence_rc, const std::string& qual, int ed, int aln_score, const Details& details); void append_qual(const std::string& qual) { sam_string.append("\t"); sam_string.append(qual.empty() ? "*" : qual); @@ -105,6 +104,6 @@ class Sam { bool show_details; }; -bool is_proper_pair(const Alignment& sam_aln1, const Alignment& sam_aln2, float mu, float sigma); +bool is_proper_pair(const Alignment& alignment1, const Alignment& alignment2, float mu, float sigma); #endif diff --git a/src/unused.cpp b/src/unused.cpp index 27550d93..3282f5f0 100644 --- a/src/unused.cpp +++ b/src/unused.cpp @@ -9,10 +9,23 @@ #include "sam.hpp" #include "aln.hpp" #include "nam.hpp" +#include "revcomp.hpp" + int main() { } +static inline Alignment align_segment( + const Aligner& aligner, + const std::string &read_segm, + const std::string &ref_segm, + int ref_start, + int ext_left, + int ext_right, + bool consistent_nam, + bool is_rc +); + class RefRandstrobeWithoutHash { public: RefRandstrobeWithoutHash() { } // TODO should not be needed @@ -1273,13 +1286,13 @@ static inline bool sort_lowest_ed_scores_single(const std::tuple struct Hit { unsigned int count; unsigned int offset; - unsigned int query_s; - unsigned int query_e; + unsigned int query_start; + unsigned int query_end; bool is_rc; bool operator< (const Hit& rhs) const { - return std::tie(count, offset, query_s, query_e, is_rc) - < std::tie(rhs.count, rhs.offset, rhs.query_s, rhs.query_e, rhs.is_rc); + return std::tie(count, offset, query_start, query_end, is_rc) + < std::tie(rhs.count, rhs.offset, rhs.query_start, rhs.query_end, rhs.is_rc); } }; @@ -1386,8 +1399,8 @@ static inline void find_nams_rescue( // std::cerr << "Q " << h.query_s << " " << h.query_e << " read length:" << read_length << std::endl; auto count = q.count; auto offset = q.offset; - h.query_s = q.query_s; - h.query_e = q.query_e; // h.query_s + read_length/2; + h.query_s = q.query_start; + h.query_e = q.query_end; // h.query_s + read_length/2; h.is_rc = q.is_rc; if ( ((count <= filter_cutoff) || (cnt < 5)) && (count <= 1000) ){ @@ -1430,8 +1443,8 @@ static inline void find_nams_rescue( { auto count = q.count; auto offset = q.offset; - h.query_s = q.query_s; - h.query_e = q.query_e; // h.query_s + read_length/2; + h.query_s = q.query_start; + h.query_e = q.query_end; // h.query_s + read_length/2; h.is_rc = q.is_rc; if ( ((count <= filter_cutoff) || (cnt < 5)) && (count <= 1000) ){ @@ -1489,10 +1502,10 @@ static inline void find_nams_rescue( for (auto & o : open_nams) { // Extend NAM - if (( o.is_rc == h.is_rc) && (o.query_prev_hit_startpos < h.query_s) && (h.query_s <= o.query_e ) && (o.ref_prev_hit_startpos < h.ref_s) && (h.ref_s <= o.ref_e) ){ - if ( (h.query_e > o.query_e) && (h.ref_e > o.ref_e) ) { - o.query_e = h.query_e; - o.ref_e = h.ref_e; + if (( o.is_rc == h.is_rc) && (o.query_prev_hit_startpos < h.query_s) && (h.query_s <= o.query_end ) && (o.ref_prev_hit_startpos < h.ref_s) && (h.ref_s <= o.ref_end) ){ + if ( (h.query_e > o.query_end) && (h.ref_e > o.ref_end) ) { + o.query_end = h.query_e; + o.ref_end = h.ref_e; // o.previous_query_start = h.query_s; // o.previous_ref_start = h.ref_s; // keeping track so that we don't . Can be caused by interleaved repeats. o.query_prev_hit_startpos = h.query_s; // log the last strobemer hit in case of outputting paf @@ -1502,7 +1515,7 @@ static inline void find_nams_rescue( is_added = true; break; } - else if ((h.query_e <= o.query_e) && (h.ref_e <= o.ref_e)) { + else if ((h.query_e <= o.query_end) && (h.ref_e <= o.ref_end)) { // o.previous_query_start = h.query_s; // o.previous_ref_start = h.ref_s; // keeping track so that we don't . Can be caused by interleaved repeats. o.query_prev_hit_startpos = h.query_s; // log the last strobemer hit in case of outputting paf @@ -1526,10 +1539,10 @@ static inline void find_nams_rescue( Nam n; n.nam_id = nam_id_cnt; nam_id_cnt ++; - n.query_s = h.query_s; - n.query_e = h.query_e; - n.ref_s = h.ref_s; - n.ref_e = h.ref_e; + n.query_start = h.query_s; + n.query_end = h.query_e; + n.ref_start = h.ref_s; + n.ref_end = h.ref_e; n.ref_id = ref_id; // n.previous_query_start = h.query_s; // n.previous_ref_start = h.ref_s; @@ -1546,9 +1559,9 @@ static inline void find_nams_rescue( // Output all NAMs from open_matches to final_nams that the current hit have passed for (auto &n : open_nams) { - if (n.query_e < h.query_s) { - int n_max_span = std::max(n.query_e - n.query_s, n.ref_e - n.ref_s); - int n_min_span = std::max(n.query_e - n.query_s, n.ref_e - n.ref_s); + if (n.query_end < h.query_s) { + int n_max_span = std::max(n.query_end - n.query_start, n.ref_end - n.ref_start); + int n_min_span = std::max(n.query_end - n.query_start, n.ref_end - n.ref_start); float n_score; n_score = ( 2*n_min_span - n_max_span) > 0 ? (float) (n.n_hits * ( 2*n_min_span - n_max_span) ) : 1; // this is really just n_hits * ( min_span - (offset_in_span) ) ); // n_score = n.n_hits * (n.query_e - n.query_s); @@ -1560,7 +1573,7 @@ static inline void find_nams_rescue( // Remove all NAMs from open_matches that the current hit have passed unsigned int c = h.query_s; - auto predicate = [c](decltype(open_nams)::value_type const &nam) { return nam.query_e < c; }; + auto predicate = [c](decltype(open_nams)::value_type const &nam) { return nam.query_end < c; }; open_nams.erase(std::remove_if(open_nams.begin(), open_nams.end(), predicate), open_nams.end()); prev_q_start = h.query_s; } @@ -1568,8 +1581,8 @@ static inline void find_nams_rescue( // Add all current open_matches to final NAMs for (auto &n : open_nams){ - int n_max_span = std::max(n.query_e - n.query_s, n.ref_e - n.ref_s); - int n_min_span = std::min(n.query_e - n.query_s, n.ref_e - n.ref_s); + int n_max_span = std::max(n.query_end - n.query_start, n.ref_end - n.ref_start); + int n_min_span = std::min(n.query_end - n.query_start, n.ref_end - n.ref_start); float n_score; n_score = ( 2*n_min_span - n_max_span) > 0 ? (float) (n.n_hits * ( 2*n_min_span - n_max_span) ) : 1; // this is really just n_hits * ( min_span - (offset_in_span) ) ); // n_score = n.n_hits * (n.query_e - n.query_s); @@ -1715,10 +1728,10 @@ static inline std::pair find_nams( for (auto & o : open_nams) { // Extend NAM - if (( o.is_rc == h.is_rc) && (o.query_prev_hit_startpos < h.query_s) && (h.query_s <= o.query_e ) && (o.ref_prev_hit_startpos < h.ref_s) && (h.ref_s <= o.ref_e) ){ - if ( (h.query_e > o.query_e) && (h.ref_e > o.ref_e) ) { - o.query_e = h.query_e; - o.ref_e = h.ref_e; + if (( o.is_rc == h.is_rc) && (o.query_prev_hit_startpos < h.query_s) && (h.query_s <= o.query_end ) && (o.ref_prev_hit_startpos < h.ref_s) && (h.ref_s <= o.ref_end) ){ + if ( (h.query_e > o.query_end) && (h.ref_e > o.ref_end) ) { + o.query_end = h.query_e; + o.ref_end = h.ref_e; // o.previous_query_start = h.query_s; // o.previous_ref_start = h.ref_s; // keeping track so that we don't . Can be caused by interleaved repeats. o.query_prev_hit_startpos = h.query_s; // log the last strobemer hit in case of outputting paf @@ -1728,7 +1741,7 @@ static inline std::pair find_nams( is_added = true; break; } - else if ((h.query_e <= o.query_e) && (h.ref_e <= o.ref_e)) { + else if ((h.query_e <= o.query_end) && (h.ref_e <= o.ref_end)) { // o.previous_query_start = h.query_s; // o.previous_ref_start = h.ref_s; // keeping track so that we don't . Can be caused by interleaved repeats. o.query_prev_hit_startpos = h.query_s; // log the last strobemer hit in case of outputting paf @@ -1776,10 +1789,10 @@ static inline std::pair find_nams( Nam n; n.nam_id = nam_id_cnt; nam_id_cnt ++; - n.query_s = h.query_s; - n.query_e = h.query_e; - n.ref_s = h.ref_s; - n.ref_e = h.ref_e; + n.query_start = h.query_s; + n.query_end = h.query_e; + n.ref_start = h.ref_s; + n.ref_end = h.ref_e; n.ref_id = ref_id; // n.previous_query_start = h.query_s; // n.previous_ref_start = h.ref_s; @@ -1796,9 +1809,9 @@ static inline std::pair find_nams( // Output all NAMs from open_matches to final_nams that the current hit have passed for (auto &n : open_nams) { - if (n.query_e < h.query_s) { - int n_max_span = std::max(n.query_e - n.query_s, n.ref_e - n.ref_s); - int n_min_span = std::min(n.query_e - n.query_s, n.ref_e - n.ref_s); + if (n.query_end < h.query_s) { + int n_max_span = std::max(n.query_end - n.query_start, n.ref_end - n.ref_start); + int n_min_span = std::min(n.query_end - n.query_start, n.ref_end - n.ref_start); float n_score; n_score = ( 2*n_min_span - n_max_span) > 0 ? (float) (n.n_hits * ( 2*n_min_span - n_max_span) ) : 1; // this is really just n_hits * ( min_span - (offset_in_span) ) ); // n_score = n.n_hits * (n.query_e - n.query_s); @@ -1810,7 +1823,7 @@ static inline std::pair find_nams( // Remove all NAMs from open_matches that the current hit have passed unsigned int c = h.query_s; - auto predicate = [c](decltype(open_nams)::value_type const &nam) { return nam.query_e < c; }; + auto predicate = [c](decltype(open_nams)::value_type const &nam) { return nam.query_end < c; }; open_nams.erase(std::remove_if(open_nams.begin(), open_nams.end(), predicate), open_nams.end()); prev_q_start = h.query_s; } @@ -1818,8 +1831,8 @@ static inline std::pair find_nams( // Add all current open_matches to final NAMs for (auto &n : open_nams){ - int n_max_span = std::max(n.query_e - n.query_s, n.ref_e - n.ref_s); - int n_min_span = std::min(n.query_e - n.query_s, n.ref_e - n.ref_s); + int n_max_span = std::max(n.query_end - n.query_start, n.ref_end - n.ref_start); + int n_min_span = std::min(n.query_end - n.query_start, n.ref_end - n.ref_start); float n_score; n_score = ( 2*n_min_span - n_max_span) > 0 ? (float) (n.n_hits * ( 2*n_min_span - n_max_span) ) : 1; // this is really just n_hits * ( min_span - (offset_in_span) ) ); // n_score = n.n_hits * (n.query_e - n.query_s); @@ -1835,3 +1848,238 @@ static inline std::pair find_nams( // } return info; } + + +static inline Alignment get_alignment_unused( + const Aligner& aligner, + const Nam &nam, + const References& references, + const Read& read, + int k, + bool consistent_nam +) { + Alignment alignment; + + const auto read_len = read.size(); + const int diff = std::abs(nam.ref_span() - nam.query_span()); + + const std::string r_tmp = nam.is_rc ? read.rc : read.seq; + const bool is_rc = nam.is_rc; + + int ext_left; + int ext_right; + int ref_tmp_segm_size; + const int ref_len = references.lengths[nam.ref_id]; + int ref_tmp_start; + std::string read_segm; + + // test full hamming based alignment first + ref_tmp_start = std::max(0, nam.ref_start - nam.query_start); + int ref_start = std::max(0, ref_tmp_start); + ref_tmp_segm_size = read_len + diff; + auto ref_segm_size = std::min(ref_tmp_segm_size, ref_len - ref_start + 1); + + std::string ref_segm = references.sequences[nam.ref_id].substr(ref_start, ref_segm_size); + if (ref_segm_size == read_len && consistent_nam) { + int hamming_dist = hamming_distance(r_tmp, ref_segm); + + if (hamming_dist >= 0 && (((float) hamming_dist / ref_segm_size) < 0.05) ) { //Hamming distance worked fine, no need to ksw align + auto info = hamming_align(r_tmp, ref_segm, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); + alignment.cigar = std::move(info.cigar); + alignment.edit_distance = info.edit_distance; + alignment.score = info.sw_score; // aln_params.match*(read_len-hamming_dist) - aln_params.mismatch*hamming_dist; + alignment.ref_start = ref_start + ext_left + info.query_start; + alignment.is_rc = is_rc; + alignment.is_unaligned = false; + return alignment; + } + } + + //// Didn't work with global Hamming - split into parts + + // identify one or two split points within the read if the segment is are larger than T + int T = 20; + // find the most central split point Use convex function result sum of squares + int left_outer = pow (nam.query_start, 2) + pow(read_len - nam.query_start, 2); + int right_inner = pow (nam.query_end - k, 2) + pow (read_len - (nam.query_end - k), 2); + + + int global_max_bp = left_outer < right_inner ? nam.query_start : nam.query_end - k; + int break_point = (global_max_bp >= T) && (global_max_bp <= (read_len - T)) ? global_max_bp : -1; + if (break_point > 0 ){ +// std::cerr << "MAX BREAKPOINT " << break_point << " candidates: " << n.query_s << " " << n.query_e - k << std::endl; + int left_region_bp = break_point + k; + int right_region_bp = break_point; +// std::cerr << "left_region_bp " << left_region_bp << " right_region_bp: " << right_region_bp << std::endl; + int right_ref_start_bp = -1; + if (break_point == nam.query_start){ + right_ref_start_bp = nam.ref_start; + } else if (break_point == (nam.query_end - k)) { + right_ref_start_bp = nam.ref_end-k; + } else { + std::cerr << "BUUUUUUG " << std::endl; + } + + // Left region align + read_segm = r_tmp.substr(0, left_region_bp); + ref_tmp_start = std::max(0, nam.ref_start - nam.query_start); + ext_left = std::min(50, ref_tmp_start); + ext_right = 0; + ref_start = ref_tmp_start - ext_left; + ref_segm_size = left_region_bp + ext_left + diff; + ref_segm = references.sequences[nam.ref_id].substr(ref_start, ref_segm_size); +// std::cerr << " " << std::endl; +// std::cerr << "GOING IN LEFT: " << " read segm len " << read_segm.length() << " ref segm len " << ref_segm_size << " ext_left: " << ext_left << std::endl; +// std::cerr << diff << std::endl; +// std::cerr << read_segm << std::endl; +// std::cerr << ref_segm << std::endl; + + auto alignment_segm_left = align_segment(aligner, read_segm, ref_segm, ref_start, ext_left, ext_right, consistent_nam, is_rc); +// std::cerr << "LEFT CIGAR: " << alignment_segm_left.cigar << std::endl; + + //Right region align + read_segm = r_tmp.substr(right_region_bp, read_len - right_region_bp ); + ref_tmp_segm_size = right_ref_start_bp + (read_len + diff - right_region_bp) < ref_len ? (read_len + diff - right_region_bp) : ref_len - right_ref_start_bp; + ext_left = 0; + ext_right = std::min(50, ref_len - (right_ref_start_bp + ref_tmp_segm_size)); + ref_segm_size = ref_tmp_segm_size + ext_right; + ref_segm = references.sequences[nam.ref_id].substr(right_ref_start_bp, ref_segm_size); +// std::cerr << " " << std::endl; +// std::cerr << "GOING IN RIGHT: " << " read segm len " << read_segm.length() << " ref segm len " << ref_segm_size << " ext_right: " << ext_right << std::endl; +// std::cerr << diff << std::endl; +// std::cerr << read_segm << std::endl; +// std::cerr << ref_segm << std::endl; +// std::cerr << "read_segm.length(): " << read_segm.length() << " ref_segm_size " << ref_segm_size << std::endl; + auto alignment_segm_right = align_segment(aligner, read_segm, ref_segm, ref_start, ext_left, ext_right, consistent_nam, is_rc); +// std::cerr << "RIGHT CIGAR: " << alignment_segm_right.cigar << std::endl; + + + // Stitch together + alignment.ref_id = nam.ref_id; + alignment.cigar = Cigar(alignment_segm_left.cigar.to_string() + alignment_segm_right.cigar.to_string()); + alignment.edit_distance = alignment_segm_left.edit_distance + alignment_segm_right.edit_distance; + alignment.score = alignment_segm_left.score + alignment_segm_right.score; + alignment.ref_start = alignment_segm_left.ref_start; + alignment.is_rc = nam.is_rc; + alignment.is_unaligned = false; + } else { +// std::cerr << "NOOOO MAX BREAKPOINT " << break_point << " candidates: " << n.query_s << " " << n.query_e - k << std::endl; + // full align + ref_tmp_start = std::max(0, nam.ref_start - nam.query_start); + ext_left = std::min(50, ref_tmp_start); + ref_start = ref_tmp_start - ext_left; + + ref_tmp_segm_size = read_len + diff; + ext_right = std::min(50, ref_len - (nam.ref_end +1)); + + ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; + ref_segm = references.sequences[nam.ref_id].substr(ref_start, ref_segm_size); +// std::cerr << " ref_tmp_start " << ref_tmp_start << " ext left " << ext_left << " ext right " << ext_right << " ref_tmp_segm_size " << ref_tmp_segm_size << " ref_segm_size " << ref_segm_size << " ref_segm " << ref_segm << std::endl; + alignment = align_segment(aligner, r_tmp, ref_segm, ref_start, ext_left, ext_right, consistent_nam, is_rc); + alignment.ref_id = nam.ref_id; + } + + + // TODO: Several breakpoints. To complicated for probably little gain if short reads, maybe implement later.. +// // Left and right breakpoint +// int left_bp_read = n.query_s + k; +// int right_bp_read = n.query_e - k; +// left_bp_read = left_bp_read > T ? left_bp_read : -1; +// right_bp_read = read_len - right_bp_read > T ? read_len - right_bp_read : -1; +// +// // Decide where to break +// bool break1, break2; +// if ( ((read_len - right_bp_read - left_bp_read) > T) && (left_bp_read >= T) && (right_bp_read >= T) ){ // Use two breakpoints ---------------X|------------------X|--------------- +// break1 = true; +// break2 = true; +// std::cerr << "CASE1 " << std::endl; +// } else if ( (left_bp_read >= T) && (right_bp_read >= T) ) { // Use one breakpoint ---------------------------X--------X-------------------------- +// break1 = left_bp_read >= right_bp_read ? true : false; // ------------------------------------X|--------X-------------------- +// break2 = left_bp_read < right_bp_read ? true : false; // ---------------------X--------|X----------------------------------- +// std::cerr << "CASE2 " << std::endl; +// } else if ( (left_bp_read >= T) && ( (read_len - left_bp_read) >= T) ) { // Use one breakpoint -----------------------X|-------------------X------- +// break1 = true; +// break2 = false; +// std::cerr << "CASE3 " << std::endl; +// } else if ( ((right_bp_read) > T) && (read_len - right_bp_read >= T) ) { // Use one breakpoint ----------X-------------------X|--------------------------- +// break1 = false; +// break2 = true; +// std::cerr << "CASE4 " << std::endl; +// } else { // No breakpoints ------X--------------------------------------X-------- +// break1 = false; +// break2 = false; +// std::cerr << "CASE5 " << std::endl; +// } +// +// std::cerr << "BREAKPOINTS: LEFT: " << left_bp_read << " RIGHT: " << right_bp_read << " BREAKING LEFT: " << break1 << " BREAKING RIGHT: " << break2 << std::endl; +// std::cerr << "MAPPING POS n.ref_s: " << n.ref_s << " n.ref_e: " << n.ref_e << " n.query_s " << n.query_s << " n.query_e: " << n.query_e << std::endl; +// std::cerr << " " << std::endl; + + +// // left alignment +// ref_end = n.ref_s + k; +// ref_start = ref_end - ext_left; +// ext_left = ref_end < 50 ? ref_end : 50; +// +// ref_tmp_segm_size = n.query_s + k; +// ext_right = 0; +// +// ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; +// ref_segm = ref_seqs[n.ref_id].substr(ref_end - ext_left, ref_segm_size); +// alignment alignment_segm_left; +// alignment_segm_left.ref_id = n.ref_id; +// read_segm = r_tmp.substr(0, n.query_s+k); +// +// std::cerr << "LEFT: ref_start: " << ref_start << " LEFT: ref_end: " << ref_end << " ref_segm_size " << ref_segm_size << " read segm size: " << read_segm.length() << std::endl; +// +// align_segment(aln_params, read_segm, ref_segm, read_segm.length(), ref_segm_size, ref_start, ext_left, ext_right, consistent_nam, is_rc, alignment_segm_left, tot_ksw_aligned); +// +// +// // center alignment +// ref_tmp_start = n.ref_s - n.query_s; +// ref_start = ref_tmp_start > 0 ? ref_tmp_start : 0; +// ext_left = 0; +// +// ref_tmp_segm_size = n.ref_e - n.ref_s; +// ext_right = 0; +// +// ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; +// ref_segm = ref_seqs[n.ref_id].substr(ref_start, ref_segm_size); +// alignment alignment_segm_center; +// alignment_segm_center.ref_id = n.ref_id; +// read_segm = r_tmp.substr(n.query_s, n.query_e - n.query_s); +// std::cerr << "CENTER: ref_start: " << ref_start << " CENTER: ref_end: " << ref_end << " ref_segm_size " << ref_segm_size << " read segm size: " << read_segm.length() << std::endl; +// +// align_segment(aln_params, read_segm, ref_segm, read_segm.length(), ref_segm_size, ref_start, ext_left, ext_right, consistent_nam, is_rc, alignment_segm_center, tot_ksw_aligned); +// +// +// // right alignment +// ext_left = 0; +// ref_start = n.ref_e - k; +// +// ext_right = ref_len - (n.ref_e +1) < 50 ? ref_len - (n.ref_e +1) : 50; +// ref_tmp_segm_size = n.ref_e + ext_right - ref_start; +// +// ref_segm_size = ref_tmp_segm_size + ext_left + ext_right; +// ref_segm = ref_seqs[n.ref_id].substr(n.ref_e - ext_left, ref_segm_size); +// alignment alignment_segm_right; +// alignment_segm_left.ref_id = n.ref_id; +// read_segm = r_tmp.substr(0, n.query_s+k); +// std::cerr << "RIGHT: ref_start: " << ref_start << " RIGHT: ref_end: " << ref_end << " ref_segm_size " << ref_segm_size << " read segm size: " << read_segm.length() << std::endl; +// +// align_segment(aln_params, read_segm, ref_segm, read_segm.length(), ref_segm_size, ref_start, ext_left, ext_right, consistent_nam, is_rc, alignment_segm_right, tot_ksw_aligned); +// +// std::cout << alignment_segm_left.cigar << " " << alignment_segm_center.cigar << " " << alignment_segm_right.cigar << std::endl; +// +// alignment.ref_id = n.ref_id; +// alignment.cigar = alignment_segm_left.cigar + alignment_segm_center.cigar + alignment_segm_right.cigar; +// alignment.ed = alignment_segm_left.ed + alignment_segm_center.ed + alignment_segm_right.ed; +// alignment.sw_score = alignment_segm_left.sw_score + alignment_segm_center.sw_score + alignment_segm_right.sw_score; +// alignment.ref_start = alignment_segm_left.ref_start; +// alignment.is_rc = alignment_segm_left.is_rc; +// alignment.is_unaligned = false; +// alignment.aln_score = alignment.sw_score; +// std::cout << "Joint: " << alignment.cigar << std::endl; + + return alignment; +} diff --git a/tests/test_sam.cpp b/tests/test_sam.cpp index 0177dd6a..b54a195d 100644 --- a/tests/test_sam.cpp +++ b/tests/test_sam.cpp @@ -50,8 +50,8 @@ TEST_CASE("Sam::add") { aln.is_unaligned = false; aln.is_rc = true; aln.ref_start = 2; - aln.ed = 3; - aln.aln_score = 9; + aln.edit_distance = 3; + aln.score = 9; aln.mapq = 55; aln.cigar = Cigar("2S2=1X3=3S"); @@ -87,8 +87,8 @@ TEST_CASE("Pair with one unmapped SAM record") { aln1.is_unaligned = false; aln1.ref_start = 2; aln1.is_rc = true; - aln1.ed = 17; - aln1.aln_score = 9; + aln1.edit_distance = 17; + aln1.score = 9; aln1.cigar = Cigar("2M"); Alignment aln2; @@ -143,8 +143,8 @@ TEST_CASE("TLEN zero when reads map to different contigs") { aln1.is_unaligned = false; aln1.ref_start = 2; aln1.is_rc = false; - aln1.ed = 17; - aln1.aln_score = 9; + aln1.edit_distance = 17; + aln1.score = 9; aln1.cigar = Cigar("2M"); Alignment aln2; @@ -152,8 +152,8 @@ TEST_CASE("TLEN zero when reads map to different contigs") { aln2.ref_id = 1; aln2.ref_start = 3; aln2.is_rc = false; - aln2.ed = 2; - aln2.aln_score = 4; + aln2.edit_distance = 2; + aln2.score = 4; aln2.cigar = Cigar("3M"); klibpp::KSeq record1;