Skip to content

Commit

Permalink
Assign single-end multimappers to a more random mapping site
Browse files Browse the repository at this point in the history
by shuffling the top-scoring NAMs

See #359
  • Loading branch information
marcelm committed Nov 16, 2023
1 parent 8c74210 commit 76995c2
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 4 deletions.
26 changes: 24 additions & 2 deletions src/aln.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "aln.hpp"

#include <algorithm>
#include <iostream>
#include <math.h>
#include <sstream>
Expand Down Expand Up @@ -958,6 +959,22 @@ void InsertSizeDistribution::update(int dist) {
}
}

/* Shuffle the top-scoring NAMs. Input must be sorted by score.
*
* This helps to ensure we pick a random location in case there are multiple
* equally good ones.
*/
void shuffle_top_nams(std::vector<Nam>& nams, std::minstd_rand& random_engine) {
if (nams.empty()) {
return;
}
auto best_score = nams[0].score;
auto it = std::find_if(nams.begin(), nams.end(), [&](const Nam& nam) { return nam.score != best_score; });
if (it != nams.end()) {
std::shuffle(nams.begin(), it, random_engine);
}
}

void align_PE_read(
const KSeq &record1,
const KSeq &record2,
Expand Down Expand Up @@ -1039,7 +1056,8 @@ void align_SE_read(
const MappingParameters &map_param,
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index
const StrobemerIndex& index,
std::minstd_rand& random_engine
) {
Details details;
Timer strobe_timer;
Expand All @@ -1059,12 +1077,16 @@ void align_SE_read(
}
statistics.tot_time_rescue += rescue_timer.duration();
}

details.nams = nams.size();

Timer nam_sort_timer;
std::sort(nams.begin(), nams.end(), by_score<Nam>);

shuffle_top_nams(nams, random_engine);

statistics.tot_sort_nams += nam_sort_timer.duration();


Timer extend_timer;
if (!map_param.is_sam_out) {
output_hits_paf(outstring, nams, record.name, references,
Expand Down
4 changes: 3 additions & 1 deletion src/aln.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <string>
#include <vector>
#include <random>
#include "kseq++/kseq++.hpp"
#include "index.hpp"
#include "refs.hpp"
Expand Down Expand Up @@ -111,7 +112,8 @@ void align_SE_read(
const MappingParameters& map_param,
const IndexParameters& index_parameters,
const References& references,
const StrobemerIndex& index
const StrobemerIndex& index,
std::minstd_rand& random_engine
);

// Private declarations, only here because we need them in tests
Expand Down
5 changes: 4 additions & 1 deletion src/pc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ void perform_task(
) {
bool eof = false;
Aligner aligner{aln_params};
std::minstd_rand random_engine;
while (!eof) {
std::vector<klibpp::KSeq> records1;
std::vector<klibpp::KSeq> records2;
Expand All @@ -161,6 +162,8 @@ void perform_task(
sam_out.reserve(7*map_param.r * (records1.size() + records3.size()));
Sam sam{sam_out, references, map_param.cigar_ops, read_group_id, map_param.output_unmapped, map_param.details};
InsertSizeDistribution isize_est;
// Use chunk index as random seed for reproducibility
random_engine.seed(chunk_index);
for (size_t i = 0; i < records1.size(); ++i) {
auto record1 = records1[i];
auto record2 = records2[i];
Expand All @@ -172,7 +175,7 @@ void perform_task(
}
for (size_t i = 0; i < records3.size(); ++i) {
auto record = records3[i];
align_SE_read(record, sam, sam_out, statistics, aligner, map_param, index_parameters, references, index);
align_SE_read(record, sam, sam_out, statistics, aligner, map_param, index_parameters, references, index, random_engine);
statistics.n_reads++;
}
output_buffer.output_records(std::move(sam_out), chunk_index);
Expand Down

0 comments on commit 76995c2

Please sign in to comment.