From 25d6e3e040052706197d5efef967e94f31ccebb1 Mon Sep 17 00:00:00 2001 From: Hannes Hauswedell Date: Tue, 21 Nov 2023 00:00:41 +0100 Subject: [PATCH] WIP --- src/search_algo.hpp | 58 ++++++++++++++++++++--------------- src/search_datastructures.hpp | 6 +++- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/search_algo.hpp b/src/search_algo.hpp index 67394d442..1b70f56dd 100644 --- a/src/search_algo.hpp +++ b/src/search_algo.hpp @@ -872,6 +872,10 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH) record.matches.resize(lH.options.maxMatches); } lH.stats.hitsFinal += record.matches.size(); + for (auto it = record.matches.begin(), it2 = std::next(record.matches.begin()); it2 != record.matches.end(); + ++it, ++it2) + if (it->_n_sId != it2->_n_sId) + ++lH.stats.pairs; // compute LCA if (lH.options.computeLCA) @@ -922,7 +926,8 @@ inline void _widenMatch(Match & m, TLocalHolder const & lH) uint64_t band = _bandSize(lH.transQrySeqs[m.qryId].size()); // end on subject is beginning plus full query length plus band - m.subjEnd = std::min(m.subjStart + lH.transQrySeqs[m.qryId].size() + band, lH.gH.transSbjSeqs[m.subjId].size()); + m.subjEnd = + std::min(m.subjStart + lH.transQrySeqs[m.qryId].size() + band, lH.gH.transSbjSeqs[m.subjId].size()); // account for band in subj start m.subjStart = (band < m.subjStart) ? m.subjStart - band : 0; @@ -1125,19 +1130,19 @@ inline void _performAlignment(TDepSetH & depSetH, } template -inline void _widenAndPreprocessMatches(TLocalHolder & lH) +inline void _widenAndPreprocessMatches(std::span & matches, TLocalHolder & lH) { - auto before = lH.matches.size(); + auto before = matches.size(); - for (Match & m : lH.matches) + for (Match & m : matches) _widenMatch(m, lH); - std::ranges::sort(lH.matches); + std::ranges::sort(matches); - if (lH.matches.size() > 1) + if (matches.size() > 1) { // pairwise merge from left to right - for (auto it = lH.matches.begin(); it < lH.matches.end() - 1; ++it) + for (auto it = matches.begin(); it < matches.end() - 1; ++it) { Match & l = *it; Match & r = *(it + 1); @@ -1149,7 +1154,7 @@ inline void _widenAndPreprocessMatches(TLocalHolder & lH) } // pairwise "swallow" from right to left - for (auto it = lH.matches.rbegin(); it < lH.matches.rend() - 1; ++it) + for (auto it = matches.rbegin(); it < matches.rend() - 1; ++it) { Match & l = *it; Match & r = *(it + 1); @@ -1159,14 +1164,14 @@ inline void _widenAndPreprocessMatches(TLocalHolder & lH) } } - auto const ret = std::ranges::unique(lH.matches); - lH.matches.erase(ret.begin(), ret.end()); - lH.stats.hitsDuplicate += (before - lH.matches.size()); + auto [new_end, old_end] = std::ranges::unique(matches); // move non-uniq to the end + matches = std::span{matches.begin(), new_end}; // "resize" of the span + lH.stats.hitsDuplicate += (before - matches.size()); } } template -inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bsDirection::fwd) +inline void iterateMatchesFullSimd(std::span lambdaMatches, TLocalHolder & lH, bsDirection const dir) { using TGlobalHolder = typename TLocalHolder::TGlobalHolder; using TBlastMatch = typename TLocalHolder::TBlastMatch; @@ -1176,7 +1181,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs // statistics #ifdef LAMBDA_MICRO_STATS ++lH.stats.numQueryWithExt; - lH.stats.numExtScore += seqan::length(lH.matches); + lH.stats.numExtScore += seqan::length(lambdaMatches); double start = sysTime(); #endif @@ -1186,19 +1191,12 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs seqan::StringSet::Type> depSetV; // pre-sort and filter - _widenAndPreprocessMatches(lH); + _widenAndPreprocessMatches(lambdaMatches, lH); // create blast matches from Lambda matches std::list blastMatches; - for (Match const & m : lH.matches) + for (Match const & m : lambdaMatches) { - /* TODO we need to externalise this because right now we sort and merge hits that are not need in BS mode */ - // In BS-mode, skip those results that have wrong orientation - if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS) - { - if ((dir == bsDirection::fwd && (m.subjId % 2)) || (dir == bsDirection::rev && !(m.subjId % 2))) - continue; - } // create blastmatch in list without copy or move blastMatches.emplace_back(lH.qryIds[m.qryId / TGlobalHolder::qryNumFrames], const_gH.indexFile.ids[m.subjId / TGlobalHolder::sbjNumFrames]); @@ -1362,12 +1360,24 @@ inline void writeRecords(TLocalHolder & lH) template inline void iterateMatches(TLocalHolder & lH) { - iterateMatchesFullSimd(lH, bsDirection::fwd); if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS) { - iterateMatchesFullSimd(lH, bsDirection::rev); + std::ranges::sort(lH.matches, + [](Match const & l, Match const & r) { + return std::tuple{l.subjId % 2, l} < + std::tuple{r.subjId % 2, r}; + }); + + auto it = std::ranges::find_if(lH.matches, [](Match const & m) { return m.subjId % 2; }); + + iterateMatchesFullSimd(std::span{lH.matches.begin(), it}, lH, bsDirection::fwd); + iterateMatchesFullSimd(std::span{it, lH.matches.end()}, lH, bsDirection::rev); lH.blastMatches.sort([](auto const & lhs, auto const & rhs) { return lhs._n_qId < rhs._n_qId; }); } + else + { + iterateMatchesFullSimd(lH.matches, lH, bsDirection::fwd); + } } //----------------------------------------------------------------------- diff --git a/src/search_datastructures.hpp b/src/search_datastructures.hpp index 6ee5fc621..aaed45a10 100644 --- a/src/search_datastructures.hpp +++ b/src/search_datastructures.hpp @@ -112,6 +112,7 @@ struct StatsHolder // final uint64_t hitsFinal; uint64_t qrysWithHit; + uint64_t pairs; #ifdef LAMBDA_MICRO_STATS // times @@ -149,6 +150,7 @@ struct StatsHolder hitsFinal = 0; qrysWithHit = 0; + pairs = 0; #ifdef LAMBDA_MICRO_STATS seedLengths.clear(); @@ -181,6 +183,7 @@ struct StatsHolder hitsFinal += rhs.hitsFinal; qrysWithHit += rhs.qrysWithHit; + pairs += rhs.pairs; #ifdef LAMBDA_MICRO_STATS seqan::append(seedLengths, rhs.seedLengths); @@ -280,7 +283,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options) if (options.verbosity >= 1) { auto const w = seqan::_numberOfDigits(stats.hitsFinal); - std::cout << "Number of valid hits: " << std::setw(w) << stats.hitsFinal + std::cout << "Number of total hits: " << std::setw(w) << stats.hitsFinal + << "\nNumber of Query-Subject pairs: " << std::setw(w) << stats.pairs << "\nNumber of Queries with at least one valid hit: " << std::setw(w) << stats.qrysWithHit << "\n"; } }