From 25d6e3e040052706197d5efef967e94f31ccebb1 Mon Sep 17 00:00:00 2001
From: Hannes Hauswedell
Date: Tue, 21 Nov 2023 00:00:41 +0100
Subject: [PATCH] WIP
---
src/search_algo.hpp | 58 ++++++++++++++++++++---------------
src/search_datastructures.hpp | 6 +++-
2 files changed, 39 insertions(+), 25 deletions(-)
diff --git a/src/search_algo.hpp b/src/search_algo.hpp
index 67394d442..1b70f56dd 100644
--- a/src/search_algo.hpp
+++ b/src/search_algo.hpp
@@ -872,6 +872,10 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
record.matches.resize(lH.options.maxMatches);
}
lH.stats.hitsFinal += record.matches.size();
+ for (auto it = record.matches.begin(), it2 = std::next(record.matches.begin()); it2 != record.matches.end();
+ ++it, ++it2)
+ if (it->_n_sId != it2->_n_sId)
+ ++lH.stats.pairs;
// compute LCA
if (lH.options.computeLCA)
@@ -922,7 +926,8 @@ inline void _widenMatch(Match & m, TLocalHolder const & lH)
uint64_t band = _bandSize(lH.transQrySeqs[m.qryId].size());
// end on subject is beginning plus full query length plus band
- m.subjEnd = std::min(m.subjStart + lH.transQrySeqs[m.qryId].size() + band, lH.gH.transSbjSeqs[m.subjId].size());
+ m.subjEnd =
+ std::min(m.subjStart + lH.transQrySeqs[m.qryId].size() + band, lH.gH.transSbjSeqs[m.subjId].size());
// account for band in subj start
m.subjStart = (band < m.subjStart) ? m.subjStart - band : 0;
@@ -1125,19 +1130,19 @@ inline void _performAlignment(TDepSetH & depSetH,
}
template
-inline void _widenAndPreprocessMatches(TLocalHolder & lH)
+inline void _widenAndPreprocessMatches(std::span & matches, TLocalHolder & lH)
{
- auto before = lH.matches.size();
+ auto before = matches.size();
- for (Match & m : lH.matches)
+ for (Match & m : matches)
_widenMatch(m, lH);
- std::ranges::sort(lH.matches);
+ std::ranges::sort(matches);
- if (lH.matches.size() > 1)
+ if (matches.size() > 1)
{
// pairwise merge from left to right
- for (auto it = lH.matches.begin(); it < lH.matches.end() - 1; ++it)
+ for (auto it = matches.begin(); it < matches.end() - 1; ++it)
{
Match & l = *it;
Match & r = *(it + 1);
@@ -1149,7 +1154,7 @@ inline void _widenAndPreprocessMatches(TLocalHolder & lH)
}
// pairwise "swallow" from right to left
- for (auto it = lH.matches.rbegin(); it < lH.matches.rend() - 1; ++it)
+ for (auto it = matches.rbegin(); it < matches.rend() - 1; ++it)
{
Match & l = *it;
Match & r = *(it + 1);
@@ -1159,14 +1164,14 @@ inline void _widenAndPreprocessMatches(TLocalHolder & lH)
}
}
- auto const ret = std::ranges::unique(lH.matches);
- lH.matches.erase(ret.begin(), ret.end());
- lH.stats.hitsDuplicate += (before - lH.matches.size());
+ auto [new_end, old_end] = std::ranges::unique(matches); // move non-uniq to the end
+ matches = std::span{matches.begin(), new_end}; // "resize" of the span
+ lH.stats.hitsDuplicate += (before - matches.size());
}
}
template
-inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bsDirection::fwd)
+inline void iterateMatchesFullSimd(std::span lambdaMatches, TLocalHolder & lH, bsDirection const dir)
{
using TGlobalHolder = typename TLocalHolder::TGlobalHolder;
using TBlastMatch = typename TLocalHolder::TBlastMatch;
@@ -1176,7 +1181,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
// statistics
#ifdef LAMBDA_MICRO_STATS
++lH.stats.numQueryWithExt;
- lH.stats.numExtScore += seqan::length(lH.matches);
+ lH.stats.numExtScore += seqan::length(lambdaMatches);
double start = sysTime();
#endif
@@ -1186,19 +1191,12 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
seqan::StringSet::Type> depSetV;
// pre-sort and filter
- _widenAndPreprocessMatches(lH);
+ _widenAndPreprocessMatches(lambdaMatches, lH);
// create blast matches from Lambda matches
std::list blastMatches;
- for (Match const & m : lH.matches)
+ for (Match const & m : lambdaMatches)
{
- /* TODO we need to externalise this because right now we sort and merge hits that are not need in BS mode */
- // In BS-mode, skip those results that have wrong orientation
- if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
- {
- if ((dir == bsDirection::fwd && (m.subjId % 2)) || (dir == bsDirection::rev && !(m.subjId % 2)))
- continue;
- }
// create blastmatch in list without copy or move
blastMatches.emplace_back(lH.qryIds[m.qryId / TGlobalHolder::qryNumFrames],
const_gH.indexFile.ids[m.subjId / TGlobalHolder::sbjNumFrames]);
@@ -1362,12 +1360,24 @@ inline void writeRecords(TLocalHolder & lH)
template
inline void iterateMatches(TLocalHolder & lH)
{
- iterateMatchesFullSimd(lH, bsDirection::fwd);
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
{
- iterateMatchesFullSimd(lH, bsDirection::rev);
+ std::ranges::sort(lH.matches,
+ [](Match const & l, Match const & r) {
+ return std::tuple{l.subjId % 2, l} <
+ std::tuple{r.subjId % 2, r};
+ });
+
+ auto it = std::ranges::find_if(lH.matches, [](Match const & m) { return m.subjId % 2; });
+
+ iterateMatchesFullSimd(std::span{lH.matches.begin(), it}, lH, bsDirection::fwd);
+ iterateMatchesFullSimd(std::span{it, lH.matches.end()}, lH, bsDirection::rev);
lH.blastMatches.sort([](auto const & lhs, auto const & rhs) { return lhs._n_qId < rhs._n_qId; });
}
+ else
+ {
+ iterateMatchesFullSimd(lH.matches, lH, bsDirection::fwd);
+ }
}
//-----------------------------------------------------------------------
diff --git a/src/search_datastructures.hpp b/src/search_datastructures.hpp
index 6ee5fc621..aaed45a10 100644
--- a/src/search_datastructures.hpp
+++ b/src/search_datastructures.hpp
@@ -112,6 +112,7 @@ struct StatsHolder
// final
uint64_t hitsFinal;
uint64_t qrysWithHit;
+ uint64_t pairs;
#ifdef LAMBDA_MICRO_STATS
// times
@@ -149,6 +150,7 @@ struct StatsHolder
hitsFinal = 0;
qrysWithHit = 0;
+ pairs = 0;
#ifdef LAMBDA_MICRO_STATS
seedLengths.clear();
@@ -181,6 +183,7 @@ struct StatsHolder
hitsFinal += rhs.hitsFinal;
qrysWithHit += rhs.qrysWithHit;
+ pairs += rhs.pairs;
#ifdef LAMBDA_MICRO_STATS
seqan::append(seedLengths, rhs.seedLengths);
@@ -280,7 +283,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options)
if (options.verbosity >= 1)
{
auto const w = seqan::_numberOfDigits(stats.hitsFinal);
- std::cout << "Number of valid hits: " << std::setw(w) << stats.hitsFinal
+ std::cout << "Number of total hits: " << std::setw(w) << stats.hitsFinal
+ << "\nNumber of Query-Subject pairs: " << std::setw(w) << stats.pairs
<< "\nNumber of Queries with at least one valid hit: " << std::setw(w) << stats.qrysWithHit << "\n";
}
}