From 8622dfc5a368af4698b3b7558602f0ff60b3e9a3 Mon Sep 17 00:00:00 2001 From: Itolstoganov Date: Tue, 16 Jan 2024 19:42:28 +0100 Subject: [PATCH 1/4] Switch to multi-context seeds hash function This only changes the hash function. Multi-context seeds are not used during lookup. --- src/arguments.hpp | 3 ++- src/cmdline.cpp | 1 + src/cmdline.hpp | 1 + src/dumpstrobes.cpp | 5 +++-- src/index.cpp | 2 +- src/indexparameters.cpp | 14 ++++++++++---- src/indexparameters.hpp | 14 +++++++++----- src/main.cpp | 4 +++- src/randstrobes.cpp | 23 +++++++++++++++-------- src/randstrobes.hpp | 15 ++++++++++++--- 10 files changed, 57 insertions(+), 25 deletions(-) diff --git a/src/arguments.hpp b/src/arguments.hpp index 547a55b6..63d2409c 100644 --- a/src/arguments.hpp +++ b/src/arguments.hpp @@ -26,10 +26,11 @@ struct SeedingArguments { "results with non default values.", {'s'}} , bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)" "[determined from reference size]", {'b'}} + , aux_len{parser, "INT", "Number of bits to be selected from the second strobe hash [24]", {"aux-len"}} { } args::ArgumentParser& parser; - args::ValueFlag r, m, k, l, u, c, s, bits; + args::ValueFlag r, m, k, l, u, c, s, bits, aux_len; }; #endif diff --git a/src/cmdline.cpp b/src/cmdline.cpp index 4222401d..ec5f7e70 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -119,6 +119,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { if (seeding.s) { opt.s = args::get(seeding.s); opt.s_set = true; } if (seeding.c) { opt.c = args::get(seeding.c); opt.c_set = true; } if (seeding.bits) { opt.bits = args::get(seeding.bits); } + if (seeding.aux_len) { opt.aux_len = args::get(seeding.aux_len); } // Alignment // if (n) { n = args::get(n); } diff --git a/src/cmdline.hpp b/src/cmdline.hpp index df117e26..f780ba3d 100644 --- a/src/cmdline.hpp +++ b/src/cmdline.hpp @@ -52,6 +52,7 @@ struct CommandLineOptions { int u { 7 }; int s { 16 }; int c { 8 }; + int aux_len{24}; // Alignment int A { 2 }; diff --git a/src/dumpstrobes.cpp b/src/dumpstrobes.cpp index 4a8a1d88..fcf10bc9 100644 --- a/src/dumpstrobes.cpp +++ b/src/dumpstrobes.cpp @@ -101,7 +101,7 @@ int run_dumpstrobes(int argc, char **argv) { } // Seeding - int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}; + int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}, aux_len{24}; int max_seed_len{}; bool k_set{false}, s_set{false}, c_set{false}, max_seed_len_set{false}, l_set{false}, u_set{false}; @@ -125,7 +125,8 @@ int run_dumpstrobes(int argc, char **argv) { l_set ? l : IndexParameters::DEFAULT, u_set ? u : IndexParameters::DEFAULT, c_set ? c : IndexParameters::DEFAULT, - max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT + max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT, + aux_len ? aux_len : IndexParameters::DEFAULT ); logger.info() << index_parameters << '\n'; diff --git a/src/index.cpp b/src/index.cpp index 94e0e7cf..d1e8af35 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -304,7 +304,7 @@ void StrobemerIndex::assign_randstrobes(size_t ref_index, size_t offset) { chunk.push_back(randstrobe); } for (auto randstrobe : chunk) { - RefRandstrobe::packed_t packed = ref_index << 8; + RefRandstrobe::packed_t packed = (ref_index << 9) | (randstrobe.main_is_first << 8); packed = packed + (randstrobe.strobe2_pos - randstrobe.strobe1_pos); randstrobes[offset++] = RefRandstrobe{randstrobe.hash, randstrobe.strobe1_pos, packed}; } diff --git a/src/indexparameters.cpp b/src/indexparameters.cpp index 0c655903..b8ed42cd 100644 --- a/src/indexparameters.cpp +++ b/src/indexparameters.cpp @@ -18,7 +18,8 @@ bool RandstrobeParameters::operator==(const RandstrobeParameters& other) const { && this->q == other.q && this->max_dist == other.max_dist && this->w_min == other.w_min - && this->w_max == other.w_max; + && this->w_max == other.w_max + && this->aux_len == other.aux_len; } /* Pre-defined index parameters that work well for a certain @@ -48,7 +49,7 @@ static std::vector profiles = { * k, s, l, u, c and max_seed_len can be used to override determined parameters * by setting them to a value other than IndexParameters::DEFAULT. */ -IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len) { +IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len, int aux_len) { const int default_c = 8; size_t canonical_read_length = 50; for (const auto& p : profiles) { @@ -78,8 +79,11 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, max_dist = max_seed_len - k; // convert to distance in start positions } int q = std::pow(2, c == DEFAULT ? default_c : c) - 1; + if (aux_len == DEFAULT) { + aux_len = 24; + } - return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist); + return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len); } void IndexParameters::write(std::ostream& os) const { @@ -90,6 +94,7 @@ void IndexParameters::write(std::ostream& os) const { write_int_to_ostream(os, randstrobe.u); write_int_to_ostream(os, randstrobe.q); write_int_to_ostream(os, randstrobe.max_dist); + write_int_to_ostream(os, randstrobe.aux_len); } IndexParameters IndexParameters::read(std::istream& is) { @@ -100,7 +105,8 @@ IndexParameters IndexParameters::read(std::istream& is) { int u = read_int_from_istream(is); int q = read_int_from_istream(is); int max_dist = read_int_from_istream(is); - return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist); + int aux_len = read_int_from_istream(is); + return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len); } bool IndexParameters::operator==(const IndexParameters& other) const { diff --git a/src/indexparameters.hpp b/src/indexparameters.hpp index 75e6c258..1e6bc58e 100644 --- a/src/indexparameters.hpp +++ b/src/indexparameters.hpp @@ -43,14 +43,16 @@ struct RandstrobeParameters { const int max_dist; const unsigned w_min; const unsigned w_max; + const unsigned aux_len; - RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max) + RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max, unsigned aux_len) : l(l) , u(u) , q(q) , max_dist(max_dist) , w_min(w_min) , w_max(w_max) + , aux_len(aux_len) { verify(); } @@ -65,6 +67,9 @@ struct RandstrobeParameters { if (w_min > w_max) { throw BadParameter("w_min is greater than w_max (choose different -l/-u parameters)"); } + if (aux_len > 63) { + throw BadParameter("aux_len is larger than 63"); + } } }; @@ -77,16 +82,15 @@ class IndexParameters { static const int DEFAULT = std::numeric_limits::min(); - IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist) + IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist, int aux_len) : canonical_read_length(canonical_read_length) , syncmer(k, s) - , randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u) + , randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u, aux_len) { } static IndexParameters from_read_length( - int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT - ); + int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT, int aux_len = DEFAULT); static IndexParameters read(std::istream& os); std::string filename_extension() const; void write(std::ostream& os) const; diff --git a/src/main.cpp b/src/main.cpp index 61509baf..b5eb5986 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -179,7 +179,8 @@ int run_strobealign(int argc, char **argv) { opt.l_set ? opt.l : IndexParameters::DEFAULT, opt.u_set ? opt.u : IndexParameters::DEFAULT, opt.c_set ? opt.c : IndexParameters::DEFAULT, - opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT + opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT, + opt.aux_len ? opt.aux_len : IndexParameters::DEFAULT ); logger.debug() << index_parameters << '\n'; AlignmentParameters aln_params; @@ -228,6 +229,7 @@ int run_strobealign(int argc, char **argv) { throw InvalidFasta("Too many reference sequences. Current maximum is " + std::to_string(RefRandstrobe::max_number_of_references)); } + logger.debug() << "Auxiliary hash length: " << index_parameters.randstrobe.aux_len << "\n"; StrobemerIndex index(references, index_parameters, opt.bits); if (opt.use_index) { // Read the index from a file diff --git a/src/randstrobes.cpp b/src/randstrobes.cpp index 89273507..88d5f837 100644 --- a/src/randstrobes.cpp +++ b/src/randstrobes.cpp @@ -45,8 +45,12 @@ static inline syncmer_hash_t syncmer_smer_hash(uint64_t packed) { return xxh64(packed); } -static inline randstrobe_hash_t randstrobe_hash(syncmer_hash_t hash1, syncmer_hash_t hash2) { - return hash1 + hash2; +static inline randstrobe_hash_t randstrobe_hash(syncmer_hash_t hash1, syncmer_hash_t hash2, size_t aux_len) { + // Make the function symmetric + if (hash1 > hash2) { + std::swap(hash1, hash2); + } + return ((hash1 >> aux_len) << aux_len) ^ (hash2 >> (64 - aux_len)); } std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer) { @@ -131,7 +135,8 @@ std::vector canonical_syncmers( } std::ostream& operator<<(std::ostream& os, const Randstrobe& randstrobe) { - os << "Randstrobe(hash=" << randstrobe.hash << ", strobe1_pos=" << randstrobe.strobe1_pos << ", strobe2_pos=" << randstrobe.strobe2_pos << ")"; + os << "Randstrobe(hash=" << randstrobe.hash << ", strobe1_pos=" << randstrobe.strobe1_pos << ", strobe2_pos=" + << randstrobe.strobe2_pos << ", main_is_first=" << randstrobe.main_is_first << ")"; return os; } @@ -145,11 +150,13 @@ std::ostream& operator<<(std::ostream& os, const QueryRandstrobe& randstrobe) { return os; } -Randstrobe make_randstrobe(Syncmer strobe1, Syncmer strobe2) { +Randstrobe make_randstrobe(Syncmer strobe1, Syncmer strobe2, int aux_len) { + bool main_is_first = strobe1.hash < strobe2.hash; return Randstrobe{ - randstrobe_hash(strobe1.hash, strobe2.hash), + randstrobe_hash(strobe1.hash, strobe2.hash, aux_len), static_cast(strobe1.position), - static_cast(strobe2.position) + static_cast(strobe2.position), + main_is_first }; } @@ -175,7 +182,7 @@ Randstrobe RandstrobeIterator::get(unsigned int strobe1_index) const { } } - return make_randstrobe(strobe1, strobe2); + return make_randstrobe(strobe1, strobe2, aux_len); } Randstrobe RandstrobeGenerator::next() { @@ -207,7 +214,7 @@ Randstrobe RandstrobeGenerator::next() { } syncmers.pop_front(); - return make_randstrobe(strobe1, strobe2); + return make_randstrobe(strobe1, strobe2, aux_len); } /* diff --git a/src/randstrobes.hpp b/src/randstrobes.hpp index 7a79e8ad..83a905c9 100644 --- a/src/randstrobes.hpp +++ b/src/randstrobes.hpp @@ -39,8 +39,12 @@ struct RefRandstrobe { return lhs < rhs; } + bool main_is_first() const { + return (m_packed >> bit_alloc) & 1; + } + int reference_index() const { - return m_packed >> bit_alloc; + return m_packed >> (bit_alloc + 1); } int strobe2_offset() const { @@ -74,6 +78,7 @@ struct Randstrobe { randstrobe_hash_t hash; unsigned int strobe1_pos; unsigned int strobe2_pos; + bool main_is_first; bool operator==(const Randstrobe& other) const { return hash == other.hash && strobe1_pos == other.strobe1_pos && strobe2_pos == other.strobe2_pos; @@ -107,6 +112,7 @@ class RandstrobeIterator { , w_max(parameters.w_max) , q(parameters.q) , max_dist(parameters.max_dist) + , aux_len(parameters.aux_len) { if (w_min > w_max) { throw std::invalid_argument("w_min is greater than w_max"); @@ -128,7 +134,8 @@ class RandstrobeIterator { const unsigned w_max; const uint64_t q; const unsigned int max_dist; - unsigned int strobe1_index = 0; + const unsigned int aux_len; + unsigned strobe1_index = 0; }; std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer); @@ -176,10 +183,11 @@ class RandstrobeGenerator { , w_max(randstrobe_parameters.w_max) , q(randstrobe_parameters.q) , max_dist(randstrobe_parameters.max_dist) + , aux_len(randstrobe_parameters.aux_len) { } Randstrobe next(); - Randstrobe end() const { return Randstrobe{0, 0, 0}; } + Randstrobe end() const { return Randstrobe{0, 0, 0, false}; } private: SyncmerIterator syncmer_iterator; @@ -187,6 +195,7 @@ class RandstrobeGenerator { const unsigned w_max; const uint64_t q; const unsigned int max_dist; + const unsigned int aux_len; std::deque syncmers; }; From 226aba1d87f0d67b5e5e5bf44a82f279015d8dda Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 24 May 2024 14:14:40 +0200 Subject: [PATCH 2/4] Slightly shorter help for --aux-len --- src/arguments.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arguments.hpp b/src/arguments.hpp index 63d2409c..931326c9 100644 --- a/src/arguments.hpp +++ b/src/arguments.hpp @@ -26,7 +26,7 @@ struct SeedingArguments { "results with non default values.", {'s'}} , bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)" "[determined from reference size]", {'b'}} - , aux_len{parser, "INT", "Number of bits to be selected from the second strobe hash [24]", {"aux-len"}} + , aux_len{parser, "INT", "No. of bits to use from secondary strobe hash [24]", {"aux-len"}} { } args::ArgumentParser& parser; From e5879ae3af1620faa864b2b6f654365769e196f2 Mon Sep 17 00:00:00 2001 From: ksahlin Date: Fri, 19 Apr 2024 11:10:41 +0200 Subject: [PATCH 3/4] Fix max_number_of_references --- src/randstrobes.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/randstrobes.hpp b/src/randstrobes.hpp index 83a905c9..bfbaaebc 100644 --- a/src/randstrobes.hpp +++ b/src/randstrobes.hpp @@ -58,7 +58,7 @@ struct RefRandstrobe { packed_t m_packed; // packed representation of ref_index and strobe offset public: - static constexpr uint32_t max_number_of_references = (1 << (32 - bit_alloc)) - 1; + static constexpr uint32_t max_number_of_references = (1 << (32 - bit_alloc - 1)) - 1; // bit_alloc - 1 because 1 bit to main_is_first() }; struct QueryRandstrobe { From a8a4958ce708102e22bf5971e8b600005f313dce Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Tue, 1 Oct 2024 18:03:05 +0200 Subject: [PATCH 4/4] Bump .sti file format version --- src/index.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.cpp b/src/index.cpp index d1e8af35..0ae9c9f1 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -21,7 +21,7 @@ #include static Logger& logger = Logger::get(); -static const uint32_t STI_FILE_FORMAT_VERSION = 2; +static const uint32_t STI_FILE_FORMAT_VERSION = 3; namespace {