Skip to content

Commit

Permalink
Merge pull request #447 from ksahlin/mcs-hashfunc
Browse files Browse the repository at this point in the history
Switch to multi-context seeds hash function
  • Loading branch information
marcelm authored Oct 2, 2024
2 parents 726d1ac + a8a4958 commit e6808d2
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 27 deletions.
3 changes: 2 additions & 1 deletion src/arguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ struct SeedingArguments {
"results with non default values.", {'s'}}
, bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)"
"[determined from reference size]", {'b'}}
, aux_len{parser, "INT", "No. of bits to use from secondary strobe hash [24]", {"aux-len"}}
{
}
args::ArgumentParser& parser;
args::ValueFlag<int> r, m, k, l, u, c, s, bits;
args::ValueFlag<int> r, m, k, l, u, c, s, bits, aux_len;
};

#endif
1 change: 1 addition & 0 deletions src/cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
if (seeding.s) { opt.s = args::get(seeding.s); opt.s_set = true; }
if (seeding.c) { opt.c = args::get(seeding.c); opt.c_set = true; }
if (seeding.bits) { opt.bits = args::get(seeding.bits); }
if (seeding.aux_len) { opt.aux_len = args::get(seeding.aux_len); }

// Alignment
// if (n) { n = args::get(n); }
Expand Down
1 change: 1 addition & 0 deletions src/cmdline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ struct CommandLineOptions {
int u { 7 };
int s { 16 };
int c { 8 };
int aux_len{24};

// Alignment
int A { 2 };
Expand Down
5 changes: 3 additions & 2 deletions src/dumpstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ int run_dumpstrobes(int argc, char **argv) {
}

// Seeding
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7};
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}, aux_len{24};
int max_seed_len{};

bool k_set{false}, s_set{false}, c_set{false}, max_seed_len_set{false}, l_set{false}, u_set{false};
Expand All @@ -125,7 +125,8 @@ int run_dumpstrobes(int argc, char **argv) {
l_set ? l : IndexParameters::DEFAULT,
u_set ? u : IndexParameters::DEFAULT,
c_set ? c : IndexParameters::DEFAULT,
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT,
aux_len ? aux_len : IndexParameters::DEFAULT
);

logger.info() << index_parameters << '\n';
Expand Down
4 changes: 2 additions & 2 deletions src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <sstream>

static Logger& logger = Logger::get();
static const uint32_t STI_FILE_FORMAT_VERSION = 2;
static const uint32_t STI_FILE_FORMAT_VERSION = 3;


namespace {
Expand Down Expand Up @@ -304,7 +304,7 @@ void StrobemerIndex::assign_randstrobes(size_t ref_index, size_t offset) {
chunk.push_back(randstrobe);
}
for (auto randstrobe : chunk) {
RefRandstrobe::packed_t packed = ref_index << 8;
RefRandstrobe::packed_t packed = (ref_index << 9) | (randstrobe.main_is_first << 8);
packed = packed + (randstrobe.strobe2_pos - randstrobe.strobe1_pos);
randstrobes[offset++] = RefRandstrobe{randstrobe.hash, randstrobe.strobe1_pos, packed};
}
Expand Down
14 changes: 10 additions & 4 deletions src/indexparameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ bool RandstrobeParameters::operator==(const RandstrobeParameters& other) const {
&& this->q == other.q
&& this->max_dist == other.max_dist
&& this->w_min == other.w_min
&& this->w_max == other.w_max;
&& this->w_max == other.w_max
&& this->aux_len == other.aux_len;
}

/* Pre-defined index parameters that work well for a certain
Expand Down Expand Up @@ -48,7 +49,7 @@ static std::vector<Profile> profiles = {
* k, s, l, u, c and max_seed_len can be used to override determined parameters
* by setting them to a value other than IndexParameters::DEFAULT.
*/
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len) {
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len, int aux_len) {
const int default_c = 8;
size_t canonical_read_length = 50;
for (const auto& p : profiles) {
Expand Down Expand Up @@ -78,8 +79,11 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s,
max_dist = max_seed_len - k; // convert to distance in start positions
}
int q = std::pow(2, c == DEFAULT ? default_c : c) - 1;
if (aux_len == DEFAULT) {
aux_len = 24;
}

return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len);
}

void IndexParameters::write(std::ostream& os) const {
Expand All @@ -90,6 +94,7 @@ void IndexParameters::write(std::ostream& os) const {
write_int_to_ostream(os, randstrobe.u);
write_int_to_ostream(os, randstrobe.q);
write_int_to_ostream(os, randstrobe.max_dist);
write_int_to_ostream(os, randstrobe.aux_len);
}

IndexParameters IndexParameters::read(std::istream& is) {
Expand All @@ -100,7 +105,8 @@ IndexParameters IndexParameters::read(std::istream& is) {
int u = read_int_from_istream(is);
int q = read_int_from_istream(is);
int max_dist = read_int_from_istream(is);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
int aux_len = read_int_from_istream(is);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len);
}

bool IndexParameters::operator==(const IndexParameters& other) const {
Expand Down
14 changes: 9 additions & 5 deletions src/indexparameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,16 @@ struct RandstrobeParameters {
const int max_dist;
const unsigned w_min;
const unsigned w_max;
const unsigned aux_len;

RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max)
RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max, unsigned aux_len)
: l(l)
, u(u)
, q(q)
, max_dist(max_dist)
, w_min(w_min)
, w_max(w_max)
, aux_len(aux_len)
{
verify();
}
Expand All @@ -65,6 +67,9 @@ struct RandstrobeParameters {
if (w_min > w_max) {
throw BadParameter("w_min is greater than w_max (choose different -l/-u parameters)");
}
if (aux_len > 63) {
throw BadParameter("aux_len is larger than 63");
}
}
};

Expand All @@ -77,16 +82,15 @@ class IndexParameters {

static const int DEFAULT = std::numeric_limits<int>::min();

IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist)
IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist, int aux_len)
: canonical_read_length(canonical_read_length)
, syncmer(k, s)
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u)
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u, aux_len)
{
}

static IndexParameters from_read_length(
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT
);
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT, int aux_len = DEFAULT);
static IndexParameters read(std::istream& os);
std::string filename_extension() const;
void write(std::ostream& os) const;
Expand Down
4 changes: 3 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ int run_strobealign(int argc, char **argv) {
opt.l_set ? opt.l : IndexParameters::DEFAULT,
opt.u_set ? opt.u : IndexParameters::DEFAULT,
opt.c_set ? opt.c : IndexParameters::DEFAULT,
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT,
opt.aux_len ? opt.aux_len : IndexParameters::DEFAULT
);
logger.debug() << index_parameters << '\n';
AlignmentParameters aln_params;
Expand Down Expand Up @@ -228,6 +229,7 @@ int run_strobealign(int argc, char **argv) {
throw InvalidFasta("Too many reference sequences. Current maximum is " + std::to_string(RefRandstrobe::max_number_of_references));
}

logger.debug() << "Auxiliary hash length: " << index_parameters.randstrobe.aux_len << "\n";
StrobemerIndex index(references, index_parameters, opt.bits);
if (opt.use_index) {
// Read the index from a file
Expand Down
23 changes: 15 additions & 8 deletions src/randstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,12 @@ static inline syncmer_hash_t syncmer_smer_hash(uint64_t packed) {
return xxh64(packed);
}

static inline randstrobe_hash_t randstrobe_hash(syncmer_hash_t hash1, syncmer_hash_t hash2) {
return hash1 + hash2;
static inline randstrobe_hash_t randstrobe_hash(syncmer_hash_t hash1, syncmer_hash_t hash2, size_t aux_len) {
// Make the function symmetric
if (hash1 > hash2) {
std::swap(hash1, hash2);
}
return ((hash1 >> aux_len) << aux_len) ^ (hash2 >> (64 - aux_len));
}

std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer) {
Expand Down Expand Up @@ -131,7 +135,8 @@ std::vector<Syncmer> canonical_syncmers(
}

std::ostream& operator<<(std::ostream& os, const Randstrobe& randstrobe) {
os << "Randstrobe(hash=" << randstrobe.hash << ", strobe1_pos=" << randstrobe.strobe1_pos << ", strobe2_pos=" << randstrobe.strobe2_pos << ")";
os << "Randstrobe(hash=" << randstrobe.hash << ", strobe1_pos=" << randstrobe.strobe1_pos << ", strobe2_pos="
<< randstrobe.strobe2_pos << ", main_is_first=" << randstrobe.main_is_first << ")";
return os;
}

Expand All @@ -145,11 +150,13 @@ std::ostream& operator<<(std::ostream& os, const QueryRandstrobe& randstrobe) {
return os;
}

Randstrobe make_randstrobe(Syncmer strobe1, Syncmer strobe2) {
Randstrobe make_randstrobe(Syncmer strobe1, Syncmer strobe2, int aux_len) {
bool main_is_first = strobe1.hash < strobe2.hash;
return Randstrobe{
randstrobe_hash(strobe1.hash, strobe2.hash),
randstrobe_hash(strobe1.hash, strobe2.hash, aux_len),
static_cast<uint32_t>(strobe1.position),
static_cast<uint32_t>(strobe2.position)
static_cast<uint32_t>(strobe2.position),
main_is_first
};
}

Expand All @@ -175,7 +182,7 @@ Randstrobe RandstrobeIterator::get(unsigned int strobe1_index) const {
}
}

return make_randstrobe(strobe1, strobe2);
return make_randstrobe(strobe1, strobe2, aux_len);
}

Randstrobe RandstrobeGenerator::next() {
Expand Down Expand Up @@ -207,7 +214,7 @@ Randstrobe RandstrobeGenerator::next() {
}
syncmers.pop_front();

return make_randstrobe(strobe1, strobe2);
return make_randstrobe(strobe1, strobe2, aux_len);
}

/*
Expand Down
17 changes: 13 additions & 4 deletions src/randstrobes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,12 @@ struct RefRandstrobe {
return lhs < rhs;
}

bool main_is_first() const {
return (m_packed >> bit_alloc) & 1;
}

int reference_index() const {
return m_packed >> bit_alloc;
return m_packed >> (bit_alloc + 1);
}

int strobe2_offset() const {
Expand All @@ -54,7 +58,7 @@ struct RefRandstrobe {
packed_t m_packed; // packed representation of ref_index and strobe offset

public:
static constexpr uint32_t max_number_of_references = (1 << (32 - bit_alloc)) - 1;
static constexpr uint32_t max_number_of_references = (1 << (32 - bit_alloc - 1)) - 1; // bit_alloc - 1 because 1 bit to main_is_first()
};

struct QueryRandstrobe {
Expand All @@ -74,6 +78,7 @@ struct Randstrobe {
randstrobe_hash_t hash;
unsigned int strobe1_pos;
unsigned int strobe2_pos;
bool main_is_first;

bool operator==(const Randstrobe& other) const {
return hash == other.hash && strobe1_pos == other.strobe1_pos && strobe2_pos == other.strobe2_pos;
Expand Down Expand Up @@ -107,6 +112,7 @@ class RandstrobeIterator {
, w_max(parameters.w_max)
, q(parameters.q)
, max_dist(parameters.max_dist)
, aux_len(parameters.aux_len)
{
if (w_min > w_max) {
throw std::invalid_argument("w_min is greater than w_max");
Expand All @@ -128,7 +134,8 @@ class RandstrobeIterator {
const unsigned w_max;
const uint64_t q;
const unsigned int max_dist;
unsigned int strobe1_index = 0;
const unsigned int aux_len;
unsigned strobe1_index = 0;
};

std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer);
Expand Down Expand Up @@ -176,17 +183,19 @@ class RandstrobeGenerator {
, w_max(randstrobe_parameters.w_max)
, q(randstrobe_parameters.q)
, max_dist(randstrobe_parameters.max_dist)
, aux_len(randstrobe_parameters.aux_len)
{ }

Randstrobe next();
Randstrobe end() const { return Randstrobe{0, 0, 0}; }
Randstrobe end() const { return Randstrobe{0, 0, 0, false}; }

private:
SyncmerIterator syncmer_iterator;
const unsigned w_min;
const unsigned w_max;
const uint64_t q;
const unsigned int max_dist;
const unsigned int aux_len;
std::deque<Syncmer> syncmers;
};

Expand Down

0 comments on commit e6808d2

Please sign in to comment.