Skip to content

Commit

Permalink
[MISC] Use uint64_t::max() for merged bins
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Sep 3, 2024
1 parent 0cab5ec commit 450c9eb
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 26 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ int main()
auto & result1 = agent.membership_for(query1, 2u);

// query1 hits in user_bin_1 and user_bin_3, which have the IDs 0 and 2, respectively.
for (int64_t hit_user_bin : result1)
for (uint64_t hit_user_bin : result1)
std::cout << hit_user_bin << ' '; // The results are not sorted: 2 0
std::cout << '\n';

Expand All @@ -154,7 +154,7 @@ int main()
agent.sort_results(); // Sort the results.

// query2 hits in user_bin_1 and user_bin_2, which have the IDs 0 and 1, respectively.
for (int64_t hit_user_bin : result2)
for (uint64_t hit_user_bin : result2)
std::cout << hit_user_bin << ' '; // The results are sorted: 0 1
std::cout << '\n';
}
Expand Down
2 changes: 1 addition & 1 deletion include/hibf/build/update_user_bins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace seqan::hibf::build
/*!\brief Updates user bins stored in HIBF.
* \ingroup hibf_build
*/
inline void update_user_bins(std::vector<int64_t> & filename_indices, layout::layout::user_bin const & record)
inline void update_user_bins(std::vector<uint64_t> & filename_indices, layout::layout::user_bin const & record)
{
std::fill_n(filename_indices.begin() + record.storage_TB_id, record.number_of_technical_bins, record.idx);
}
Expand Down
4 changes: 3 additions & 1 deletion include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ struct config
* Since the data to construct the (H)IBF is given by a function object `seqan::hibf::config::input_fn`,
* the number of user bins to consider must be given via this option.
*
* Value must be neither `0` nor `std::numeric_limits<uint64_t>::max()`.
*
* \include test/snippet/hibf/config_number_of_user_bins.cpp
*
* In this example, `12` user bins would be inserted into the (H)IBF, each only storing the hash `42`.
Expand Down Expand Up @@ -288,7 +290,7 @@ struct config
/*!\brief Checks several variables of seqan::hibf::config and sets default values if necessary.
*
* Required options:
* * seqan::hibf::config::number_of_user_bins must be set to a value other than `0`.
* * seqan::hibf::config::number_of_user_bins must be neither `0` nor `std::numeric_limits<uint64_t>::max()`.
* * seqan::hibf::config::input_fn must be set.
*
* Constrains:
Expand Down
38 changes: 23 additions & 15 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@
namespace seqan::hibf
{

namespace bin_kind
{

//!\brief The value that indicates a merged bin.
static constexpr uint64_t merged{std::numeric_limits<uint64_t>::max()};

} // namespace bin_kind

/*!\brief The Hierarchical Interleaved Bloom Filter (HIBF) - Fast answers to set-membership queries for multiple bins.
* \ingroup hibf
* \details
Expand Down Expand Up @@ -202,16 +210,16 @@ class hierarchical_interleaved_bloom_filter
* If `j != i` is returned, there is a lower level IBF, bin `b` is a merged bin, and `j` is the ID of the lower
* level IBF in ibf_vector.
*/
std::vector<std::vector<int64_t>> next_ibf_id;
std::vector<std::vector<uint64_t>> next_ibf_id;

/*!\brief Stores for each bin in each IBF of the HIBF the user bin ID.
* \details
* Assume we look up a bin `b` in IBF `i`, i.e. `ibf_bin_to_user_bin_id[i][b]`.
* If `-1` is returned, bin `b` is a merged bin, there is no single user bin, we need to look into the
* lower level IBF.
* If `seqan::hibf::bin_kind::merged` is returned, bin `b` is a merged bin, there is no single user bin, we need
* to look into the lower level IBF.
* Otherwise, the returned value `j` is the corresponding user bin ID.
*/
std::vector<std::vector<int64_t>> ibf_bin_to_user_bin_id{};
std::vector<std::vector<uint64_t>> ibf_bin_to_user_bin_id{};

//!\brief Returns a membership_agent to be used for counting.
membership_agent_type membership_agent() const;
Expand Down Expand Up @@ -269,7 +277,7 @@ class hierarchical_interleaved_bloom_filter::membership_agent_type

//!\brief Helper for recursive membership querying.
template <std::ranges::forward_range value_range_t>
void membership_for_impl(value_range_t && values, int64_t const ibf_idx, uint16_t const threshold)
void membership_for_impl(value_range_t && values, size_t const ibf_idx, uint16_t const threshold)
{
auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<uint16_t>();
auto & result = agent.bulk_count(values);
Expand All @@ -282,7 +290,7 @@ class hierarchical_interleaved_bloom_filter::membership_agent_type

auto const current_filename_index = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin];

if (current_filename_index < 0) // merged bin
if (current_filename_index == bin_kind::merged) // merged bin
{
if (sum >= threshold)
membership_for_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
Expand All @@ -299,7 +307,7 @@ class hierarchical_interleaved_bloom_filter::membership_agent_type
}

//!\brief Stores the result of membership_for().
std::vector<int64_t> result_buffer;
std::vector<uint64_t> result_buffer;

public:
/*!\name Constructors, destructor and assignment
Expand Down Expand Up @@ -366,8 +374,8 @@ class hierarchical_interleaved_bloom_filter::membership_agent_type
* seqan::hibf::hierarchical_interleaved_bloom_filter::membership_agent for each thread.
*/
template <std::ranges::forward_range value_range_t>
[[nodiscard]] std::vector<int64_t> const & membership_for(value_range_t && values,
uint16_t const threshold) & noexcept
[[nodiscard]] std::vector<uint64_t> const & membership_for(value_range_t && values,
uint16_t const threshold) & noexcept
{
assert(hibf_ptr != nullptr);

Expand All @@ -377,16 +385,16 @@ class hierarchical_interleaved_bloom_filter::membership_agent_type

result_buffer.clear();

membership_for_impl(values, 0, threshold);
membership_for_impl(values, 0u, threshold);

return result_buffer;
}

// `membership_for` cannot be called on a temporary, since the object the returned reference points to
// is immediately destroyed.
template <std::ranges::range value_range_t>
[[nodiscard]] std::vector<int64_t> const & membership_for(value_range_t && values,
uint16_t const threshold) && noexcept = delete;
[[nodiscard]] std::vector<uint64_t> const & membership_for(value_range_t && values,
uint16_t const threshold) && noexcept = delete;
//!\}
};

Expand All @@ -405,7 +413,7 @@ class hierarchical_interleaved_bloom_filter::counting_agent_type

//!\brief Helper for recursive bulk counting.
template <std::ranges::forward_range value_range_t>
void bulk_count_impl(value_range_t && values, int64_t const ibf_idx, size_t const threshold)
void bulk_count_impl(value_range_t && values, size_t const ibf_idx, size_t const threshold)
{
auto agent = hibf_ptr->ibf_vector[ibf_idx].template counting_agent<value_t>();
auto & result = agent.bulk_count(values);
Expand All @@ -417,7 +425,7 @@ class hierarchical_interleaved_bloom_filter::counting_agent_type
sum += result[bin];
auto const current_filename_index = hibf_ptr->ibf_bin_to_user_bin_id[ibf_idx][bin];

if (current_filename_index < 0) // merged bin
if (current_filename_index == bin_kind::merged) // merged bin
{
if (sum >= threshold)
bulk_count_impl(values, hibf_ptr->next_ibf_id[ibf_idx][bin], threshold);
Expand Down Expand Up @@ -498,7 +506,7 @@ class hierarchical_interleaved_bloom_filter::counting_agent_type

std::ranges::fill(result_buffer, static_cast<value_t>(0));

bulk_count_impl(values, 0, threshold);
bulk_count_impl(values, 0u, threshold);

return result_buffer;
}
Expand Down
6 changes: 5 additions & 1 deletion src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ void config::validate_and_set_defaults()
if (number_of_user_bins == 0u)
throw std::invalid_argument{"[HIBF CONFIG ERROR] You did not set the required config::number_of_user_bins."};

if (number_of_user_bins == 18'446'744'073'709'551'615ULL) // std::numeric_limits<uint64_t>::max() = bin_kind::merged
throw std::invalid_argument{"[HIBF CONFIG ERROR] The maximum possible config::number_of_user_bins "
"is 18446744073709551614."};

if (number_of_hash_functions == 0u || number_of_hash_functions > 5u)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::number_of_hash_functions must be in [1,5]."};

Expand Down Expand Up @@ -97,7 +101,7 @@ void config::validate_and_set_defaults()
}
else if (tmax > 18'446'744'073'709'551'552ULL) // next_multiple_of_64 would not fit in size_t. Underflowed by user?
{
throw std::invalid_argument{"[HIBF CONFIG ERROR] The maximum possible tmax is 18446744073709551552."};
throw std::invalid_argument{"[HIBF CONFIG ERROR] The maximum possible config::tmax is 18446744073709551552."};
}
else if (tmax % 64 != 0)
{
Expand Down
4 changes: 2 additions & 2 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf,
{
size_t const ibf_pos{data.request_ibf_idx()};

std::vector<int64_t> ibf_positions(current_node.number_of_technical_bins, ibf_pos);
std::vector<int64_t> filename_indices(current_node.number_of_technical_bins, -1);
std::vector<uint64_t> ibf_positions(current_node.number_of_technical_bins, ibf_pos);
std::vector<uint64_t> filename_indices(current_node.number_of_technical_bins, bin_kind::merged);
robin_hood::unordered_flat_set<uint64_t> kmers{};

auto initialise_max_bin_kmers = [&]() -> size_t
Expand Down
4 changes: 2 additions & 2 deletions test/snippet/readme.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ int main()
auto & result1 = agent.membership_for(query1, 2u);

// query1 hits in user_bin_1 and user_bin_3, which have the IDs 0 and 2, respectively.
for (int64_t hit_user_bin : result1)
for (uint64_t hit_user_bin : result1)
std::cout << hit_user_bin << ' '; // The results are not sorted: 2 0
std::cout << '\n';

Expand All @@ -88,7 +88,7 @@ int main()
agent.sort_results(); // Sort the results.

// query2 hits in user_bin_1 and user_bin_2, which have the IDs 0 and 1, respectively.
for (int64_t hit_user_bin : result2)
for (uint64_t hit_user_bin : result2)
std::cout << hit_user_bin << ' '; // The results are sorted: 0 1
std::cout << '\n';
}
11 changes: 9 additions & 2 deletions test/unit/hibf/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,16 @@ TEST(config_test, validate_and_set_defaults)
check_error_message(configuration, "[HIBF CONFIG ERROR] You did not set the required config::input_fn.");
}

// number_of_user_bins cannot be 0
// number_of_user_bins cannot be 0 or bin_kind::merged (18'446'744'073'709'551'615ULL)
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn};
check_error_message(configuration,
"[HIBF CONFIG ERROR] You did not set the required config::number_of_user_bins.");

configuration.number_of_user_bins = 18'446'744'073'709'551'615ULL;
check_error_message(configuration,
"[HIBF CONFIG ERROR] The maximum possible config::number_of_user_bins "
"is 18446744073709551614.");
}

// number_of_hash_functions must be in [1,5]
Expand Down Expand Up @@ -255,7 +260,9 @@ TEST(config_test, validate_and_set_defaults)
.number_of_user_bins = 1u,
.tmax = 18'446'744'073'709'551'553ULL};

check_error_message(configuration, "[HIBF CONFIG ERROR] The maximum possible tmax is 18446744073709551552.");
check_error_message(configuration,
"[HIBF CONFIG ERROR] The maximum possible config::tmax "
"is 18446744073709551552.");
}

// Given tmax is not a multiple of 64
Expand Down

0 comments on commit 450c9eb

Please sign in to comment.