From 0f5dc35efefc0d463ab58b98bf8e533592f3d45c Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 30 Oct 2024 15:18:43 +0100 Subject: [PATCH] dynamic --- include/hibf/build/insert_into_ibf.hpp | 3 +- include/hibf/config.hpp | 20 +++++- .../hierarchical_interleaved_bloom_filter.hpp | 20 ++++++ include/hibf/interleaved_bloom_filter.hpp | 29 ++++++-- include/hibf/layout/hierarchical_binning.hpp | 3 +- include/hibf/misc/empty_bins_by_fraction.hpp | 26 +++++++ include/hibf/misc/insert_iterator.hpp | 10 +++ src/build/construct_ibf.cpp | 2 +- src/build/insert_into_ibf.cpp | 40 +++++++++-- src/config.cpp | 17 ++++- src/hierarchical_interleaved_bloom_filter.cpp | 17 +++-- src/interleaved_bloom_filter.cpp | 68 ++++++++++++++----- src/layout/hierarchical_binning.cpp | 13 ++-- .../interleaved_bloom_filter_benchmark.cpp | 5 +- test/unit/hibf/config_test.cpp | 63 ++++++++++++++++- .../hibf/interleaved_bloom_filter_test.cpp | 38 +++++------ .../hibf/layout/hierarchical_binning_test.cpp | 37 ++++++++++ 17 files changed, 338 insertions(+), 73 deletions(-) create mode 100644 include/hibf/misc/empty_bins_by_fraction.hpp diff --git a/include/hibf/build/insert_into_ibf.hpp b/include/hibf/build/insert_into_ibf.hpp index e8d41853..85d6d419 100644 --- a/include/hibf/build/insert_into_ibf.hpp +++ b/include/hibf/build/insert_into_ibf.hpp @@ -21,7 +21,8 @@ namespace seqan::hibf::build * \details * Automatically does naive splitting if number_of_bins > 1. */ -void insert_into_ibf(robin_hood::unordered_flat_set const & kmers, +void insert_into_ibf(build_data const & data, + robin_hood::unordered_flat_set const & kmers, size_t const number_of_bins, size_t const bin_index, seqan::hibf::interleaved_bloom_filter & ibf, diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index cbf6d224..5a0b526d 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -40,6 +40,7 @@ namespace seqan::hibf * | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] | * | Layout | seqan::hibf::config::sketch_bits | 12 | | * | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset | + * | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout | * | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | | * | Layout | seqan::hibf::config::alpha | 1.2 | | * | Layout | seqan::hibf::config::disable_estimate_union | false | | @@ -230,6 +231,9 @@ struct config */ size_t tmax{}; + //!\brief The percentage of empty bins in the layout. + double empty_bin_fraction{}; + /*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm. * * The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of @@ -302,6 +306,7 @@ struct config * * seqan::hibf::config::threads must be greater than `0`. * * seqan::hibf::config::sketch_bits must be in `[5,32]`. * * seqan::hibf::config::tmax must be at most `18446744073709551552`. + * * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`. * * seqan::hibf::config::alpha must be positive. * * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`. * @@ -324,6 +329,7 @@ struct config threads == other.threads && sketch_bits == other.sketch_bits && tmax == other.tmax && + empty_bin_fraction == other.empty_bin_fraction && alpha == other.alpha && max_rearrangement_ratio == other.max_rearrangement_ratio && disable_estimate_union == other.disable_estimate_union && @@ -331,14 +337,18 @@ struct config // clang-format on } + bool validated{false}; + private: friend class cereal::access; + static constexpr uint32_t version{2}; + template void serialize(archive_t & archive) { - uint32_t version{1}; - archive(CEREAL_NVP(version)); + uint32_t parsed_version{version}; + archive(cereal::make_nvp("version", parsed_version)); archive(CEREAL_NVP(number_of_user_bins)); archive(CEREAL_NVP(number_of_hash_functions)); @@ -348,10 +358,16 @@ struct config archive(CEREAL_NVP(sketch_bits)); archive(CEREAL_NVP(tmax)); + + if (parsed_version > 1u) + archive(CEREAL_NVP(empty_bin_fraction)); + archive(CEREAL_NVP(alpha)); archive(CEREAL_NVP(max_rearrangement_ratio)); archive(CEREAL_NVP(disable_estimate_union)); archive(CEREAL_NVP(disable_rearrangement)); + if (parsed_version > 1u) + archive(CEREAL_NVP(validated)); } }; diff --git a/include/hibf/hierarchical_interleaved_bloom_filter.hpp b/include/hibf/hierarchical_interleaved_bloom_filter.hpp index 3b3f3715..7b58886a 100644 --- a/include/hibf/hierarchical_interleaved_bloom_filter.hpp +++ b/include/hibf/hierarchical_interleaved_bloom_filter.hpp @@ -33,6 +33,8 @@ namespace bin_kind //!\brief The value that indicates a merged bin. static constexpr uint64_t merged{std::numeric_limits::max()}; +//!\brief The value that indicates a deleted bin. +static constexpr uint64_t deleted{std::numeric_limits::max() - 1u}; } // namespace bin_kind @@ -212,6 +214,23 @@ class hierarchical_interleaved_bloom_filter */ std::vector> next_ibf_id; + struct previous_ibf_id_pair + { + size_t ibf_idx{}; + size_t bin_idx{}; + + friend constexpr auto operator<=>(previous_ibf_id_pair const &, previous_ibf_id_pair const &) = default; + + template + void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive) + { + archive(ibf_idx); + archive(bin_idx); + } + }; + + std::vector prev_ibf_id; + /*!\brief Stores for each bin in each IBF of the HIBF the user bin ID. * \details * Assume we look up a bin `b` in IBF `i`, i.e. `ibf_bin_to_user_bin_id[i][b]`. @@ -251,6 +270,7 @@ class hierarchical_interleaved_bloom_filter archive(ibf_vector); archive(next_ibf_id); archive(ibf_bin_to_user_bin_id); + archive(prev_ibf_id); } /*!\name Timer diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp index 4e5dfbeb..e7d27fe8 100644 --- a/include/hibf/interleaved_bloom_filter.hpp +++ b/include/hibf/interleaved_bloom_filter.hpp @@ -22,6 +22,7 @@ #include // for CEREAL_SERIALIZE_FUNCTION_NAME #include // for base_class +#include // for vector #include // for cereal_archive #include // for aligned_allocator @@ -193,7 +194,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector //!\brief Helper function to reduce code-duplication between emplace and emplace_exists. template - inline auto emplace_impl(size_t const value, bin_index const bin) noexcept; + inline void emplace_impl(size_t const value, bin_index const bin) noexcept; public: class membership_agent_type; // documented upon definition below @@ -257,10 +258,9 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector /*!\brief Inserts a value into a specific bin and returns whether the value already existed. * \param[in] value The raw numeric value to process. * \param[in] bin The bin index to insert into. - * \returns `true` if the value already existed, `false` otherwise. * \sa seqan::hibf::interleaved_bloom_filter::emplace */ - [[nodiscard]] bool emplace_exists(size_t const value, bin_index const bin) noexcept; + void emplace_exists(size_t const value, bin_index const bin) noexcept; /*!\brief Clears a specific bin. * \param[in] bin The bin index to clear. @@ -292,7 +292,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector "The reference type of the range to clear must be seqan::hibf::bin_index."); #ifndef NDEBUG for (auto && bin : bin_range) - assert(bin.value < bins); + assert(bin.value < technical_bins); #endif // NDEBUG for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i) @@ -300,8 +300,14 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector (*this)[bin.value + offset] = 0; } + /*!\brief Sets the number of bins stored in the Interleaved Bloom Filter. + * \param[in] new_bin_count The new number of bins. + * \returns `true` if the number of bins was set, `false` if the number of bins was not set. + */ + bool set_bin_count(bin_count const new_bin_count); + /*!\brief Increases the number of bins stored in the Interleaved Bloom Filter. - * \param[in] new_bins_ The new number of bins. + * \param[in] new_bin_count The new number of bins. * \throws std::invalid_argument If passed number of bins is smaller than current number of bins. * * \attention The new number of bins must be greater or equal to the current number of bins. @@ -322,7 +328,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector * * \include test/snippet/ibf/interleaved_bloom_filter_increase_bin_number_to.cpp */ - void increase_bin_number_to(bin_count const new_bins_); + void increase_bin_number_to(bin_count const new_bin_count); //!\} /*!\name Lookup @@ -378,6 +384,12 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector return bins; } + //!\brief Sets the number of bins to a new value. + void overwrite_bin_count(size_t const new_count) noexcept + { + bins = new_count; + } + /*!\brief Returns the size of a single bin that the Interleaved Bloom Filter manages. * \returns The size in bits of a single bin. */ @@ -398,7 +410,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector /*!\name Comparison operators * \{ */ - constexpr bool operator==(interleaved_bloom_filter const &) const = default; + HIBF_CONSTEXPR_VECTOR bool operator==(interleaved_bloom_filter const &) const = default; //!\} /*!\name Access @@ -414,6 +426,8 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector using base_t::data; //!\} + std::vector occupancy{}; + /*!\cond DEV * \brief Serialisation support function. * \tparam archive_t Type of `archive`; must satisfy seqan::hibf::cereal_archive. @@ -431,6 +445,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector archive(bin_words); archive(hash_funs); archive(cereal::base_class(this)); + archive(occupancy); } //!\endcond }; diff --git a/include/hibf/layout/hierarchical_binning.hpp b/include/hibf/layout/hierarchical_binning.hpp index 551173d4..147c1966 100644 --- a/include/hibf/layout/hierarchical_binning.hpp +++ b/include/hibf/layout/hierarchical_binning.hpp @@ -68,7 +68,8 @@ class hierarchical_binning config{config_}, data{std::addressof(data_)}, num_user_bins{data->positions.size()}, - num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)} + num_technical_bins{data->previous.empty() ? config.tmax + : std::min(needed_technical_bins(num_user_bins), config.tmax)} { assert(data != nullptr); } diff --git a/include/hibf/misc/empty_bins_by_fraction.hpp b/include/hibf/misc/empty_bins_by_fraction.hpp new file mode 100644 index 00000000..7d520e62 --- /dev/null +++ b/include/hibf/misc/empty_bins_by_fraction.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +#include + +namespace seqan::hibf +{ + +/*!\brief Returns the number of empty bins that should be created by a given fraction of the total number of bins. + * \param[in] tmax The total number of bins. + * \param[in] fraction The fraction of the total number of bins that should be empty. + * \ingroup hibf + * \sa https://godbolt.org/z/cMjbM39vj + */ +[[nodiscard]] constexpr size_t empty_bins_by_fraction(size_t const tmax, double const fraction) noexcept +{ + return std::clamp(tmax * fraction, 1, tmax - 2) - (fraction == 0.0); +} + +} // namespace seqan::hibf diff --git a/include/hibf/misc/insert_iterator.hpp b/include/hibf/misc/insert_iterator.hpp index 268120dc..d34f50cf 100644 --- a/include/hibf/misc/insert_iterator.hpp +++ b/include/hibf/misc/insert_iterator.hpp @@ -54,6 +54,12 @@ class insert_iterator type{data_type::ibf} {} + explicit constexpr insert_iterator(ibf_t & ibf, size_t ibf_bin_index, bool) : + ptr{std::addressof(ibf)}, + ibf_bin_index{ibf_bin_index}, + type{data_type::ibf2} + {} + explicit constexpr insert_iterator(function_t & fun) : ptr{std::addressof(fun)}, type{data_type::function} {} @@ -72,6 +78,9 @@ class insert_iterator case data_type::ibf: static_cast(ptr)->emplace(value, static_cast(ibf_bin_index)); break; + case data_type::ibf2: + static_cast(ptr)->emplace_exists(value, static_cast(ibf_bin_index)); + break; default: assert(type == data_type::function); static_cast(ptr)->operator()(value); @@ -102,6 +111,7 @@ class insert_iterator unordered_set, sketch, ibf, + ibf2, function }; diff --git a/src/build/construct_ibf.cpp b/src/build/construct_ibf.cpp index d44e7cca..ce91f853 100644 --- a/src/build/construct_ibf.cpp +++ b/src/build/construct_ibf.cpp @@ -55,7 +55,7 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s local_index_allocation_timer.stop(); data.index_allocation_timer += local_index_allocation_timer; - insert_into_ibf(kmers, number_of_bins, ibf_node.max_bin_index, ibf, data.fill_ibf_timer); + insert_into_ibf(data, kmers, number_of_bins, ibf_node.max_bin_index, ibf, data.fill_ibf_timer); if (!is_root) update_parent_kmers(parent_kmers, kmers, data.merge_kmers_timer); diff --git a/src/build/insert_into_ibf.cpp b/src/build/insert_into_ibf.cpp index 45e2d432..47f61b17 100644 --- a/src/build/insert_into_ibf.cpp +++ b/src/build/insert_into_ibf.cpp @@ -23,7 +23,8 @@ namespace seqan::hibf::build { // automatically does naive splitting if number_of_bins > 1 -void insert_into_ibf(robin_hood::unordered_flat_set const & kmers, +void insert_into_ibf(build_data const & data, + robin_hood::unordered_flat_set const & kmers, size_t const number_of_bins, size_t const bin_index, seqan::hibf::interleaved_bloom_filter & ibf, @@ -32,16 +33,42 @@ void insert_into_ibf(robin_hood::unordered_flat_set const & kmers, size_t const chunk_size = divide_and_ceil(kmers.size(), number_of_bins); size_t chunk_number{}; + bool const use_exists = data.config.empty_bin_fraction > 0.0; + serial_timer local_fill_ibf_timer{}; local_fill_ibf_timer.start(); - for (auto chunk : kmers | seqan::stl::views::chunk(chunk_size)) + auto chunk_view = seqan::stl::views::chunk(kmers, chunk_size); + for (auto && chunk : chunk_view) { assert(chunk_number < number_of_bins); seqan::hibf::bin_index const bin_idx{bin_index + chunk_number}; ++chunk_number; - for (size_t const value : chunk) - ibf.emplace(value, bin_idx); + if (use_exists) + { + for (auto && value : chunk) + ibf.emplace_exists(value, bin_idx); + } + else + { + for (auto && value : chunk) + ibf.emplace(value, bin_idx); + } + } + + assert(chunk_view.size() <= number_of_bins); + if (use_exists && chunk_view.size() < number_of_bins) + { + size_t const diff = number_of_bins - chunk_view.size(); + auto it = ibf.occupancy.begin() + bin_index + chunk_view.size(); + assert(std::ranges::all_of(it, + it + diff, + [](size_t value) + { + return value == 0u; + })); + std::ranges::fill_n(it, diff, 1u); } + local_fill_ibf_timer.stop(); fill_ibf_timer += local_fill_ibf_timer; } @@ -54,7 +81,10 @@ void insert_into_ibf(build_data const & data, serial_timer local_fill_ibf_timer{}; local_user_bin_io_timer.start(); local_fill_ibf_timer.start(); - data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id}); + if (data.config.empty_bin_fraction > 0.0) + data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id, true}); + else + data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id}); local_user_bin_io_timer.stop(); local_fill_ibf_timer.stop(); data.user_bin_io_timer += local_user_bin_io_timer; diff --git a/src/config.cpp b/src/config.cpp index 36ae0753..148dfc13 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -14,9 +14,10 @@ #include // for JSONInputArchive, JSONOutputArchive #include // for make_nvp, InputArchive, OutputArchive -#include // for config -#include // for meta_header, meta_hibf_config_end, meta_hibf_config_start -#include // for next_multiple_of_64 +#include // for config +#include // for meta_header, meta_hibf_config_end, meta_hibf_config_start +#include // for empty_bins_by_fraction +#include // for next_multiple_of_64 namespace seqan::hibf { @@ -63,6 +64,9 @@ void config::write_to(std::ostream & stream) const void config::validate_and_set_defaults() { + if (validated) + return; + if (!input_fn) throw std::invalid_argument{"[HIBF CONFIG ERROR] You did not set the required config::input_fn."}; @@ -111,6 +115,11 @@ void config::validate_and_set_defaults() << "anyway, so we increased your number of technical bins to " << tmax << ".\n"; } + if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0) + throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."}; + + tmax -= empty_bins_by_fraction(tmax, empty_bin_fraction); + if (alpha < 0.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."}; @@ -119,6 +128,8 @@ void config::validate_and_set_defaults() if (disable_estimate_union || max_rearrangement_ratio == 0.0) disable_rearrangement = true; + + validated = true; } } // namespace seqan::hibf diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 0c5bc5bf..72710a15 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -42,7 +42,8 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, robin_hood::unordered_flat_set & parent_kmers, layout::graph::node const & current_node, build::build_data & data, - bool is_root) + bool is_root, + size_t const parent_ibf_idx = 0u) { size_t const ibf_pos{data.request_ibf_idx()}; @@ -68,7 +69,8 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, kmers, current_node.children[current_node.favourite_child_idx.value()], data, - false); + false, + ibf_pos); return 1; } else // max bin is not a merged bin @@ -124,13 +126,14 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, auto & child = children[index]; robin_hood::unordered_flat_set kmers{}; - size_t const ibf_pos = hierarchical_build(hibf, kmers, child, data, false); + size_t const new_ibf_pos = hierarchical_build(hibf, kmers, child, data, false, ibf_pos); auto parent_bin_index = child.parent_bin_index; + hibf.prev_ibf_id[new_ibf_pos] = {.ibf_idx = parent_ibf_idx, .bin_idx = parent_bin_index}; { size_t const mutex_id{parent_bin_index / 64}; std::lock_guard guard{local_ibf_mutex[mutex_id]}; - technical_bin_to_ibf_id[parent_bin_index] = ibf_pos; - build::insert_into_ibf(kmers, 1, parent_bin_index, ibf, data.fill_ibf_timer); + technical_bin_to_ibf_id[parent_bin_index] = new_ibf_pos; + build::insert_into_ibf(data, kmers, 1, parent_bin_index, ibf, data.fill_ibf_timer); if (!is_root) build::update_parent_kmers(parent_kmers, kmers, data.merge_kmers_timer); } @@ -152,7 +155,8 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf, else { compute_kmers(kmers, data, record); - build::insert_into_ibf(kmers, + build::insert_into_ibf(data, + kmers, record.number_of_technical_bins, record.storage_TB_id, ibf, @@ -184,6 +188,7 @@ void build_index(hierarchical_interleaved_bloom_filter & hibf, hibf.ibf_vector.resize(number_of_ibfs); hibf.ibf_bin_to_user_bin_id.resize(number_of_ibfs); + hibf.prev_ibf_id.resize(number_of_ibfs); hibf.next_ibf_id.resize(number_of_ibfs); build::build_data data{.config = config, .ibf_graph = {hibf_layout}}; diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index d86615b1..a0e41728 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -47,6 +47,7 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_, bin_words = divide_and_ceil(bins, 64u); technical_bins = bin_words * 64u; resize(technical_bins * bin_size_); + occupancy.resize(technical_bins, 0u); } size_t find_biggest_bin(config const & configuration) @@ -97,6 +98,20 @@ size_t max_bin_size(config & configuration, size_t const max_bin_elements) .elements = max_size}); } +template +inline void +loop_dispatch(seqan::hibf::interleaved_bloom_filter & ibf, config const & configuration, size_t const chunk_size) +{ +#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads) + for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) + { + if constexpr (use_exists) + configuration.input_fn(i, insert_iterator{ibf, i, true}); + else + configuration.input_fn(i, insert_iterator{ibf, i}); + } +} + // config validation is done by max_bin_size interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_t const max_bin_elements) : interleaved_bloom_filter{seqan::hibf::bin_count{configuration.number_of_user_bins}, @@ -106,15 +121,14 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_ // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) size_t const chunk_size = std::clamp(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u); -#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads) - for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) - { - configuration.input_fn(i, insert_iterator{*this, i}); - } + if (configuration.empty_bin_fraction > 0.0) + loop_dispatch(*this, configuration, chunk_size); + else + loop_dispatch(*this, configuration, chunk_size); } template -inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index const bin) noexcept +inline void interleaved_bloom_filter::emplace_impl(size_t const value, bin_index const bin) noexcept { assert(bin.value < bins); @@ -131,11 +145,15 @@ inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index seqan::hibf::bit_vector::reference bit_reference{(*this)[idx]}; if constexpr (check_exists) exists &= bit_reference; - bit_reference = 1; + bit_reference = true; }; if constexpr (check_exists) - return exists; + { + // Seems to be faster than occupancy[bin.value] += !exists because memory access might be mitigated. + if (!exists) + ++occupancy[bin.value]; + } }; [[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept @@ -143,36 +161,48 @@ inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index return emplace_impl(value, bin); } -[[gnu::always_inline]] bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept +[[gnu::always_inline]] void interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept { return emplace_impl(value, bin); } void interleaved_bloom_filter::clear(bin_index const bin) noexcept { - assert(bin.value < bins); + assert(bin.value < technical_bins); for (size_t idx = bin.value, i = 0; i < bin_size_; idx += technical_bins, ++i) (*this)[idx] = 0; } -void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count const new_bins_) +bool interleaved_bloom_filter::set_bin_count(seqan::hibf::bin_count const new_bin_count) { - size_t const new_bins = new_bins_.value; + size_t const new_bins = new_bin_count.value; + size_t const new_bin_words = divide_and_ceil(new_bins, 64u); + + if (new_bin_words > bin_words) + return false; + + bins = new_bins; + return true; +} - if (new_bins < bins) +void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count const new_bin_count) +{ + if (new_bin_count.value < bins) throw std::invalid_argument{"The number of new bins must be >= the current number of bins."}; + if (set_bin_count(new_bin_count)) + return; + + size_t const new_bins = new_bin_count.value; size_t const new_bin_words = divide_and_ceil(new_bins, 64u); + assert(new_bins > bins); bins = new_bins; - if (new_bin_words == bin_words) // No need for internal resize if bin_words does not change. - return; - size_t const new_technical_bins = new_bin_words * 64u; size_t const new_bit_size = bin_size_ * new_technical_bins; size_t const old_bit_size = size(); - size_t const delta = new_technical_bins - technical_bins + 64; + size_t const delta = new_technical_bins - technical_bins + 64u; resize(new_bit_size); uint64_t * const ptr = data(); @@ -180,7 +210,7 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con // old new // |-------------|---------| // Backwards copy blocks of size (old_)technical_bins such that the new blocks are of size new_technical_bins. - for (size_t new_block_end = new_bit_size, old_block_end = old_bit_size; old_block_end > 0; + for (size_t new_block_end = new_bit_size, old_block_end = old_bit_size; old_block_end > 0u; new_block_end -= new_technical_bins, old_block_end -= technical_bins) { size_t const stop = new_block_end - new_technical_bins; @@ -196,6 +226,8 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con bin_words = new_bin_words; technical_bins = new_technical_bins; + + occupancy.resize(technical_bins, 0u); } [[gnu::always_inline]] bit_vector const & diff --git a/src/layout/hierarchical_binning.cpp b/src/layout/hierarchical_binning.cpp index ab058cc0..9ae1511b 100644 --- a/src/layout/hierarchical_binning.cpp +++ b/src/layout/hierarchical_binning.cpp @@ -18,6 +18,7 @@ #include // for layout #include // for simple_binning #include // for divide_and_ceil +#include // for empty_bins_by_fraction #include // for next_multiple_of_64 #include // for concurrent_timer #include // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY @@ -79,12 +80,13 @@ size_t hierarchical_binning::execute() [[nodiscard]] size_t hierarchical_binning::needed_technical_bins(size_t const requested_num_ub) const { - return std::min(next_multiple_of_64(requested_num_ub), config.tmax); + size_t const next_multiple = next_multiple_of_64(requested_num_ub); + return next_multiple - empty_bins_by_fraction(next_multiple, config.empty_bin_fraction); } [[nodiscard]] size_t hierarchical_binning::max_merge_levels(size_t const num_ubs_in_merge) const { - size_t const lower_lvl_tbs = needed_technical_bins(num_ubs_in_merge); + size_t const lower_lvl_tbs = std::min(needed_technical_bins(num_ubs_in_merge), config.tmax); double const levels = std::log(num_ubs_in_merge) / std::log(lower_lvl_tbs); return static_cast(std::ceil(levels)); } @@ -407,8 +409,10 @@ void hierarchical_binning::update_libf_data(data_store & libf_data, size_t const size_t hierarchical_binning::add_lower_level(data_store & libf_data) const { + size_t const number_of_user_bins = libf_data.positions.size(); + // now do the binning for the low-level IBF: - if (libf_data.positions.size() > config.tmax) + if (number_of_user_bins > config.tmax) { // recursively call hierarchical binning if there are still too many UBs return hierarchical_binning{libf_data, config}.execute(); // return id of maximum technical bin @@ -416,7 +420,8 @@ size_t hierarchical_binning::add_lower_level(data_store & libf_data) const else { // use simple binning to distribute remaining UBs - return simple_binning{libf_data, 0}.execute(); // return id of maximum technical bin + size_t const number_of_technical_bins = needed_technical_bins(number_of_user_bins); + return simple_binning{libf_data, number_of_technical_bins}.execute(); // return id of maximum technical bin } } diff --git a/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp b/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp index 6c6c819d..83944283 100644 --- a/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp +++ b/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp @@ -107,18 +107,15 @@ inline void emplace_benchmark_impl(::benchmark::State & state) for (auto _ : state) { size_t bin_index = 0u; - [[maybe_unused]] size_t result{}; for (auto && chunk : seqan::stl::views::chunk(values, chunk_size)) { for (auto value : chunk) if constexpr (check_exists) - result += ibf.emplace_exists(value, seqan::hibf::bin_index{bin_index}); + ibf.emplace_exists(value, seqan::hibf::bin_index{bin_index}); else ibf.emplace(value, seqan::hibf::bin_index{bin_index}); ++bin_index; } - if constexpr (check_exists) - benchmark::DoNotOptimize(result); } state.counters["elements"] = elements_per_second(number_of_elements); diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp index 43d98fca..09f04635 100644 --- a/test/unit/hibf/config_test.cpp +++ b/test/unit/hibf/config_test.cpp @@ -37,7 +37,7 @@ TEST(config_test, write_to) std::string const expected_file{"@HIBF_CONFIG\n" "@{\n" "@ \"hibf_config\": {\n" - "@ \"version\": 1,\n" + "@ \"version\": 2,\n" "@ \"number_of_user_bins\": 123456789,\n" "@ \"number_of_hash_functions\": 4,\n" "@ \"maximum_fpr\": 0.0001,\n" @@ -45,10 +45,12 @@ TEST(config_test, write_to) "@ \"threads\": 31,\n" "@ \"sketch_bits\": 8,\n" "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.0,\n" "@ \"alpha\": 1.0,\n" "@ \"max_rearrangement_ratio\": 0.333,\n" "@ \"disable_estimate_union\": true,\n" - "@ \"disable_rearrangement\": false\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"validated\": false\n" "@ }\n" "@}\n" "@HIBF_CONFIG_END\n"}; @@ -57,6 +59,47 @@ TEST(config_test, write_to) } TEST(config_test, read_from) +{ + std::stringstream ss{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 2,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_fpr\": 0.0001,\n" + "@ \"relaxed_fpr\": 0.3,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.5,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"validated\": true\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + seqan::hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_fpr, 0.0001); + EXPECT_EQ(configuration.relaxed_fpr, 0.3); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.5); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); + EXPECT_EQ(configuration.validated, true); +} + +TEST(config_test, read_from_v1) { std::stringstream ss{"@HIBF_CONFIG\n" "@{\n" @@ -87,6 +130,7 @@ TEST(config_test, read_from) EXPECT_EQ(configuration.threads, 31); EXPECT_EQ(configuration.sketch_bits, 8); EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.0); EXPECT_EQ(configuration.alpha, 1.0); EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); EXPECT_EQ(configuration.disable_estimate_union, true); @@ -286,6 +330,21 @@ TEST(config_test, validate_and_set_defaults) "increased your number of technical bins to 64.\n"); } + // empty_bin_fraction must be in [0.0,1.0) + { + seqan::hibf::config configuration{.input_fn = dummy_input_fn, + .number_of_user_bins = 1u, + .empty_bin_fraction = -0.1}; + EXPECT_THROW_MSG(configuration.validate_and_set_defaults(), + std::invalid_argument, + "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."); + + configuration.empty_bin_fraction = 1.0; + EXPECT_THROW_MSG(configuration.validate_and_set_defaults(), + std::invalid_argument, + "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."); + } + // alpha must be positive { seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1}; diff --git a/test/unit/hibf/interleaved_bloom_filter_test.cpp b/test/unit/hibf/interleaved_bloom_filter_test.cpp index 9a3d35a8..42caf4ea 100644 --- a/test/unit/hibf/interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/interleaved_bloom_filter_test.cpp @@ -198,25 +198,25 @@ TEST(ibf_test, emplace) } } -TEST(ibf_test, emplace_exists) -{ - // 1. Construct and emplace - seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{128u}, - seqan::hibf::bin_size{512}, - seqan::hibf::hash_function_count{2u}}; - - for (size_t bin_idx : std::views::iota(0, 64)) - for (size_t hash : std::views::iota(0, 64)) - ibf.emplace(hash, seqan::hibf::bin_index{bin_idx}); - - // 2. Test for correctness - for (size_t bin_idx : std::views::iota(0, 64)) - for (size_t hash : std::views::iota(0, 64)) - ASSERT_TRUE(ibf.emplace_exists(hash, seqan::hibf::bin_index{bin_idx})); - - for (size_t bin_idx : std::views::iota(64, 128)) - ASSERT_FALSE(ibf.emplace_exists(0u, seqan::hibf::bin_index{bin_idx})); -} +// TEST(ibf_test, emplace_exists) +// { +// // 1. Construct and emplace +// seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{128u}, +// seqan::hibf::bin_size{512}, +// seqan::hibf::hash_function_count{2u}}; + +// for (size_t bin_idx : std::views::iota(0, 64)) +// for (size_t hash : std::views::iota(0, 64)) +// ibf.emplace(hash, seqan::hibf::bin_index{bin_idx}); + +// // 2. Test for correctness +// for (size_t bin_idx : std::views::iota(0, 64)) +// for (size_t hash : std::views::iota(0, 64)) +// ASSERT_TRUE(ibf.emplace_exists(hash, seqan::hibf::bin_index{bin_idx})); + +// for (size_t bin_idx : std::views::iota(64, 128)) +// ASSERT_FALSE(ibf.emplace_exists(0u, seqan::hibf::bin_index{bin_idx})); +// } TEST(ibf_test, clear) { diff --git a/test/unit/hibf/layout/hierarchical_binning_test.cpp b/test/unit/hibf/layout/hierarchical_binning_test.cpp index 8a94d304..17bf31b0 100644 --- a/test/unit/hibf/layout/hierarchical_binning_test.cpp +++ b/test/unit/hibf/layout/hierarchical_binning_test.cpp @@ -68,6 +68,43 @@ TEST(hierarchical_binning_test, small_example) EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); } +TEST(hierarchical_binning_test, small_example_with_empty_bins) +{ + seqan::hibf::config config; + config.tmax = 4; + config.disable_estimate_union = true; // also disables rearrangement + config.empty_bin_fraction = 0.001; + + seqan::hibf::layout::layout hibf_layout{}; + std::vector kmer_counts{500, 1000, 500, 500, 500, 500, 500, 500}; + + seqan::hibf::layout::data_store data{.hibf_layout = &hibf_layout, .kmer_counts = &kmer_counts}; + + data.fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); + + seqan::hibf::layout::hierarchical_binning algo{data, config}; + EXPECT_EQ(algo.execute(), 3u); // #HIGH_LEVEL_IBF max_bin_id:3 + + std::vector expected_max_bins{{{2}, 0}, {{3}, 43}}; + + // clang-format off + std::vector expected_user_bins{{{} , 0 , 1 , 7}, + {{} , 1 , 1 , 6}, + {{2}, 0 , 22 - 1, 3}, + {{2}, 22 - 1, 21 , 4}, + {{2}, 43 - 1, 21 , 5}, + {{3}, 0 , 42 + 1, 1}, + {{3}, 42 + 1, 11 - 1, 0}, + {{3}, 53 , 11 - 1, 2}}; + // clang-format on + + EXPECT_RANGE_EQ(hibf_layout.max_bins, expected_max_bins); + EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); +} + TEST(hierarchical_binning_test, another_example) { seqan::hibf::config config;