From 5de8aa9ed2c12607a7c394dd4c891091d5bbd6ad Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 23 Oct 2024 13:29:01 +0200 Subject: [PATCH 1/4] [MISC] Extend insert_iterator --- include/hibf/interleaved_bloom_filter.hpp | 5 +- include/hibf/misc/insert_iterator.hpp | 74 ++++++++++++++++------- src/interleaved_bloom_filter.cpp | 20 +++--- test/unit/hibf/CMakeLists.txt | 1 + test/unit/hibf/insert_iterator_test.cpp | 58 ++++++++++++++++++ 5 files changed, 126 insertions(+), 32 deletions(-) create mode 100644 test/unit/hibf/insert_iterator_test.cpp diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp index e73a52cf..4e5dfbeb 100644 --- a/include/hibf/interleaved_bloom_filter.hpp +++ b/include/hibf/interleaved_bloom_filter.hpp @@ -24,7 +24,6 @@ #include // for base_class #include // for cereal_archive -#include // for config #include // for aligned_allocator #include // for bit_vector #include // for counting_vector @@ -33,6 +32,10 @@ namespace seqan::hibf { + +// config.hpp -> misc/insert_iterator.hpp (Needs interleaved_bloom_filter to be a complete class) +struct config; + /*!\brief A strong type that represents the number of bins for the seqan::hibf::interleaved_bloom_filter. * \ingroup ibf * \qualifier strong diff --git a/include/hibf/misc/insert_iterator.hpp b/include/hibf/misc/insert_iterator.hpp index 903869a0..3cd595f6 100644 --- a/include/hibf/misc/insert_iterator.hpp +++ b/include/hibf/misc/insert_iterator.hpp @@ -13,7 +13,9 @@ #include // for vector #include // for unordered_flat_set, hash +#include #include +#include // IWYU pragma: private, include @@ -29,32 +31,51 @@ class insert_iterator using pointer = void; using reference = void; - insert_iterator() = delete; - insert_iterator(insert_iterator const &) = default; - insert_iterator(insert_iterator &&) = default; - insert_iterator & operator=(insert_iterator const &) = default; - insert_iterator & operator=(insert_iterator &&) = default; - ~insert_iterator() = default; + constexpr insert_iterator() = default; + constexpr insert_iterator(insert_iterator const &) = default; + constexpr insert_iterator(insert_iterator &&) = default; + constexpr insert_iterator & operator=(insert_iterator const &) = default; + constexpr insert_iterator & operator=(insert_iterator &&) = default; + constexpr ~insert_iterator() = default; - explicit constexpr insert_iterator(robin_hood::unordered_flat_set & set) : - set{std::addressof(set)}, - is_set{true} + using set_t = robin_hood::unordered_flat_set; + using sketch_t = sketch::hyperloglog; + using ibf_t = interleaved_bloom_filter; + using function_t = std::function; + + explicit constexpr insert_iterator(set_t & set) : ptr{std::addressof(set)}, type{data_type::unordered_set} + {} + + explicit constexpr insert_iterator(sketch_t & sketch) : ptr{std::addressof(sketch)}, type{data_type::sketch} {} - explicit constexpr insert_iterator(std::vector & vec) : vec{std::addressof(vec)}, is_set{false} + explicit constexpr insert_iterator(ibf_t & ibf, size_t ibf_bin_index) : + ptr{std::addressof(ibf)}, + ibf_bin_index{ibf_bin_index}, + type{data_type::ibf} {} - insert_iterator & operator=(uint64_t const value) noexcept + explicit constexpr insert_iterator(function_t & fun) : ptr{std::addressof(fun)}, type{data_type::function} + {} + + [[gnu::always_inline, gnu::flatten]] inline insert_iterator & operator=(uint64_t const value) noexcept { - if (is_set) - { - assert(set != nullptr); - set->emplace(value); - } - else + assert(ptr != nullptr); + + switch (type) { - assert(vec != nullptr); - vec->emplace_back(value); + case data_type::unordered_set: + static_cast(ptr)->emplace(value); + break; + case data_type::sketch: + static_cast(ptr)->add(value); + break; + case data_type::ibf: + static_cast(ptr)->emplace(value, static_cast(ibf_bin_index)); + break; + default: + assert(type == data_type::function); + static_cast(ptr)->operator()(value); } return *this; } @@ -75,9 +96,18 @@ class insert_iterator } private: - robin_hood::unordered_flat_set * set{nullptr}; - std::vector * vec{nullptr}; - bool is_set{false}; + void * ptr{nullptr}; + + enum class data_type : uint8_t + { + unordered_set, + sketch, + ibf, + function + }; + + size_t ibf_bin_index{}; + data_type type{}; }; } // namespace seqan::hibf diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index 195ad487..d48f69d9 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -22,6 +22,11 @@ namespace seqan::hibf { +#if HIBF_COMPILER_IS_GCC +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wattributes" +#endif // HIBF_COMPILER_IS_GCC + interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_, seqan::hibf::bin_size size, seqan::hibf::hash_function_count funs) @@ -118,12 +123,12 @@ inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index return exists; }; -void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept +[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept { return emplace_impl(value, bin); } -bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept +[[gnu::always_inline]] bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept { return emplace_impl(value, bin); } @@ -178,16 +183,9 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con technical_bins = new_technical_bins; } -#if HIBF_COMPILER_IS_GCC -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wattributes" -#endif // HIBF_COMPILER_IS_GCC [[gnu::always_inline]] bit_vector const & interleaved_bloom_filter::membership_agent_type::bulk_contains(size_t const value) & noexcept { -#if HIBF_COMPILER_IS_GCC -# pragma GCC diagnostic pop -#endif // HIBF_COMPILER_IS_GCC assert(ibf_ptr != nullptr); assert(result_buffer.size() == ibf_ptr->bin_count()); @@ -276,4 +274,8 @@ interleaved_bloom_filter::membership_agent_type::bulk_contains(size_t const valu return result_buffer; } +#if HIBF_COMPILER_IS_GCC +# pragma GCC diagnostic pop +#endif // HIBF_COMPILER_IS_GCC + } // namespace seqan::hibf diff --git a/test/unit/hibf/CMakeLists.txt b/test/unit/hibf/CMakeLists.txt index ee8ff489..49da8e71 100644 --- a/test/unit/hibf/CMakeLists.txt +++ b/test/unit/hibf/CMakeLists.txt @@ -9,6 +9,7 @@ hibf_test (config_test.cpp) hibf_test (counting_vector_test.cpp) hibf_test (counting_vector_avx512_test.cpp) hibf_test (hierarchical_interleaved_bloom_filter_test.cpp) +hibf_test (insert_iterator_test.cpp) hibf_test (interleaved_bloom_filter_test.cpp) hibf_test (interleaved_bloom_filter_avx512_test.cpp) hibf_test (path_test.cpp) diff --git a/test/unit/hibf/insert_iterator_test.cpp b/test/unit/hibf/insert_iterator_test.cpp new file mode 100644 index 00000000..fc89e66e --- /dev/null +++ b/test/unit/hibf/insert_iterator_test.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include // for Message, TestPartResult, AssertionResult, Test, EXPECT_EQ, Capture... + +#include +#include +#include +#include // for expect_range_eq, EXPECT_RANGE_EQ + +static constexpr std::array values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + +TEST(insert_iterator_test, unordered_set) +{ + robin_hood::unordered_flat_set target; + seqan::hibf::insert_iterator it{target}; + std::ranges::copy(values, it); + EXPECT_EQ(target.size(), 10u); +} + +TEST(insert_iterator_test, sketch) +{ + seqan::hibf::sketch::hyperloglog target{5u}; + seqan::hibf::insert_iterator it{target}; + std::ranges::copy(values, it); + EXPECT_NEAR(target.estimate(), 11.99, 0.001); +} + +TEST(insert_iterator_test, ibf) +{ + seqan::hibf::interleaved_bloom_filter target{seqan::hibf::bin_count{8u}, + seqan::hibf::bin_size{8u}, + seqan::hibf::hash_function_count{1u}}; + for (size_t i = 0; i < 3; ++i) + { + seqan::hibf::insert_iterator it{target, i}; + std::ranges::copy(values, it); + } + + auto agent = target.counting_agent(); + auto & result = agent.bulk_count(values); + std::vector const expected{10, 10, 10, 0, 0, 0, 0, 0}; + EXPECT_RANGE_EQ(result, expected); +} + +TEST(insert_iterator_test, function) +{ + robin_hood::unordered_flat_set target; + std::function fun = [&target](size_t const value) + { + target.emplace(value); + target.emplace((1u + value) * 11u); + }; + seqan::hibf::insert_iterator it{fun}; + std::ranges::copy(values, it); + EXPECT_EQ(target.size(), 20); +} From a5f579cf37140863e0ad48f2247caea55f9fe009 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Thu, 24 Oct 2024 11:24:04 +0200 Subject: [PATCH 2/4] [INFRA] merge coverage reports --- .github/workflows/ci_coverage.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci_coverage.yml b/.github/workflows/ci_coverage.yml index cd9e000a..99004455 100644 --- a/.github/workflows/ci_coverage.yml +++ b/.github/workflows/ci_coverage.yml @@ -81,6 +81,7 @@ jobs: --exclude-unreachable-branches \ --exclude-throw-branches \ --exclude-noncode-lines \ + --merge-mode-functions separate \ -j \ --cobertura \ --output ${GITHUB_WORKSPACE}/build/coverage_report.xml From 7f336c3429cffce996a6e77024ece61e5f9c4307 Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 23 Oct 2024 13:30:05 +0200 Subject: [PATCH 3/4] [MISC] Use new insert_iterator in more places --- src/build/insert_into_ibf.cpp | 14 +- src/interleaved_bloom_filter.cpp | 9 +- src/sketch/compute_sketches.cpp | 21 +-- test/performance/ibf/CMakeLists.txt | 1 + ...ed_bloom_filter_construction_benchmark.cpp | 154 ++++++++++++++++++ .../sketch/compute_sketches_benchmark.cpp | 21 ++- 6 files changed, 186 insertions(+), 34 deletions(-) create mode 100644 test/performance/ibf/interleaved_bloom_filter_construction_benchmark.cpp diff --git a/src/build/insert_into_ibf.cpp b/src/build/insert_into_ibf.cpp index 4a5b0784..07b45b78 100644 --- a/src/build/insert_into_ibf.cpp +++ b/src/build/insert_into_ibf.cpp @@ -51,20 +51,14 @@ void insert_into_ibf(build_data const & data, layout::layout::user_bin const & record, seqan::hibf::interleaved_bloom_filter & ibf) { - auto const bin_index = seqan::hibf::bin_index{static_cast(record.storage_TB_id)}; - std::vector values; - serial_timer local_user_bin_io_timer{}; - local_user_bin_io_timer.start(); - data.config.input_fn(record.idx, insert_iterator{values}); - local_user_bin_io_timer.stop(); - data.user_bin_io_timer += local_user_bin_io_timer; - serial_timer local_fill_ibf_timer{}; + local_user_bin_io_timer.start(); local_fill_ibf_timer.start(); - for (auto && value : values) - ibf.emplace(value, bin_index); + data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id}); + local_user_bin_io_timer.stop(); local_fill_ibf_timer.stop(); + data.user_bin_io_timer += local_user_bin_io_timer; data.fill_ibf_timer += local_fill_ibf_timer; } diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index d48f69d9..bd837831 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -85,16 +85,11 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_ { // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) size_t const chunk_size = std::clamp(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u); - robin_hood::unordered_flat_set kmers; -#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads) private(kmers) +#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads) for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) { - kmers.clear(); - configuration.input_fn(i, insert_iterator{kmers}); - - for (uint64_t const hash : kmers) - emplace(hash, seqan::hibf::bin_index{i}); + configuration.input_fn(i, insert_iterator{*this, i}); } } diff --git a/src/sketch/compute_sketches.cpp b/src/sketch/compute_sketches.cpp index 5faa299d..cd7d9ab8 100644 --- a/src/sketch/compute_sketches.cpp +++ b/src/sketch/compute_sketches.cpp @@ -27,21 +27,18 @@ namespace seqan::hibf::sketch void compute_sketches(config const & config, std::vector & hll_sketches) { // compute hll_sketches - hll_sketches.resize(config.number_of_user_bins); + hll_sketches.resize(config.number_of_user_bins, config.sketch_bits); + + assert(std::ranges::all_of(hll_sketches, + [bits = config.sketch_bits](hyperloglog const & sketch) + { + return sketch.data_size() == (1ULL << bits); + })); - robin_hood::unordered_flat_set kmers; -#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers) +#pragma omp parallel for schedule(dynamic) num_threads(config.threads) for (size_t i = 0; i < config.number_of_user_bins; ++i) { - seqan::hibf::sketch::hyperloglog hll_sketch(config.sketch_bits); - - kmers.clear(); - config.input_fn(i, insert_iterator{kmers}); - - for (auto k_hash : kmers) - hll_sketch.add(k_hash); - - hll_sketches[i] = std::move(hll_sketch); + config.input_fn(i, insert_iterator{hll_sketches[i]}); } } diff --git a/test/performance/ibf/CMakeLists.txt b/test/performance/ibf/CMakeLists.txt index 498044fc..df882bf3 100644 --- a/test/performance/ibf/CMakeLists.txt +++ b/test/performance/ibf/CMakeLists.txt @@ -5,3 +5,4 @@ hibf_benchmark (bit_vector_benchmark.cpp) hibf_benchmark (bit_vector_serialisation_benchmark.cpp) hibf_benchmark (interleaved_bloom_filter_benchmark.cpp) +hibf_benchmark (interleaved_bloom_filter_construction_benchmark.cpp) diff --git a/test/performance/ibf/interleaved_bloom_filter_construction_benchmark.cpp b/test/performance/ibf/interleaved_bloom_filter_construction_benchmark.cpp new file mode 100644 index 00000000..24b69227 --- /dev/null +++ b/test/performance/ibf/interleaved_bloom_filter_construction_benchmark.cpp @@ -0,0 +1,154 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include // for State, Benchmark, AddCustomContext, Counter, BENCHMARK + +#include // for __fn, generate +#include // for log, ceil, exp +#include // for size_t +#include // for equal_to +#include // for uniform_int_distribution, mt19937_64 +#include // for transform_view, iota_view, __range_adaptor_closure_t, __fn +#include // for to_string, basic_string +#include // for tuple, make_tuple +#include // for move, pair +#include // for vector + +#include // for config +#include // for hash, unordered_map +#include // for chunk, chunk_fn, chunk_view +#include // for operator| +#include // for bin_index, interleaved_bloom_filter, bin_count, bin_size +#include // for divide_and_ceil +#include // for HIBF_HAS_AVX512 +#include // for operator""_MiB + +using namespace seqan::hibf::test::literals; +static constexpr size_t total_ibf_size_in_bytes{1_MiB}; +static constexpr size_t number_of_hash_functions{2u}; +static constexpr double false_positive_rate{0.05}; + +inline benchmark::Counter ibf_size(size_t const bit_size) +{ + return benchmark::Counter(bit_size / 8, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024); +} + +// This computes how many elements need to be inserted into the IBF to achieve the desired false positive rate for the +// given size. +// The `number_of_elements` many generated values are used for both constructing and querying the IBF. +static /* cmath not constexpr in libc++ */ size_t number_of_elements = []() +{ + size_t const bits = 8u * total_ibf_size_in_bytes; + double const numerator = -std::log(1 - std::exp(std::log(false_positive_rate) / number_of_hash_functions)) * bits; + return std::ceil(numerator / number_of_hash_functions); +}(); + +static auto get_value(size_t const bins) +{ + size_t const chunk_size = seqan::hibf::divide_and_ceil(number_of_elements, bins); + return seqan::stl::views::chunk(std::views::iota(size_t{}, number_of_elements), chunk_size); +} + +void manual_construct(::benchmark::State & state) +{ + size_t const bins = state.range(0); + size_t const bits = 8u * total_ibf_size_in_bytes / bins; + + auto values = get_value(bins); + + for (auto _ : state) + { + seqan::hibf::interleaved_bloom_filter ibf{seqan::hibf::bin_count{bins}, + seqan::hibf::bin_size{bits}, + seqan::hibf::hash_function_count{number_of_hash_functions}}; + + for (size_t bin_index = 0u; bin_index < bins; ++bin_index) + { + for (auto value : values[bin_index]) + ibf.emplace(value, seqan::hibf::bin_index{bin_index}); + } + + state.counters["IBF_size"] = ibf_size(ibf.bit_size()); + + benchmark::DoNotOptimize(ibf); + } +} + +void config_construct(::benchmark::State & state) +{ + size_t const bins = state.range(0); + + auto values = get_value(bins); + + seqan::hibf::config config{.input_fn = + [&values](size_t const user_bin_id, seqan::hibf::insert_iterator && it) + { + for (auto const value : values[user_bin_id]) + it = value; + }, + .number_of_user_bins = bins, + .number_of_hash_functions = number_of_hash_functions, + .maximum_fpr = false_positive_rate}; + + for (auto _ : state) + { + seqan::hibf::interleaved_bloom_filter ibf{config}; + + state.counters["IBF_size"] = ibf_size(ibf.bit_size()); + + benchmark::DoNotOptimize(ibf); + } +} + +void config_and_max_construct(::benchmark::State & state) +{ + size_t const bins = state.range(0); + + auto values = get_value(bins); + size_t const max_bin_size = values[0].size(); + + seqan::hibf::config config{.input_fn = + [&values](size_t const user_bin_id, seqan::hibf::insert_iterator && it) + { + for (auto const value : values[user_bin_id]) + it = value; + }, + .number_of_user_bins = bins, + .number_of_hash_functions = number_of_hash_functions, + .maximum_fpr = false_positive_rate}; + + for (auto _ : state) + { + seqan::hibf::interleaved_bloom_filter ibf{config, max_bin_size}; + + state.counters["IBF_size"] = ibf_size(ibf.bit_size()); + + benchmark::DoNotOptimize(ibf); + } +} + +BENCHMARK(manual_construct)->RangeMultiplier(2)->Range(64, 1024); +BENCHMARK(config_construct)->RangeMultiplier(2)->Range(64, 1024); +BENCHMARK(config_and_max_construct)->RangeMultiplier(2)->Range(64, 1024); + +// This is a hack to add custom context information to the benchmark output. +// The alternative would be to do it in the main(). However, this would require +// not using the BENCHMARK_MAIN macro. +[[maybe_unused]] static bool foo = []() +{ + benchmark::AddCustomContext("IBF size in bytes", std::to_string(total_ibf_size_in_bytes)); + benchmark::AddCustomContext("Number of hash functions", std::to_string(number_of_hash_functions)); + benchmark::AddCustomContext("False positive rate", std::to_string(false_positive_rate)); + benchmark::AddCustomContext("Number of elements", std::to_string(number_of_elements)); + benchmark::AddCustomContext("HIBF_HAS_AVX512", HIBF_HAS_AVX512 ? "true" : "false"); + benchmark::AddCustomContext("AVX512 support", +#if __AVX512F__ && __AVX512BW__ + "true"); +#else + "false"); +#endif + return true; +}(); + +BENCHMARK_MAIN(); diff --git a/test/performance/sketch/compute_sketches_benchmark.cpp b/test/performance/sketch/compute_sketches_benchmark.cpp index 7e6c2358..f2cd0f64 100644 --- a/test/performance/sketch/compute_sketches_benchmark.cpp +++ b/test/performance/sketch/compute_sketches_benchmark.cpp @@ -13,6 +13,11 @@ #include // for hyperloglog #include // for minhashes +inline benchmark::Counter elements_per_second(size_t const count) +{ + return benchmark::Counter(count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); +} + enum class sketch : uint8_t { Hyperloglog, @@ -22,30 +27,36 @@ enum class sketch : uint8_t template void compute_sketches(benchmark::State & state) { + static constexpr uint64_t elements_per_bin = 10000; auto create_hashes = [&](size_t const ub_id, seqan::hibf::insert_iterator it) { // 0 = [0, 10000] // 1 = [10000, 20000] // 1 = [20000, 30000] - for (size_t i = ub_id * 10000; i < (ub_id + 1) * 10000; ++i) + for (size_t i = ub_id * elements_per_bin; i < (ub_id + 1) * elements_per_bin; ++i) it = i; }; - [[maybe_unused]] std::vector minhash_sketches; - std::vector hyperloglog_sketches; - seqan::hibf::config config{}; - config.number_of_user_bins = 16; + config.number_of_user_bins = 64; config.input_fn = create_hashes; config.sketch_bits = 12; + [[maybe_unused]] std::vector minhash_sketches; + std::vector hyperloglog_sketches(config.number_of_user_bins, config.sketch_bits); + for (auto _ : state) { if constexpr (sketch_t == sketch::MinHashes) seqan::hibf::sketch::compute_sketches(config, hyperloglog_sketches, minhash_sketches); else seqan::hibf::sketch::compute_sketches(config, hyperloglog_sketches); + + benchmark::DoNotOptimize(hyperloglog_sketches); + benchmark::ClobberMemory(); } + + state.counters["elements"] = elements_per_second(elements_per_bin * config.number_of_user_bins); } BENCHMARK_TEMPLATE(compute_sketches, sketch::Hyperloglog); From 76202c179bc8f13dc8258875ccdc007de50c64af Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 23 Oct 2024 13:31:27 +0200 Subject: [PATCH 4/4] [MISC] IBF construction: Use sketches to determine biggest bin, use exact counts for biggest bin --- src/interleaved_bloom_filter.cpp | 53 ++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index bd837831..cdebb3cc 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -13,11 +13,13 @@ #include // for bin_size_in_bits #include // for config, insert_iterator -#include // for unordered_flat_set #include // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_... #include // for bit_vector #include // for divide_and_ceil -#include // for HIBF_COMPILER_IS_GCC +#include +#include // for HIBF_COMPILER_IS_GCC +#include // for compute_sketches +#include // for hyperloglog namespace seqan::hibf { @@ -48,29 +50,48 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_, resize(technical_bins * bin_size_); } -size_t max_bin_size(config & configuration, size_t const max_bin_elements) +size_t find_biggest_bin(config const & configuration) { - configuration.validate_and_set_defaults(); - + size_t bin_id{}; size_t max_size{}; + seqan::hibf::sketch::hyperloglog sketch{configuration.sketch_bits}; - if (max_bin_elements == 0u) +#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) firstprivate(sketch) + for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) { - robin_hood::unordered_flat_set kmers; -#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers) - for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) - { - kmers.clear(); - configuration.input_fn(i, insert_iterator{kmers}); + sketch.reset(); + configuration.input_fn(i, insert_iterator{sketch}); + size_t const estimate = sketch.estimate(); #pragma omp critical - max_size = std::max(max_size, kmers.size()); + { + if (estimate > max_size) + { + max_size = estimate; + bin_id = i; + } } } - else + + return bin_id; +} + +size_t max_bin_size(config & configuration, size_t const max_bin_elements) +{ + configuration.validate_and_set_defaults(); + + size_t const max_size = [&]() { - max_size = max_bin_elements; - } + if (max_bin_elements != 0u) + return max_bin_elements; + + // Use sketches to determine biggest bin. + size_t const max_bin_id = find_biggest_bin(configuration); + // Get exact count for biggest bin. Sketch estimate's accuracy depends on configuration.sketch_bits + robin_hood::unordered_flat_set kmers{}; + configuration.input_fn(max_bin_id, insert_iterator{kmers}); + return kmers.size(); + }(); return build::bin_size_in_bits({.fpr = configuration.maximum_fpr, // .hash_count = configuration.number_of_hash_functions,