diff --git a/src/interleaved_bloom_filter.cpp b/src/interleaved_bloom_filter.cpp index ab3ae1fd..05fc89e8 100644 --- a/src/interleaved_bloom_filter.cpp +++ b/src/interleaved_bloom_filter.cpp @@ -13,11 +13,12 @@ #include // for bin_size_in_bits #include // for config, insert_iterator -#include // for unordered_flat_set #include // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_... #include // for bit_vector #include // for divide_and_ceil #include // for HIBF_COMPILER_IS_GCC +#include // for compute_sketches +#include // for hyperloglog namespace seqan::hibf { @@ -43,29 +44,48 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_, resize(technical_bins * bin_size_); } -size_t max_bin_size(config & configuration, size_t const max_bin_elements) +size_t find_biggest_bin(config const & configuration) { - configuration.validate_and_set_defaults(); - + size_t bin_id{}; size_t max_size{}; + seqan::hibf::sketch::hyperloglog sketch{configuration.sketch_bits}; - if (max_bin_elements == 0u) +#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) firstprivate(sketch) + for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) { - robin_hood::unordered_flat_set kmers; -#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers) - for (size_t i = 0u; i < configuration.number_of_user_bins; ++i) - { - kmers.clear(); - configuration.input_fn(i, insert_iterator{kmers}); + sketch.reset(); + configuration.input_fn(i, insert_iterator{sketch}); + size_t const estimate = sketch.estimate(); #pragma omp critical - max_size = std::max(max_size, kmers.size()); + { + if (estimate > max_size) + { + max_size = estimate; + bin_id = i; + } } } - else + + return bin_id; +} + +size_t max_bin_size(config & configuration, size_t const max_bin_elements) +{ + configuration.validate_and_set_defaults(); + + size_t const max_size = [&]() { - max_size = max_bin_elements; - } + if (max_bin_elements != 0u) + return max_bin_elements; + + // Use sketches to determine biggest bin. + size_t const max_bin_id = find_biggest_bin(configuration); + // Get exact count for biggest bin. Sketch estimate's accuracy depends on configuration.sketch_bits + robin_hood::unordered_flat_set kmers{}; + configuration.input_fn(max_bin_id, insert_iterator{kmers}); + return kmers.size(); + }(); return build::bin_size_in_bits({.fpr = configuration.maximum_fpr, // .hash_count = configuration.number_of_hash_functions,