Skip to content

Commit

Permalink
[MISC] IBF construction: Use sketches to determine biggest bin, use e…
Browse files Browse the repository at this point in the history
…xact counts for biggest bin
  • Loading branch information
eseiler committed Oct 23, 2024
1 parent 6a67a48 commit 2ed20d8
Showing 1 changed file with 35 additions and 15 deletions.
50 changes: 35 additions & 15 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@

#include <hibf/build/bin_size_in_bits.hpp> // for bin_size_in_bits
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_...
#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/platform.hpp> // for HIBF_COMPILER_IS_GCC
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf
{
Expand All @@ -43,29 +44,48 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
resize(technical_bins * bin_size_);
}

size_t max_bin_size(config & configuration, size_t const max_bin_elements)
size_t find_biggest_bin(config const & configuration)
{
configuration.validate_and_set_defaults();

size_t bin_id{};
size_t max_size{};
seqan::hibf::sketch::hyperloglog sketch{configuration.sketch_bits};

if (max_bin_elements == 0u)
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) firstprivate(sketch)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});
sketch.reset();
configuration.input_fn(i, insert_iterator{sketch});

size_t const estimate = sketch.estimate();
#pragma omp critical
max_size = std::max(max_size, kmers.size());
{
if (estimate > max_size)
{
max_size = estimate;
bin_id = i;
}
}
}
else

return bin_id;
}

size_t max_bin_size(config & configuration, size_t const max_bin_elements)
{
configuration.validate_and_set_defaults();

size_t const max_size = [&]()
{
max_size = max_bin_elements;
}
if (max_bin_elements != 0u)
return max_bin_elements;

// Use sketches to determine biggest bin.
size_t const max_bin_id = find_biggest_bin(configuration);
// Get exact count for biggest bin. Sketch estimate's accuracy depends on configuration.sketch_bits
robin_hood::unordered_flat_set<uint64_t> kmers{};
configuration.input_fn(max_bin_id, insert_iterator{kmers});
return kmers.size();
}();

return build::bin_size_in_bits({.fpr = configuration.maximum_fpr, //
.hash_count = configuration.number_of_hash_functions,
Expand Down

0 comments on commit 2ed20d8

Please sign in to comment.