Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MISC] Extend insert_iterator #237

Merged
merged 4 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ jobs:
--exclude-unreachable-branches \
--exclude-throw-branches \
--exclude-noncode-lines \
--merge-mode-functions separate \
-j \
--cobertura \
--output ${GITHUB_WORKSPACE}/build/coverage_report.xml
Expand Down
5 changes: 4 additions & 1 deletion include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include <cereal/types/base_class.hpp> // for base_class

#include <hibf/cereal/concepts.hpp> // for cereal_archive
#include <hibf/config.hpp> // for config
#include <hibf/contrib/aligned_allocator.hpp> // for aligned_allocator
#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/misc/counting_vector.hpp> // for counting_vector
Expand All @@ -33,6 +32,10 @@

namespace seqan::hibf
{

// config.hpp -> misc/insert_iterator.hpp (Needs interleaved_bloom_filter to be a complete class)
struct config;

/*!\brief A strong type that represents the number of bins for the seqan::hibf::interleaved_bloom_filter.
* \ingroup ibf
* \qualifier strong
Expand Down
74 changes: 52 additions & 22 deletions include/hibf/misc/insert_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
#include <vector> // for vector

#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set, hash
#include <hibf/interleaved_bloom_filter.hpp>
#include <hibf/platform.hpp>
#include <hibf/sketch/hyperloglog.hpp>

// IWYU pragma: private, include <hibf/config.hpp>

Expand All @@ -29,32 +31,51 @@ class insert_iterator
using pointer = void;
using reference = void;

insert_iterator() = delete;
insert_iterator(insert_iterator const &) = default;
insert_iterator(insert_iterator &&) = default;
insert_iterator & operator=(insert_iterator const &) = default;
insert_iterator & operator=(insert_iterator &&) = default;
~insert_iterator() = default;
constexpr insert_iterator() = default;
constexpr insert_iterator(insert_iterator const &) = default;
constexpr insert_iterator(insert_iterator &&) = default;
constexpr insert_iterator & operator=(insert_iterator const &) = default;
constexpr insert_iterator & operator=(insert_iterator &&) = default;
constexpr ~insert_iterator() = default;

explicit constexpr insert_iterator(robin_hood::unordered_flat_set<uint64_t> & set) :
set{std::addressof(set)},
is_set{true}
using set_t = robin_hood::unordered_flat_set<uint64_t>;
using sketch_t = sketch::hyperloglog;
using ibf_t = interleaved_bloom_filter;
using function_t = std::function<void(uint64_t const)>;

explicit constexpr insert_iterator(set_t & set) : ptr{std::addressof(set)}, type{data_type::unordered_set}
{}

explicit constexpr insert_iterator(sketch_t & sketch) : ptr{std::addressof(sketch)}, type{data_type::sketch}
{}

explicit constexpr insert_iterator(std::vector<uint64_t> & vec) : vec{std::addressof(vec)}, is_set{false}
explicit constexpr insert_iterator(ibf_t & ibf, size_t ibf_bin_index) :
ptr{std::addressof(ibf)},
ibf_bin_index{ibf_bin_index},
type{data_type::ibf}
{}

insert_iterator & operator=(uint64_t const value) noexcept
explicit constexpr insert_iterator(function_t & fun) : ptr{std::addressof(fun)}, type{data_type::function}
{}

[[gnu::always_inline, gnu::flatten]] inline insert_iterator & operator=(uint64_t const value) noexcept
{
if (is_set)
{
assert(set != nullptr);
set->emplace(value);
}
else
assert(ptr != nullptr);

switch (type)
{
assert(vec != nullptr);
vec->emplace_back(value);
case data_type::unordered_set:
static_cast<set_t *>(ptr)->emplace(value);
break;
case data_type::sketch:
static_cast<sketch_t *>(ptr)->add(value);
break;
case data_type::ibf:
static_cast<ibf_t *>(ptr)->emplace(value, static_cast<bin_index>(ibf_bin_index));
break;
default:
assert(type == data_type::function);
static_cast<function_t *>(ptr)->operator()(value);
}
return *this;
}
Expand All @@ -75,9 +96,18 @@ class insert_iterator
}

private:
robin_hood::unordered_flat_set<uint64_t> * set{nullptr};
std::vector<uint64_t> * vec{nullptr};
bool is_set{false};
void * ptr{nullptr};

enum class data_type : uint8_t
{
unordered_set,
sketch,
ibf,
function
};

size_t ibf_bin_index{};
data_type type{};
};

} // namespace seqan::hibf
14 changes: 4 additions & 10 deletions src/build/insert_into_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,14 @@ void insert_into_ibf(build_data const & data,
layout::layout::user_bin const & record,
seqan::hibf::interleaved_bloom_filter & ibf)
{
auto const bin_index = seqan::hibf::bin_index{static_cast<size_t>(record.storage_TB_id)};
std::vector<uint64_t> values;

serial_timer local_user_bin_io_timer{};
local_user_bin_io_timer.start();
data.config.input_fn(record.idx, insert_iterator{values});
local_user_bin_io_timer.stop();
data.user_bin_io_timer += local_user_bin_io_timer;

serial_timer local_fill_ibf_timer{};
local_user_bin_io_timer.start();
local_fill_ibf_timer.start();
for (auto && value : values)
ibf.emplace(value, bin_index);
data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id});
local_user_bin_io_timer.stop();
local_fill_ibf_timer.stop();
data.user_bin_io_timer += local_user_bin_io_timer;
data.fill_ibf_timer += local_fill_ibf_timer;
}

Expand Down
82 changes: 50 additions & 32 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,22 @@

#include <hibf/build/bin_size_in_bits.hpp> // for bin_size_in_bits
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_...
#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/platform.hpp> // for HIBF_COMPILER_IS_GCC
#include <hibf/misc/insert_iterator.hpp>
#include <hibf/platform.hpp> // for HIBF_COMPILER_IS_GCC
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf
{

#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wattributes"
#endif // HIBF_COMPILER_IS_GCC

interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
seqan::hibf::bin_size size,
seqan::hibf::hash_function_count funs)
Expand All @@ -43,29 +50,48 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
resize(technical_bins * bin_size_);
}

size_t max_bin_size(config & configuration, size_t const max_bin_elements)
size_t find_biggest_bin(config const & configuration)
{
configuration.validate_and_set_defaults();

size_t bin_id{};
size_t max_size{};
seqan::hibf::sketch::hyperloglog sketch{configuration.sketch_bits};

if (max_bin_elements == 0u)
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) firstprivate(sketch)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});
sketch.reset();
configuration.input_fn(i, insert_iterator{sketch});

size_t const estimate = sketch.estimate();
#pragma omp critical
max_size = std::max(max_size, kmers.size());
{
if (estimate > max_size)
{
max_size = estimate;
bin_id = i;
}
}
}
else

return bin_id;
}

size_t max_bin_size(config & configuration, size_t const max_bin_elements)
{
configuration.validate_and_set_defaults();

size_t const max_size = [&]()
{
max_size = max_bin_elements;
}
if (max_bin_elements != 0u)
return max_bin_elements;

// Use sketches to determine biggest bin.
size_t const max_bin_id = find_biggest_bin(configuration);
// Get exact count for biggest bin. Sketch estimate's accuracy depends on configuration.sketch_bits
robin_hood::unordered_flat_set<uint64_t> kmers{};
configuration.input_fn(max_bin_id, insert_iterator{kmers});
return kmers.size();
}();

return build::bin_size_in_bits({.fpr = configuration.maximum_fpr, //
.hash_count = configuration.number_of_hash_functions,
Expand All @@ -80,16 +106,11 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
{
// NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u);
robin_hood::unordered_flat_set<uint64_t> kmers;

#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads) private(kmers)
#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});

for (uint64_t const hash : kmers)
emplace(hash, seqan::hibf::bin_index{i});
configuration.input_fn(i, insert_iterator{*this, i});
}
}

Expand Down Expand Up @@ -118,12 +139,12 @@ inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index
return exists;
};

void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<false>(value, bin);
}

bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept
[[gnu::always_inline]] bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<true>(value, bin);
}
Expand Down Expand Up @@ -178,16 +199,9 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con
technical_bins = new_technical_bins;
}

#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wattributes"
#endif // HIBF_COMPILER_IS_GCC
[[gnu::always_inline]] bit_vector const &
interleaved_bloom_filter::membership_agent_type::bulk_contains(size_t const value) & noexcept
{
#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic pop
#endif // HIBF_COMPILER_IS_GCC
assert(ibf_ptr != nullptr);
assert(result_buffer.size() == ibf_ptr->bin_count());

Expand Down Expand Up @@ -276,4 +290,8 @@ interleaved_bloom_filter::membership_agent_type::bulk_contains(size_t const valu
return result_buffer;
}

#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic pop
#endif // HIBF_COMPILER_IS_GCC

} // namespace seqan::hibf
21 changes: 9 additions & 12 deletions src/sketch/compute_sketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,18 @@ namespace seqan::hibf::sketch
void compute_sketches(config const & config, std::vector<sketch::hyperloglog> & hll_sketches)
{
// compute hll_sketches
hll_sketches.resize(config.number_of_user_bins);
hll_sketches.resize(config.number_of_user_bins, config.sketch_bits);

assert(std::ranges::all_of(hll_sketches,
[bits = config.sketch_bits](hyperloglog const & sketch)
{
return sketch.data_size() == (1ULL << bits);
}));

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
#pragma omp parallel for schedule(dynamic) num_threads(config.threads)
for (size_t i = 0; i < config.number_of_user_bins; ++i)
{
seqan::hibf::sketch::hyperloglog hll_sketch(config.sketch_bits);

kmers.clear();
config.input_fn(i, insert_iterator{kmers});

for (auto k_hash : kmers)
hll_sketch.add(k_hash);

hll_sketches[i] = std::move(hll_sketch);
config.input_fn(i, insert_iterator{hll_sketches[i]});
}
}

Expand Down
1 change: 1 addition & 0 deletions test/performance/ibf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
hibf_benchmark (bit_vector_benchmark.cpp)
hibf_benchmark (bit_vector_serialisation_benchmark.cpp)
hibf_benchmark (interleaved_bloom_filter_benchmark.cpp)
hibf_benchmark (interleaved_bloom_filter_construction_benchmark.cpp)
Loading