From 088b02c2e5bbcd269415cb60924a147f1ff0de1d Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Wed, 23 Oct 2024 13:29:01 +0200 Subject: [PATCH] [MISC] Extend insert_iterator --- include/hibf/misc/insert_iterator.hpp | 73 ++++++++++++++++--------- src/CMakeLists.txt | 1 + src/misc/insert_iterator.cpp | 65 ++++++++++++++++++++++ test/unit/hibf/CMakeLists.txt | 1 + test/unit/hibf/insert_iterator_test.cpp | 58 ++++++++++++++++++++ 5 files changed, 171 insertions(+), 27 deletions(-) create mode 100644 src/misc/insert_iterator.cpp create mode 100644 test/unit/hibf/insert_iterator_test.cpp diff --git a/include/hibf/misc/insert_iterator.hpp b/include/hibf/misc/insert_iterator.hpp index 903869a0..72719598 100644 --- a/include/hibf/misc/insert_iterator.hpp +++ b/include/hibf/misc/insert_iterator.hpp @@ -20,6 +20,17 @@ namespace seqan::hibf { +// hibf/interleaved_bloom_filter.hpp includes config.hpp, which includes insert_iterator.hpp +// Hence, we need a forward declaration. +class interleaved_bloom_filter; + +namespace sketch +{ + +class hyperloglog; + +} + class insert_iterator { public: @@ -29,35 +40,34 @@ class insert_iterator using pointer = void; using reference = void; - insert_iterator() = delete; - insert_iterator(insert_iterator const &) = default; - insert_iterator(insert_iterator &&) = default; - insert_iterator & operator=(insert_iterator const &) = default; - insert_iterator & operator=(insert_iterator &&) = default; - ~insert_iterator() = default; + constexpr insert_iterator() = default; + constexpr insert_iterator(insert_iterator const &) = default; + constexpr insert_iterator(insert_iterator &&) = default; + constexpr insert_iterator & operator=(insert_iterator const &) = default; + constexpr insert_iterator & operator=(insert_iterator &&) = default; + constexpr ~insert_iterator() = default; + + using set_t = robin_hood::unordered_flat_set; + using sketch_t = sketch::hyperloglog; + using ibf_t = interleaved_bloom_filter; + using function_t = std::function; - explicit constexpr insert_iterator(robin_hood::unordered_flat_set & set) : - set{std::addressof(set)}, - is_set{true} + explicit constexpr insert_iterator(set_t & set) : ptr{std::addressof(set)}, type{data_type::unordered_set} {} - explicit constexpr insert_iterator(std::vector & vec) : vec{std::addressof(vec)}, is_set{false} + explicit constexpr insert_iterator(sketch_t & sketch) : ptr{std::addressof(sketch)}, type{data_type::sketch} {} - insert_iterator & operator=(uint64_t const value) noexcept - { - if (is_set) - { - assert(set != nullptr); - set->emplace(value); - } - else - { - assert(vec != nullptr); - vec->emplace_back(value); - } - return *this; - } + explicit constexpr insert_iterator(ibf_t & ibf, size_t ibf_bin_index) : + ptr{std::addressof(ibf)}, + ibf_bin_index{ibf_bin_index}, + type{data_type::ibf} + {} + + constexpr insert_iterator(function_t & fun) : ptr{std::addressof(fun)}, type{data_type::function} + {} + + insert_iterator & operator=(uint64_t const value) noexcept; [[nodiscard]] constexpr insert_iterator & operator*() noexcept { @@ -75,9 +85,18 @@ class insert_iterator } private: - robin_hood::unordered_flat_set * set{nullptr}; - std::vector * vec{nullptr}; - bool is_set{false}; + void * ptr{nullptr}; + + enum class data_type : uint8_t + { + unordered_set, + sketch, + ibf, + function + }; + + size_t ibf_bin_index{}; + data_type type{}; }; } // namespace seqan::hibf diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9d240b38..aab05f8f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,7 @@ set (HIBF_SOURCE_FILES sketch/compute_sketches.cpp layout/graph.cpp layout/hierarchical_binning.cpp + misc/insert_iterator.cpp misc/print.cpp sketch/toolbox.cpp sketch/hyperloglog.cpp diff --git a/src/misc/insert_iterator.cpp b/src/misc/insert_iterator.cpp new file mode 100644 index 00000000..c04d9025 --- /dev/null +++ b/src/misc/insert_iterator.cpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include // for assert +#include // for ceil, sqrt +#include // for function +#include // for operator<<, basic_ostream, basic_istream, getline, stringstream +#include // for basic_stringstream +#include // for invalid_argument +#include // for char_traits, string +#include // for operator==, basic_string_view, string_view + +#include // for JSONInputArchive, JSONOutputArchive +#include // for make_nvp, InputArchive, OutputArchive + +#include +#include // for insert_iterator +#include + +namespace seqan::hibf +{ + +// Inlining std::function produces overhead which affects the other cases. +[[gnu::noinline]] void invoke_without_inlining(void * const ptr, uint64_t const value) +{ + static_cast(ptr)->operator()(value); +} + +#if HIBF_COMPILER_IS_GCC +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wattributes" +#endif // HIBF_COMPILER_IS_GCC +[[gnu::always_inline]] insert_iterator & insert_iterator::operator=(uint64_t const value) noexcept +{ +#if HIBF_COMPILER_IS_GCC +# pragma GCC diagnostic pop +#endif // HIBF_COMPILER_IS_GCC + assert(ptr != nullptr); + + switch (type) + { + case data_type::unordered_set: + static_cast(ptr)->emplace(value); + break; + case data_type::sketch: + static_cast(ptr)->add(value); + break; + case data_type::ibf: + static_cast(ptr)->emplace(value, static_cast(ibf_bin_index)); + break; + case data_type::function: + invoke_without_inlining(ptr, value); + break; + default: // GCOVR_EXCL_LINE +#ifndef NDEBUG + assert(false); // GCOVR_EXCL_LINE +#else + __builtin_unreachable(); +#endif + } + return *this; +} + +} // namespace seqan::hibf diff --git a/test/unit/hibf/CMakeLists.txt b/test/unit/hibf/CMakeLists.txt index ee8ff489..49da8e71 100644 --- a/test/unit/hibf/CMakeLists.txt +++ b/test/unit/hibf/CMakeLists.txt @@ -9,6 +9,7 @@ hibf_test (config_test.cpp) hibf_test (counting_vector_test.cpp) hibf_test (counting_vector_avx512_test.cpp) hibf_test (hierarchical_interleaved_bloom_filter_test.cpp) +hibf_test (insert_iterator_test.cpp) hibf_test (interleaved_bloom_filter_test.cpp) hibf_test (interleaved_bloom_filter_avx512_test.cpp) hibf_test (path_test.cpp) diff --git a/test/unit/hibf/insert_iterator_test.cpp b/test/unit/hibf/insert_iterator_test.cpp new file mode 100644 index 00000000..fc89e66e --- /dev/null +++ b/test/unit/hibf/insert_iterator_test.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include // for Message, TestPartResult, AssertionResult, Test, EXPECT_EQ, Capture... + +#include +#include +#include +#include // for expect_range_eq, EXPECT_RANGE_EQ + +static constexpr std::array values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + +TEST(insert_iterator_test, unordered_set) +{ + robin_hood::unordered_flat_set target; + seqan::hibf::insert_iterator it{target}; + std::ranges::copy(values, it); + EXPECT_EQ(target.size(), 10u); +} + +TEST(insert_iterator_test, sketch) +{ + seqan::hibf::sketch::hyperloglog target{5u}; + seqan::hibf::insert_iterator it{target}; + std::ranges::copy(values, it); + EXPECT_NEAR(target.estimate(), 11.99, 0.001); +} + +TEST(insert_iterator_test, ibf) +{ + seqan::hibf::interleaved_bloom_filter target{seqan::hibf::bin_count{8u}, + seqan::hibf::bin_size{8u}, + seqan::hibf::hash_function_count{1u}}; + for (size_t i = 0; i < 3; ++i) + { + seqan::hibf::insert_iterator it{target, i}; + std::ranges::copy(values, it); + } + + auto agent = target.counting_agent(); + auto & result = agent.bulk_count(values); + std::vector const expected{10, 10, 10, 0, 0, 0, 0, 0}; + EXPECT_RANGE_EQ(result, expected); +} + +TEST(insert_iterator_test, function) +{ + robin_hood::unordered_flat_set target; + std::function fun = [&target](size_t const value) + { + target.emplace(value); + target.emplace((1u + value) * 11u); + }; + seqan::hibf::insert_iterator it{fun}; + std::ranges::copy(values, it); + EXPECT_EQ(target.size(), 20); +}