Skip to content

Commit

Permalink
[MISC] Extend insert_iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 23, 2024
1 parent 25539c9 commit 088b02c
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 27 deletions.
73 changes: 46 additions & 27 deletions include/hibf/misc/insert_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@
namespace seqan::hibf
{

// hibf/interleaved_bloom_filter.hpp includes config.hpp, which includes insert_iterator.hpp
// Hence, we need a forward declaration.
class interleaved_bloom_filter;

namespace sketch
{

class hyperloglog;

}

class insert_iterator
{
public:
Expand All @@ -29,35 +40,34 @@ class insert_iterator
using pointer = void;
using reference = void;

insert_iterator() = delete;
insert_iterator(insert_iterator const &) = default;
insert_iterator(insert_iterator &&) = default;
insert_iterator & operator=(insert_iterator const &) = default;
insert_iterator & operator=(insert_iterator &&) = default;
~insert_iterator() = default;
constexpr insert_iterator() = default;
constexpr insert_iterator(insert_iterator const &) = default;
constexpr insert_iterator(insert_iterator &&) = default;
constexpr insert_iterator & operator=(insert_iterator const &) = default;
constexpr insert_iterator & operator=(insert_iterator &&) = default;
constexpr ~insert_iterator() = default;

using set_t = robin_hood::unordered_flat_set<uint64_t>;
using sketch_t = sketch::hyperloglog;
using ibf_t = interleaved_bloom_filter;
using function_t = std::function<void(uint64_t const)>;

explicit constexpr insert_iterator(robin_hood::unordered_flat_set<uint64_t> & set) :
set{std::addressof(set)},
is_set{true}
explicit constexpr insert_iterator(set_t & set) : ptr{std::addressof(set)}, type{data_type::unordered_set}
{}

explicit constexpr insert_iterator(std::vector<uint64_t> & vec) : vec{std::addressof(vec)}, is_set{false}
explicit constexpr insert_iterator(sketch_t & sketch) : ptr{std::addressof(sketch)}, type{data_type::sketch}
{}

insert_iterator & operator=(uint64_t const value) noexcept
{
if (is_set)
{
assert(set != nullptr);
set->emplace(value);
}
else
{
assert(vec != nullptr);
vec->emplace_back(value);
}
return *this;
}
explicit constexpr insert_iterator(ibf_t & ibf, size_t ibf_bin_index) :
ptr{std::addressof(ibf)},
ibf_bin_index{ibf_bin_index},
type{data_type::ibf}
{}

constexpr insert_iterator(function_t & fun) : ptr{std::addressof(fun)}, type{data_type::function}
{}

insert_iterator & operator=(uint64_t const value) noexcept;

[[nodiscard]] constexpr insert_iterator & operator*() noexcept
{
Expand All @@ -75,9 +85,18 @@ class insert_iterator
}

private:
robin_hood::unordered_flat_set<uint64_t> * set{nullptr};
std::vector<uint64_t> * vec{nullptr};
bool is_set{false};
void * ptr{nullptr};

enum class data_type : uint8_t
{
unordered_set,
sketch,
ibf,
function
};

size_t ibf_bin_index{};
data_type type{};
};

} // namespace seqan::hibf
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ set (HIBF_SOURCE_FILES
sketch/compute_sketches.cpp
layout/graph.cpp
layout/hierarchical_binning.cpp
misc/insert_iterator.cpp
misc/print.cpp
sketch/toolbox.cpp
sketch/hyperloglog.cpp
Expand Down
65 changes: 65 additions & 0 deletions src/misc/insert_iterator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <cassert> // for assert
#include <cmath> // for ceil, sqrt
#include <functional> // for function
#include <iostream> // for operator<<, basic_ostream, basic_istream, getline, stringstream
#include <sstream> // for basic_stringstream
#include <stdexcept> // for invalid_argument
#include <string> // for char_traits, string
#include <string_view> // for operator==, basic_string_view, string_view

#include <cereal/archives/json.hpp> // for JSONInputArchive, JSONOutputArchive
#include <cereal/cereal.hpp> // for make_nvp, InputArchive, OutputArchive

#include <hibf/interleaved_bloom_filter.hpp>
#include <hibf/misc/insert_iterator.hpp> // for insert_iterator
#include <hibf/sketch/hyperloglog.hpp>

namespace seqan::hibf
{

// Inlining std::function produces overhead which affects the other cases.
[[gnu::noinline]] void invoke_without_inlining(void * const ptr, uint64_t const value)
{
static_cast<typename insert_iterator::function_t *>(ptr)->operator()(value);
}

#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wattributes"
#endif // HIBF_COMPILER_IS_GCC
[[gnu::always_inline]] insert_iterator & insert_iterator::operator=(uint64_t const value) noexcept
{
#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic pop
#endif // HIBF_COMPILER_IS_GCC
assert(ptr != nullptr);

switch (type)
{
case data_type::unordered_set:
static_cast<set_t *>(ptr)->emplace(value);
break;
case data_type::sketch:
static_cast<sketch_t *>(ptr)->add(value);
break;
case data_type::ibf:
static_cast<ibf_t *>(ptr)->emplace(value, static_cast<bin_index>(ibf_bin_index));
break;
case data_type::function:
invoke_without_inlining(ptr, value);
break;
default: // GCOVR_EXCL_LINE
#ifndef NDEBUG
assert(false); // GCOVR_EXCL_LINE
#else
__builtin_unreachable();
#endif
}
return *this;
}

} // namespace seqan::hibf
1 change: 1 addition & 0 deletions test/unit/hibf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ hibf_test (config_test.cpp)
hibf_test (counting_vector_test.cpp)
hibf_test (counting_vector_avx512_test.cpp)
hibf_test (hierarchical_interleaved_bloom_filter_test.cpp)
hibf_test (insert_iterator_test.cpp)
hibf_test (interleaved_bloom_filter_test.cpp)
hibf_test (interleaved_bloom_filter_avx512_test.cpp)
hibf_test (path_test.cpp)
Expand Down
58 changes: 58 additions & 0 deletions test/unit/hibf/insert_iterator_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <gtest/gtest.h> // for Message, TestPartResult, AssertionResult, Test, EXPECT_EQ, Capture...

#include <hibf/interleaved_bloom_filter.hpp>
#include <hibf/misc/insert_iterator.hpp>
#include <hibf/sketch/hyperloglog.hpp>
#include <hibf/test/expect_range_eq.hpp> // for expect_range_eq, EXPECT_RANGE_EQ

static constexpr std::array<size_t, 10> values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};

TEST(insert_iterator_test, unordered_set)
{
robin_hood::unordered_flat_set<uint64_t> target;
seqan::hibf::insert_iterator it{target};
std::ranges::copy(values, it);
EXPECT_EQ(target.size(), 10u);
}

TEST(insert_iterator_test, sketch)
{
seqan::hibf::sketch::hyperloglog target{5u};
seqan::hibf::insert_iterator it{target};
std::ranges::copy(values, it);
EXPECT_NEAR(target.estimate(), 11.99, 0.001);
}

TEST(insert_iterator_test, ibf)
{
seqan::hibf::interleaved_bloom_filter target{seqan::hibf::bin_count{8u},
seqan::hibf::bin_size{8u},
seqan::hibf::hash_function_count{1u}};
for (size_t i = 0; i < 3; ++i)
{
seqan::hibf::insert_iterator it{target, i};
std::ranges::copy(values, it);
}

auto agent = target.counting_agent<uint8_t>();
auto & result = agent.bulk_count(values);
std::vector<uint8_t> const expected{10, 10, 10, 0, 0, 0, 0, 0};
EXPECT_RANGE_EQ(result, expected);
}

TEST(insert_iterator_test, function)
{
robin_hood::unordered_flat_set<uint64_t> target;
std::function<void(uint64_t const)> fun = [&target](size_t const value)
{
target.emplace(value);
target.emplace((1u + value) * 11u);
};
seqan::hibf::insert_iterator it{fun};
std::ranges::copy(values, it);
EXPECT_EQ(target.size(), 20);
}

0 comments on commit 088b02c

Please sign in to comment.