Skip to content

Commit

Permalink
dynamic
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 30, 2024
1 parent 2c1488a commit 0f5dc35
Show file tree
Hide file tree
Showing 17 changed files with 338 additions and 73 deletions.
3 changes: 2 additions & 1 deletion include/hibf/build/insert_into_ibf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ namespace seqan::hibf::build
* \details
* Automatically does naive splitting if number_of_bins > 1.
*/
void insert_into_ibf(robin_hood::unordered_flat_set<uint64_t> const & kmers,
void insert_into_ibf(build_data const & data,
robin_hood::unordered_flat_set<uint64_t> const & kmers,
size_t const number_of_bins,
size_t const bin_index,
seqan::hibf::interleaved_bloom_filter & ibf,
Expand Down
20 changes: 18 additions & 2 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace seqan::hibf
* | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] |
* | Layout | seqan::hibf::config::sketch_bits | 12 | |
* | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset |
* | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout |
* | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | |
* | Layout | seqan::hibf::config::alpha | 1.2 | |
* | Layout | seqan::hibf::config::disable_estimate_union | false | |
Expand Down Expand Up @@ -230,6 +231,9 @@ struct config
*/
size_t tmax{};

//!\brief The percentage of empty bins in the layout.
double empty_bin_fraction{};

/*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm.
*
* The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of
Expand Down Expand Up @@ -302,6 +306,7 @@ struct config
* * seqan::hibf::config::threads must be greater than `0`.
* * seqan::hibf::config::sketch_bits must be in `[5,32]`.
* * seqan::hibf::config::tmax must be at most `18446744073709551552`.
* * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`.
* * seqan::hibf::config::alpha must be positive.
* * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`.
*
Expand All @@ -324,21 +329,26 @@ struct config
threads == other.threads &&
sketch_bits == other.sketch_bits &&
tmax == other.tmax &&
empty_bin_fraction == other.empty_bin_fraction &&
alpha == other.alpha &&
max_rearrangement_ratio == other.max_rearrangement_ratio &&
disable_estimate_union == other.disable_estimate_union &&
disable_rearrangement == other.disable_rearrangement;
// clang-format on
}

bool validated{false};

private:
friend class cereal::access;

static constexpr uint32_t version{2};

template <typename archive_t>
void serialize(archive_t & archive)
{
uint32_t version{1};
archive(CEREAL_NVP(version));
uint32_t parsed_version{version};
archive(cereal::make_nvp("version", parsed_version));

archive(CEREAL_NVP(number_of_user_bins));
archive(CEREAL_NVP(number_of_hash_functions));
Expand All @@ -348,10 +358,16 @@ struct config

archive(CEREAL_NVP(sketch_bits));
archive(CEREAL_NVP(tmax));

if (parsed_version > 1u)
archive(CEREAL_NVP(empty_bin_fraction));

archive(CEREAL_NVP(alpha));
archive(CEREAL_NVP(max_rearrangement_ratio));
archive(CEREAL_NVP(disable_estimate_union));
archive(CEREAL_NVP(disable_rearrangement));
if (parsed_version > 1u)
archive(CEREAL_NVP(validated));
}
};

Expand Down
20 changes: 20 additions & 0 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ namespace bin_kind

//!\brief The value that indicates a merged bin.
static constexpr uint64_t merged{std::numeric_limits<uint64_t>::max()};
//!\brief The value that indicates a deleted bin.
static constexpr uint64_t deleted{std::numeric_limits<uint64_t>::max() - 1u};

} // namespace bin_kind

Expand Down Expand Up @@ -212,6 +214,23 @@ class hierarchical_interleaved_bloom_filter
*/
std::vector<std::vector<uint64_t>> next_ibf_id;

struct previous_ibf_id_pair
{
size_t ibf_idx{};
size_t bin_idx{};

friend constexpr auto operator<=>(previous_ibf_id_pair const &, previous_ibf_id_pair const &) = default;

template <seqan::hibf::cereal_archive archive_t>
void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
{
archive(ibf_idx);
archive(bin_idx);
}
};

std::vector<previous_ibf_id_pair> prev_ibf_id;

/*!\brief Stores for each bin in each IBF of the HIBF the user bin ID.
* \details
* Assume we look up a bin `b` in IBF `i`, i.e. `ibf_bin_to_user_bin_id[i][b]`.
Expand Down Expand Up @@ -251,6 +270,7 @@ class hierarchical_interleaved_bloom_filter
archive(ibf_vector);
archive(next_ibf_id);
archive(ibf_bin_to_user_bin_id);
archive(prev_ibf_id);
}

/*!\name Timer
Expand Down
29 changes: 22 additions & 7 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include <cereal/macros.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
#include <cereal/types/base_class.hpp> // for base_class
#include <cereal/types/vector.hpp> // for vector

#include <hibf/cereal/concepts.hpp> // for cereal_archive
#include <hibf/contrib/aligned_allocator.hpp> // for aligned_allocator
Expand Down Expand Up @@ -193,7 +194,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector

//!\brief Helper function to reduce code-duplication between emplace and emplace_exists.
template <bool check_exists>
inline auto emplace_impl(size_t const value, bin_index const bin) noexcept;
inline void emplace_impl(size_t const value, bin_index const bin) noexcept;

public:
class membership_agent_type; // documented upon definition below
Expand Down Expand Up @@ -257,10 +258,9 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
/*!\brief Inserts a value into a specific bin and returns whether the value already existed.
* \param[in] value The raw numeric value to process.
* \param[in] bin The bin index to insert into.
* \returns `true` if the value already existed, `false` otherwise.
* \sa seqan::hibf::interleaved_bloom_filter::emplace
*/
[[nodiscard]] bool emplace_exists(size_t const value, bin_index const bin) noexcept;
void emplace_exists(size_t const value, bin_index const bin) noexcept;

/*!\brief Clears a specific bin.
* \param[in] bin The bin index to clear.
Expand Down Expand Up @@ -292,16 +292,22 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
"The reference type of the range to clear must be seqan::hibf::bin_index.");
#ifndef NDEBUG
for (auto && bin : bin_range)
assert(bin.value < bins);
assert(bin.value < technical_bins);
#endif // NDEBUG

for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
for (auto && bin : bin_range)
(*this)[bin.value + offset] = 0;
}

/*!\brief Sets the number of bins stored in the Interleaved Bloom Filter.
* \param[in] new_bin_count The new number of bins.
* \returns `true` if the number of bins was set, `false` if the number of bins was not set.
*/
bool set_bin_count(bin_count const new_bin_count);

/*!\brief Increases the number of bins stored in the Interleaved Bloom Filter.
* \param[in] new_bins_ The new number of bins.
* \param[in] new_bin_count The new number of bins.
* \throws std::invalid_argument If passed number of bins is smaller than current number of bins.
*
* \attention The new number of bins must be greater or equal to the current number of bins.
Expand All @@ -322,7 +328,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
*
* \include test/snippet/ibf/interleaved_bloom_filter_increase_bin_number_to.cpp
*/
void increase_bin_number_to(bin_count const new_bins_);
void increase_bin_number_to(bin_count const new_bin_count);
//!\}

/*!\name Lookup
Expand Down Expand Up @@ -378,6 +384,12 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
return bins;
}

//!\brief Sets the number of bins to a new value.
void overwrite_bin_count(size_t const new_count) noexcept
{
bins = new_count;
}

/*!\brief Returns the size of a single bin that the Interleaved Bloom Filter manages.
* \returns The size in bits of a single bin.
*/
Expand All @@ -398,7 +410,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
/*!\name Comparison operators
* \{
*/
constexpr bool operator==(interleaved_bloom_filter const &) const = default;
HIBF_CONSTEXPR_VECTOR bool operator==(interleaved_bloom_filter const &) const = default;
//!\}

/*!\name Access
Expand All @@ -414,6 +426,8 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
using base_t::data;
//!\}

std::vector<size_t> occupancy{};

/*!\cond DEV
* \brief Serialisation support function.
* \tparam archive_t Type of `archive`; must satisfy seqan::hibf::cereal_archive.
Expand All @@ -431,6 +445,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
archive(bin_words);
archive(hash_funs);
archive(cereal::base_class<base_t>(this));
archive(occupancy);
}
//!\endcond
};
Expand Down
3 changes: 2 additions & 1 deletion include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ class hierarchical_binning
config{config_},
data{std::addressof(data_)},
num_user_bins{data->positions.size()},
num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)}
num_technical_bins{data->previous.empty() ? config.tmax
: std::min<size_t>(needed_technical_bins(num_user_bins), config.tmax)}
{
assert(data != nullptr);
}
Expand Down
26 changes: 26 additions & 0 deletions include/hibf/misc/empty_bins_by_fraction.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <algorithm>
#include <cstddef>

#include <hibf/platform.hpp>

namespace seqan::hibf
{

/*!\brief Returns the number of empty bins that should be created by a given fraction of the total number of bins.
* \param[in] tmax The total number of bins.
* \param[in] fraction The fraction of the total number of bins that should be empty.
* \ingroup hibf
* \sa https://godbolt.org/z/cMjbM39vj
*/
[[nodiscard]] constexpr size_t empty_bins_by_fraction(size_t const tmax, double const fraction) noexcept
{
return std::clamp<size_t>(tmax * fraction, 1, tmax - 2) - (fraction == 0.0);
}

} // namespace seqan::hibf
10 changes: 10 additions & 0 deletions include/hibf/misc/insert_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ class insert_iterator
type{data_type::ibf}
{}

explicit constexpr insert_iterator(ibf_t & ibf, size_t ibf_bin_index, bool) :
ptr{std::addressof(ibf)},
ibf_bin_index{ibf_bin_index},
type{data_type::ibf2}
{}

Check warning on line 61 in include/hibf/misc/insert_iterator.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/insert_iterator.hpp#L57-L61

Added lines #L57 - L61 were not covered by tests

explicit constexpr insert_iterator(function_t & fun) : ptr{std::addressof(fun)}, type{data_type::function}
{}

Expand All @@ -72,6 +78,9 @@ class insert_iterator
case data_type::ibf:
static_cast<ibf_t *>(ptr)->emplace(value, static_cast<bin_index>(ibf_bin_index));
break;
case data_type::ibf2:
static_cast<ibf_t *>(ptr)->emplace_exists(value, static_cast<bin_index>(ibf_bin_index));
break;

Check warning on line 83 in include/hibf/misc/insert_iterator.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/insert_iterator.hpp#L81-L83

Added lines #L81 - L83 were not covered by tests
default:
assert(type == data_type::function);
static_cast<function_t *>(ptr)->operator()(value);
Expand Down Expand Up @@ -102,6 +111,7 @@ class insert_iterator
unordered_set,
sketch,
ibf,
ibf2,
function
};

Expand Down
2 changes: 1 addition & 1 deletion src/build/construct_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ seqan::hibf::interleaved_bloom_filter construct_ibf(robin_hood::unordered_flat_s
local_index_allocation_timer.stop();
data.index_allocation_timer += local_index_allocation_timer;

insert_into_ibf(kmers, number_of_bins, ibf_node.max_bin_index, ibf, data.fill_ibf_timer);
insert_into_ibf(data, kmers, number_of_bins, ibf_node.max_bin_index, ibf, data.fill_ibf_timer);
if (!is_root)
update_parent_kmers(parent_kmers, kmers, data.merge_kmers_timer);

Expand Down
40 changes: 35 additions & 5 deletions src/build/insert_into_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ namespace seqan::hibf::build
{

// automatically does naive splitting if number_of_bins > 1
void insert_into_ibf(robin_hood::unordered_flat_set<uint64_t> const & kmers,
void insert_into_ibf(build_data const & data,
robin_hood::unordered_flat_set<uint64_t> const & kmers,
size_t const number_of_bins,
size_t const bin_index,
seqan::hibf::interleaved_bloom_filter & ibf,
Expand All @@ -32,16 +33,42 @@ void insert_into_ibf(robin_hood::unordered_flat_set<uint64_t> const & kmers,
size_t const chunk_size = divide_and_ceil(kmers.size(), number_of_bins);
size_t chunk_number{};

bool const use_exists = data.config.empty_bin_fraction > 0.0;

serial_timer local_fill_ibf_timer{};
local_fill_ibf_timer.start();
for (auto chunk : kmers | seqan::stl::views::chunk(chunk_size))
auto chunk_view = seqan::stl::views::chunk(kmers, chunk_size);
for (auto && chunk : chunk_view)
{
assert(chunk_number < number_of_bins);
seqan::hibf::bin_index const bin_idx{bin_index + chunk_number};
++chunk_number;
for (size_t const value : chunk)
ibf.emplace(value, bin_idx);
if (use_exists)
{
for (auto && value : chunk)
ibf.emplace_exists(value, bin_idx);

Check warning on line 49 in src/build/insert_into_ibf.cpp

View check run for this annotation

Codecov / codecov/patch

src/build/insert_into_ibf.cpp#L49

Added line #L49 was not covered by tests
}
else
{
for (auto && value : chunk)
ibf.emplace(value, bin_idx);
}
}

assert(chunk_view.size() <= number_of_bins);
if (use_exists && chunk_view.size() < number_of_bins)
{
size_t const diff = number_of_bins - chunk_view.size();
auto it = ibf.occupancy.begin() + bin_index + chunk_view.size();
assert(std::ranges::all_of(it,
it + diff,
[](size_t value)
{
return value == 0u;
}));
std::ranges::fill_n(it, diff, 1u);

Check warning on line 69 in src/build/insert_into_ibf.cpp

View check run for this annotation

Codecov / codecov/patch

src/build/insert_into_ibf.cpp#L69

Added line #L69 was not covered by tests
}

local_fill_ibf_timer.stop();
fill_ibf_timer += local_fill_ibf_timer;
}
Expand All @@ -54,7 +81,10 @@ void insert_into_ibf(build_data const & data,
serial_timer local_fill_ibf_timer{};
local_user_bin_io_timer.start();
local_fill_ibf_timer.start();
data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id});
if (data.config.empty_bin_fraction > 0.0)
data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id, true});
else
data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id});
local_user_bin_io_timer.stop();
local_fill_ibf_timer.stop();
data.user_bin_io_timer += local_user_bin_io_timer;
Expand Down
Loading

0 comments on commit 0f5dc35

Please sign in to comment.