Skip to content

Commit

Permalink
[WIP] Dynamic
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 15, 2024
1 parent dbbfb3d commit c8757dd
Show file tree
Hide file tree
Showing 12 changed files with 228 additions and 34 deletions.
20 changes: 18 additions & 2 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace seqan::hibf
* | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] |
* | Layout | seqan::hibf::config::sketch_bits | 12 | |
* | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset |
* | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout |
* | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | |
* | Layout | seqan::hibf::config::alpha | 1.2 | |
* | Layout | seqan::hibf::config::disable_estimate_union | false | |
Expand Down Expand Up @@ -230,6 +231,9 @@ struct config
*/
size_t tmax{};

//!\brief The percentage of empty bins in the layout.
double empty_bin_fraction{};

/*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm.
*
* The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of
Expand Down Expand Up @@ -302,6 +306,7 @@ struct config
* * seqan::hibf::config::threads must be greater than `0`.
* * seqan::hibf::config::sketch_bits must be in `[5,32]`.
* * seqan::hibf::config::tmax must be at most `18446744073709551552`.
* * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`.
* * seqan::hibf::config::alpha must be positive.
* * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`.
*
Expand All @@ -324,21 +329,26 @@ struct config
threads == other.threads &&
sketch_bits == other.sketch_bits &&
tmax == other.tmax &&
empty_bin_fraction == other.empty_bin_fraction &&
alpha == other.alpha &&
max_rearrangement_ratio == other.max_rearrangement_ratio &&
disable_estimate_union == other.disable_estimate_union &&
disable_rearrangement == other.disable_rearrangement;
// clang-format on
}

bool validated{false};
private:
friend class cereal::access;

static constexpr uint32_t version{2};


template <typename archive_t>
void serialize(archive_t & archive)
{
uint32_t version{1};
archive(CEREAL_NVP(version));
uint32_t parsed_version{version};
archive(cereal::make_nvp("version", parsed_version));

archive(CEREAL_NVP(number_of_user_bins));
archive(CEREAL_NVP(number_of_hash_functions));
Expand All @@ -348,10 +358,16 @@ struct config

archive(CEREAL_NVP(sketch_bits));
archive(CEREAL_NVP(tmax));

if (parsed_version > 1u)
archive(CEREAL_NVP(empty_bin_fraction));

archive(CEREAL_NVP(alpha));
archive(CEREAL_NVP(max_rearrangement_ratio));
archive(CEREAL_NVP(disable_estimate_union));
archive(CEREAL_NVP(disable_rearrangement));
if (parsed_version > 1u)
archive(CEREAL_NVP(validated));
}
};

Expand Down
18 changes: 18 additions & 0 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ namespace bin_kind

//!\brief The value that indicates a merged bin.
static constexpr uint64_t merged{std::numeric_limits<uint64_t>::max()};
//!\brief The value that indicates a deleted bin.
static constexpr uint64_t deleted{std::numeric_limits<uint64_t>::max() - 1u};

} // namespace bin_kind

Expand Down Expand Up @@ -212,6 +214,21 @@ class hierarchical_interleaved_bloom_filter
*/
std::vector<std::vector<uint64_t>> next_ibf_id;

struct previous_ibf_id_pair
{
size_t ibf_idx{};
size_t bin_idx{};

template <seqan::hibf::cereal_archive archive_t>
void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
{
archive(ibf_idx);
archive(bin_idx);
}
};

std::vector<previous_ibf_id_pair> prev_ibf_id;

/*!\brief Stores for each bin in each IBF of the HIBF the user bin ID.
* \details
* Assume we look up a bin `b` in IBF `i`, i.e. `ibf_bin_to_user_bin_id[i][b]`.
Expand Down Expand Up @@ -251,6 +268,7 @@ class hierarchical_interleaved_bloom_filter
archive(ibf_vector);
archive(next_ibf_id);
archive(ibf_bin_to_user_bin_id);
archive(prev_ibf_id);
}

/*!\name Timer
Expand Down
20 changes: 16 additions & 4 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector

//!\brief Helper function to reduce code-duplication between emplace and emplace_exists.
template <bool check_exists>
inline auto emplace_impl(size_t const value, bin_index const bin) noexcept;
inline void emplace_impl(size_t const value, bin_index const bin) noexcept;

public:
class membership_agent_type; // documented upon definition below
Expand Down Expand Up @@ -249,7 +249,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
* \returns `true` if the value already existed, `false` otherwise.
* \sa seqan::hibf::interleaved_bloom_filter::emplace
*/
[[nodiscard]] bool emplace_exists(size_t const value, bin_index const bin) noexcept;
void emplace_exists(size_t const value, bin_index const bin) noexcept;

/*!\brief Clears a specific bin.
* \param[in] bin The bin index to clear.
Expand Down Expand Up @@ -281,14 +281,16 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
"The reference type of the range to clear must be seqan::hibf::bin_index.");
#ifndef NDEBUG
for (auto && bin : bin_range)
assert(bin.value < bins);
assert(bin.value < technical_bins);
#endif // NDEBUG

for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
for (auto && bin : bin_range)
(*this)[bin.value + offset] = 0;
}

bool set_bin_count(bin_count const new_bin_count);

/*!\brief Increases the number of bins stored in the Interleaved Bloom Filter.
* \param[in] new_bins_ The new number of bins.
* \throws std::invalid_argument If passed number of bins is smaller than current number of bins.
Expand All @@ -311,7 +313,7 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
*
* \include test/snippet/ibf/interleaved_bloom_filter_increase_bin_number_to.cpp
*/
void increase_bin_number_to(bin_count const new_bins_);
void increase_bin_number_to(bin_count const new_bin_count);
//!\}

/*!\name Lookup
Expand Down Expand Up @@ -367,6 +369,11 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
return bins;
}

void overwrite_bin_count(size_t const new_count) noexcept
{
bins = new_count;
}

/*!\brief Returns the size of a single bin that the Interleaved Bloom Filter manages.
* \returns The size in bits of a single bin.
*/
Expand Down Expand Up @@ -403,6 +410,9 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
using base_t::data;
//!\}

std::vector<size_t> occupancy{};
bit_vector occupied_bins{};

/*!\cond DEV
* \brief Serialisation support function.
* \tparam archive_t Type of `archive`; must satisfy seqan::hibf::cereal_archive.
Expand All @@ -420,6 +430,8 @@ class interleaved_bloom_filter : private seqan::hibf::bit_vector
archive(bin_words);
archive(hash_funs);
archive(cereal::base_class<base_t>(this));
archive(occupancy);
archive(occupied_bins);
}
//!\endcond
};
Expand Down
3 changes: 2 additions & 1 deletion include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ class hierarchical_binning
config{config_},
data{std::addressof(data_)},
num_user_bins{data->positions.size()},
num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)}
num_technical_bins{data->previous.empty() ? config.tmax
: std::min<size_t>(needed_technical_bins(num_user_bins), config.tmax)}
{
assert(data != nullptr);
}
Expand Down
26 changes: 26 additions & 0 deletions include/hibf/misc/empty_bins_by_fraction.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <algorithm>
#include <cstddef>

#include <hibf/platform.hpp>

namespace seqan::hibf
{

/*!\brief Returns the number of empty bins that should be created by a given fraction of the total number of bins.
* \param[in] tmax The total number of bins.
* \param[in] fraction The fraction of the total number of bins that should be empty.
* \ingroup hibf
* \sa https://godbolt.org/z/cMjbM39vj
*/
[[nodiscard]] constexpr size_t empty_bins_by_fraction(size_t const tmax, double const fraction) noexcept
{
return std::clamp<size_t>(tmax * fraction, 1, tmax - 2) - (fraction == 0.0);
}

} // namespace seqan::hibf
4 changes: 2 additions & 2 deletions src/build/insert_into_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void insert_into_ibf(robin_hood::unordered_flat_set<uint64_t> const & kmers,
seqan::hibf::bin_index const bin_idx{bin_index + chunk_number};
++chunk_number;
for (size_t const value : chunk)
ibf.emplace(value, bin_idx);
ibf.emplace_exists(value, bin_idx);
}
local_fill_ibf_timer.stop();
fill_ibf_timer += local_fill_ibf_timer;
Expand All @@ -63,7 +63,7 @@ void insert_into_ibf(build_data const & data,
serial_timer local_fill_ibf_timer{};
local_fill_ibf_timer.start();
for (auto && value : values)
ibf.emplace(value, bin_index);
ibf.emplace_exists(value, bin_index);
local_fill_ibf_timer.stop();
data.fill_ibf_timer += local_fill_ibf_timer;
}
Expand Down
17 changes: 14 additions & 3 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
#include <cereal/archives/json.hpp> // for JSONInputArchive, JSONOutputArchive
#include <cereal/cereal.hpp> // for make_nvp, InputArchive, OutputArchive

#include <hibf/config.hpp> // for config
#include <hibf/layout/prefixes.hpp> // for meta_header, meta_hibf_config_end, meta_hibf_config_start
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/config.hpp> // for config
#include <hibf/layout/prefixes.hpp> // for meta_header, meta_hibf_config_end, meta_hibf_config_start
#include <hibf/misc/empty_bins_by_fraction.hpp> // for empty_bins_by_fraction
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64

namespace seqan::hibf
{
Expand Down Expand Up @@ -63,6 +64,9 @@ void config::write_to(std::ostream & stream) const

void config::validate_and_set_defaults()
{
if (validated)
return;

if (!input_fn)
throw std::invalid_argument{"[HIBF CONFIG ERROR] You did not set the required config::input_fn."};

Expand Down Expand Up @@ -111,6 +115,11 @@ void config::validate_and_set_defaults()
<< "anyway, so we increased your number of technical bins to " << tmax << ".\n";
}

if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."};

tmax -= empty_bins_by_fraction(tmax, empty_bin_fraction);

if (alpha < 0.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."};

Expand All @@ -119,6 +128,8 @@ void config::validate_and_set_defaults()

if (disable_estimate_union || max_rearrangement_ratio == 0.0)
disable_rearrangement = true;

validated = true;
}

} // namespace seqan::hibf
10 changes: 7 additions & 3 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf,
robin_hood::unordered_flat_set<uint64_t> & parent_kmers,
layout::graph::node const & current_node,
build::build_data & data,
bool is_root)
bool is_root,
size_t const parent_ibf_idx = 0u)
{
size_t const ibf_pos{data.request_ibf_idx()};

Expand All @@ -68,7 +69,8 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf,
kmers,
current_node.children[current_node.favourite_child_idx.value()],
data,
false);
false,
ibf_pos);
return 1;
}
else // max bin is not a merged bin
Expand Down Expand Up @@ -124,8 +126,9 @@ size_t hierarchical_build(hierarchical_interleaved_bloom_filter & hibf,
auto & child = children[index];

robin_hood::unordered_flat_set<uint64_t> kmers{};
size_t const ibf_pos = hierarchical_build(hibf, kmers, child, data, false);
size_t const ibf_pos = hierarchical_build(hibf, kmers, child, data, false, ibf_pos);
auto parent_bin_index = child.parent_bin_index;
hibf.prev_ibf_id[ibf_pos] = {.ibf_idx = parent_ibf_idx, .bin_idx = parent_bin_index};
{
size_t const mutex_id{parent_bin_index / 64};
std::lock_guard<std::mutex> guard{local_ibf_mutex[mutex_id]};
Expand Down Expand Up @@ -184,6 +187,7 @@ void build_index(hierarchical_interleaved_bloom_filter & hibf,

hibf.ibf_vector.resize(number_of_ibfs);
hibf.ibf_bin_to_user_bin_id.resize(number_of_ibfs);
hibf.prev_ibf_id.resize(number_of_ibfs);
hibf.next_ibf_id.resize(number_of_ibfs);

build::build_data data{.config = config, .ibf_graph = {hibf_layout}};
Expand Down
Loading

0 comments on commit c8757dd

Please sign in to comment.