Skip to content

Commit

Permalink
[FEATURE] Allow empty bins in layout
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Aug 19, 2024
1 parent f4321f8 commit cf5e52d
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 11 deletions.
16 changes: 14 additions & 2 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace seqan::hibf
* | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] |
* | Layout | seqan::hibf::config::sketch_bits | 12 | |
* | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset |
* | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout |
* | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | |
* | Layout | seqan::hibf::config::alpha | 1.2 | |
* | Layout | seqan::hibf::config::disable_estimate_union | false | |
Expand Down Expand Up @@ -228,6 +229,9 @@ struct config
*/
size_t tmax{};

//!\brief The percentage of empty bins in the layout.
double empty_bin_fraction{};

/*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm.
*
* The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of
Expand Down Expand Up @@ -300,6 +304,7 @@ struct config
* * seqan::hibf::config::threads must be greater than `0`.
* * seqan::hibf::config::sketch_bits must be in `[5,32]`.
* * seqan::hibf::config::tmax must be at most `18446744073709551552`.
* * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`.
* * seqan::hibf::config::alpha must be positive.
* * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`.
*
Expand All @@ -322,6 +327,7 @@ struct config
threads == other.threads &&
sketch_bits == other.sketch_bits &&
tmax == other.tmax &&
empty_bin_fraction == other.empty_bin_fraction &&
alpha == other.alpha &&
max_rearrangement_ratio == other.max_rearrangement_ratio &&
disable_estimate_union == other.disable_estimate_union &&
Expand All @@ -332,11 +338,13 @@ struct config
private:
friend class cereal::access;

static constexpr uint32_t version{2};

template <typename archive_t>
void serialize(archive_t & archive)
{
uint32_t version{1};
archive(CEREAL_NVP(version));
uint32_t parsed_version{version};
archive(cereal::make_nvp("version", parsed_version));

archive(CEREAL_NVP(number_of_user_bins));
archive(CEREAL_NVP(number_of_hash_functions));
Expand All @@ -346,6 +354,10 @@ struct config

archive(CEREAL_NVP(sketch_bits));
archive(CEREAL_NVP(tmax));

if (parsed_version > 1u)
archive(CEREAL_NVP(empty_bin_fraction));

archive(CEREAL_NVP(alpha));
archive(CEREAL_NVP(max_rearrangement_ratio));
archive(CEREAL_NVP(disable_estimate_union));
Expand Down
3 changes: 2 additions & 1 deletion include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ class hierarchical_binning
config{config_},
data{std::addressof(data_)},
num_user_bins{data->positions.size()},
num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)}
num_technical_bins{data->previous.empty() ? config.tmax
: std::min<size_t>(needed_technical_bins(num_user_bins), config.tmax)}
{
assert(data != nullptr);
}
Expand Down
26 changes: 26 additions & 0 deletions include/hibf/misc/empty_bins_by_fraction.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <algorithm>
#include <cstddef>

#include <hibf/platform.hpp>

namespace seqan::hibf
{

/*!\brief Returns the number of empty bins that should be created by a given fraction of the total number of bins.
* \param[in] tmax The total number of bins.
* \param[in] fraction The fraction of the total number of bins that should be empty.
* \ingroup hibf
* \sa https://godbolt.org/z/cMjbM39vj
*/
[[nodiscard]] constexpr size_t empty_bins_by_fraction(size_t const tmax, double const fraction) noexcept
{
return std::clamp<size_t>(tmax * fraction, 1, tmax - 1) - (fraction == 0.0);
}

} // namespace seqan::hibf
12 changes: 9 additions & 3 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
#include <cereal/archives/json.hpp> // for JSONInputArchive, JSONOutputArchive
#include <cereal/cereal.hpp> // for make_nvp, InputArchive, OutputArchive

#include <hibf/config.hpp> // for config
#include <hibf/layout/prefixes.hpp> // for meta_header, meta_hibf_config_end, meta_hibf_config_start
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/config.hpp> // for config
#include <hibf/layout/prefixes.hpp> // for meta_header, meta_hibf_config_end, meta_hibf_config_start
#include <hibf/misc/empty_bins_by_fraction.hpp> // for empty_bins_by_fraction
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64

namespace seqan::hibf
{
Expand Down Expand Up @@ -107,6 +108,11 @@ void config::validate_and_set_defaults()
<< "anyway, so we increased your number of technical bins to " << tmax << ".\n";
}

if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."};

tmax -= empty_bins_by_fraction(tmax, empty_bin_fraction);

if (alpha < 0.0)
throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."};

Expand Down
13 changes: 9 additions & 4 deletions src/layout/hierarchical_binning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <hibf/layout/layout.hpp> // for layout
#include <hibf/layout/simple_binning.hpp> // for simple_binning
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/misc/empty_bins_by_fraction.hpp> // for empty_bins_by_fraction
#include <hibf/misc/next_multiple_of_64.hpp> // for next_multiple_of_64
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/platform.hpp> // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY
Expand Down Expand Up @@ -77,12 +78,13 @@ size_t hierarchical_binning::execute()

[[nodiscard]] size_t hierarchical_binning::needed_technical_bins(size_t const requested_num_ub) const
{
return std::min<size_t>(next_multiple_of_64(requested_num_ub), config.tmax);
size_t const next_multiple = next_multiple_of_64(requested_num_ub);
return next_multiple - empty_bins_by_fraction(next_multiple, config.empty_bin_fraction);
}

[[nodiscard]] size_t hierarchical_binning::max_merge_levels(size_t const num_ubs_in_merge) const
{
size_t const lower_lvl_tbs = needed_technical_bins(num_ubs_in_merge);
size_t const lower_lvl_tbs = std::min<size_t>(needed_technical_bins(num_ubs_in_merge), config.tmax);
double const levels = std::log(num_ubs_in_merge) / std::log(lower_lvl_tbs);
return static_cast<size_t>(std::ceil(levels));
}
Expand Down Expand Up @@ -405,16 +407,19 @@ void hierarchical_binning::update_libf_data(data_store & libf_data, size_t const

size_t hierarchical_binning::add_lower_level(data_store & libf_data) const
{
size_t const number_of_user_bins = libf_data.positions.size();

// now do the binning for the low-level IBF:
if (libf_data.positions.size() > config.tmax)
if (number_of_user_bins > config.tmax)
{
// recursively call hierarchical binning if there are still too many UBs
return hierarchical_binning{libf_data, config}.execute(); // return id of maximum technical bin
}
else
{
// use simple binning to distribute remaining UBs
return simple_binning{libf_data, 0}.execute(); // return id of maximum technical bin
size_t const number_of_technical_bins = needed_technical_bins(number_of_user_bins);
return simple_binning{libf_data, number_of_technical_bins}.execute(); // return id of maximum technical bin
}
}

Expand Down
54 changes: 53 additions & 1 deletion test/unit/hibf/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ TEST(config_test, write_to)
std::string const expected_file{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 1,\n"
"@ \"version\": 2,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.0,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
Expand All @@ -57,6 +58,45 @@ TEST(config_test, write_to)
}

TEST(config_test, read_from)
{
std::stringstream ss{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 2,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_fpr\": 0.0001,\n"
"@ \"relaxed_fpr\": 0.3,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"empty_bin_fraction\": 0.5,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
"@ \"disable_rearrangement\": false\n"
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"};

seqan::hibf::config configuration;
configuration.read_from(ss);

EXPECT_EQ(configuration.number_of_user_bins, 123456789);
EXPECT_EQ(configuration.number_of_hash_functions, 4);
EXPECT_EQ(configuration.maximum_fpr, 0.0001);
EXPECT_EQ(configuration.relaxed_fpr, 0.3);
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.empty_bin_fraction, 0.5);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
EXPECT_EQ(configuration.disable_rearrangement, false);
}

TEST(config_test, read_from_v1)
{
std::stringstream ss{"@HIBF_CONFIG\n"
"@{\n"
Expand Down Expand Up @@ -87,6 +127,7 @@ TEST(config_test, read_from)
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.empty_bin_fraction, 0.0);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
Expand Down Expand Up @@ -272,6 +313,17 @@ TEST(config_test, validate_and_set_defaults)
"increased your number of technical bins to 64.\n");
}

// empty_bin_fraction must be in [0.0,1.0)
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn,
.number_of_user_bins = 1u,
.empty_bin_fraction = -0.1};
check_error_message(configuration, "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0).");

configuration.empty_bin_fraction = 1.0;
check_error_message(configuration, "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0).");
}

// alpha must be positive
{
seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1};
Expand Down
34 changes: 34 additions & 0 deletions test/unit/hibf/layout/hierarchical_binning_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,40 @@ TEST(hierarchical_binning_test, small_example)
EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins);
}

TEST(hierarchical_binning_test, small_example_with_empty_bins)
{
seqan::hibf::config config;
config.tmax = 4;
config.disable_estimate_union = true; // also disables rearrangement
config.empty_bin_fraction = 0.001;

seqan::hibf::layout::layout hibf_layout{};
std::vector<size_t> kmer_counts{500, 1000, 500, 500, 500, 500, 500, 500};

seqan::hibf::layout::data_store data{.hibf_layout = &hibf_layout, .kmer_counts = &kmer_counts};

data.fpr_correction =
seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax});
seqan::hibf::layout::hierarchical_binning algo{data, config};
EXPECT_EQ(algo.execute(), 3u); // #HIGH_LEVEL_IBF max_bin_id:3

std::vector<seqan::hibf::layout::layout::max_bin> expected_max_bins{{{1}, 0}, {{2}, 0}};

// clang-format off
std::vector<seqan::hibf::layout::layout::user_bin> expected_user_bins{{{}, 0, 1, 7},
{{1}, 0, 22 - 1, 4},
{{1}, 22 - 1, 21, 5},
{{1}, 43 - 1, 21, 6},
{{2}, 0, 22 - 1, 0},
{{2}, 22 - 1, 21, 2},
{{2}, 43 - 1, 21, 3},
{{}, 3, 1, 1}};
// clang-format on

EXPECT_RANGE_EQ(hibf_layout.max_bins, expected_max_bins);
EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins);
}

TEST(hierarchical_binning_test, another_example)
{
seqan::hibf::config config;
Expand Down

0 comments on commit cf5e52d

Please sign in to comment.