diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index 8a19ba81..b8a0df19 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -40,6 +40,7 @@ namespace seqan::hibf * | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] | * | Layout | seqan::hibf::config::sketch_bits | 12 | | * | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset | + * | Layout | seqan::hibf::config::empty_bin_fraction | 0.0 | Dynamic Layout | * | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | | * | Layout | seqan::hibf::config::alpha | 1.2 | | * | Layout | seqan::hibf::config::disable_estimate_union | false | | @@ -228,6 +229,9 @@ struct config */ size_t tmax{}; + //!\brief The percentage of empty bins in the layout. + double empty_bin_fraction{}; + /*!\brief A scaling factor to influence the amount of merged bins produced by the layout algorithm. * * The layout algorithm optimizes the space consumption of the resulting HIBF, but currently has no means of @@ -300,6 +304,7 @@ struct config * * seqan::hibf::config::threads must be greater than `0`. * * seqan::hibf::config::sketch_bits must be in `[5,32]`. * * seqan::hibf::config::tmax must be at most `18446744073709551552`. + * * seqan::hibf::config::empty_bin_fraction must be in `[0.0,1.0)`. * * seqan::hibf::config::alpha must be positive. * * seqan::hibf::config::max_rearrangement_ratio must be in `[0.0,1.0]`. * @@ -322,6 +327,7 @@ struct config threads == other.threads && sketch_bits == other.sketch_bits && tmax == other.tmax && + empty_bin_fraction == other.empty_bin_fraction && alpha == other.alpha && max_rearrangement_ratio == other.max_rearrangement_ratio && disable_estimate_union == other.disable_estimate_union && @@ -332,11 +338,13 @@ struct config private: friend class cereal::access; + static constexpr uint32_t version{2}; + template void serialize(archive_t & archive) { - uint32_t version{1}; - archive(CEREAL_NVP(version)); + uint32_t parsed_version{version}; + archive(cereal::make_nvp("version", parsed_version)); archive(CEREAL_NVP(number_of_user_bins)); archive(CEREAL_NVP(number_of_hash_functions)); @@ -346,6 +354,10 @@ struct config archive(CEREAL_NVP(sketch_bits)); archive(CEREAL_NVP(tmax)); + + if (parsed_version > 1u) + archive(CEREAL_NVP(empty_bin_fraction)); + archive(CEREAL_NVP(alpha)); archive(CEREAL_NVP(max_rearrangement_ratio)); archive(CEREAL_NVP(disable_estimate_union)); diff --git a/include/hibf/layout/hierarchical_binning.hpp b/include/hibf/layout/hierarchical_binning.hpp index 652f7f27..7557b265 100644 --- a/include/hibf/layout/hierarchical_binning.hpp +++ b/include/hibf/layout/hierarchical_binning.hpp @@ -117,7 +117,8 @@ class hierarchical_binning config{config_}, data{std::addressof(data_)}, num_user_bins{data->positions.size()}, - num_technical_bins{data->previous.empty() ? config.tmax : needed_technical_bins(num_user_bins)} + num_technical_bins{data->previous.empty() ? config.tmax + : std::min(needed_technical_bins(num_user_bins), config.tmax)} { assert(data != nullptr); } diff --git a/include/hibf/misc/empty_bins_by_fraction.hpp b/include/hibf/misc/empty_bins_by_fraction.hpp new file mode 100644 index 00000000..778ac22f --- /dev/null +++ b/include/hibf/misc/empty_bins_by_fraction.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include +#include + +#include + +namespace seqan::hibf +{ + +/*!\brief Returns the number of empty bins that should be created by a given fraction of the total number of bins. + * \param[in] tmax The total number of bins. + * \param[in] fraction The fraction of the total number of bins that should be empty. + * \ingroup hibf + * \sa https://godbolt.org/z/cMjbM39vj + */ +[[nodiscard]] constexpr size_t empty_bins_by_fraction(size_t const tmax, double const fraction) noexcept +{ + return std::clamp(tmax * fraction, 1, tmax - 1) - (fraction == 0.0); +} + +} // namespace seqan::hibf diff --git a/src/config.cpp b/src/config.cpp index b82069d6..5f324c5e 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -14,9 +14,10 @@ #include // for JSONInputArchive, JSONOutputArchive #include // for make_nvp, InputArchive, OutputArchive -#include // for config -#include // for meta_header, meta_hibf_config_end, meta_hibf_config_start -#include // for next_multiple_of_64 +#include // for config +#include // for meta_header, meta_hibf_config_end, meta_hibf_config_start +#include // for empty_bins_by_fraction +#include // for next_multiple_of_64 namespace seqan::hibf { @@ -107,6 +108,11 @@ void config::validate_and_set_defaults() << "anyway, so we increased your number of technical bins to " << tmax << ".\n"; } + if (empty_bin_fraction < 0.0 || empty_bin_fraction >= 1.0) + throw std::invalid_argument{"[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."}; + + tmax -= empty_bins_by_fraction(tmax, empty_bin_fraction); + if (alpha < 0.0) throw std::invalid_argument{"[HIBF CONFIG ERROR] config::alpha must be positive."}; diff --git a/src/layout/hierarchical_binning.cpp b/src/layout/hierarchical_binning.cpp index 0c588c32..151aa800 100644 --- a/src/layout/hierarchical_binning.cpp +++ b/src/layout/hierarchical_binning.cpp @@ -17,6 +17,7 @@ #include // for layout #include // for simple_binning #include // for divide_and_ceil +#include // for empty_bins_by_fraction #include // for next_multiple_of_64 #include // for concurrent_timer #include // for HIBF_WORKAROUND_GCC_BOGUS_MEMCPY @@ -77,12 +78,13 @@ size_t hierarchical_binning::execute() [[nodiscard]] size_t hierarchical_binning::needed_technical_bins(size_t const requested_num_ub) const { - return std::min(next_multiple_of_64(requested_num_ub), config.tmax); + size_t const next_multiple = next_multiple_of_64(requested_num_ub); + return next_multiple - empty_bins_by_fraction(next_multiple, config.empty_bin_fraction); } [[nodiscard]] size_t hierarchical_binning::max_merge_levels(size_t const num_ubs_in_merge) const { - size_t const lower_lvl_tbs = needed_technical_bins(num_ubs_in_merge); + size_t const lower_lvl_tbs = std::min(needed_technical_bins(num_ubs_in_merge), config.tmax); double const levels = std::log(num_ubs_in_merge) / std::log(lower_lvl_tbs); return static_cast(std::ceil(levels)); } @@ -405,8 +407,10 @@ void hierarchical_binning::update_libf_data(data_store & libf_data, size_t const size_t hierarchical_binning::add_lower_level(data_store & libf_data) const { + size_t const number_of_user_bins = libf_data.positions.size(); + // now do the binning for the low-level IBF: - if (libf_data.positions.size() > config.tmax) + if (number_of_user_bins > config.tmax) { // recursively call hierarchical binning if there are still too many UBs return hierarchical_binning{libf_data, config}.execute(); // return id of maximum technical bin @@ -414,7 +418,8 @@ size_t hierarchical_binning::add_lower_level(data_store & libf_data) const else { // use simple binning to distribute remaining UBs - return simple_binning{libf_data, 0}.execute(); // return id of maximum technical bin + size_t const number_of_technical_bins = needed_technical_bins(number_of_user_bins); + return simple_binning{libf_data, number_of_technical_bins}.execute(); // return id of maximum technical bin } } diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp index af576a00..20e31d39 100644 --- a/test/unit/hibf/config_test.cpp +++ b/test/unit/hibf/config_test.cpp @@ -37,7 +37,7 @@ TEST(config_test, write_to) std::string const expected_file{"@HIBF_CONFIG\n" "@{\n" "@ \"hibf_config\": {\n" - "@ \"version\": 1,\n" + "@ \"version\": 2,\n" "@ \"number_of_user_bins\": 123456789,\n" "@ \"number_of_hash_functions\": 4,\n" "@ \"maximum_fpr\": 0.0001,\n" @@ -45,6 +45,7 @@ TEST(config_test, write_to) "@ \"threads\": 31,\n" "@ \"sketch_bits\": 8,\n" "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.0,\n" "@ \"alpha\": 1.0,\n" "@ \"max_rearrangement_ratio\": 0.333,\n" "@ \"disable_estimate_union\": true,\n" @@ -57,6 +58,45 @@ TEST(config_test, write_to) } TEST(config_test, read_from) +{ + std::stringstream ss{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 2,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_fpr\": 0.0001,\n" + "@ \"relaxed_fpr\": 0.3,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"empty_bin_fraction\": 0.5,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + seqan::hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_fpr, 0.0001); + EXPECT_EQ(configuration.relaxed_fpr, 0.3); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.5); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); +} + +TEST(config_test, read_from_v1) { std::stringstream ss{"@HIBF_CONFIG\n" "@{\n" @@ -87,6 +127,7 @@ TEST(config_test, read_from) EXPECT_EQ(configuration.threads, 31); EXPECT_EQ(configuration.sketch_bits, 8); EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.empty_bin_fraction, 0.0); EXPECT_EQ(configuration.alpha, 1.0); EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); EXPECT_EQ(configuration.disable_estimate_union, true); @@ -272,6 +313,17 @@ TEST(config_test, validate_and_set_defaults) "increased your number of technical bins to 64.\n"); } + // empty_bin_fraction must be in [0.0,1.0) + { + seqan::hibf::config configuration{.input_fn = dummy_input_fn, + .number_of_user_bins = 1u, + .empty_bin_fraction = -0.1}; + check_error_message(configuration, "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."); + + configuration.empty_bin_fraction = 1.0; + check_error_message(configuration, "[HIBF CONFIG ERROR] config::empty_bin_fraction must be in [0.0,1.0)."); + } + // alpha must be positive { seqan::hibf::config configuration{.input_fn = dummy_input_fn, .number_of_user_bins = 1u, .alpha = -0.1}; diff --git a/test/unit/hibf/layout/hierarchical_binning_test.cpp b/test/unit/hibf/layout/hierarchical_binning_test.cpp index 95e57978..4bd86b7c 100644 --- a/test/unit/hibf/layout/hierarchical_binning_test.cpp +++ b/test/unit/hibf/layout/hierarchical_binning_test.cpp @@ -45,6 +45,40 @@ TEST(hierarchical_binning_test, small_example) EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); } +TEST(hierarchical_binning_test, small_example_with_empty_bins) +{ + seqan::hibf::config config; + config.tmax = 4; + config.disable_estimate_union = true; // also disables rearrangement + config.empty_bin_fraction = 0.001; + + seqan::hibf::layout::layout hibf_layout{}; + std::vector kmer_counts{500, 1000, 500, 500, 500, 500, 500, 500}; + + seqan::hibf::layout::data_store data{.hibf_layout = &hibf_layout, .kmer_counts = &kmer_counts}; + + data.fpr_correction = + seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + seqan::hibf::layout::hierarchical_binning algo{data, config}; + EXPECT_EQ(algo.execute(), 3u); // #HIGH_LEVEL_IBF max_bin_id:3 + + std::vector expected_max_bins{{{1}, 0}, {{2}, 0}}; + + // clang-format off + std::vector expected_user_bins{{{}, 0, 1, 7}, + {{1}, 0, 22 - 1, 4}, + {{1}, 22 - 1, 21, 5}, + {{1}, 43 - 1, 21, 6}, + {{2}, 0, 22 - 1, 0}, + {{2}, 22 - 1, 21, 2}, + {{2}, 43 - 1, 21, 3}, + {{}, 3, 1, 1}}; + // clang-format on + + EXPECT_RANGE_EQ(hibf_layout.max_bins, expected_max_bins); + EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); +} + TEST(hierarchical_binning_test, another_example) { seqan::hibf::config config;