diff --git a/include/hibf/layout/compute_fpr_correction.hpp b/include/hibf/layout/compute_fpr_correction.hpp index b6d205c5..86b4c9cf 100644 --- a/include/hibf/layout/compute_fpr_correction.hpp +++ b/include/hibf/layout/compute_fpr_correction.hpp @@ -27,6 +27,6 @@ struct fpr_correction_parameters * \ingroup hibf_layout * \sa https://godbolt.org/z/zTj1v9W94 */ -std::vector compute_fpr_correction(fpr_correction_parameters const & params); +[[nodiscard]] std::vector compute_fpr_correction(fpr_correction_parameters const & params); } // namespace seqan::hibf::layout diff --git a/include/hibf/layout/compute_relaxed_fpr_correction.hpp b/include/hibf/layout/compute_relaxed_fpr_correction.hpp new file mode 100644 index 00000000..d911f5ff --- /dev/null +++ b/include/hibf/layout/compute_relaxed_fpr_correction.hpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#pragma once + +#include // for size_t + +#include + +namespace seqan::hibf::layout +{ + +/*!\brief Contains parameters for compute_relaxed_fpr_correction. + * \ingroup hibf_layout + * \qualifier strong + */ +struct relaxed_fpr_correction_parameters +{ + double fpr{}; + double relaxed_fpr{}; + size_t hash_count{}; +}; + +/*!\brief Precompute size correction factor for merged bins which are allowed to have a relaxed FPR. + * \ingroup hibf_layout + */ +[[nodiscard]] double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params); + +} // namespace seqan::hibf::layout diff --git a/include/hibf/layout/data_store.hpp b/include/hibf/layout/data_store.hpp index 12736602..67e281c6 100644 --- a/include/hibf/layout/data_store.hpp +++ b/include/hibf/layout/data_store.hpp @@ -76,6 +76,9 @@ struct data_store //!\brief The false positive correction based on fp_rate, num_hash_functions and requested_max_tb. std::vector fpr_correction{}; + //!\brief The correction factor for merged bins which are allowed to have a relaxed FPR. + double relaxed_fpr_correction{}; + //!\brief Information about previous levels of the IBF if the algorithm is called recursively. previous_level previous{}; diff --git a/include/hibf/layout/hierarchical_binning.hpp b/include/hibf/layout/hierarchical_binning.hpp index 652f7f27..25b28cbc 100644 --- a/include/hibf/layout/hierarchical_binning.hpp +++ b/include/hibf/layout/hierarchical_binning.hpp @@ -37,10 +37,8 @@ class hierarchical_binning //!\brief Simplifies passing the parameters needed for tracking the maximum technical bin. struct maximum_bin_tracker { - size_t max_id{}; //!< The ID of the technical bin with maximal size. - size_t max_size{}; //!< The maximum technical bin size seen so far. - size_t max_split_id{}; //!< The ID of the split bin with maximal size (if any). - size_t max_split_size{}; //!< The maximum split bin size seen so far. + size_t max_id{}; //!< The ID of the technical bin with maximal size. + size_t max_size{}; //!< The maximum technical bin size seen so far. void update_max(size_t const new_id, size_t const new_size) { @@ -50,52 +48,6 @@ class hierarchical_binning max_size = new_size; } } - - //!\brief Split cardinality `new_size` must already account for fpr-correction. - void update_split_max(size_t const new_id, size_t const new_size) - { - if (new_size > max_split_size) - { - max_split_id = new_id; - max_split_size = new_size; - } - } - - /*!\brief Decides which bin is reported as the maximum bin. - *\param config The HIBF configuration. - *\return The chosen max bin id. - * - * As a HIBF feature, the merged bin FPR can differ from the overall maximum FPR. Merged bins in an HIBF layout - * will always be followed by one or more lower-level IBFs that will have split bins or single bins (split = 1) - * to recover the original user bins. - * - * We need to make sure, though, that downsizing merged bins does not affect split bins. - * Therefore, we check if choosing a merged bin as the max bin violates the minimum_bits needed for split bins. - * If so, we can report the largest split bin as the max bin as it will choose the correct size and downsize - * larger merged bins only a little. - */ - size_t choose_max_bin(seqan::hibf::config const & config) - { - if (max_id == max_split_id) // Overall max bin is a split bin. - return max_id; - - // Split cardinality `max_split_size` already accounts for fpr correction. - // The minimum size of the TBs of this IBF to ensure the maximum_false_positive_rate for split bins. - size_t const minimum_bits{build::bin_size_in_bits({.fpr = config.maximum_fpr, - .hash_count = config.number_of_hash_functions, - .elements = max_split_size})}; - - // The potential size of the TBs of this IBF given the allowed merged bin FPR. - size_t const merged_bits{build::bin_size_in_bits({.fpr = config.relaxed_fpr, // - .hash_count = config.number_of_hash_functions, - .elements = max_size})}; - - // If split and merged bits are the same, we prefer merged bins. Better for build parallelisation. - if ((minimum_bits > merged_bits)) - return max_split_id; - - return max_id; - } }; public: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5afb658f..5b64369c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,6 +10,7 @@ set (HIBF_SOURCE_FILES layout/layout.cpp layout/compute_fpr_correction.cpp layout/compute_layout.cpp + layout/compute_relaxed_fpr_correction.cpp sketch/compute_sketches.cpp layout/graph.cpp layout/hierarchical_binning.cpp diff --git a/src/layout/compute_layout.cpp b/src/layout/compute_layout.cpp index 064930d0..9b5c70d9 100644 --- a/src/layout/compute_layout.cpp +++ b/src/layout/compute_layout.cpp @@ -9,15 +9,16 @@ #include // for move #include // for vector -#include // for config -#include // for compute_fpr_correction -#include // for compute_layout -#include // for data_store -#include // for hierarchical_binning -#include // for layout -#include // for iota_vector -#include // for concurrent_timer -#include // for hyperloglog +#include // for config +#include // for compute_fpr_correction +#include // for compute_layout +#include // for compute_relaxed_fpr_correction +#include // for data_store +#include // for hierarchical_binning +#include // for layout +#include // for iota_vector +#include // for concurrent_timer +#include // for hyperloglog namespace seqan::hibf::layout { @@ -46,6 +47,10 @@ layout compute_layout(config const & config, .hash_count = config.number_of_hash_functions, .t_max = config.tmax}); + store.relaxed_fpr_correction = compute_relaxed_fpr_correction({.fpr = config.maximum_fpr, // + .relaxed_fpr = config.relaxed_fpr, + .hash_count = config.number_of_hash_functions}); + store.hibf_layout->top_level_max_bin_id = seqan::hibf::layout::hierarchical_binning{store, config}.execute(); union_estimation_timer = store.union_estimation_timer; rearrangement_timer = store.rearrangement_timer; diff --git a/src/layout/compute_relaxed_fpr_correction.cpp b/src/layout/compute_relaxed_fpr_correction.cpp new file mode 100644 index 00000000..78358c25 --- /dev/null +++ b/src/layout/compute_relaxed_fpr_correction.cpp @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin +// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik +// SPDX-License-Identifier: BSD-3-Clause + +#include // for assert +#include // for log1p, exp, log + +#include // for compute_fpr_correction + +namespace seqan::hibf::layout +{ + +double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params) +{ + assert(params.fpr > 0.0 && params.fpr <= 1.0); + assert(params.relaxed_fpr > 0.0 && params.relaxed_fpr <= 1.0); + assert(params.hash_count > 0u); + assert(params.fpr <= params.relaxed_fpr); + + double const numerator = std::log1p(-std::exp(std::log(params.fpr) / params.hash_count)); + double const denominator = std::log1p(-std::exp(std::log(params.relaxed_fpr) / params.hash_count)); + double const relaxed_fpr_correction = numerator / denominator; + + assert(relaxed_fpr_correction > 0.0 && relaxed_fpr_correction <= 1.0); + return relaxed_fpr_correction; +} + +} // namespace seqan::hibf::layout diff --git a/src/layout/hierarchical_binning.cpp b/src/layout/hierarchical_binning.cpp index 0c588c32..6a30f7ec 100644 --- a/src/layout/hierarchical_binning.cpp +++ b/src/layout/hierarchical_binning.cpp @@ -118,7 +118,7 @@ void hierarchical_binning::initialization(std::vector> & mat for (size_t j = 1; j < num_user_bins; ++j) { sum += (*data->kmer_counts)[data->positions[j]]; - matrix[0][j] = data->union_estimates[j]; + matrix[0][j] = data->union_estimates[j] * data->relaxed_fpr_correction; ll_matrix[0][j] = max_merge_levels(j + 1) * sum; trace[0][j] = {0u, j - 1}; // unnecessary? } @@ -130,7 +130,7 @@ void hierarchical_binning::initialization(std::vector> & mat assert(j < data->positions.size()); assert(data->positions[j] < data->kmer_counts->size()); sum += (*data->kmer_counts)[data->positions[j]]; - matrix[0][j] = sum; + matrix[0][j] = sum * data->relaxed_fpr_correction; ll_matrix[0][j] = max_merge_levels(j + 1) * sum; trace[0][j] = {0u, j - 1}; // unnecessary? } @@ -206,12 +206,13 @@ void hierarchical_binning::recursion(std::vector> & matrix, size_t j_prime{j - 1}; size_t weight{current_weight}; - auto get_weight = [&]() + auto get_weight = [&]() -> size_t { // if we use the union estimate we plug in that value instead of the sum (weight) // union_estimates[j_prime] is the union of {j_prime, ..., j} // the + 1 is necessary because j_prime is decremented directly after weight is updated - return config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1]; + size_t const uncorrected = config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1]; + return data->relaxed_fpr_correction * uncorrected; }; // if the user bin j-1 was not split into multiple technical bins! @@ -278,7 +279,7 @@ void hierarchical_binning::backtrack_merged_bin(size_t trace_j, if (!config.disable_estimate_union) kmer_count = sketch.estimate(); // overwrite kmer_count high_level_max_id/size bin - max_tracker.update_max(bin_id, kmer_count); + max_tracker.update_max(bin_id, kmer_count * data->relaxed_fpr_correction); // std::cout << "]: " << kmer_count << std::endl; } @@ -302,7 +303,6 @@ void hierarchical_binning::backtrack_split_bin(size_t trace_j, size_t const cardinality_per_bin = divide_and_ceil(corrected_cardinality, number_of_bins); max_tracker.update_max(bin_id, cardinality_per_bin); - max_tracker.update_split_max(bin_id, cardinality_per_bin); // std::cout << "split " << trace_j << " into " << number_of_bins << ": " << cardinality_per_bin << std::endl; } @@ -359,7 +359,7 @@ size_t hierarchical_binning::backtracking(std::vectorkmer_counts, .sketches = data->sketches, .positions = {data->positions[trace_j]}, - .fpr_correction = data->fpr_correction}; + .fpr_correction = data->fpr_correction, + .relaxed_fpr_correction = data->relaxed_fpr_correction}; return libf_data; } diff --git a/test/unit/hibf/layout/hierarchical_binning_test.cpp b/test/unit/hibf/layout/hierarchical_binning_test.cpp index 95e57978..035c5827 100644 --- a/test/unit/hibf/layout/hierarchical_binning_test.cpp +++ b/test/unit/hibf/layout/hierarchical_binning_test.cpp @@ -7,12 +7,13 @@ #include // for size_t #include // for vector, allocator -#include // for config -#include // for compute_fpr_correction -#include // for data_store -#include // for hierarchical_binning -#include // for layout -#include // for expect_range_eq, EXPECT_RANGE_EQ +#include // for config +#include // for compute_fpr_correction +#include // for compute_relaxed_fpr_correction +#include // for data_store +#include // for hierarchical_binning +#include // for layout +#include // for expect_range_eq, EXPECT_RANGE_EQ TEST(hierarchical_binning_test, small_example) { @@ -27,19 +28,22 @@ TEST(hierarchical_binning_test, small_example) data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); + seqan::hibf::layout::hierarchical_binning algo{data, config}; EXPECT_EQ(algo.execute(), 3u); // #HIGH_LEVEL_IBF max_bin_id:3 - std::vector expected_max_bins{{{1}, 22}, {{2}, 22}}; + std::vector expected_max_bins{{{2}, 22}, {{3}, 0}}; std::vector expected_user_bins{{{}, 0, 1, 7}, - {{1}, 0, 22, 4}, - {{1}, 22, 21, 5}, - {{1}, 43, 21, 6}, - {{2}, 0, 22, 0}, - {{2}, 22, 21, 2}, - {{2}, 43, 21, 3}, - {{}, 3, 1, 1}}; + {{}, 1, 1, 6}, + {{2}, 0, 22, 3}, + {{2}, 22, 21, 4}, + {{2}, 43, 21, 5}, + {{3}, 0, 42, 1}, + {{3}, 42, 11, 0}, + {{3}, 53, 11, 2}}; EXPECT_RANGE_EQ(hibf_layout.max_bins, expected_max_bins); EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); @@ -57,6 +61,8 @@ TEST(hierarchical_binning_test, another_example) data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); seqan::hibf::layout::hierarchical_binning algo{data, config}; EXPECT_EQ(algo.execute(), 1u); // #HIGH_LEVEL_IBF max_bin_id:1 @@ -88,6 +94,8 @@ TEST(hierarchical_binning_test, high_level_max_bin_id_is_0) data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); seqan::hibf::layout::hierarchical_binning algo{data, config}; EXPECT_EQ(algo.execute(), 0u); // #HIGH_LEVEL_IBF max_bin_id:1 @@ -113,6 +121,8 @@ TEST(hierarchical_binning_test, knuts_example) data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); seqan::hibf::layout::hierarchical_binning algo{data, config}; EXPECT_EQ(algo.execute(), 1u); @@ -141,21 +151,20 @@ TEST(hierarchical_binning_test, four_level_hibf) data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); seqan::hibf::layout::hierarchical_binning algo{data, config}; EXPECT_EQ(algo.execute(), 1u); // #HIGH_LEVEL_IBF max_bin_id:1 - std::vector expected_max_bins{{{0, 0, 0, 0}, 33}, - {{0, 0, 0}, 1}, - {{0, 0}, 1}, - {{0}, 1}}; + std::vector expected_max_bins{{{0, 0}, 33}, {{0, 1}, 0}, {{0}, 1}, {{1}, 0}}; - std::vector expected_user_bins{{{0, 0, 0, 0}, 0, 33, 4}, - {{0, 0, 0, 0}, 33, 31, 5}, - {{0, 0, 0}, 1, 1, 3}, - {{0, 0}, 1, 1, 2}, - {{0}, 1, 1, 1}, - {{}, 1, 1, 0}}; + std::vector expected_user_bins{{{0, 0}, 0, 33, 4}, + {{0, 0}, 33, 31, 5}, + {{0, 1}, 0, 57, 2}, + {{0, 1}, 57, 7, 3}, + {{1}, 0, 53, 0}, + {{1}, 53, 11, 1}}; EXPECT_RANGE_EQ(hibf_layout.max_bins, expected_max_bins); EXPECT_RANGE_EQ(hibf_layout.user_bins, expected_user_bins); @@ -174,6 +183,8 @@ TEST(hierarchical_binning_test, tb0_is_a_merged_bin) data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); seqan::hibf::layout::hierarchical_binning algo{data, config}; EXPECT_EQ(algo.execute(), 0u); @@ -202,6 +213,8 @@ TEST(hierarchical_binning_test, tb0_is_a_merged_bin_and_leads_to_recursive_call) data.fpr_correction = seqan::hibf::layout::compute_fpr_correction({.fpr = 0.05, .hash_count = 2, .t_max = config.tmax}); + data.relaxed_fpr_correction = + seqan::hibf::layout::compute_relaxed_fpr_correction({.fpr = 0.05, .relaxed_fpr = 0.3, .hash_count = 2}); seqan::hibf::layout::hierarchical_binning algo{data, config}; EXPECT_EQ(algo.execute(), 0u);