Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Include relaxed FPR into the DP algorithm. #224

Merged
merged 4 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/hibf/layout/compute_fpr_correction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ struct fpr_correction_parameters
* \ingroup hibf_layout
* \sa https://godbolt.org/z/zTj1v9W94
*/
std::vector<double> compute_fpr_correction(fpr_correction_parameters const & params);
[[nodiscard]] std::vector<double> compute_fpr_correction(fpr_correction_parameters const & params);

} // namespace seqan::hibf::layout
30 changes: 30 additions & 0 deletions include/hibf/layout/compute_relaxed_fpr_correction.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <cstddef> // for size_t

#include <hibf/platform.hpp>

namespace seqan::hibf::layout
{

/*!\brief Contains parameters for compute_relaxed_fpr_correction.
* \ingroup hibf_layout
* \qualifier strong
*/
struct relaxed_fpr_correction_parameters
{
double fpr{};
double relaxed_fpr{};
size_t hash_count{};
};

/*!\brief Precompute size correction factor for merged bins which are allowed to have a relaxed FPR.
* \ingroup hibf_layout
*/
[[nodiscard]] double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params);

} // namespace seqan::hibf::layout
3 changes: 3 additions & 0 deletions include/hibf/layout/data_store.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ struct data_store
//!\brief The false positive correction based on fp_rate, num_hash_functions and requested_max_tb.
std::vector<double> fpr_correction{};

//!\brief The correction factor for merged bins which are allowed to have a relaxed FPR.
double relaxed_fpr_correction{};

//!\brief Information about previous levels of the IBF if the algorithm is called recursively.
previous_level previous{};

Expand Down
52 changes: 2 additions & 50 deletions include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,8 @@ class hierarchical_binning
//!\brief Simplifies passing the parameters needed for tracking the maximum technical bin.
struct maximum_bin_tracker
{
size_t max_id{}; //!< The ID of the technical bin with maximal size.
size_t max_size{}; //!< The maximum technical bin size seen so far.
size_t max_split_id{}; //!< The ID of the split bin with maximal size (if any).
size_t max_split_size{}; //!< The maximum split bin size seen so far.
size_t max_id{}; //!< The ID of the technical bin with maximal size.
size_t max_size{}; //!< The maximum technical bin size seen so far.

void update_max(size_t const new_id, size_t const new_size)
{
Expand All @@ -50,52 +48,6 @@ class hierarchical_binning
max_size = new_size;
}
}

//!\brief Split cardinality `new_size` must already account for fpr-correction.
void update_split_max(size_t const new_id, size_t const new_size)
{
if (new_size > max_split_size)
{
max_split_id = new_id;
max_split_size = new_size;
}
}

/*!\brief Decides which bin is reported as the maximum bin.
*\param config The HIBF configuration.
*\return The chosen max bin id.
*
* As a HIBF feature, the merged bin FPR can differ from the overall maximum FPR. Merged bins in an HIBF layout
* will always be followed by one or more lower-level IBFs that will have split bins or single bins (split = 1)
* to recover the original user bins.
*
* We need to make sure, though, that downsizing merged bins does not affect split bins.
* Therefore, we check if choosing a merged bin as the max bin violates the minimum_bits needed for split bins.
* If so, we can report the largest split bin as the max bin as it will choose the correct size and downsize
* larger merged bins only a little.
*/
size_t choose_max_bin(seqan::hibf::config const & config)
{
if (max_id == max_split_id) // Overall max bin is a split bin.
return max_id;

// Split cardinality `max_split_size` already accounts for fpr correction.
// The minimum size of the TBs of this IBF to ensure the maximum_false_positive_rate for split bins.
size_t const minimum_bits{build::bin_size_in_bits({.fpr = config.maximum_fpr,
.hash_count = config.number_of_hash_functions,
.elements = max_split_size})};

// The potential size of the TBs of this IBF given the allowed merged bin FPR.
size_t const merged_bits{build::bin_size_in_bits({.fpr = config.relaxed_fpr, //
.hash_count = config.number_of_hash_functions,
.elements = max_size})};

// If split and merged bits are the same, we prefer merged bins. Better for build parallelisation.
if ((minimum_bits > merged_bits))
return max_split_id;

return max_id;
}
};

public:
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set (HIBF_SOURCE_FILES
layout/layout.cpp
layout/compute_fpr_correction.cpp
layout/compute_layout.cpp
layout/compute_relaxed_fpr_correction.cpp
sketch/compute_sketches.cpp
layout/graph.cpp
layout/hierarchical_binning.cpp
Expand Down
23 changes: 14 additions & 9 deletions src/layout/compute_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
#include <utility> // for move
#include <vector> // for vector

#include <hibf/config.hpp> // for config
#include <hibf/layout/compute_fpr_correction.hpp> // for compute_fpr_correction
#include <hibf/layout/compute_layout.hpp> // for compute_layout
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/layout/hierarchical_binning.hpp> // for hierarchical_binning
#include <hibf/layout/layout.hpp> // for layout
#include <hibf/misc/iota_vector.hpp> // for iota_vector
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog
#include <hibf/config.hpp> // for config
#include <hibf/layout/compute_fpr_correction.hpp> // for compute_fpr_correction
#include <hibf/layout/compute_layout.hpp> // for compute_layout
#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_relaxed_fpr_correction
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/layout/hierarchical_binning.hpp> // for hierarchical_binning
#include <hibf/layout/layout.hpp> // for layout
#include <hibf/misc/iota_vector.hpp> // for iota_vector
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf::layout
{
Expand Down Expand Up @@ -46,6 +47,10 @@
.hash_count = config.number_of_hash_functions,
.t_max = config.tmax});

store.relaxed_fpr_correction = compute_relaxed_fpr_correction({.fpr = config.maximum_fpr, //

Check warning on line 50 in src/layout/compute_layout.cpp

View check run for this annotation

Codecov / codecov/patch

src/layout/compute_layout.cpp#L50

Added line #L50 was not covered by tests
.relaxed_fpr = config.relaxed_fpr,
.hash_count = config.number_of_hash_functions});

store.hibf_layout->top_level_max_bin_id = seqan::hibf::layout::hierarchical_binning{store, config}.execute();
union_estimation_timer = store.union_estimation_timer;
rearrangement_timer = store.rearrangement_timer;
Expand Down
28 changes: 28 additions & 0 deletions src/layout/compute_relaxed_fpr_correction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <cassert> // for assert
#include <cmath> // for log1p, exp, log

#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_fpr_correction

namespace seqan::hibf::layout
{

double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params)
{
assert(params.fpr > 0.0 && params.fpr <= 1.0);
assert(params.relaxed_fpr > 0.0 && params.relaxed_fpr <= 1.0);
assert(params.hash_count > 0u);
assert(params.fpr <= params.relaxed_fpr);

double const numerator = std::log1p(-std::exp(std::log(params.fpr) / params.hash_count));
double const denominator = std::log1p(-std::exp(std::log(params.relaxed_fpr) / params.hash_count));
double const relaxed_fpr_correction = numerator / denominator;

assert(relaxed_fpr_correction > 0.0 && relaxed_fpr_correction <= 1.0);
return relaxed_fpr_correction;
}

} // namespace seqan::hibf::layout
17 changes: 9 additions & 8 deletions src/layout/hierarchical_binning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ void hierarchical_binning::initialization(std::vector<std::vector<size_t>> & mat
for (size_t j = 1; j < num_user_bins; ++j)
{
sum += (*data->kmer_counts)[data->positions[j]];
matrix[0][j] = data->union_estimates[j];
matrix[0][j] = data->union_estimates[j] * data->relaxed_fpr_correction;
ll_matrix[0][j] = max_merge_levels(j + 1) * sum;
trace[0][j] = {0u, j - 1}; // unnecessary?
}
Expand All @@ -130,7 +130,7 @@ void hierarchical_binning::initialization(std::vector<std::vector<size_t>> & mat
assert(j < data->positions.size());
assert(data->positions[j] < data->kmer_counts->size());
sum += (*data->kmer_counts)[data->positions[j]];
matrix[0][j] = sum;
matrix[0][j] = sum * data->relaxed_fpr_correction;
ll_matrix[0][j] = max_merge_levels(j + 1) * sum;
trace[0][j] = {0u, j - 1}; // unnecessary?
}
Expand Down Expand Up @@ -206,12 +206,13 @@ void hierarchical_binning::recursion(std::vector<std::vector<size_t>> & matrix,
size_t j_prime{j - 1};
size_t weight{current_weight};

auto get_weight = [&]()
auto get_weight = [&]() -> size_t
{
// if we use the union estimate we plug in that value instead of the sum (weight)
// union_estimates[j_prime] is the union of {j_prime, ..., j}
// the + 1 is necessary because j_prime is decremented directly after weight is updated
return config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1];
size_t const uncorrected = config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1];
return data->relaxed_fpr_correction * uncorrected;
};

// if the user bin j-1 was not split into multiple technical bins!
Expand Down Expand Up @@ -278,7 +279,7 @@ void hierarchical_binning::backtrack_merged_bin(size_t trace_j,
if (!config.disable_estimate_union)
kmer_count = sketch.estimate(); // overwrite kmer_count high_level_max_id/size bin

max_tracker.update_max(bin_id, kmer_count);
max_tracker.update_max(bin_id, kmer_count * data->relaxed_fpr_correction);
// std::cout << "]: " << kmer_count << std::endl;
}

Expand All @@ -302,7 +303,6 @@ void hierarchical_binning::backtrack_split_bin(size_t trace_j,
size_t const cardinality_per_bin = divide_and_ceil(corrected_cardinality, number_of_bins);

max_tracker.update_max(bin_id, cardinality_per_bin);
max_tracker.update_split_max(bin_id, cardinality_per_bin);

// std::cout << "split " << trace_j << " into " << number_of_bins << ": " << cardinality_per_bin << std::endl;
}
Expand Down Expand Up @@ -359,7 +359,7 @@ size_t hierarchical_binning::backtracking(std::vector<std::vector<std::pair<size
backtrack_split_bin(trace_j, trace_i + 1, bin_id, max_tracker);
}

return max_tracker.choose_max_bin(config);
return max_tracker.max_id;
}

data_store hierarchical_binning::initialise_libf_data(size_t const trace_j) const
Expand All @@ -369,7 +369,8 @@ data_store hierarchical_binning::initialise_libf_data(size_t const trace_j) cons
.kmer_counts = data->kmer_counts,
.sketches = data->sketches,
.positions = {data->positions[trace_j]},
.fpr_correction = data->fpr_correction};
.fpr_correction = data->fpr_correction,
.relaxed_fpr_correction = data->relaxed_fpr_correction};

return libf_data;
}
Expand Down
Loading
Loading