Skip to content

Commit

Permalink
[MISC] Add fpr_quality util
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Aug 21, 2023
1 parent c3adc74 commit 7e76da1
Show file tree
Hide file tree
Showing 2 changed files with 246 additions and 0 deletions.
26 changes: 26 additions & 0 deletions util/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# ------------------------------------------------------------------------------------------------------------
# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
# shipped with this file and also available at: https://github.com/seqan/Hierarchical_Interleaved_Bloomfilter/blob/main/LICENSE.md
# ------------------------------------------------------------------------------------------------------------

cmake_minimum_required (VERSION 3.10...3.22)
project (hibf_util CXX)

# Dependency: seqan::hibf
get_filename_component (HIBF_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}/.." ABSOLUTE)
add_subdirectory ("${HIBF_ROOT_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/hibf_lib")

# Dependency: Sharg
include (FetchContent)
FetchContent_Declare (
sharg
URL "https://github.com/seqan/sharg-parser/releases/download/1.1.1/sharg-1.1.1-Source.tar.xz"
URL_HASH SHA256=7330f06501718e7871e55e5fd70d0e41472cc8b34bd0e3519f8c5547510c671c)
FetchContent_Populate (sharg)
list (APPEND CMAKE_PREFIX_PATH "${sharg_SOURCE_DIR}/build_system")
find_package (sharg 1.0 REQUIRED)

add_executable (fpr_quality fpr_quality.cpp)
target_link_libraries (fpr_quality seqan::hibf sharg::sharg)
220 changes: 220 additions & 0 deletions util/fpr_quality.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
// ------------------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/Hierarchical_Interleaved_Bloomfilter/blob/main/LICENSE.md
// ------------------------------------------------------------------------------------------------------------

#include <iostream>
#include <random>

#include <hibf/contrib/robin_hood.hpp>
#include <hibf/detail/build/bin_size_in_bits.hpp>
#include <hibf/detail/layout/compute_fpr_correction.hpp>
#include <hibf/interleaved_bloom_filter.hpp>

#include <sharg/parser.hpp>

struct config
{
// Set by user
size_t kmer_size{12};
size_t elements{130};
size_t splits{5};
size_t hash{2};
double fpr{0.05};

// Internal
size_t number_of_kmers{};
size_t split_elements_per_bin{};
};

class positive_integer_validator
{
public:
using option_value_type = size_t;

positive_integer_validator() = default;
positive_integer_validator(positive_integer_validator const &) = default;
positive_integer_validator & operator=(positive_integer_validator const &) = default;
positive_integer_validator(positive_integer_validator &&) = default;
positive_integer_validator & operator=(positive_integer_validator &&) = default;
~positive_integer_validator() = default;

explicit positive_integer_validator(bool const is_zero_positive_) : is_zero_positive{is_zero_positive_}
{}

void operator()(option_value_type const & val) const
{
if (!is_zero_positive && !val)
throw sharg::validation_error{"The value must be a positive integer."};
}

std::string get_help_page_message() const
{
if (is_zero_positive)
return "Value must be a positive integer or 0.";
else
return "Value must be a positive integer.";
}

private:
bool is_zero_positive{false};
};

void init_parser(sharg::parser & parser, config & cfg)
{
parser.add_option(cfg.kmer_size,
sharg::config{.short_id = '\0',
.long_id = "kmer",
.description = "The k-mer size.",
.validator = sharg::arithmetic_range_validator{1, 31}});
parser.add_option(cfg.elements,
sharg::config{.short_id = '\0',
.long_id = "elements",
.description = "Number of elements to insert.",
.validator = positive_integer_validator{}});
parser.add_option(cfg.splits,
sharg::config{.short_id = '\0',
.long_id = "splits",
.description = "Number of bins to split into.",
.validator = positive_integer_validator{}});
parser.add_option(cfg.hash,
sharg::config{.short_id = '\0',
.long_id = "hash",
.description = "The number of hash functions to use.",
.validator = sharg::arithmetic_range_validator{1, 5}});
parser.add_option(cfg.fpr,
sharg::config{.short_id = '\0',
.long_id = "fpr",
.description = "The desired false positive rate.",
.validator = sharg::arithmetic_range_validator{0.0, 1.0}});
}

size_t split_bin_size_in_bits(config const & cfg)
{
return hibf::bin_size_in_bits(cfg.split_elements_per_bin, cfg.hash, cfg.fpr);
}

void print_results(size_t const fp_count, config const & cfg)
{
double const fpr = (cfg.number_of_kmers > cfg.elements)
? static_cast<double>(fp_count) / (cfg.number_of_kmers - cfg.elements)
: 0.0;
std::cout << "fp_count: " << fp_count << '\n' //
<< "fp_rate: " << std::fixed << std::setprecision(3) << fpr << '\n';
}

void single_tb(config const & cfg)
{
hibf::interleaved_bloom_filter ibf{hibf::bin_count{1u},
hibf::bin_size{hibf::bin_size_in_bits(cfg.elements, cfg.hash, cfg.fpr)},
hibf::hash_function_count{cfg.hash}};
auto agent = ibf.membership_agent();

// Generate elements many random kmer values.
robin_hood::unordered_set<uint64_t> inserted_values{};
std::random_device rd;
std::mt19937_64 gen(rd());
std::uniform_int_distribution<uint64_t> distrib(0ULL, cfg.number_of_kmers - 1u);

for (; inserted_values.size() < cfg.elements;)
inserted_values.emplace(distrib(gen));

for (uint64_t const value : inserted_values)
ibf.emplace(value, hibf::bin_index{0u});

// Check all possible kmer values.
size_t fp_count{};
for (uint64_t value{}; value < cfg.number_of_kmers; ++value)
{
auto & result = agent.bulk_contains(value);
fp_count += result[0];
}

// There are cfg.elements many TP.
fp_count -= cfg.elements;

print_results(fp_count, cfg);
}

void multiple_tb(config const & cfg, size_t const bin_size)
{
hibf::interleaved_bloom_filter ibf{hibf::bin_count{cfg.splits},
hibf::bin_size{bin_size},
hibf::hash_function_count{cfg.hash}};
auto agent = ibf.membership_agent();

// Generate elements many random kmer values.
robin_hood::unordered_set<uint64_t> all_values{};
std::random_device rd;
std::mt19937_64 gen(rd());
std::uniform_int_distribution<uint64_t> distrib(0ULL, cfg.number_of_kmers - 1u);

for (; all_values.size() < cfg.elements;)
all_values.emplace(distrib(gen));

// Distribute across all bins.
size_t counter{};
for (uint64_t const value : all_values)
ibf.emplace(value, hibf::bin_index{counter++ / cfg.split_elements_per_bin});

// Check all possible kmer values.
size_t fp_count{};
for (uint64_t value{}; value < cfg.number_of_kmers; ++value)
{
auto & result = agent.bulk_contains(value);
// A hit in any of the split bins is a FP. We only count one.
size_t local_fp{};
for (size_t i{}; i < cfg.splits; ++i)
{
local_fp += result[i];
}
fp_count += local_fp > 0u;
}

// There are cfg.elements many TP.
fp_count -= cfg.elements;

print_results(fp_count, cfg);
}

int main(int argc, char ** argv)
{
sharg::parser parser{"fpr_quality", argc, argv, sharg::update_notifications::off};
parser.info.author = "Enrico Seiler";
parser.info.short_copyright = "BSD 3-Clause License";
parser.info.short_description = "Inserts a given amount of k-mers into an IBF and queries all possible k-mers. "
"Reports the resulting FPR for both single and split bins.";
config cfg{};
init_parser(parser, cfg);
parser.parse();

cfg.number_of_kmers = (1ULL << (2 * cfg.kmer_size));

if (cfg.elements > cfg.number_of_kmers)
{
std::cout << "[WARNING] Inserting more elements than there are possible k-mers. "
<< "Setting number of elements to number of possible k-mers.\n\n";
cfg.elements = cfg.number_of_kmers;
}

cfg.split_elements_per_bin = (cfg.elements + cfg.splits - 1) / cfg.splits; // ceil for positive integers

std::cout << "kmer: " << cfg.kmer_size << '\n';
std::cout << "elements: " << cfg.elements << '\n';
std::cout << "splits: " << cfg.splits << '\n';
std::cout << "hash: " << cfg.hash << '\n';
std::cout << "fpr: " << cfg.fpr << "\n\n";

std::cout << "=== Single bin ===\n";
single_tb(cfg);

std::cout << "=== Split into " << cfg.splits << " bins ===\n";
multiple_tb(cfg, split_bin_size_in_bits(cfg));

std::cout << "=== Split into " << cfg.splits << " bins corrected ===\n";
double const fpr_correction =
hibf::layout::compute_fpr_correction({.fpr = cfg.fpr, .hash_count = cfg.hash, .t_max = cfg.splits})[cfg.splits];
multiple_tb(cfg, std::ceil(split_bin_size_in_bits(cfg) * fpr_correction));
}

0 comments on commit 7e76da1

Please sign in to comment.