generated from seqan/library-template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #47 from eseiler/misc/util
[MISC] Add fpr_quality util
- Loading branch information
Showing
4 changed files
with
328 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
name: Util Linux | ||
|
||
on: | ||
push: | ||
branches: | ||
- 'main' | ||
pull_request: | ||
types: | ||
- unlabeled | ||
workflow_dispatch: | ||
|
||
concurrency: | ||
group: util-${{ github.event.pull_request.number || github.ref }} | ||
cancel-in-progress: ${{ github.event_name != 'push' }} | ||
|
||
env: | ||
TZ: Europe/Berlin | ||
|
||
defaults: | ||
run: | ||
shell: bash -Eexuo pipefail {0} | ||
|
||
jobs: | ||
build: | ||
name: ${{ matrix.name }} | ||
runs-on: ubuntu-22.04 | ||
timeout-minutes: 120 | ||
if: github.repository_owner == 'seqan' || github.event_name == 'workflow_dispatch' || github.event.label.name == 'lint' | ||
strategy: | ||
fail-fast: true | ||
matrix: | ||
include: | ||
- name: "gcc13" | ||
compiler: "gcc-13" | ||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
with: | ||
fetch-depth: 1 | ||
submodules: true | ||
|
||
- name: Setup toolchain | ||
uses: seqan/actions/setup-toolchain@main | ||
with: | ||
compiler: ${{ matrix.compiler }} | ||
ccache_size: 75M | ||
|
||
- name: Install CMake | ||
uses: seqan/actions/setup-cmake@main | ||
with: | ||
cmake: 3.16.9 | ||
|
||
- name: Configure util | ||
run: | | ||
mkdir build | ||
cd build | ||
cmake ../util -DCMAKE_BUILD_TYPE=Release \ | ||
-DHIBF_NATIVE_BUILD=OFF | ||
- name: Build util | ||
run: | | ||
ccache -z | ||
cd build | ||
make -k -j2 | ||
ccache -sv | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# ------------------------------------------------------------------------------------------------------------ | ||
# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
# shipped with this file and also available at: https://github.com/seqan/Hierarchical_Interleaved_Bloomfilter/blob/main/LICENSE.md | ||
# ------------------------------------------------------------------------------------------------------------ | ||
|
||
cmake_minimum_required (VERSION 3.10...3.22) | ||
project (hibf_util CXX) | ||
|
||
# Dependency: seqan::hibf | ||
get_filename_component (HIBF_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}/.." ABSOLUTE) | ||
add_subdirectory ("${HIBF_ROOT_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/hibf_lib") | ||
|
||
# Dependency: Sharg | ||
include (FetchContent) | ||
FetchContent_Declare ( | ||
sharg | ||
URL "https://github.com/seqan/sharg-parser/releases/download/1.1.1/sharg-1.1.1-Source.tar.xz" | ||
URL_HASH SHA256=7330f06501718e7871e55e5fd70d0e41472cc8b34bd0e3519f8c5547510c671c) | ||
FetchContent_Populate (sharg) | ||
list (APPEND CMAKE_PREFIX_PATH "${sharg_SOURCE_DIR}/build_system") | ||
find_package (sharg 1.0 REQUIRED) | ||
|
||
## ccache | ||
include ("${HIBF_ROOT_DIR}/test/cmake/hibf_require_ccache.cmake") | ||
hibf_require_ccache () | ||
|
||
add_executable (fpr_quality fpr_quality.cpp) | ||
target_link_libraries (fpr_quality seqan::hibf sharg::sharg) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
// ------------------------------------------------------------------------------------------------------------ | ||
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
// shipped with this file and also available at: https://github.com/seqan/Hierarchical_Interleaved_Bloomfilter/blob/main/LICENSE.md | ||
// ------------------------------------------------------------------------------------------------------------ | ||
|
||
#include <iostream> | ||
#include <random> | ||
|
||
#include <hibf/contrib/robin_hood.hpp> | ||
#include <hibf/detail/build/bin_size_in_bits.hpp> | ||
#include <hibf/detail/layout/compute_fpr_correction.hpp> | ||
#include <hibf/interleaved_bloom_filter.hpp> | ||
|
||
#include <sharg/parser.hpp> | ||
|
||
struct config | ||
{ | ||
// Set by user | ||
size_t kmer_size{12}; | ||
size_t elements{130}; | ||
size_t splits{5}; | ||
size_t hash{2}; | ||
double fpr{0.05}; | ||
|
||
// Internal | ||
size_t number_of_kmers{}; | ||
size_t split_elements_per_bin{}; | ||
}; | ||
|
||
class positive_integer_validator | ||
{ | ||
public: | ||
using option_value_type = size_t; | ||
|
||
positive_integer_validator() = default; | ||
positive_integer_validator(positive_integer_validator const &) = default; | ||
positive_integer_validator & operator=(positive_integer_validator const &) = default; | ||
positive_integer_validator(positive_integer_validator &&) = default; | ||
positive_integer_validator & operator=(positive_integer_validator &&) = default; | ||
~positive_integer_validator() = default; | ||
|
||
explicit positive_integer_validator(bool const is_zero_positive_) : is_zero_positive{is_zero_positive_} | ||
{} | ||
|
||
void operator()(option_value_type const & val) const | ||
{ | ||
if (!is_zero_positive && !val) | ||
throw sharg::validation_error{"The value must be a positive integer."}; | ||
} | ||
|
||
std::string get_help_page_message() const | ||
{ | ||
if (is_zero_positive) | ||
return "Value must be a positive integer or 0."; | ||
else | ||
return "Value must be a positive integer."; | ||
} | ||
|
||
private: | ||
bool is_zero_positive{false}; | ||
}; | ||
|
||
void init_parser(sharg::parser & parser, config & cfg) | ||
{ | ||
parser.add_option(cfg.kmer_size, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "kmer", | ||
.description = "The k-mer size.", | ||
.validator = sharg::arithmetic_range_validator{1, 31}}); | ||
parser.add_option(cfg.elements, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "elements", | ||
.description = "Number of elements to insert.", | ||
.validator = positive_integer_validator{}}); | ||
parser.add_option(cfg.splits, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "splits", | ||
.description = "Number of bins to split into.", | ||
.validator = positive_integer_validator{}}); | ||
parser.add_option(cfg.hash, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "hash", | ||
.description = "The number of hash functions to use.", | ||
.validator = sharg::arithmetic_range_validator{1, 5}}); | ||
parser.add_option(cfg.fpr, | ||
sharg::config{.short_id = '\0', | ||
.long_id = "fpr", | ||
.description = "The desired false positive rate.", | ||
.validator = sharg::arithmetic_range_validator{0.0, 1.0}}); | ||
} | ||
|
||
size_t split_bin_size_in_bits(config const & cfg) | ||
{ | ||
return hibf::bin_size_in_bits(cfg.split_elements_per_bin, cfg.hash, cfg.fpr); | ||
} | ||
|
||
void print_results(size_t const fp_count, config const & cfg) | ||
{ | ||
double const fpr = (cfg.number_of_kmers > cfg.elements) | ||
? static_cast<double>(fp_count) / (cfg.number_of_kmers - cfg.elements) | ||
: 0.0; | ||
std::cout << "fp_count: " << fp_count << '\n' // | ||
<< "fp_rate: " << std::fixed << std::setprecision(3) << fpr << '\n'; | ||
} | ||
|
||
void single_tb(config const & cfg) | ||
{ | ||
hibf::interleaved_bloom_filter ibf{hibf::bin_count{1u}, | ||
hibf::bin_size{hibf::bin_size_in_bits(cfg.elements, cfg.hash, cfg.fpr)}, | ||
hibf::hash_function_count{cfg.hash}}; | ||
auto agent = ibf.membership_agent(); | ||
|
||
// Generate elements many random kmer values. | ||
robin_hood::unordered_set<uint64_t> inserted_values{}; | ||
std::random_device rd; | ||
std::mt19937_64 gen(rd()); | ||
std::uniform_int_distribution<uint64_t> distrib(0ULL, cfg.number_of_kmers - 1u); | ||
|
||
for (; inserted_values.size() < cfg.elements;) | ||
inserted_values.emplace(distrib(gen)); | ||
|
||
for (uint64_t const value : inserted_values) | ||
ibf.emplace(value, hibf::bin_index{0u}); | ||
|
||
// Check all possible kmer values. | ||
size_t fp_count{}; | ||
for (uint64_t value{}; value < cfg.number_of_kmers; ++value) | ||
{ | ||
auto & result = agent.bulk_contains(value); | ||
fp_count += result[0]; | ||
} | ||
|
||
// There are cfg.elements many TP. | ||
fp_count -= cfg.elements; | ||
|
||
print_results(fp_count, cfg); | ||
} | ||
|
||
void multiple_tb(config const & cfg, size_t const bin_size) | ||
{ | ||
hibf::interleaved_bloom_filter ibf{hibf::bin_count{cfg.splits}, | ||
hibf::bin_size{bin_size}, | ||
hibf::hash_function_count{cfg.hash}}; | ||
auto agent = ibf.membership_agent(); | ||
|
||
// Generate elements many random kmer values. | ||
robin_hood::unordered_set<uint64_t> all_values{}; | ||
std::random_device rd; | ||
std::mt19937_64 gen(rd()); | ||
std::uniform_int_distribution<uint64_t> distrib(0ULL, cfg.number_of_kmers - 1u); | ||
|
||
for (; all_values.size() < cfg.elements;) | ||
all_values.emplace(distrib(gen)); | ||
|
||
// Distribute across all bins. | ||
size_t counter{}; | ||
for (uint64_t const value : all_values) | ||
ibf.emplace(value, hibf::bin_index{counter++ / cfg.split_elements_per_bin}); | ||
|
||
// Check all possible kmer values. | ||
size_t fp_count{}; | ||
for (uint64_t value{}; value < cfg.number_of_kmers; ++value) | ||
{ | ||
auto & result = agent.bulk_contains(value); | ||
// A hit in any of the split bins is a FP. We only count one. | ||
size_t local_fp{}; | ||
for (size_t i{}; i < cfg.splits; ++i) | ||
{ | ||
local_fp += result[i]; | ||
} | ||
fp_count += local_fp > 0u; | ||
} | ||
|
||
// There are cfg.elements many TP. | ||
fp_count -= cfg.elements; | ||
|
||
print_results(fp_count, cfg); | ||
} | ||
|
||
int main(int argc, char ** argv) | ||
{ | ||
sharg::parser parser{"fpr_quality", argc, argv, sharg::update_notifications::off}; | ||
parser.info.author = "Enrico Seiler"; | ||
parser.info.short_copyright = "BSD 3-Clause License"; | ||
parser.info.short_description = "Inserts a given amount of k-mers into an IBF and queries all possible k-mers. " | ||
"Reports the resulting FPR for both single and split bins."; | ||
config cfg{}; | ||
init_parser(parser, cfg); | ||
parser.parse(); | ||
|
||
cfg.number_of_kmers = (1ULL << (2 * cfg.kmer_size)); | ||
|
||
if (cfg.elements > cfg.number_of_kmers) | ||
{ | ||
std::cout << "[WARNING] Inserting more elements than there are possible k-mers. " | ||
<< "Setting number of elements to number of possible k-mers.\n\n"; | ||
cfg.elements = cfg.number_of_kmers; | ||
} | ||
|
||
cfg.split_elements_per_bin = (cfg.elements + cfg.splits - 1) / cfg.splits; // ceil for positive integers | ||
|
||
std::cout << "kmer: " << cfg.kmer_size << '\n'; | ||
std::cout << "elements: " << cfg.elements << '\n'; | ||
std::cout << "splits: " << cfg.splits << '\n'; | ||
std::cout << "hash: " << cfg.hash << '\n'; | ||
std::cout << "fpr: " << cfg.fpr << "\n\n"; | ||
|
||
std::cout << "=== Single bin ===\n"; | ||
single_tb(cfg); | ||
|
||
std::cout << "=== Split into " << cfg.splits << " bins ===\n"; | ||
multiple_tb(cfg, split_bin_size_in_bits(cfg)); | ||
|
||
std::cout << "=== Split into " << cfg.splits << " bins corrected ===\n"; | ||
double const fpr_correction = | ||
hibf::layout::compute_fpr_correction({.fpr = cfg.fpr, .hash_count = cfg.hash, .t_max = cfg.splits})[cfg.splits]; | ||
multiple_tb(cfg, std::ceil(split_bin_size_in_bits(cfg) * fpr_correction)); | ||
} |
6e6f5a2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Successfully deployed to the following URLs:
hibf – ./
hibf.vercel.app
hibf-seqan.vercel.app
hibf-git-main-seqan.vercel.app