diff --git a/.github/workflows/ci_lint.yml b/.github/workflows/ci_lint.yml index 6dd43133..7d79e2c2 100644 --- a/.github/workflows/ci_lint.yml +++ b/.github/workflows/ci_lint.yml @@ -57,12 +57,21 @@ jobs: steps: - name: "Cancel Documentation" run: echo "Cancelling Documentation" + cancel_util: + name: Cancel running Workflows + concurrency: + group: util-${{ github.event.pull_request.number }} + cancel-in-progress: true + runs-on: ubuntu-22.04 + steps: + - name: "Cancel Util" + run: echo "Cancelling Util" lint: name: Lint concurrency: group: lint-${{ github.event.pull_request.number }} cancel-in-progress: true - needs: [cancel_linux, cancel_macos, cancel_misc, cancel_coverage, cancel_documentation] + needs: [cancel_linux, cancel_macos, cancel_misc, cancel_coverage, cancel_documentation, cancel_util] runs-on: ubuntu-22.04 timeout-minutes: 15 steps: diff --git a/.github/workflows/ci_util.yml b/.github/workflows/ci_util.yml new file mode 100644 index 00000000..3bd5bdd3 --- /dev/null +++ b/.github/workflows/ci_util.yml @@ -0,0 +1,68 @@ +name: Util Linux + +on: + push: + branches: + - 'main' + pull_request: + types: + - unlabeled + workflow_dispatch: + +concurrency: + group: util-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.event_name != 'push' }} + +env: + TZ: Europe/Berlin + +defaults: + run: + shell: bash -Eexuo pipefail {0} + +jobs: + build: + name: ${{ matrix.name }} + runs-on: ubuntu-22.04 + timeout-minutes: 120 + if: github.repository_owner == 'seqan' || github.event_name == 'workflow_dispatch' || github.event.label.name == 'lint' + strategy: + fail-fast: true + matrix: + include: + - name: "gcc13" + compiler: "gcc-13" + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 1 + submodules: true + + - name: Setup toolchain + uses: seqan/actions/setup-toolchain@main + with: + compiler: ${{ matrix.compiler }} + ccache_size: 75M + + - name: Install CMake + uses: seqan/actions/setup-cmake@main + with: + cmake: 3.16.9 + + - name: Configure util + run: | + mkdir build + cd build + cmake ../util -DCMAKE_BUILD_TYPE=Release \ + -DHIBF_NATIVE_BUILD=OFF + + - name: Build util + run: | + ccache -z + cd build + make -k -j2 + ccache -sv + + diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt new file mode 100644 index 00000000..906d9703 --- /dev/null +++ b/util/CMakeLists.txt @@ -0,0 +1,30 @@ +# ------------------------------------------------------------------------------------------------------------ +# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +# shipped with this file and also available at: https://github.com/seqan/Hierarchical_Interleaved_Bloomfilter/blob/main/LICENSE.md +# ------------------------------------------------------------------------------------------------------------ + +cmake_minimum_required (VERSION 3.10...3.22) +project (hibf_util CXX) + +# Dependency: seqan::hibf +get_filename_component (HIBF_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}/.." ABSOLUTE) +add_subdirectory ("${HIBF_ROOT_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/hibf_lib") + +# Dependency: Sharg +include (FetchContent) +FetchContent_Declare ( + sharg + URL "https://github.com/seqan/sharg-parser/releases/download/1.1.1/sharg-1.1.1-Source.tar.xz" + URL_HASH SHA256=7330f06501718e7871e55e5fd70d0e41472cc8b34bd0e3519f8c5547510c671c) +FetchContent_Populate (sharg) +list (APPEND CMAKE_PREFIX_PATH "${sharg_SOURCE_DIR}/build_system") +find_package (sharg 1.0 REQUIRED) + +## ccache +include ("${HIBF_ROOT_DIR}/test/cmake/hibf_require_ccache.cmake") +hibf_require_ccache () + +add_executable (fpr_quality fpr_quality.cpp) +target_link_libraries (fpr_quality seqan::hibf sharg::sharg) diff --git a/util/fpr_quality.cpp b/util/fpr_quality.cpp new file mode 100644 index 00000000..e66b92bd --- /dev/null +++ b/util/fpr_quality.cpp @@ -0,0 +1,220 @@ +// ------------------------------------------------------------------------------------------------------------ +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/Hierarchical_Interleaved_Bloomfilter/blob/main/LICENSE.md +// ------------------------------------------------------------------------------------------------------------ + +#include +#include + +#include +#include +#include +#include + +#include + +struct config +{ + // Set by user + size_t kmer_size{12}; + size_t elements{130}; + size_t splits{5}; + size_t hash{2}; + double fpr{0.05}; + + // Internal + size_t number_of_kmers{}; + size_t split_elements_per_bin{}; +}; + +class positive_integer_validator +{ +public: + using option_value_type = size_t; + + positive_integer_validator() = default; + positive_integer_validator(positive_integer_validator const &) = default; + positive_integer_validator & operator=(positive_integer_validator const &) = default; + positive_integer_validator(positive_integer_validator &&) = default; + positive_integer_validator & operator=(positive_integer_validator &&) = default; + ~positive_integer_validator() = default; + + explicit positive_integer_validator(bool const is_zero_positive_) : is_zero_positive{is_zero_positive_} + {} + + void operator()(option_value_type const & val) const + { + if (!is_zero_positive && !val) + throw sharg::validation_error{"The value must be a positive integer."}; + } + + std::string get_help_page_message() const + { + if (is_zero_positive) + return "Value must be a positive integer or 0."; + else + return "Value must be a positive integer."; + } + +private: + bool is_zero_positive{false}; +}; + +void init_parser(sharg::parser & parser, config & cfg) +{ + parser.add_option(cfg.kmer_size, + sharg::config{.short_id = '\0', + .long_id = "kmer", + .description = "The k-mer size.", + .validator = sharg::arithmetic_range_validator{1, 31}}); + parser.add_option(cfg.elements, + sharg::config{.short_id = '\0', + .long_id = "elements", + .description = "Number of elements to insert.", + .validator = positive_integer_validator{}}); + parser.add_option(cfg.splits, + sharg::config{.short_id = '\0', + .long_id = "splits", + .description = "Number of bins to split into.", + .validator = positive_integer_validator{}}); + parser.add_option(cfg.hash, + sharg::config{.short_id = '\0', + .long_id = "hash", + .description = "The number of hash functions to use.", + .validator = sharg::arithmetic_range_validator{1, 5}}); + parser.add_option(cfg.fpr, + sharg::config{.short_id = '\0', + .long_id = "fpr", + .description = "The desired false positive rate.", + .validator = sharg::arithmetic_range_validator{0.0, 1.0}}); +} + +size_t split_bin_size_in_bits(config const & cfg) +{ + return hibf::bin_size_in_bits(cfg.split_elements_per_bin, cfg.hash, cfg.fpr); +} + +void print_results(size_t const fp_count, config const & cfg) +{ + double const fpr = (cfg.number_of_kmers > cfg.elements) + ? static_cast(fp_count) / (cfg.number_of_kmers - cfg.elements) + : 0.0; + std::cout << "fp_count: " << fp_count << '\n' // + << "fp_rate: " << std::fixed << std::setprecision(3) << fpr << '\n'; +} + +void single_tb(config const & cfg) +{ + hibf::interleaved_bloom_filter ibf{hibf::bin_count{1u}, + hibf::bin_size{hibf::bin_size_in_bits(cfg.elements, cfg.hash, cfg.fpr)}, + hibf::hash_function_count{cfg.hash}}; + auto agent = ibf.membership_agent(); + + // Generate elements many random kmer values. + robin_hood::unordered_set inserted_values{}; + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution distrib(0ULL, cfg.number_of_kmers - 1u); + + for (; inserted_values.size() < cfg.elements;) + inserted_values.emplace(distrib(gen)); + + for (uint64_t const value : inserted_values) + ibf.emplace(value, hibf::bin_index{0u}); + + // Check all possible kmer values. + size_t fp_count{}; + for (uint64_t value{}; value < cfg.number_of_kmers; ++value) + { + auto & result = agent.bulk_contains(value); + fp_count += result[0]; + } + + // There are cfg.elements many TP. + fp_count -= cfg.elements; + + print_results(fp_count, cfg); +} + +void multiple_tb(config const & cfg, size_t const bin_size) +{ + hibf::interleaved_bloom_filter ibf{hibf::bin_count{cfg.splits}, + hibf::bin_size{bin_size}, + hibf::hash_function_count{cfg.hash}}; + auto agent = ibf.membership_agent(); + + // Generate elements many random kmer values. + robin_hood::unordered_set all_values{}; + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution distrib(0ULL, cfg.number_of_kmers - 1u); + + for (; all_values.size() < cfg.elements;) + all_values.emplace(distrib(gen)); + + // Distribute across all bins. + size_t counter{}; + for (uint64_t const value : all_values) + ibf.emplace(value, hibf::bin_index{counter++ / cfg.split_elements_per_bin}); + + // Check all possible kmer values. + size_t fp_count{}; + for (uint64_t value{}; value < cfg.number_of_kmers; ++value) + { + auto & result = agent.bulk_contains(value); + // A hit in any of the split bins is a FP. We only count one. + size_t local_fp{}; + for (size_t i{}; i < cfg.splits; ++i) + { + local_fp += result[i]; + } + fp_count += local_fp > 0u; + } + + // There are cfg.elements many TP. + fp_count -= cfg.elements; + + print_results(fp_count, cfg); +} + +int main(int argc, char ** argv) +{ + sharg::parser parser{"fpr_quality", argc, argv, sharg::update_notifications::off}; + parser.info.author = "Enrico Seiler"; + parser.info.short_copyright = "BSD 3-Clause License"; + parser.info.short_description = "Inserts a given amount of k-mers into an IBF and queries all possible k-mers. " + "Reports the resulting FPR for both single and split bins."; + config cfg{}; + init_parser(parser, cfg); + parser.parse(); + + cfg.number_of_kmers = (1ULL << (2 * cfg.kmer_size)); + + if (cfg.elements > cfg.number_of_kmers) + { + std::cout << "[WARNING] Inserting more elements than there are possible k-mers. " + << "Setting number of elements to number of possible k-mers.\n\n"; + cfg.elements = cfg.number_of_kmers; + } + + cfg.split_elements_per_bin = (cfg.elements + cfg.splits - 1) / cfg.splits; // ceil for positive integers + + std::cout << "kmer: " << cfg.kmer_size << '\n'; + std::cout << "elements: " << cfg.elements << '\n'; + std::cout << "splits: " << cfg.splits << '\n'; + std::cout << "hash: " << cfg.hash << '\n'; + std::cout << "fpr: " << cfg.fpr << "\n\n"; + + std::cout << "=== Single bin ===\n"; + single_tb(cfg); + + std::cout << "=== Split into " << cfg.splits << " bins ===\n"; + multiple_tb(cfg, split_bin_size_in_bits(cfg)); + + std::cout << "=== Split into " << cfg.splits << " bins corrected ===\n"; + double const fpr_correction = + hibf::layout::compute_fpr_correction({.fpr = cfg.fpr, .hash_count = cfg.hash, .t_max = cfg.splits})[cfg.splits]; + multiple_tb(cfg, std::ceil(split_bin_size_in_bits(cfg) * fpr_correction)); +}