Merge pull request #45 from eseiler/misc/perf

[MISC] Improve bulk_contains performance
seqan · Aug 24, 2023 · 5caf0e3 · 5caf0e3 · vercel · Aug 24, 2023
2 parents 6ccdaaf + d16026b
commit 5caf0e3
Show file tree

Hide file tree

Showing 6 changed files with 297 additions and 27 deletions.
diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
@@ -54,6 +54,7 @@ jobs:
             compiler: "intel"
             build: unit
             build_type: Release
+            cxx_flags: "-Xclang=-Wno-pass-failed"
 
     steps:
       - name: Checkout
@@ -83,7 +84,8 @@ jobs:
           cd build
           cmake ../test/${{ matrix.build }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
                                             -DHIBF_NATIVE_BUILD=OFF \
-                                            -DHIBF_VERBOSE_TESTS=OFF
+                                            -DHIBF_VERBOSE_TESTS=OFF \
+                                            -DCMAKE_CXX_FLAGS="${{ matrix.cxx_flags }}"
           make -j2 gtest_build
 
       - name: Build tests

diff --git a/include/hibf/interleaved_bloom_filter.hpp b/include/hibf/interleaved_bloom_filter.hpp
@@ -654,6 +654,9 @@ class interleaved_bloom_filter::membership_agent_type
     //!\brief A pointer to the augmented hibf::interleaved_bloom_filter.
     ibf_t const * ibf_ptr{nullptr};
 
+    //!\brief Stores access positions of augmented hibf::interleaved_bloom_filter.
+    std::array<size_t, 5> bloom_filter_indices;
+
 public:
     /*!\name Constructors, destructor and assignment
      * \{
@@ -701,24 +704,113 @@ class interleaved_bloom_filter::membership_agent_type
         assert(ibf_ptr != nullptr);
         assert(result_buffer.size() == ibf_ptr->bin_count());
 
-        std::array<size_t, 5> bloom_filter_indices;
-        std::memcpy(&bloom_filter_indices, &ibf_ptr->hash_seeds, sizeof(size_t) * ibf_ptr->hash_funs);
+        // Needed for auto-vectorization of loop. ibf_ptr->bin_words could change bewtween loops.
+        size_t const bin_words = ibf_ptr->bin_words;
+        size_t const hash_funs = ibf_ptr->hash_funs;
+
+#ifndef NDEBUG
+        assert(bin_words != 0u);
+        assert(hash_funs != 0u);
+#else
+        // Removes case for bin_words == 0u. The same statment inside the switch-case wouldn't have that effect.
+        if (bin_words == 0u)
+            __builtin_unreachable();
+        if (hash_funs == 0u)
+            __builtin_unreachable();
+#endif
+
+        for (size_t i = 0; i < hash_funs; ++i)
+            bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, ibf_ptr->hash_seeds[i]) >> 6;
 
-        for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
-            bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, bloom_filter_indices[i]);
+        uint64_t * const raw = result_buffer.raw_data().data(); // TODO: std::assume_aligned<64> once memory-aligned
+        uint64_t const * const ibf_data = ibf_ptr->data.data(); // TODO: std::assume_aligned<64> once memory-aligned
+        std::memcpy(raw, ibf_data + bloom_filter_indices[0], sizeof(uint64_t) * bin_words);
 
-        for (size_t batch = 0; batch < ibf_ptr->bin_words; ++batch)
+        // https://godbolt.org/z/1nbhvqeGj
+        // Having the loop inside is faster.
+        // GCOVR_EXCL_START
+        switch (bin_words)
         {
-            size_t tmp{-1ULL};
-            for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
+        case 1u: // 1 AND (64 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
             {
-                assert(bloom_filter_indices[i] < ibf_ptr->data.size());
-                tmp &= ibf_ptr->data.get_int(bloom_filter_indices[i]);
-                bloom_filter_indices[i] += 64;
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+                raw[0] &= ibf_raw[0];
+            }
+            break;
+        case 2u: // 1 SSE4 instruction (128 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < 2u; ++batch)
+                    raw[batch] &= ibf_raw[batch];
+            }
+            break;
+        case 3u: // 1 SSE4 instruction (128 bit) + 1 AND (64 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < 3u; ++batch)
+                    raw[batch] &= ibf_raw[batch];
+            }
+            break;
+        case 4u: // 1 AVX2 instruction (256 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < 4u; ++batch)
+                    raw[batch] &= ibf_raw[batch];
+            }
+            break;
+        case 5u: // 1 AVX2 instruction (256 bit) + 1 AND (64 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < 5u; ++batch)
+                    raw[batch] &= ibf_raw[batch];
+            }
+            break;
+        case 6u: // 1 AVX2 instruction (256 bit) + 1 SSE4 instruction (128 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < 6u; ++batch)
+                    raw[batch] &= ibf_raw[batch];
+            }
+            break;
+        case 7u: // 1 AVX2 instruction (256 bit) + 1 SSE4 instruction (128 bit) + 1 AND (64 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < 7u; ++batch)
+                    raw[batch] &= ibf_raw[batch];
+            }
+            break;
+        case 8u: // 1 AVX512 instruction (512 bit)
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < 8u; ++batch)
+                    raw[batch] &= ibf_raw[batch];
+            }
+            break;
+        default: // Auto vectorize. Might create different versions.
+            for (size_t i = 1; i < hash_funs; ++i)
+            {
+                uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
+#pragma omp simd
+                for (size_t batch = 0; batch < bin_words; ++batch)
+                    raw[batch] &= ibf_raw[batch];
             }
-
-            result_buffer.data.set_int(batch << 6, tmp);
         }
+        // GCOVR_EXCL_STOP
 
         return result_buffer;
     }

diff --git a/test/cmake/hibf_require_benchmark.cmake b/test/cmake/hibf_require_benchmark.cmake
@@ -12,25 +12,25 @@ cmake_minimum_required (VERSION 3.16)
 macro (hibf_require_benchmark)
     enable_testing ()
 
-    set (benchmark_version "1.8.0")
-    set (gbenchmark_git_tag "v${benchmark_version}")
+    set (HIBF_BENCHMARK_TAG "v1.8.2")
 
-    find_package (benchmark ${benchmark_version} EXACT QUIET)
+    find_package (benchmark QUIET)
 
-    if (NOT benchmark_FOUND)
-        message (STATUS "Fetching Google Benchmark ${benchmark_version}")
+    # Also ensure that Google Benchmark if fetched for the latest library cron, which sets the tag to "main".
+    if (NOT benchmark_FOUND OR "${HIBF_BENCHMARK_TAG}" STREQUAL "main")
+        message (STATUS "Fetching Google Benchmark ${HIBF_BENCHMARK_TAG}")
 
         include (FetchContent)
         FetchContent_Declare (
             gbenchmark_fetch_content
             GIT_REPOSITORY "https://github.com/google/benchmark.git"
-            GIT_TAG "${gbenchmark_git_tag}")
+            GIT_TAG "${HIBF_BENCHMARK_TAG}")
         option (BENCHMARK_ENABLE_TESTING "" OFF)
         option (BENCHMARK_ENABLE_WERROR "" OFF) # Does not apply to Debug builds.
         option (BENCHMARK_ENABLE_INSTALL "" OFF)
         FetchContent_MakeAvailable (gbenchmark_fetch_content)
     else ()
-        message (STATUS "Found Google Benchmark ${benchmark_version}")
+        message (STATUS "  Test dependency:            Google Benchmark ${benchmark_VERSION} found.")
     endif ()
 
     # NOTE: google benchmark's CMakeLists.txt already defines Shlwapi

diff --git a/test/cmake/hibf_require_test.cmake b/test/cmake/hibf_require_test.cmake
@@ -12,24 +12,24 @@ cmake_minimum_required (VERSION 3.16)
 macro (hibf_require_test)
     enable_testing ()
 
-    set (gtest_version "1.13.0")
-    set (gtest_git_tag "v${gtest_version}")
+    set (HIBF_GTEST_TAG "v1.13.0")
 
-    find_package (GTest ${gtest_version} EXACT QUIET)
+    find_package (GTest QUIET)
 
-    if (NOT GTest_FOUND)
-        message (STATUS "Fetching Google Test ${gtest_version}")
+    # Also ensure that Google Test if fetched for the latest library cron, which sets the tag to "main".
+    if (NOT GTest_FOUND OR "${HIBF_GTEST_TAG}" STREQUAL "main")
+        message (STATUS "Fetching Google Test ${HIBF_GTEST_TAG}")
 
         include (FetchContent)
         FetchContent_Declare (
             gtest_fetch_content
             GIT_REPOSITORY "https://github.com/google/googletest.git"
-            GIT_TAG "${gtest_git_tag}")
+            GIT_TAG "${HIBF_GTEST_TAG}")
         option (BUILD_GMOCK "" OFF)
         option (INSTALL_GTEST "" OFF)
         FetchContent_MakeAvailable (gtest_fetch_content)
     else ()
-        message (STATUS "Found Google Test ${gtest_version}")
+        message (STATUS "  Test dependency:            Google Test ${GTest_VERSION} found.")
     endif ()
 
     if (NOT TARGET gtest_build)

diff --git a/test/performance/ibf/CMakeLists.txt b/test/performance/ibf/CMakeLists.txt
@@ -0,0 +1 @@
+hibf_benchmark (interleaved_bloom_filter_benchmark.cpp)
diff --git a/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp b/test/performance/ibf/interleaved_bloom_filter_benchmark.cpp
@@ -0,0 +1,175 @@
+// -----------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
+// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
+// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
+// -----------------------------------------------------------------------------------------------------
+
+#include <benchmark/benchmark.h>
+
+#include <hibf/contrib/std/to.hpp>
+#include <hibf/contrib/std/zip_view.hpp>
+#include <hibf/interleaved_bloom_filter.hpp>
+
+inline benchmark::Counter hashes_per_second(size_t const count)
+{
+    return benchmark::Counter(count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000);
+}
+
+#if 1
+static void arguments(benchmark::internal::Benchmark * b)
+{
+    // Total size: 1MiB
+    // bins, bin_size, hash_num, sequence_length
+    b->Args({64, 1LL << 17, 2, 1LL << 17});
+    b->Args({128, 1LL << 16, 2, 1LL << 17});
+    b->Args({192, 1LL << 16, 2, 1LL << 17});
+    b->Args({256, 1LL << 15, 2, 1LL << 17});
+    b->Args({1024, 1LL << 10, 2, 1LL << 17});
+}
+#else
+static void arguments(benchmark::internal::Benchmark * b)
+{
+    // Total size: 1GiB
+    // bins, bin_size, hash_num, sequence_length
+    b->Args({64, 1LL << 27, 2, 1LL << 27});
+    b->Args({128, 1LL << 26, 2, 1LL << 27});
+    b->Args({192, 1LL << 26, 2, 1LL << 27});
+    b->Args({256, 1LL << 25, 2, 1LL << 27});
+    b->Args({1024, 1LL << 20, 2, 1LL << 27});
+}
+#endif
+
+auto set_up(::benchmark::State const & state)
+{
+    size_t const bins = state.range(0);
+    size_t const bits = state.range(1);
+    size_t const hash_num = state.range(2);
+    size_t const sequence_length = state.range(3);
+
+    auto generate = [sequence_length](size_t const max_value = std::numeric_limits<size_t>::max())
+    {
+        auto generator = [max_value]()
+        {
+            std::uniform_int_distribution<size_t> distr{0u, max_value};
+            std::mt19937_64 engine{0ULL};
+            return distr(engine);
+        };
+        std::vector<size_t> result(sequence_length);
+
+        std::ranges::generate(result, generator);
+        return result;
+    };
+
+    std::vector<size_t> const bin_indices{generate(bins - 1)};
+    std::vector<size_t> const hash_values{generate()};
+
+    hibf::interleaved_bloom_filter ibf{hibf::bin_count{bins},
+                                       hibf::bin_size{bits},
+                                       hibf::hash_function_count{hash_num}};
+
+    return std::make_tuple(bin_indices, hash_values, ibf);
+}
+
+void emplace_benchmark(::benchmark::State & state)
+{
+    auto && [bin_indices, hash_values, ibf] = set_up(state);
+
+    for (auto _ : state)
+    {
+        for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
+            ibf.emplace(hash, hibf::bin_index{bin});
+    }
+
+    state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
+}
+
+void clear_benchmark(::benchmark::State & state)
+{
+    auto && [bin_indices, hash_values, ibf] = set_up(state);
+    (void)bin_indices;
+    (void)hash_values;
+
+    std::vector<hibf::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
+                                           | std::views::transform(
+                                                 [](size_t i)
+                                                 {
+                                                     return hibf::bin_index{i};
+                                                 })
+                                           | seqan::std::ranges::to<std::vector>();
+
+    for (auto _ : state)
+    {
+        for (auto bin : bin_range)
+            ibf.clear(bin);
+    }
+
+    state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
+}
+
+void clear_range_benchmark(::benchmark::State & state)
+{
+    auto && [bin_indices, hash_values, ibf] = set_up(state);
+    (void)bin_indices;
+    (void)hash_values;
+
+    std::vector<hibf::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
+                                           | std::views::transform(
+                                                 [](size_t i)
+                                                 {
+                                                     return hibf::bin_index{i};
+                                                 })
+                                           | seqan::std::ranges::to<std::vector>();
+
+    for (auto _ : state)
+    {
+        ibf.clear(bin_range);
+    }
+
+    state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
+}
+
+void bulk_contains_benchmark(::benchmark::State & state)
+{
+    auto && [bin_indices, hash_values, ibf] = set_up(state);
+
+    for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
+        ibf.emplace(hash, hibf::bin_index{bin});
+
+    auto agent = ibf.membership_agent();
+    for (auto _ : state)
+    {
+        for (auto hash : hash_values)
+        {
+            [[maybe_unused]] auto & res = agent.bulk_contains(hash);
+            benchmark::ClobberMemory();
+        }
+    }
+
+    state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
+}
+
+void bulk_count_benchmark(::benchmark::State & state)
+{
+    auto && [bin_indices, hash_values, ibf] = set_up(state);
+
+    for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
+        ibf.emplace(hash, hibf::bin_index{bin});
+
+    auto agent = ibf.counting_agent();
+    for (auto _ : state)
+    {
+        [[maybe_unused]] auto & res = agent.bulk_count(hash_values);
+        benchmark::ClobberMemory();
+    }
+
+    state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
+}
+
+BENCHMARK(emplace_benchmark)->Apply(arguments);
+BENCHMARK(clear_benchmark)->Apply(arguments);
+BENCHMARK(clear_range_benchmark)->Apply(arguments);
+BENCHMARK(bulk_contains_benchmark)->Apply(arguments);
+BENCHMARK(bulk_count_benchmark)->Apply(arguments);
+
+BENCHMARK_MAIN();