Skip to content

Commit

Permalink
Merge pull request #45 from eseiler/misc/perf
Browse files Browse the repository at this point in the history
[MISC] Improve bulk_contains performance
  • Loading branch information
eseiler authored Aug 24, 2023
2 parents 6ccdaaf + d16026b commit 5caf0e3
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 27 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/ci_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
compiler: "intel"
build: unit
build_type: Release
cxx_flags: "-Xclang=-Wno-pass-failed"

steps:
- name: Checkout
Expand Down Expand Up @@ -83,7 +84,8 @@ jobs:
cd build
cmake ../test/${{ matrix.build }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DHIBF_NATIVE_BUILD=OFF \
-DHIBF_VERBOSE_TESTS=OFF
-DHIBF_VERBOSE_TESTS=OFF \
-DCMAKE_CXX_FLAGS="${{ matrix.cxx_flags }}"
make -j2 gtest_build
- name: Build tests
Expand Down
116 changes: 104 additions & 12 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,9 @@ class interleaved_bloom_filter::membership_agent_type
//!\brief A pointer to the augmented hibf::interleaved_bloom_filter.
ibf_t const * ibf_ptr{nullptr};

//!\brief Stores access positions of augmented hibf::interleaved_bloom_filter.
std::array<size_t, 5> bloom_filter_indices;

public:
/*!\name Constructors, destructor and assignment
* \{
Expand Down Expand Up @@ -701,24 +704,113 @@ class interleaved_bloom_filter::membership_agent_type
assert(ibf_ptr != nullptr);
assert(result_buffer.size() == ibf_ptr->bin_count());

std::array<size_t, 5> bloom_filter_indices;
std::memcpy(&bloom_filter_indices, &ibf_ptr->hash_seeds, sizeof(size_t) * ibf_ptr->hash_funs);
// Needed for auto-vectorization of loop. ibf_ptr->bin_words could change bewtween loops.
size_t const bin_words = ibf_ptr->bin_words;
size_t const hash_funs = ibf_ptr->hash_funs;

#ifndef NDEBUG
assert(bin_words != 0u);
assert(hash_funs != 0u);
#else
// Removes case for bin_words == 0u. The same statment inside the switch-case wouldn't have that effect.
if (bin_words == 0u)
__builtin_unreachable();
if (hash_funs == 0u)
__builtin_unreachable();
#endif

for (size_t i = 0; i < hash_funs; ++i)
bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, ibf_ptr->hash_seeds[i]) >> 6;

for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, bloom_filter_indices[i]);
uint64_t * const raw = result_buffer.raw_data().data(); // TODO: std::assume_aligned<64> once memory-aligned
uint64_t const * const ibf_data = ibf_ptr->data.data(); // TODO: std::assume_aligned<64> once memory-aligned
std::memcpy(raw, ibf_data + bloom_filter_indices[0], sizeof(uint64_t) * bin_words);

for (size_t batch = 0; batch < ibf_ptr->bin_words; ++batch)
// https://godbolt.org/z/1nbhvqeGj
// Having the loop inside is faster.
// GCOVR_EXCL_START
switch (bin_words)
{
size_t tmp{-1ULL};
for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
case 1u: // 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
assert(bloom_filter_indices[i] < ibf_ptr->data.size());
tmp &= ibf_ptr->data.get_int(bloom_filter_indices[i]);
bloom_filter_indices[i] += 64;
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
raw[0] &= ibf_raw[0];
}
break;
case 2u: // 1 SSE4 instruction (128 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 2u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 3u: // 1 SSE4 instruction (128 bit) + 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 3u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 4u: // 1 AVX2 instruction (256 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 4u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 5u: // 1 AVX2 instruction (256 bit) + 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 5u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 6u: // 1 AVX2 instruction (256 bit) + 1 SSE4 instruction (128 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 6u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 7u: // 1 AVX2 instruction (256 bit) + 1 SSE4 instruction (128 bit) + 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 7u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 8u: // 1 AVX512 instruction (512 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 8u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
default: // Auto vectorize. Might create different versions.
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < bin_words; ++batch)
raw[batch] &= ibf_raw[batch];
}

result_buffer.data.set_int(batch << 6, tmp);
}
// GCOVR_EXCL_STOP

return result_buffer;
}
Expand Down
14 changes: 7 additions & 7 deletions test/cmake/hibf_require_benchmark.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,25 @@ cmake_minimum_required (VERSION 3.16)
macro (hibf_require_benchmark)
enable_testing ()

set (benchmark_version "1.8.0")
set (gbenchmark_git_tag "v${benchmark_version}")
set (HIBF_BENCHMARK_TAG "v1.8.2")

find_package (benchmark ${benchmark_version} EXACT QUIET)
find_package (benchmark QUIET)

if (NOT benchmark_FOUND)
message (STATUS "Fetching Google Benchmark ${benchmark_version}")
# Also ensure that Google Benchmark if fetched for the latest library cron, which sets the tag to "main".
if (NOT benchmark_FOUND OR "${HIBF_BENCHMARK_TAG}" STREQUAL "main")
message (STATUS "Fetching Google Benchmark ${HIBF_BENCHMARK_TAG}")

include (FetchContent)
FetchContent_Declare (
gbenchmark_fetch_content
GIT_REPOSITORY "https://github.com/google/benchmark.git"
GIT_TAG "${gbenchmark_git_tag}")
GIT_TAG "${HIBF_BENCHMARK_TAG}")
option (BENCHMARK_ENABLE_TESTING "" OFF)
option (BENCHMARK_ENABLE_WERROR "" OFF) # Does not apply to Debug builds.
option (BENCHMARK_ENABLE_INSTALL "" OFF)
FetchContent_MakeAvailable (gbenchmark_fetch_content)
else ()
message (STATUS "Found Google Benchmark ${benchmark_version}")
message (STATUS " Test dependency: Google Benchmark ${benchmark_VERSION} found.")
endif ()

# NOTE: google benchmark's CMakeLists.txt already defines Shlwapi
Expand Down
14 changes: 7 additions & 7 deletions test/cmake/hibf_require_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,24 @@ cmake_minimum_required (VERSION 3.16)
macro (hibf_require_test)
enable_testing ()

set (gtest_version "1.13.0")
set (gtest_git_tag "v${gtest_version}")
set (HIBF_GTEST_TAG "v1.13.0")

find_package (GTest ${gtest_version} EXACT QUIET)
find_package (GTest QUIET)

if (NOT GTest_FOUND)
message (STATUS "Fetching Google Test ${gtest_version}")
# Also ensure that Google Test if fetched for the latest library cron, which sets the tag to "main".
if (NOT GTest_FOUND OR "${HIBF_GTEST_TAG}" STREQUAL "main")
message (STATUS "Fetching Google Test ${HIBF_GTEST_TAG}")

include (FetchContent)
FetchContent_Declare (
gtest_fetch_content
GIT_REPOSITORY "https://github.com/google/googletest.git"
GIT_TAG "${gtest_git_tag}")
GIT_TAG "${HIBF_GTEST_TAG}")
option (BUILD_GMOCK "" OFF)
option (INSTALL_GTEST "" OFF)
FetchContent_MakeAvailable (gtest_fetch_content)
else ()
message (STATUS "Found Google Test ${gtest_version}")
message (STATUS " Test dependency: Google Test ${GTest_VERSION} found.")
endif ()

if (NOT TARGET gtest_build)
Expand Down
1 change: 1 addition & 0 deletions test/performance/ibf/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hibf_benchmark (interleaved_bloom_filter_benchmark.cpp)
175 changes: 175 additions & 0 deletions test/performance/ibf/interleaved_bloom_filter_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

#include <benchmark/benchmark.h>

#include <hibf/contrib/std/to.hpp>
#include <hibf/contrib/std/zip_view.hpp>
#include <hibf/interleaved_bloom_filter.hpp>

inline benchmark::Counter hashes_per_second(size_t const count)
{
return benchmark::Counter(count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000);
}

#if 1
static void arguments(benchmark::internal::Benchmark * b)
{
// Total size: 1MiB
// bins, bin_size, hash_num, sequence_length
b->Args({64, 1LL << 17, 2, 1LL << 17});
b->Args({128, 1LL << 16, 2, 1LL << 17});
b->Args({192, 1LL << 16, 2, 1LL << 17});
b->Args({256, 1LL << 15, 2, 1LL << 17});
b->Args({1024, 1LL << 10, 2, 1LL << 17});
}
#else
static void arguments(benchmark::internal::Benchmark * b)
{
// Total size: 1GiB
// bins, bin_size, hash_num, sequence_length
b->Args({64, 1LL << 27, 2, 1LL << 27});
b->Args({128, 1LL << 26, 2, 1LL << 27});
b->Args({192, 1LL << 26, 2, 1LL << 27});
b->Args({256, 1LL << 25, 2, 1LL << 27});
b->Args({1024, 1LL << 20, 2, 1LL << 27});
}
#endif

auto set_up(::benchmark::State const & state)
{
size_t const bins = state.range(0);
size_t const bits = state.range(1);
size_t const hash_num = state.range(2);
size_t const sequence_length = state.range(3);

auto generate = [sequence_length](size_t const max_value = std::numeric_limits<size_t>::max())
{
auto generator = [max_value]()
{
std::uniform_int_distribution<size_t> distr{0u, max_value};
std::mt19937_64 engine{0ULL};
return distr(engine);
};
std::vector<size_t> result(sequence_length);

std::ranges::generate(result, generator);
return result;
};

std::vector<size_t> const bin_indices{generate(bins - 1)};
std::vector<size_t> const hash_values{generate()};

hibf::interleaved_bloom_filter ibf{hibf::bin_count{bins},
hibf::bin_size{bits},
hibf::hash_function_count{hash_num}};

return std::make_tuple(bin_indices, hash_values, ibf);
}

void emplace_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);

for (auto _ : state)
{
for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
ibf.emplace(hash, hibf::bin_index{bin});
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

void clear_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);
(void)bin_indices;
(void)hash_values;

std::vector<hibf::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
| std::views::transform(
[](size_t i)
{
return hibf::bin_index{i};
})
| seqan::std::ranges::to<std::vector>();

for (auto _ : state)
{
for (auto bin : bin_range)
ibf.clear(bin);
}

state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
}

void clear_range_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);
(void)bin_indices;
(void)hash_values;

std::vector<hibf::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
| std::views::transform(
[](size_t i)
{
return hibf::bin_index{i};
})
| seqan::std::ranges::to<std::vector>();

for (auto _ : state)
{
ibf.clear(bin_range);
}

state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
}

void bulk_contains_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);

for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
ibf.emplace(hash, hibf::bin_index{bin});

auto agent = ibf.membership_agent();
for (auto _ : state)
{
for (auto hash : hash_values)
{
[[maybe_unused]] auto & res = agent.bulk_contains(hash);
benchmark::ClobberMemory();
}
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

void bulk_count_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);

for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
ibf.emplace(hash, hibf::bin_index{bin});

auto agent = ibf.counting_agent();
for (auto _ : state)
{
[[maybe_unused]] auto & res = agent.bulk_count(hash_values);
benchmark::ClobberMemory();
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

BENCHMARK(emplace_benchmark)->Apply(arguments);
BENCHMARK(clear_benchmark)->Apply(arguments);
BENCHMARK(clear_range_benchmark)->Apply(arguments);
BENCHMARK(bulk_contains_benchmark)->Apply(arguments);
BENCHMARK(bulk_count_benchmark)->Apply(arguments);

BENCHMARK_MAIN();

1 comment on commit 5caf0e3

@vercel
Copy link

@vercel vercel bot commented on 5caf0e3 Aug 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

hibf – ./

hibf.vercel.app
hibf-seqan.vercel.app
hibf-git-main-seqan.vercel.app

Please sign in to comment.