Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] PHIBF #188

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/hibf/layout/compute_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ namespace seqan::hibf::layout
* \param[in] config The configuration to compute the layout with.
* \param[in] kmer_counts The vector that will store the kmer counts (estimations).
* \param[in] sketches The vector that will store the sketches.
* \param[in] positions The vector that will store the positions.
* \param[in,out] union_estimation_timer The timer that measures the union estimation time.
* \param[in,out] rearrangement_timer The timer that measures the rearrangement time.
* \returns layout
*/
layout compute_layout(config const & config,
std::vector<size_t> const & kmer_counts,
std::vector<sketch::hyperloglog> const & sketches,
std::vector<size_t> && positions,
concurrent_timer & union_estimation_timer,
concurrent_timer & rearrangement_timer);

Expand Down
61 changes: 61 additions & 0 deletions include/hibf/misc/partition.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <bit>
#include <cstddef>
#include <cstdint>

#include <hibf/platform.hpp>

namespace seqan::hibf
{

struct partition_toolbox
{
partition_toolbox(partition_toolbox const &) = default;
partition_toolbox(partition_toolbox &&) = default;
partition_toolbox & operator=(partition_toolbox const &) = default;
partition_toolbox & operator=(partition_toolbox &&) = default;
~partition_toolbox() = default;

explicit partition_toolbox(size_t const parts) : partitions{parts}

Check warning on line 24 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L24

Added line #L24 was not covered by tests
{
size_t const suffixes = next_power_of_four(partitions);
size_t const suffixes_per_part = suffixes / partitions;
mask = suffixes - 1;
shift_value = std::countr_zero(suffixes_per_part);

Check warning on line 29 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L26-L29

Added lines #L26 - L29 were not covered by tests
}

size_t partitions{};
size_t mask{};
int shift_value{};

// The number of suffixes is a power of four.
// The number of partitions is a power of two.
// The number of suffixes is greater than or equal to the number of partitions.
// This means that the number of suffixes per partition is always a power of two.
// Therefore, we can do a right shift instead of the division in:
// (hash & mask) / suffixes_per_part == partition
// The compiler cannot optimize the division to a right shift because suffixes_per_part is a runtime variable.
constexpr size_t hash_partition(uint64_t const hash) const

Check warning on line 43 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L43

Added line #L43 was not covered by tests
{
return (hash & mask) >> shift_value;

Check warning on line 45 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L45

Added line #L45 was not covered by tests
}

static constexpr size_t next_power_of_four(size_t number)

Check warning on line 48 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L48

Added line #L48 was not covered by tests
{
if (number == 0ULL || number == 1ULL)

Check warning on line 50 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L50

Added line #L50 was not covered by tests
return 1ULL; // GCOVR_EXCL_LINE

--number;
int const highest_set_bit_pos = std::bit_width(number);
int const shift_amount = (highest_set_bit_pos + (highest_set_bit_pos & 1)) - 2;

Check warning on line 55 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L53-L55

Added lines #L53 - L55 were not covered by tests
// ( Next multiple of two ) 4 has two zeros
return 0b0100ULL << shift_amount;

Check warning on line 57 in include/hibf/misc/partition.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/partition.hpp#L57

Added line #L57 was not covered by tests
}
};

} // namespace seqan::hibf
5 changes: 5 additions & 0 deletions include/hibf/sketch/compute_sketches.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,9 @@ void compute_sketches(config const & config,
std::vector<size_t> & kmer_counts,
std::vector<sketch::hyperloglog> & sketches);

void compute_sketches(config const & config,
std::vector<std::vector<size_t>> & kmer_counts,
std::vector<std::vector<sketch::hyperloglog>> & sketches,
size_t const number_of_partitions);

} // namespace seqan::hibf::sketch
9 changes: 9 additions & 0 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,19 @@
return count == 0u;
}));

std::vector<size_t> positions = [&kmer_counts]()

Check warning on line 221 in src/hierarchical_interleaved_bloom_filter.cpp

View check run for this annotation

Codecov / codecov/patch

src/hierarchical_interleaved_bloom_filter.cpp#L221

Added line #L221 was not covered by tests
{
std::vector<size_t> ps;
ps.resize(kmer_counts.size());
std::iota(ps.begin(), ps.end(), 0);
return ps;
}();

layout_dp_algorithm_timer.start();
auto layout = layout::compute_layout(configuration,
kmer_counts,
sketches,
std::move(positions),
layout_union_estimation_timer,
layout_rearrangement_timer);
layout_dp_algorithm_timer.stop();
Expand Down
24 changes: 22 additions & 2 deletions src/layout/compute_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,15 @@
layout compute_layout(config const & config,
std::vector<size_t> const & kmer_counts,
std::vector<sketch::hyperloglog> const & sketches,
std::vector<size_t> && positions,
concurrent_timer & union_estimation_timer,
concurrent_timer & rearrangement_timer)
{
assert(kmer_counts.size() == sketches.size());
assert(positions.size() <= sketches.size());
assert(sketches.size() == config.number_of_user_bins);
assert(*std::ranges::max_element(positions) <= config.number_of_user_bins);

layout resulting_layout{};

// The output streams facilitate writing the layout file in hierarchical structure.
Expand All @@ -37,7 +43,8 @@
data_store store{.false_positive_rate = config.maximum_fpr,
.hibf_layout = &resulting_layout,
.kmer_counts = std::addressof(kmer_counts),
.sketches = std::addressof(sketches)};
.sketches = std::addressof(sketches),
.positions = std::move(positions)};

store.fpr_correction = compute_fpr_correction({.fpr = config.maximum_fpr, //
.hash_count = config.number_of_hash_functions,
Expand Down Expand Up @@ -66,7 +73,20 @@
concurrent_timer union_estimation_timer;
concurrent_timer rearrangement_timer;

return compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer);
std::vector<size_t> positions = [&kmer_counts]()

Check warning on line 76 in src/layout/compute_layout.cpp

View check run for this annotation

Codecov / codecov/patch

src/layout/compute_layout.cpp#L76

Added line #L76 was not covered by tests
{
std::vector<size_t> ps;
ps.resize(kmer_counts.size());
std::iota(ps.begin(), ps.end(), 0);
return ps;
}();

return compute_layout(config,
kmer_counts,
sketches,
std::move(positions),
union_estimation_timer,
rearrangement_timer);
}

} // namespace seqan::hibf::layout
6 changes: 4 additions & 2 deletions src/layout/layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,10 @@ void seqan::hibf::layout::layout::read_from(std::istream & stream)

assert(line == prefix::layout_column_names);

// parse the rest of the file
while (std::getline(stream, line))
// parse the rest of the file until either
// 1) the end of the file is reached
// 2) Another header line starts, which indicates a partitioned layout
while (stream.good() && static_cast<char>(stream.peek()) != prefix::layout_header[0] && std::getline(stream, line))
user_bins.emplace_back(parse_layout_line(line));
}

Expand Down
36 changes: 34 additions & 2 deletions src/sketch/compute_sketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
#include <functional> // for function
#include <vector> // for vector

#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/misc/partition.hpp>
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/estimate_kmer_counts.hpp> // for estimate_kmer_counts
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog
Expand Down Expand Up @@ -43,4 +44,35 @@
sketch::estimate_kmer_counts(sketches, kmer_counts);
}

void compute_sketches(config const & config,

Check warning on line 47 in src/sketch/compute_sketches.cpp

View check run for this annotation

Codecov / codecov/patch

src/sketch/compute_sketches.cpp#L47

Added line #L47 was not covered by tests
std::vector<std::vector<size_t>> & cardinalities,
std::vector<std::vector<sketch::hyperloglog>> & sketches,
size_t const number_of_partitions)
{
partition_toolbox partition_helper{number_of_partitions};

Check warning on line 52 in src/sketch/compute_sketches.cpp

View check run for this annotation

Codecov / codecov/patch

src/sketch/compute_sketches.cpp#L52

Added line #L52 was not covered by tests

sketches.resize(number_of_partitions);
cardinalities.resize(number_of_partitions);

Check warning on line 55 in src/sketch/compute_sketches.cpp

View check run for this annotation

Codecov / codecov/patch

src/sketch/compute_sketches.cpp#L54-L55

Added lines #L54 - L55 were not covered by tests

for (size_t i = 0; i < number_of_partitions; ++i)

Check warning on line 57 in src/sketch/compute_sketches.cpp

View check run for this annotation

Codecov / codecov/patch

src/sketch/compute_sketches.cpp#L57

Added line #L57 was not covered by tests
{
sketches[i].resize(config.number_of_user_bins, seqan::hibf::sketch::hyperloglog(config.sketch_bits));
cardinalities[i].resize(config.number_of_user_bins);

Check warning on line 60 in src/sketch/compute_sketches.cpp

View check run for this annotation

Codecov / codecov/patch

src/sketch/compute_sketches.cpp#L59-L60

Added lines #L59 - L60 were not covered by tests
}

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)

Check warning on line 64 in src/sketch/compute_sketches.cpp

View check run for this annotation

Codecov / codecov/patch

src/sketch/compute_sketches.cpp#L63-L64

Added lines #L63 - L64 were not covered by tests
for (size_t i = 0; i < config.number_of_user_bins; ++i)
{
kmers.clear();
config.input_fn(i, insert_iterator{kmers});

for (auto k_hash : kmers)
sketches[partition_helper.hash_partition(k_hash)][i].add(k_hash);
}

for (size_t i = 0; i < number_of_partitions; ++i)
sketch::estimate_kmer_counts(sketches[i], cardinalities[i]);

Check warning on line 75 in src/sketch/compute_sketches.cpp

View check run for this annotation

Codecov / codecov/patch

src/sketch/compute_sketches.cpp#L74-L75

Added lines #L74 - L75 were not covered by tests
}

} // namespace seqan::hibf::sketch
17 changes: 15 additions & 2 deletions test/unit/hibf/layout/compute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <cstddef> // for size_t
#include <functional> // for function
#include <numeric> // for iota
#include <vector> // for vector, allocator

#include <hibf/config.hpp> // for insert_iterator, config
Expand Down Expand Up @@ -39,8 +40,20 @@ TEST(compute_layout, dispatch)
seqan::hibf::concurrent_timer union_estimation_timer{};
seqan::hibf::concurrent_timer rearrangement_timer{};

auto layout2 =
seqan::hibf::layout::compute_layout(config, kmer_counts, sketches, union_estimation_timer, rearrangement_timer);
std::vector<size_t> positions = [&kmer_counts]()
{
std::vector<size_t> ps;
ps.resize(kmer_counts.size());
std::iota(ps.begin(), ps.end(), 0);
return ps;
}();

auto layout2 = seqan::hibf::layout::compute_layout(config,
kmer_counts,
sketches,
std::move(positions),
union_estimation_timer,
rearrangement_timer);

EXPECT_TRUE(layout1 == layout2);
}
45 changes: 45 additions & 0 deletions test/unit/hibf/layout/layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,48 @@ TEST(layout_test, clear)
EXPECT_TRUE(layout.max_bins.empty());
EXPECT_TRUE(layout.user_bins.empty());
}

TEST(layout_test, read_from_partitioned_layout)
{
// layout consists of three partitions, written one after the other
std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
)layout_file"};

for (size_t i = 0; i < 3; ++i)
{
seqan::hibf::layout::layout layout;
layout.read_from(ss);

EXPECT_EQ(layout.top_level_max_bin_id, 111);
EXPECT_EQ(layout.max_bins[0], (seqan::hibf::layout::layout::max_bin{{0}, 0}));
EXPECT_EQ(layout.max_bins[1], (seqan::hibf::layout::layout::max_bin{{2}, 2}));
EXPECT_EQ(layout.max_bins[2], (seqan::hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22}));
EXPECT_EQ(layout.user_bins[0], (seqan::hibf::layout::layout::user_bin{std::vector<size_t>{}, 0, 1, 7}));
EXPECT_EQ(layout.user_bins[1], (seqan::hibf::layout::layout::user_bin{std::vector<size_t>{1}, 0, 22, 4}));
EXPECT_EQ(layout.user_bins[2],
(seqan::hibf::layout::layout::user_bin{std::vector<size_t>{1, 2, 3, 4}, 22, 21, 5}));
}
}