Skip to content

Commit

Permalink
[FEATURE] Make the HIBF constructible from a layout file.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Aug 22, 2023
1 parent b89792b commit 304fa9c
Show file tree
Hide file tree
Showing 8 changed files with 256 additions and 0 deletions.
4 changes: 4 additions & 0 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstddef> // for size_t
#include <filesystem> // for path
#include <functional> // for function
#include <iosfwd> // for ostream
#include <iterator> // for insert_iterator

#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
Expand Down Expand Up @@ -85,6 +86,9 @@ struct config
// bool compressed{false};
//!\}

void read_from(std::istream & stream);
void write_to(std::ostream & stream) const;

private:
friend class cereal::access;

Expand Down
3 changes: 3 additions & 0 deletions include/hibf/detail/layout/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ struct layout
}
};

void read_from(std::istream & stream);
void write_to(std::ostream & stream) const;

size_t top_level_max_bin_id{};
std::vector<max_bin> max_bins{};
std::vector<user_bin> user_bins{};
Expand Down
1 change: 1 addition & 0 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class hierarchical_interleaved_bloom_filter
~hierarchical_interleaved_bloom_filter() = default; //!< Defaulted.

hierarchical_interleaved_bloom_filter(config const & configuration);
hierarchical_interleaved_bloom_filter(std::function<void(size_t const, insert_iterator &&)> input_fn, std::filesystem::path const & layout_filename);
//!\}

//!\brief The individual interleaved Bloom filters.
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
set (HIBF_SOURCE_FILES
hierarchical_interleaved_bloom_filter.cpp
config.cpp
detail/layout/simple_binning.cpp
detail/layout/layout.cpp
detail/layout/execute.cpp
detail/layout/compute_fpr_correction.cpp
detail/layout/compute_layout.cpp
Expand Down
57 changes: 57 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <charconv>
#include <cassert>
#include <iostream>

#include <cereal/archives/json.hpp>

#include <hibf/config.hpp>
#include <hibf/detail/prefixes.hpp>

namespace hibf
{

inline void config::read_from(std::istream & stream)
{
std::string line;
std::stringstream config_str;

std::getline(stream, line);
assert(line == "##CONFIG:");

while (std::getline(stream, line) && line.size() >= 3
&& std::string_view{line}.substr(0, 1) == hibf::prefix::header
&& std::string_view{line}.substr(1, 1) == hibf::prefix::header_config && line != "##ENDCONFIG")
config_str << line.substr(2); // remove hibf::prefix::header & hibf::prefix::header_config

assert(line == "##ENDCONFIG");

cereal::JSONInputArchive iarchive(config_str);
iarchive(*this);

std::getline(stream, line); // skip "##ENDCONFIG"
}

inline void config::write_to(std::ostream & stream) const
{
// write json file to temprorary string stream with cereal
std::stringstream config_stream{};
cereal::JSONOutputArchive output(config_stream); // stream to cout
output(cereal::make_nvp("config", *this));

// write config
stream << prefix::header << prefix::header_config << "CONFIG:\n";
std::string line;
while (std::getline(config_stream, line, '\n'))
stream << prefix::header << prefix::header_config << line << '\n';
stream << prefix::header << prefix::header_config << "}\n" // last closing bracket isn't written by loop above
<< prefix::header << prefix::header_config << "ENDCONFIG\n";
}

} // namespace hibf
148 changes: 148 additions & 0 deletions src/detail/layout/layout.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <charconv>
#include <cassert>
#include <iostream>

#include <hibf/config.hpp>
#include <hibf/detail/prefixes.hpp>

#include <hibf/detail/layout/layout.hpp>

namespace hibf::layout
{

inline hibf::layout::layout::user_bin parse_layout_line(std::string const & current_line)
{
hibf::layout::layout::user_bin result{};

// initialize parsing
std::string_view const buffer{current_line};
auto const buffer_end{buffer.end()};
auto field_end = buffer.begin();
while (field_end != buffer_end && *field_end != '\t')
++field_end;

size_t tmp{}; // integer buffer when reading numbers

assert(field_end != buffer_end);
assert(*field_end != '\t');
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.idx = tmp;

do // read bin_indices
{
++field_end; // skip tab or ;
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.previous_TB_indices.push_back(tmp);
}
while (field_end != buffer_end && *field_end != '\t');

result.storage_TB_id = result.previous_TB_indices.back();
result.previous_TB_indices.pop_back();

do // read number of technical bins
{
++field_end; // skip tab or ;
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.number_of_technical_bins = tmp; // only the last number really counts
}
while (field_end != buffer_end && *field_end != '\t');

return result;
}

inline void hibf::layout::layout::read_from(std::istream & stream)
{
// parse header
auto parse_bin_indices = [](std::string_view const & buffer)
{
std::vector<size_t> result;

auto buffer_start = &buffer[0];
auto const buffer_end = buffer_start + buffer.size();

size_t tmp{};

while (buffer_start < buffer_end)
{
buffer_start = std::from_chars(buffer_start, buffer_end, tmp).ptr;
++buffer_start; // skip ;
result.push_back(tmp);
}

return result;
};

auto parse_first_bin = [](std::string_view const & buffer)
{
size_t tmp{};
std::from_chars(&buffer[0], &buffer[0] + buffer.size(), tmp);
return tmp;
};

std::string line;
std::stringstream config_str;

assert(line[0] == '#'); // still reading header lines
assert(line.substr(1, hibf::prefix::high_level.size()) == hibf::prefix::high_level);

// parse High Level max bin index
assert(line.substr(hibf::prefix::high_level.size() + 2, 11) == "max_bin_id:");
std::string_view const hibf_max_bin_str{line.begin() + 27, line.end()};
top_level_max_bin_id = parse_first_bin(hibf_max_bin_str);

// first read and parse header records, in order to sort them before adding them to the graph
while (std::getline(stream, line) && line.substr(0, 6) != "#FILES")
{
assert(line.substr(1, hibf::prefix::merged_bin.size()) == hibf::prefix::merged_bin);

// parse header line
std::string_view const indices_str{
line.begin() + 1 /*#*/ + hibf::prefix::merged_bin.size() + 1 /*_*/,
std::find(line.begin() + hibf::prefix::merged_bin.size() + 2, line.end(), ' ')};

assert(line.substr(hibf::prefix::merged_bin.size() + indices_str.size() + 3, 11) == "max_bin_id:");
std::string_view const max_id_str{line.begin() + hibf::prefix::merged_bin.size() + indices_str.size() + 14,
line.end()};

max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str));
}

// parse the rest of the file
std::string current_line;
while (std::getline(stream, current_line))
user_bins.emplace_back(parse_layout_line(current_line));

}

inline void hibf::layout::layout::write_to(std::ostream & stream) const
{
// write layout header with max bin ids
stream << prefix::first_header_line << " max_bin_id:" << top_level_max_bin_id << '\n';
for (auto const & max_bin : max_bins)
stream << max_bin << '\n';

// write header line
stream << prefix::header << "USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n";

// write layout entries
for (auto const & user_bin : user_bins)
{
stream << user_bin.idx << '\t';
for (auto bin : user_bin.previous_TB_indices)
stream << bin << ';';
stream << user_bin.storage_TB_id << '\t';
for ([[maybe_unused]] auto && elem : user_bin.previous_TB_indices) // number of bins per merged level is 1
stream << "1;";
stream << user_bin.number_of_technical_bins;
stream << '\n';
}
}

} // namespace hibf::layout
13 changes: 13 additions & 0 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,17 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con
build_index(*this, configuration, layout);
}

hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(std::function<void(size_t const, insert_iterator &&)> input_fn, std::filesystem::path const & layout_filename)
{
std::ifstream layout_file{layout_filename};
config configuration;
layout::layout hibf_layout;
configuration.read_from(layout_file);
hibf_layout.read_from(layout_file);

configuration.input_fn = input_fn;
auto layout = layout::compute_layout(configuration);
build_index(*this, configuration, layout);
}

} // namespace hibf
28 changes: 28 additions & 0 deletions test/unit/hibf/detail/layout/layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,31 @@ TEST(layout_test, printing_user_bins)

EXPECT_EQ(ss.str(), expected);
}

TEST(layout_test, write_to)
{
std::stringstream ss{};

hibf::layout::layout layout;

layout.top_level_max_bin_id = 111;
layout.max_bins.emplace_back(std::vector<size_t>{}, 0);
layout.max_bins.emplace_back(std::vector<size_t>{2}, 2);
layout.max_bins.emplace_back(std::vector<size_t>{1, 2, 3, 4}, 22);
layout.user_bins.emplace_back(7, std::vector<size_t>{}, 1, 0);
layout.user_bins.emplace_back(4, std::vector<size_t>{1}, 22, 0);
layout.user_bins.emplace_back(5, std::vector<size_t>{1, 2, 3, 4}, 21, 22);

layout.write_to(ss);

std::string expected = R"layout_file(#HIBF_ max_bin_id:111
#MERGED_BIN_ max_bin_id:0
#MERGED_BIN_2 max_bin_id:2
#MERGED_BIN_1;2;3;4 max_bin_id:22
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
)layout_file";

EXPECT_EQ(ss.str(), expected);
}

0 comments on commit 304fa9c

Please sign in to comment.