Skip to content

Commit

Permalink
Merge pull request #50 from smehringer/from_layout
Browse files Browse the repository at this point in the history
[FEATURE] Make the HIBF constructible from a layout file.
  • Loading branch information
eseiler authored Aug 24, 2023
2 parents 5caf0e3 + a2fa58b commit 1e269ec
Show file tree
Hide file tree
Showing 12 changed files with 526 additions and 26 deletions.
8 changes: 4 additions & 4 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstddef> // for size_t
#include <filesystem> // for path
#include <functional> // for function
#include <iosfwd> // for ostream
#include <iterator> // for insert_iterator

#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
Expand Down Expand Up @@ -78,13 +79,13 @@ struct config
// Related to k-mers
bool disable_cutoffs{false};

//!\brief If given, no layout algorithm is esxecuted but the layout from file is used for building.
std::filesystem::path layout_file{};

// Related to IBF
// bool compressed{false};
//!\}

void read_from(std::istream & stream);
void write_to(std::ostream & stream) const;

private:
friend class cereal::access;

Expand All @@ -107,7 +108,6 @@ struct config
archive(CEREAL_NVP(disable_rearrangement));

archive(CEREAL_NVP(disable_cutoffs));
archive(CEREAL_NVP(layout_file));
}
};

Expand Down
7 changes: 5 additions & 2 deletions include/hibf/detail/layout/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ struct layout
requires std::derived_from<stream_type, std::ostream>
friend stream_type & operator<<(stream_type & stream, max_bin const & object)
{
stream << prefix::header << prefix::merged_bin << '_';
stream << prefix::layout_header << prefix::layout_lower_level << '_';
auto it = object.previous_TB_indices.begin();
auto end = object.previous_TB_indices.end();
// If not empty, we join with ';'
Expand All @@ -37,7 +37,7 @@ struct layout
while (++it != end)
stream << ';' << *it;
}
stream << " max_bin_id:" << object.id;
stream << " " << prefix::layout_fullest_technical_bin_idx << object.id;

return stream;
}
Expand Down Expand Up @@ -86,6 +86,9 @@ struct layout
}
};

void read_from(std::istream & stream);
void write_to(std::ostream & stream) const;

size_t top_level_max_bin_id{};
std::vector<max_bin> max_bins{};
std::vector<user_bin> user_bins{};
Expand Down
66 changes: 50 additions & 16 deletions include/hibf/detail/prefixes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,55 @@

namespace hibf::prefix
{

constexpr std::string_view chopper{"chopper"};

constexpr std::string_view header{"#"};

constexpr std::string_view header_config{"#"};

constexpr std::string_view high_level{"HIGH_LEVEL_IBF"};

constexpr std::string_view first_header_line{"#HIGH_LEVEL_IBF"};
static_assert(first_header_line.starts_with(header));
static_assert(first_header_line.ends_with(high_level));

constexpr std::string_view merged_bin{"MERGED_BIN"};

constexpr std::string_view split_bin{"SPLIT_BIN"};
/* These prefixes are for writing the layout file
* It is structured like this:
*
* [0) Possibly metadata added by chopper/raptor-layout]
* 1) Metadata: the hibf config
* 2) Layout header: max bin ids for the merged bins
* 3) Layout content: Assignment of user bin idx to technical bin idx
*
* And marked like this:
* [0) First character is @; Start and End of meta data should be marked accordingly to (1)]
* 1) First character is @; Start and End are marked by @HIBF_CONFIG and @HIBF_CONFIG_END respectively
* 2) First character is #;
* 3) No mark, plain content.
*
* Example:
*
* ```
* @CHOPPER_USER_BINS
* @0 /path/to/file1.fa
* @CHOPPER_USER_BINS_END
* @CHOPPER_CONFIG
* @0 k = 20
* @CHOPPER_CONFIG_END
*
* ``
*/

constexpr std::string_view meta_header{"@"};

constexpr std::string_view meta_hibf_config_start{"@HIBF_CONFIG"};
static_assert(meta_hibf_config_start.starts_with(meta_header));

constexpr std::string_view meta_hibf_config_end{"@HIBF_CONFIG_END"};
static_assert(meta_hibf_config_end.starts_with(meta_header));

constexpr std::string_view layout_header{"#"};

constexpr std::string_view layout_top_level{"TOP_LEVEL_IBF"};

constexpr std::string_view layout_lower_level{"LOWER_LEVEL_IBF"};

constexpr std::string_view layout_fullest_technical_bin_idx{"fullest_technical_bin_idx:"};

constexpr std::string_view layout_first_header_line{"#TOP_LEVEL_IBF"};
static_assert(layout_first_header_line.starts_with(layout_header));
static_assert(layout_first_header_line.ends_with(layout_top_level));

constexpr std::string_view layout_column_names{"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS"};
static_assert(layout_column_names.starts_with(layout_header));

} // namespace hibf::prefix
10 changes: 10 additions & 0 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,16 @@ class hierarchical_interleaved_bloom_filter
~hierarchical_interleaved_bloom_filter() = default; //!< Defaulted.

hierarchical_interleaved_bloom_filter(config const & configuration);

/*!\brief [Advanced] Constructs the HIBF from a layout file (stream) and a given input function
* \details
* This constructor makes it possible to construct an hibf from a given layout file instead of calculating the
* layout based on the input function. A hibf::config object is not needed as it is assumed to be stored in the
* layout file. A layout file can be constructed manually or via chopper (https://github.com/seqan/chopper)
* or raptor-layout (https://github.com/seqan/raptor).
*/
hierarchical_interleaved_bloom_filter(std::function<void(size_t const, insert_iterator &&)> input_fn,
std::istream & layout_stream);
//!\}

//!\brief The individual interleaved Bloom filters.
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
set (HIBF_SOURCE_FILES
hierarchical_interleaved_bloom_filter.cpp
config.cpp
detail/layout/simple_binning.cpp
detail/layout/layout.cpp
detail/layout/execute.cpp
detail/layout/compute_fpr_correction.cpp
detail/layout/compute_layout.cpp
Expand Down
60 changes: 60 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <cassert>
#include <charconv>
#include <iostream>

#include <hibf/config.hpp>
#include <hibf/detail/prefixes.hpp>

#include <cereal/archives/json.hpp>

namespace hibf
{

void config::read_from(std::istream & stream)
{
std::string line;
std::stringstream config_str;

while (std::getline(stream, line) && line != prefix::meta_hibf_config_start)
;

assert(line == prefix::meta_hibf_config_start);

// TODO ##CONFIG: as prefix
while (std::getline(stream, line) && line != prefix::meta_hibf_config_end)
{
assert(line.size() >= 2);
assert(std::string_view{line}.substr(0, 1) == hibf::prefix::meta_header);
config_str << line.substr(1); // remove hibf::prefix::meta_header
}

assert(line == prefix::meta_hibf_config_end);

cereal::JSONInputArchive iarchive(config_str);
iarchive(*this);
}

void config::write_to(std::ostream & stream) const
{
// write json file to temprorary string stream with cereal
std::stringstream config_stream{};
cereal::JSONOutputArchive output(config_stream); // stream to cout
output(cereal::make_nvp("hibf_config", *this));

// write config
stream << prefix::meta_hibf_config_start << '\n';
std::string line;
while (std::getline(config_stream, line, '\n'))
stream << prefix::meta_header << line << '\n';
stream << prefix::meta_header << "}\n" // last closing bracket isn't written by loop above
<< prefix::meta_hibf_config_end << '\n';
}

} // namespace hibf
144 changes: 144 additions & 0 deletions src/detail/layout/layout.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <cassert>
#include <charconv>
#include <iostream>

#include <hibf/config.hpp>
#include <hibf/detail/layout/layout.hpp>
#include <hibf/detail/prefixes.hpp>

namespace hibf::layout
{

hibf::layout::layout::user_bin parse_layout_line(std::string const & current_line)
{
hibf::layout::layout::user_bin result{};

size_t tmp{}; // integer buffer when reading numbers

// initialize parsing
std::string_view const buffer{current_line};
auto const buffer_end{buffer.end()};
auto field_end = buffer.begin();
assert(field_end != buffer_end);

// read user bin index
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.idx = tmp;
assert(field_end != buffer_end && *field_end == '\t');

do // read bin_indices
{
++field_end; // skip tab or ;
assert(field_end != buffer_end && *field_end != '\t');
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.previous_TB_indices.push_back(tmp);
}
while (field_end != buffer_end && *field_end != '\t');

result.storage_TB_id = result.previous_TB_indices.back();
result.previous_TB_indices.pop_back();

do // read number of technical bins
{
++field_end; // skip tab or ;
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.number_of_technical_bins = tmp; // only the last number really counts
}
while (field_end != buffer_end && *field_end != '\t');

return result;
}

void hibf::layout::layout::read_from(std::istream & stream)
{
// parse header
auto parse_bin_indices = [](std::string_view const & buffer)
{
std::vector<size_t> result;

auto buffer_start = &buffer[0];
auto const buffer_end = buffer_start + buffer.size();

size_t tmp{};

while (buffer_start < buffer_end)
{
buffer_start = std::from_chars(buffer_start, buffer_end, tmp).ptr;
++buffer_start; // skip ;
result.push_back(tmp);
}

return result;
};

auto parse_first_bin = [](std::string_view const & buffer)
{
size_t tmp{};
std::from_chars(&buffer[0], &buffer[0] + buffer.size(), tmp);
return tmp;
};

std::string line;

std::getline(stream, line); // get first line that is always the max bin index of the top level bin
assert(line.starts_with(prefix::layout_first_header_line));

// parse High Level max bin index
constexpr size_t fullest_tbx_prefix_size = prefix::layout_fullest_technical_bin_idx.size();
assert(line.substr(prefix::layout_top_level.size() + 2, fullest_tbx_prefix_size)
== prefix::layout_fullest_technical_bin_idx);
std::string_view const hibf_max_bin_str{line.begin() + prefix::layout_top_level.size() + 2
+ fullest_tbx_prefix_size,
line.end()};
top_level_max_bin_id = parse_first_bin(hibf_max_bin_str);

// read and parse header records, in order to sort them before adding them to the graph
while (std::getline(stream, line) && line != prefix::layout_column_names)
{
assert(line.substr(1, prefix::layout_lower_level.size()) == prefix::layout_lower_level);

// parse header line
std::string_view const indices_str{
line.begin() + 1 /*#*/ + prefix::layout_lower_level.size() + 1 /*_*/,
std::find(line.begin() + prefix::layout_lower_level.size() + 2, line.end(), ' ')};

assert(line.substr(prefix::layout_lower_level.size() + indices_str.size() + 3, fullest_tbx_prefix_size)
== prefix::layout_fullest_technical_bin_idx);
std::string_view const max_id_str{line.begin() + prefix::layout_lower_level.size() + indices_str.size()
+ fullest_tbx_prefix_size + 3,
line.end()};

max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str));
}

assert(line == prefix::layout_column_names);

// parse the rest of the file
while (std::getline(stream, line))
user_bins.emplace_back(parse_layout_line(line));
}

void hibf::layout::layout::write_to(std::ostream & stream) const
{
// write layout header with max bin ids
stream << prefix::layout_first_header_line << " " << prefix::layout_fullest_technical_bin_idx
<< top_level_max_bin_id << '\n';
for (auto const & max_bin : max_bins)
stream << max_bin << '\n';

// write header line
stream << prefix::layout_column_names << '\n';

// write layout entries
for (auto const & user_bin : user_bins)
stream << user_bin << '\n';
}

} // namespace hibf::layout
15 changes: 15 additions & 0 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,19 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con
build_index(*this, configuration, layout);
}

hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(
std::function<void(size_t const, insert_iterator &&)> input_fn,
std::istream & layout_stream)
{
// read config and layout from file
config configuration;
layout::layout hibf_layout;
configuration.read_from(layout_stream);
hibf_layout.read_from(layout_stream);

configuration.input_fn = input_fn; // set input as it cannot be serialized.

build_index(*this, configuration, hibf_layout);
}

} // namespace hibf
1 change: 1 addition & 0 deletions test/unit/hibf/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
add_subdirectories ()

hibf_test (config_test.cpp)
hibf_test (hierarchical_interleaved_bloom_filter_test.cpp)
hibf_test (interleaved_bloom_filter_test.cpp)
Loading

1 comment on commit 1e269ec

@vercel
Copy link

@vercel vercel bot commented on 1e269ec Aug 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

hibf – ./

hibf.vercel.app
hibf-seqan.vercel.app
hibf-git-main-seqan.vercel.app

Please sign in to comment.