diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index bdebda25..b6450842 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -11,6 +11,7 @@ #include // for size_t #include // for path #include // for function +#include // for ostream #include // for insert_iterator #include // for unordered_flat_set @@ -85,6 +86,9 @@ struct config // bool compressed{false}; //!\} + void read_from(std::istream & stream); + void write_to(std::ostream & stream) const; + private: friend class cereal::access; diff --git a/include/hibf/detail/layout/layout.hpp b/include/hibf/detail/layout/layout.hpp index d884d237..193243e0 100644 --- a/include/hibf/detail/layout/layout.hpp +++ b/include/hibf/detail/layout/layout.hpp @@ -86,6 +86,9 @@ struct layout } }; + void read_from(std::istream & stream); + void write_to(std::ostream & stream) const; + size_t top_level_max_bin_id{}; std::vector max_bins{}; std::vector user_bins{}; diff --git a/include/hibf/hierarchical_interleaved_bloom_filter.hpp b/include/hibf/hierarchical_interleaved_bloom_filter.hpp index 5f026707..845ab18e 100644 --- a/include/hibf/hierarchical_interleaved_bloom_filter.hpp +++ b/include/hibf/hierarchical_interleaved_bloom_filter.hpp @@ -105,6 +105,7 @@ class hierarchical_interleaved_bloom_filter ~hierarchical_interleaved_bloom_filter() = default; //!< Defaulted. hierarchical_interleaved_bloom_filter(config const & configuration); + hierarchical_interleaved_bloom_filter(std::function input_fn, std::filesystem::path const & layout_filename); //!\} //!\brief The individual interleaved Bloom filters. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cb361587..a1f76fab 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,8 @@ set (HIBF_SOURCE_FILES hierarchical_interleaved_bloom_filter.cpp + config.cpp detail/layout/simple_binning.cpp + detail/layout/layout.cpp detail/layout/execute.cpp detail/layout/compute_fpr_correction.cpp detail/layout/compute_layout.cpp diff --git a/src/config.cpp b/src/config.cpp new file mode 100644 index 00000000..f4bde3ed --- /dev/null +++ b/src/config.cpp @@ -0,0 +1,57 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include + +#include + +#include +#include + +namespace hibf +{ + +inline void config::read_from(std::istream & stream) +{ + std::string line; + std::stringstream config_str; + + std::getline(stream, line); + assert(line == "##CONFIG:"); + + while (std::getline(stream, line) && line.size() >= 3 + && std::string_view{line}.substr(0, 1) == hibf::prefix::header + && std::string_view{line}.substr(1, 1) == hibf::prefix::header_config && line != "##ENDCONFIG") + config_str << line.substr(2); // remove hibf::prefix::header & hibf::prefix::header_config + + assert(line == "##ENDCONFIG"); + + cereal::JSONInputArchive iarchive(config_str); + iarchive(*this); + + std::getline(stream, line); // skip "##ENDCONFIG" +} + +inline void config::write_to(std::ostream & stream) const +{ + // write json file to temprorary string stream with cereal + std::stringstream config_stream{}; + cereal::JSONOutputArchive output(config_stream); // stream to cout + output(cereal::make_nvp("config", *this)); + + // write config + stream << prefix::header << prefix::header_config << "CONFIG:\n"; + std::string line; + while (std::getline(config_stream, line, '\n')) + stream << prefix::header << prefix::header_config << line << '\n'; + stream << prefix::header << prefix::header_config << "}\n" // last closing bracket isn't written by loop above + << prefix::header << prefix::header_config << "ENDCONFIG\n"; +} + +} // namespace hibf diff --git a/src/detail/layout/layout.cpp b/src/detail/layout/layout.cpp new file mode 100644 index 00000000..132f4b21 --- /dev/null +++ b/src/detail/layout/layout.cpp @@ -0,0 +1,148 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include + +#include + +namespace hibf::layout +{ + +inline hibf::layout::layout::user_bin parse_layout_line(std::string const & current_line) +{ + hibf::layout::layout::user_bin result{}; + + // initialize parsing + std::string_view const buffer{current_line}; + auto const buffer_end{buffer.end()}; + auto field_end = buffer.begin(); + while (field_end != buffer_end && *field_end != '\t') + ++field_end; + + size_t tmp{}; // integer buffer when reading numbers + + assert(field_end != buffer_end); + assert(*field_end != '\t'); + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.idx = tmp; + + do // read bin_indices + { + ++field_end; // skip tab or ; + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.previous_TB_indices.push_back(tmp); + } + while (field_end != buffer_end && *field_end != '\t'); + + result.storage_TB_id = result.previous_TB_indices.back(); + result.previous_TB_indices.pop_back(); + + do // read number of technical bins + { + ++field_end; // skip tab or ; + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.number_of_technical_bins = tmp; // only the last number really counts + } + while (field_end != buffer_end && *field_end != '\t'); + + return result; +} + +inline void hibf::layout::layout::read_from(std::istream & stream) +{ + // parse header + auto parse_bin_indices = [](std::string_view const & buffer) + { + std::vector result; + + auto buffer_start = &buffer[0]; + auto const buffer_end = buffer_start + buffer.size(); + + size_t tmp{}; + + while (buffer_start < buffer_end) + { + buffer_start = std::from_chars(buffer_start, buffer_end, tmp).ptr; + ++buffer_start; // skip ; + result.push_back(tmp); + } + + return result; + }; + + auto parse_first_bin = [](std::string_view const & buffer) + { + size_t tmp{}; + std::from_chars(&buffer[0], &buffer[0] + buffer.size(), tmp); + return tmp; + }; + + std::string line; + std::stringstream config_str; + + assert(line[0] == '#'); // still reading header lines + assert(line.substr(1, hibf::prefix::high_level.size()) == hibf::prefix::high_level); + + // parse High Level max bin index + assert(line.substr(hibf::prefix::high_level.size() + 2, 11) == "max_bin_id:"); + std::string_view const hibf_max_bin_str{line.begin() + 27, line.end()}; + top_level_max_bin_id = parse_first_bin(hibf_max_bin_str); + + // first read and parse header records, in order to sort them before adding them to the graph + while (std::getline(stream, line) && line.substr(0, 6) != "#FILES") + { + assert(line.substr(1, hibf::prefix::merged_bin.size()) == hibf::prefix::merged_bin); + + // parse header line + std::string_view const indices_str{ + line.begin() + 1 /*#*/ + hibf::prefix::merged_bin.size() + 1 /*_*/, + std::find(line.begin() + hibf::prefix::merged_bin.size() + 2, line.end(), ' ')}; + + assert(line.substr(hibf::prefix::merged_bin.size() + indices_str.size() + 3, 11) == "max_bin_id:"); + std::string_view const max_id_str{line.begin() + hibf::prefix::merged_bin.size() + indices_str.size() + 14, + line.end()}; + + max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str)); + } + + // parse the rest of the file + std::string current_line; + while (std::getline(stream, current_line)) + user_bins.emplace_back(parse_layout_line(current_line)); + +} + +inline void hibf::layout::layout::write_to(std::ostream & stream) const +{ + // write layout header with max bin ids + stream << prefix::first_header_line << " max_bin_id:" << top_level_max_bin_id << '\n'; + for (auto const & max_bin : max_bins) + stream << max_bin << '\n'; + + // write header line + stream << prefix::header << "USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n"; + + // write layout entries + for (auto const & user_bin : user_bins) + { + stream << user_bin.idx << '\t'; + for (auto bin : user_bin.previous_TB_indices) + stream << bin << ';'; + stream << user_bin.storage_TB_id << '\t'; + for ([[maybe_unused]] auto && elem : user_bin.previous_TB_indices) // number of bins per merged level is 1 + stream << "1;"; + stream << user_bin.number_of_technical_bins; + stream << '\n'; + } +} + +} // namespace hibf::layout diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 8545010c..c9ca04ff 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -199,4 +199,17 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con build_index(*this, configuration, layout); } +hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(std::function input_fn, std::filesystem::path const & layout_filename) +{ + std::ifstream layout_file{layout_filename}; + config configuration; + layout::layout hibf_layout; + configuration.read_from(layout_file); + hibf_layout.read_from(layout_file); + + configuration.input_fn = input_fn; + auto layout = layout::compute_layout(configuration); + build_index(*this, configuration, layout); +} + } // namespace hibf diff --git a/test/unit/hibf/detail/layout/layout_test.cpp b/test/unit/hibf/detail/layout/layout_test.cpp index d3579c4d..85bb1a1f 100644 --- a/test/unit/hibf/detail/layout/layout_test.cpp +++ b/test/unit/hibf/detail/layout/layout_test.cpp @@ -49,3 +49,31 @@ TEST(layout_test, printing_user_bins) EXPECT_EQ(ss.str(), expected); } + +TEST(layout_test, write_to) +{ + std::stringstream ss{}; + + hibf::layout::layout layout; + + layout.top_level_max_bin_id = 111; + layout.max_bins.emplace_back(std::vector{}, 0); + layout.max_bins.emplace_back(std::vector{2}, 2); + layout.max_bins.emplace_back(std::vector{1, 2, 3, 4}, 22); + layout.user_bins.emplace_back(7, std::vector{}, 1, 0); + layout.user_bins.emplace_back(4, std::vector{1}, 22, 0); + layout.user_bins.emplace_back(5, std::vector{1, 2, 3, 4}, 21, 22); + + layout.write_to(ss); + + std::string expected = R"layout_file(#HIBF_ max_bin_id:111 +#MERGED_BIN_ max_bin_id:0 +#MERGED_BIN_2 max_bin_id:2 +#MERGED_BIN_1;2;3;4 max_bin_id:22 +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"; + + EXPECT_EQ(ss.str(), expected); +}