Skip to content

Commit

Permalink
[FEATURE] Add layout::read_from and layout::write_to.
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Aug 23, 2023
1 parent 2661b91 commit 991c413
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 0 deletions.
3 changes: 3 additions & 0 deletions include/hibf/detail/layout/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ struct layout
}
};

void read_from(std::istream & stream);
void write_to(std::ostream & stream) const;

size_t top_level_max_bin_id{};
std::vector<max_bin> max_bins{};
std::vector<user_bin> user_bins{};
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ set (HIBF_SOURCE_FILES
hierarchical_interleaved_bloom_filter.cpp
config.cpp
detail/layout/simple_binning.cpp
detail/layout/layout.cpp
detail/layout/execute.cpp
detail/layout/compute_fpr_correction.cpp
detail/layout/compute_layout.cpp
Expand Down
137 changes: 137 additions & 0 deletions src/detail/layout/layout.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <cassert>
#include <charconv>
#include <iostream>

#include <hibf/config.hpp>
#include <hibf/detail/layout/layout.hpp>
#include <hibf/detail/prefixes.hpp>

namespace hibf::layout
{

hibf::layout::layout::user_bin parse_layout_line(std::string const & current_line)
{
hibf::layout::layout::user_bin result{};

size_t tmp{}; // integer buffer when reading numbers

// initialize parsing
std::string_view const buffer{current_line};
auto const buffer_end{buffer.end()};
auto field_end = buffer.begin();
assert(field_end != buffer_end);

// read user bin index
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.idx = tmp;
assert(field_end != buffer_end && *field_end == '\t');

do // read bin_indices
{
++field_end; // skip tab or ;
assert(field_end != buffer_end && *field_end != '\t');
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.previous_TB_indices.push_back(tmp);
}
while (field_end != buffer_end && *field_end != '\t');

result.storage_TB_id = result.previous_TB_indices.back();
result.previous_TB_indices.pop_back();

do // read number of technical bins
{
++field_end; // skip tab or ;
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.number_of_technical_bins = tmp; // only the last number really counts
}
while (field_end != buffer_end && *field_end != '\t');

return result;
}

void hibf::layout::layout::read_from(std::istream & stream)
{
// parse header
auto parse_bin_indices = [](std::string_view const & buffer)
{
std::vector<size_t> result;

auto buffer_start = &buffer[0];
auto const buffer_end = buffer_start + buffer.size();

size_t tmp{};

while (buffer_start < buffer_end)
{
buffer_start = std::from_chars(buffer_start, buffer_end, tmp).ptr;
++buffer_start; // skip ;
result.push_back(tmp);
}

return result;
};

auto parse_first_bin = [](std::string_view const & buffer)
{
size_t tmp{};
std::from_chars(&buffer[0], &buffer[0] + buffer.size(), tmp);
return tmp;
};

std::string line;

std::getline(stream, line); // get first line that is always the max bin index of the top level bin
assert(line.substr(0, hibf::prefix::first_header_line.size()) == hibf::prefix::first_header_line);

// parse High Level max bin index
assert(line.substr(hibf::prefix::high_level.size() + 2, 11) == "max_bin_id:");
std::string_view const hibf_max_bin_str{line.begin() + 27, line.end()};
top_level_max_bin_id = parse_first_bin(hibf_max_bin_str);

// read and parse header records, in order to sort them before adding them to the graph
while (std::getline(stream, line) && line != hibf::prefix::column_ids)
{
assert(line.substr(1, hibf::prefix::merged_bin.size()) == hibf::prefix::merged_bin);

// parse header line
std::string_view const indices_str{
line.begin() + 1 /*#*/ + hibf::prefix::merged_bin.size() + 1 /*_*/,
std::find(line.begin() + hibf::prefix::merged_bin.size() + 2, line.end(), ' ')};

assert(line.substr(hibf::prefix::merged_bin.size() + indices_str.size() + 3, 11) == "max_bin_id:");
std::string_view const max_id_str{line.begin() + hibf::prefix::merged_bin.size() + indices_str.size() + 14,
line.end()};

max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str));
}

assert(line == hibf::prefix::column_ids);

// parse the rest of the file
while (std::getline(stream, line))
user_bins.emplace_back(parse_layout_line(line));
}

void hibf::layout::layout::write_to(std::ostream & stream) const
{
// write layout header with max bin ids
stream << prefix::first_header_line << " max_bin_id:" << top_level_max_bin_id << '\n';
for (auto const & max_bin : max_bins)
stream << max_bin << '\n';

// write header line
stream << prefix::column_ids << '\n';

// write layout entries
for (auto const & user_bin : user_bins)
stream << user_bin << '\n';
}

} // namespace hibf::layout
54 changes: 54 additions & 0 deletions test/unit/hibf/detail/layout/layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <vector> // for vector

#include <hibf/detail/layout/layout.hpp> // for layout, operator<<
#include <hibf/test/expect_range_eq.hpp> // for expect_range_eq, EXPECT_RANGE_EQ

TEST(layout_test, printing_max_bins)
{
Expand Down Expand Up @@ -49,3 +50,56 @@ TEST(layout_test, printing_user_bins)

EXPECT_EQ(ss.str(), expected);
}

TEST(layout_test, write_to)
{
std::stringstream ss{};

hibf::layout::layout layout;

layout.top_level_max_bin_id = 111;
layout.max_bins.emplace_back(std::vector<size_t>{0}, 0);
layout.max_bins.emplace_back(std::vector<size_t>{2}, 2);
layout.max_bins.emplace_back(std::vector<size_t>{1, 2, 3, 4}, 22);
layout.user_bins.emplace_back(7, std::vector<size_t>{}, 1, 0);
layout.user_bins.emplace_back(4, std::vector<size_t>{1}, 22, 0);
layout.user_bins.emplace_back(5, std::vector<size_t>{1, 2, 3, 4}, 21, 22);

layout.write_to(ss);

std::string expected = R"layout_file(#HIGH_LEVEL_IBF max_bin_id:111
#MERGED_BIN_0 max_bin_id:0
#MERGED_BIN_2 max_bin_id:2
#MERGED_BIN_1;2;3;4 max_bin_id:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
)layout_file";

EXPECT_EQ(ss.str(), expected);
}

TEST(layout_test, read_from)
{
std::stringstream ss{R"layout_file(#HIGH_LEVEL_IBF max_bin_id:111
#MERGED_BIN_0 max_bin_id:0
#MERGED_BIN_2 max_bin_id:2
#MERGED_BIN_1;2;3;4 max_bin_id:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
5 1;2;3;4;22 1;1;1;1;21
)layout_file"};

hibf::layout::layout layout;
layout.read_from(ss);

EXPECT_EQ(layout.top_level_max_bin_id, 111);
EXPECT_EQ(layout.max_bins[0], (hibf::layout::layout::max_bin{{0}, 0}));
EXPECT_EQ(layout.max_bins[1], (hibf::layout::layout::max_bin{{2}, 2}));
EXPECT_EQ(layout.max_bins[2], (hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22}));
EXPECT_EQ(layout.user_bins[0], (hibf::layout::layout::user_bin{7, std::vector<size_t>{}, 1, 0}));
EXPECT_EQ(layout.user_bins[1], (hibf::layout::layout::user_bin{4, std::vector<size_t>{1}, 22, 0}));
EXPECT_EQ(layout.user_bins[2], (hibf::layout::layout::user_bin{5, std::vector<size_t>{1, 2, 3, 4}, 21, 22}));
}

0 comments on commit 991c413

Please sign in to comment.