Skip to content

Commit

Permalink
adaptations
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Aug 23, 2023
1 parent 525c4a6 commit 1babecf
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 92 deletions.
4 changes: 2 additions & 2 deletions include/hibf/detail/layout/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ struct layout
requires std::derived_from<stream_type, std::ostream>
friend stream_type & operator<<(stream_type & stream, max_bin const & object)
{
stream << prefix::header << prefix::merged_bin << '_';
stream << prefix::layout_header << prefix::layout_lower_level << '_';
auto it = object.previous_TB_indices.begin();
auto end = object.previous_TB_indices.end();
// If not empty, we join with ';'
Expand All @@ -37,7 +37,7 @@ struct layout
while (++it != end)
stream << ';' << *it;
}
stream << " max_bin_id:" << object.id;
stream << " " << prefix::layout_fullest_technical_bin_idx << object.id;

return stream;
}
Expand Down
69 changes: 50 additions & 19 deletions include/hibf/detail/prefixes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,55 @@

namespace hibf::prefix
{

constexpr std::string_view chopper{"chopper"};

constexpr std::string_view header{"#"};

constexpr std::string_view header_config{"#"};

constexpr std::string_view high_level{"HIGH_LEVEL_IBF"};

constexpr std::string_view first_header_line{"#HIGH_LEVEL_IBF"};
static_assert(first_header_line.starts_with(header));
static_assert(first_header_line.ends_with(high_level));

constexpr std::string_view merged_bin{"MERGED_BIN"};

constexpr std::string_view split_bin{"SPLIT_BIN"};

constexpr std::string_view column_ids{"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS"};
static_assert(column_ids.starts_with(header));
/* These prefixes are for writing the layout file
* It is structured like this:
*
* [0) Possibly metadata added by chopper/raptor-layout]
* 1) Metadata: the hibf config
* 2) Layout header: max bin ids for the merged bins
* 3) Layout content: Assignment of user bin idx to technical bin idx
*
* And marked like this:
* [0) First character is @; Start and End of meta data should be marked accordingly to (1)]
* 1) First character is @; Start and End are marked by @HIBF_CONFIG and @HIBF_CONFIG_END respectively
* 2) First character is #;
* 3) No mark, plain content.
*
* Example:
*
* ```
* @CHOPPER_USER_BINS
* @0 /path/to/file1.fa
* @CHOPPER_USER_BINS_END
* @CHOPPER_CONFIG
* @0 k = 20
* @CHOPPER_CONFIG_END
*
* ``
*/

constexpr std::string_view meta_header{"@"};

constexpr std::string_view meta_hibf_config_start{"@HIBF_CONFIG"};
static_assert(meta_hibf_config_start.starts_with(meta_header));

constexpr std::string_view meta_hibf_config_end{"@HIBF_CONFIG_END"};
static_assert(meta_hibf_config_end.starts_with(meta_header));

constexpr std::string_view layout_header{"#"};

constexpr std::string_view layout_top_level{"TOP_LEVEL_IBF"};

constexpr std::string_view layout_lower_level{"LOWER_LEVEL_IBF"};

constexpr std::string_view layout_fullest_technical_bin_idx{"fullest_technical_bin_idx:"};

constexpr std::string_view layout_first_header_line{"#TOP_LEVEL_IBF"};
static_assert(layout_first_header_line.starts_with(layout_header));
static_assert(layout_first_header_line.ends_with(layout_top_level));

constexpr std::string_view layout_column_names{"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS"};
static_assert(layout_column_names.starts_with(layout_header));

} // namespace hibf::prefix
27 changes: 16 additions & 11 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,19 @@ void config::read_from(std::istream & stream)
std::string line;
std::stringstream config_str;

std::getline(stream, line);
assert(line == "##CONFIG:");
while(std::getline(stream, line) && line != prefix::meta_hibf_config_start);

while (std::getline(stream, line) && line.size() >= 3 && std::string_view{line}.substr(0, 1) == hibf::prefix::header
&& std::string_view{line}.substr(1, 1) == hibf::prefix::header_config && line != "##ENDCONFIG")
config_str << line.substr(2); // remove hibf::prefix::header & hibf::prefix::header_config
assert(line == prefix::meta_hibf_config_start);

assert(line == "##ENDCONFIG");
// TODO ##CONFIG: as prefix
while (std::getline(stream, line) && line != prefix::meta_hibf_config_end)
{
assert(line.size() >= 2);
assert(std::string_view{line}.substr(0, 1) == hibf::prefix::meta_header);
config_str << line.substr(1); // remove hibf::prefix::meta_header
}

assert(line == prefix::meta_hibf_config_end);

cereal::JSONInputArchive iarchive(config_str);
iarchive(*this);
Expand All @@ -40,15 +45,15 @@ void config::write_to(std::ostream & stream) const
// write json file to temprorary string stream with cereal
std::stringstream config_stream{};
cereal::JSONOutputArchive output(config_stream); // stream to cout
output(cereal::make_nvp("config", *this));
output(cereal::make_nvp("hibf_config", *this));

// write config
stream << prefix::header << prefix::header_config << "CONFIG:\n";
stream << prefix::meta_hibf_config_start << '\n';
std::string line;
while (std::getline(config_stream, line, '\n'))
stream << prefix::header << prefix::header_config << line << '\n';
stream << prefix::header << prefix::header_config << "}\n" // last closing bracket isn't written by loop above
<< prefix::header << prefix::header_config << "ENDCONFIG\n";
stream << prefix::meta_header << line << '\n';
stream << prefix::meta_header << "}\n" // last closing bracket isn't written by loop above
<< prefix::meta_hibf_config_end << '\n';
}

} // namespace hibf
26 changes: 14 additions & 12 deletions src/detail/layout/layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,31 +88,32 @@ void hibf::layout::layout::read_from(std::istream & stream)
std::string line;

std::getline(stream, line); // get first line that is always the max bin index of the top level bin
assert(line.substr(0, hibf::prefix::first_header_line.size()) == hibf::prefix::first_header_line);
assert(line.starts_with(prefix::layout_first_header_line));

// parse High Level max bin index
assert(line.substr(hibf::prefix::high_level.size() + 2, 11) == "max_bin_id:");
std::string_view const hibf_max_bin_str{line.begin() + 27, line.end()};
constexpr size_t fullest_tbx_prefix_size = prefix::layout_fullest_technical_bin_idx.size();
assert(line.substr(prefix::layout_top_level.size() + 2, fullest_tbx_prefix_size) == prefix::layout_fullest_technical_bin_idx);
std::string_view const hibf_max_bin_str{line.begin() + prefix::layout_top_level.size() + 2 + fullest_tbx_prefix_size, line.end()};
top_level_max_bin_id = parse_first_bin(hibf_max_bin_str);

// read and parse header records, in order to sort them before adding them to the graph
while (std::getline(stream, line) && line != hibf::prefix::column_ids)
while (std::getline(stream, line) && line != prefix::layout_column_names)
{
assert(line.substr(1, hibf::prefix::merged_bin.size()) == hibf::prefix::merged_bin);
assert(line.substr(1, prefix::layout_lower_level.size()) == prefix::layout_lower_level);

// parse header line
std::string_view const indices_str{
line.begin() + 1 /*#*/ + hibf::prefix::merged_bin.size() + 1 /*_*/,
std::find(line.begin() + hibf::prefix::merged_bin.size() + 2, line.end(), ' ')};
line.begin() + 1 /*#*/ + prefix::layout_lower_level.size() + 1 /*_*/,
std::find(line.begin() + prefix::layout_lower_level.size() + 2, line.end(), ' ')};

assert(line.substr(hibf::prefix::merged_bin.size() + indices_str.size() + 3, 11) == "max_bin_id:");
std::string_view const max_id_str{line.begin() + hibf::prefix::merged_bin.size() + indices_str.size() + 14,
assert(line.substr(prefix::layout_lower_level.size() + indices_str.size() + 3, fullest_tbx_prefix_size) == prefix::layout_fullest_technical_bin_idx);
std::string_view const max_id_str{line.begin() + prefix::layout_lower_level.size() + indices_str.size() + fullest_tbx_prefix_size + 3,
line.end()};

max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str));
}

assert(line == hibf::prefix::column_ids);
assert(line == prefix::layout_column_names);

// parse the rest of the file
while (std::getline(stream, line))
Expand All @@ -122,12 +123,13 @@ void hibf::layout::layout::read_from(std::istream & stream)
void hibf::layout::layout::write_to(std::ostream & stream) const
{
// write layout header with max bin ids
stream << prefix::first_header_line << " max_bin_id:" << top_level_max_bin_id << '\n';
stream << prefix::layout_first_header_line << " "
<< prefix::layout_fullest_technical_bin_idx << top_level_max_bin_id << '\n';
for (auto const & max_bin : max_bins)
stream << max_bin << '\n';

// write header line
stream << prefix::column_ids << '\n';
stream << prefix::layout_column_names << '\n';

// write layout entries
for (auto const & user_bin : user_bins)
Expand Down
115 changes: 97 additions & 18 deletions test/unit/hibf/config_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,103 @@ TEST(config_test, write_to)

configuration.write_to(ss);

std::string const expected_file{"##CONFIG:\n"
"##{\n"
"## \"config\": {\n"
"## \"version\": 1,\n"
"## \"number_of_user_bins\": 123456789,\n"
"## \"number_of_hash_functions\": 4,\n"
"## \"maximum_false_positive_rate\": 0.0001,\n"
"## \"threads\": 31,\n"
"## \"sketch_bits\": 8,\n"
"## \"tmax\": 128,\n"
"## \"alpha\": 1.0,\n"
"## \"max_rearrangement_ratio\": 0.333,\n"
"## \"disable_estimate_union\": true,\n"
"## \"disable_rearrangement\": false,\n"
"## \"disable_cutoffs\": false\n"
"## }\n"
"##}\n"
"##ENDCONFIG\n"};
std::string const expected_file{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_false_positive_rate\": 0.0001,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
"@ \"disable_rearrangement\": false,\n"
"@ \"disable_cutoffs\": false\n"
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"};

EXPECT_EQ(ss.str(), expected_file);
}

TEST(config_test, read_from)
{
std::stringstream ss{"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_false_positive_rate\": 0.0001,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
"@ \"disable_rearrangement\": false,\n"
"@ \"disable_cutoffs\": false\n"
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"};

hibf::config configuration;
configuration.read_from(ss);

EXPECT_EQ(configuration.number_of_user_bins, 123456789);
EXPECT_EQ(configuration.number_of_hash_functions, 4);
EXPECT_EQ(configuration.maximum_false_positive_rate, 0.0001);
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
EXPECT_EQ(configuration.disable_rearrangement, false);
EXPECT_EQ(configuration.disable_cutoffs, false);
}

TEST(config_test, read_from_with_more_meta)
{
std::stringstream ss{"@blah some chopper stuff\n"
"@blah some chopper stuff\n"
"@blah some chopper stuff\n"
"@blah some chopper stuff\n"
"@blah some chopper stuff\n"
"@HIBF_CONFIG\n"
"@{\n"
"@ \"hibf_config\": {\n"
"@ \"version\": 1,\n"
"@ \"number_of_user_bins\": 123456789,\n"
"@ \"number_of_hash_functions\": 4,\n"
"@ \"maximum_false_positive_rate\": 0.0001,\n"
"@ \"threads\": 31,\n"
"@ \"sketch_bits\": 8,\n"
"@ \"tmax\": 128,\n"
"@ \"alpha\": 1.0,\n"
"@ \"max_rearrangement_ratio\": 0.333,\n"
"@ \"disable_estimate_union\": true,\n"
"@ \"disable_rearrangement\": false,\n"
"@ \"disable_cutoffs\": false\n"
"@ }\n"
"@}\n"
"@HIBF_CONFIG_END\n"};

hibf::config configuration;
configuration.read_from(ss);

EXPECT_EQ(configuration.number_of_user_bins, 123456789);
EXPECT_EQ(configuration.number_of_hash_functions, 4);
EXPECT_EQ(configuration.maximum_false_positive_rate, 0.0001);
EXPECT_EQ(configuration.threads, 31);
EXPECT_EQ(configuration.sketch_bits, 8);
EXPECT_EQ(configuration.tmax, 128);
EXPECT_EQ(configuration.alpha, 1.0);
EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333);
EXPECT_EQ(configuration.disable_estimate_union, true);
EXPECT_EQ(configuration.disable_rearrangement, false);
EXPECT_EQ(configuration.disable_cutoffs, false);
}
22 changes: 11 additions & 11 deletions test/unit/hibf/detail/layout/layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ TEST(layout_test, printing_max_bins)
for (auto const & mb : layout.max_bins)
ss << mb << "\n";

std::string expected = R"mb(#MERGED_BIN_ max_bin_id:0
#MERGED_BIN_2 max_bin_id:2
#MERGED_BIN_1;2;3;4 max_bin_id:22
std::string expected = R"mb(#LOWER_LEVEL_IBF_ fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
)mb";

EXPECT_EQ(ss.str(), expected);
Expand Down Expand Up @@ -67,10 +67,10 @@ TEST(layout_test, write_to)

layout.write_to(ss);

std::string expected = R"layout_file(#HIGH_LEVEL_IBF max_bin_id:111
#MERGED_BIN_0 max_bin_id:0
#MERGED_BIN_2 max_bin_id:2
#MERGED_BIN_1;2;3;4 max_bin_id:22
std::string expected = R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
Expand All @@ -82,10 +82,10 @@ TEST(layout_test, write_to)

TEST(layout_test, read_from)
{
std::stringstream ss{R"layout_file(#HIGH_LEVEL_IBF max_bin_id:111
#MERGED_BIN_0 max_bin_id:0
#MERGED_BIN_2 max_bin_id:2
#MERGED_BIN_1;2;3;4 max_bin_id:22
std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111
#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0
#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2
#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22
#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS
7 0 1
4 1;0 1;22
Expand Down
Loading

0 comments on commit 1babecf

Please sign in to comment.