diff --git a/include/hibf/detail/layout/layout.hpp b/include/hibf/detail/layout/layout.hpp index 193243e0..1f3dbc30 100644 --- a/include/hibf/detail/layout/layout.hpp +++ b/include/hibf/detail/layout/layout.hpp @@ -27,7 +27,7 @@ struct layout requires std::derived_from friend stream_type & operator<<(stream_type & stream, max_bin const & object) { - stream << prefix::header << prefix::merged_bin << '_'; + stream << prefix::layout_header << prefix::layout_lower_level << '_'; auto it = object.previous_TB_indices.begin(); auto end = object.previous_TB_indices.end(); // If not empty, we join with ';' @@ -37,7 +37,7 @@ struct layout while (++it != end) stream << ';' << *it; } - stream << " max_bin_id:" << object.id; + stream << " " << prefix::layout_fullest_technical_bin_idx << object.id; return stream; } diff --git a/include/hibf/detail/prefixes.hpp b/include/hibf/detail/prefixes.hpp index 88ec8d45..2b1fe6e8 100644 --- a/include/hibf/detail/prefixes.hpp +++ b/include/hibf/detail/prefixes.hpp @@ -6,24 +6,55 @@ namespace hibf::prefix { - -constexpr std::string_view chopper{"chopper"}; - -constexpr std::string_view header{"#"}; - -constexpr std::string_view header_config{"#"}; - -constexpr std::string_view high_level{"HIGH_LEVEL_IBF"}; - -constexpr std::string_view first_header_line{"#HIGH_LEVEL_IBF"}; -static_assert(first_header_line.starts_with(header)); -static_assert(first_header_line.ends_with(high_level)); - -constexpr std::string_view merged_bin{"MERGED_BIN"}; - -constexpr std::string_view split_bin{"SPLIT_BIN"}; - -constexpr std::string_view column_ids{"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS"}; -static_assert(column_ids.starts_with(header)); +/* These prefixes are for writing the layout file + + * It is structured like this: + * + * [0) Possibly metadata added by chopper/raptor-layout] + * 1) Metadata: the hibf config + * 2) Layout header: max bin ids for the merged bins + * 3) Layout content: Assignment of user bin idx to technical bin idx + * + * And marked like this: + * [0) First character is @; Start and End of meta data should be marked accordingly to (1)] + * 1) First character is @; Start and End are marked by @HIBF_CONFIG and @HIBF_CONFIG_END respectively + * 2) First character is #; + * 3) No mark, plain content. + * + * Example: + * + * ``` + * @CHOPPER_USER_BINS + * @0 /path/to/file1.fa + * @CHOPPER_USER_BINS_END + * @CHOPPER_CONFIG + * @0 k = 20 + * @CHOPPER_CONFIG_END + * + * `` + */ + +constexpr std::string_view meta_header{"@"}; + +constexpr std::string_view meta_hibf_config_start{"@HIBF_CONFIG"}; +static_assert(meta_hibf_config_start.starts_with(meta_header)); + +constexpr std::string_view meta_hibf_config_end{"@HIBF_CONFIG_END"}; +static_assert(meta_hibf_config_end.starts_with(meta_header)); + +constexpr std::string_view layout_header{"#"}; + +constexpr std::string_view layout_top_level{"TOP_LEVEL_IBF"}; + +constexpr std::string_view layout_lower_level{"LOWER_LEVEL_IBF"}; + +constexpr std::string_view layout_fullest_technical_bin_idx{"fullest_technical_bin_idx:"}; + +constexpr std::string_view layout_first_header_line{"#TOP_LEVEL_IBF"}; +static_assert(layout_first_header_line.starts_with(layout_header)); +static_assert(layout_first_header_line.ends_with(layout_top_level)); + +constexpr std::string_view layout_column_names{"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS"}; +static_assert(layout_column_names.starts_with(layout_header)); } // namespace hibf::prefix diff --git a/src/config.cpp b/src/config.cpp index c56b7ba6..a0290c78 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -22,14 +22,19 @@ void config::read_from(std::istream & stream) std::string line; std::stringstream config_str; - std::getline(stream, line); - assert(line == "##CONFIG:"); + while(std::getline(stream, line) && line != prefix::meta_hibf_config_start); - while (std::getline(stream, line) && line.size() >= 3 && std::string_view{line}.substr(0, 1) == hibf::prefix::header - && std::string_view{line}.substr(1, 1) == hibf::prefix::header_config && line != "##ENDCONFIG") - config_str << line.substr(2); // remove hibf::prefix::header & hibf::prefix::header_config + assert(line == prefix::meta_hibf_config_start); - assert(line == "##ENDCONFIG"); + // TODO ##CONFIG: as prefix + while (std::getline(stream, line) && line != prefix::meta_hibf_config_end) + { + assert(line.size() >= 2); + assert(std::string_view{line}.substr(0, 1) == hibf::prefix::meta_header); + config_str << line.substr(1); // remove hibf::prefix::meta_header + } + + assert(line == prefix::meta_hibf_config_end); cereal::JSONInputArchive iarchive(config_str); iarchive(*this); @@ -40,15 +45,15 @@ void config::write_to(std::ostream & stream) const // write json file to temprorary string stream with cereal std::stringstream config_stream{}; cereal::JSONOutputArchive output(config_stream); // stream to cout - output(cereal::make_nvp("config", *this)); + output(cereal::make_nvp("hibf_config", *this)); // write config - stream << prefix::header << prefix::header_config << "CONFIG:\n"; + stream << prefix::meta_hibf_config_start << '\n'; std::string line; while (std::getline(config_stream, line, '\n')) - stream << prefix::header << prefix::header_config << line << '\n'; - stream << prefix::header << prefix::header_config << "}\n" // last closing bracket isn't written by loop above - << prefix::header << prefix::header_config << "ENDCONFIG\n"; + stream << prefix::meta_header << line << '\n'; + stream << prefix::meta_header << "}\n" // last closing bracket isn't written by loop above + << prefix::meta_hibf_config_end << '\n'; } } // namespace hibf diff --git a/src/detail/layout/layout.cpp b/src/detail/layout/layout.cpp index c615c504..00eea9f7 100644 --- a/src/detail/layout/layout.cpp +++ b/src/detail/layout/layout.cpp @@ -88,31 +88,32 @@ void hibf::layout::layout::read_from(std::istream & stream) std::string line; std::getline(stream, line); // get first line that is always the max bin index of the top level bin - assert(line.substr(0, hibf::prefix::first_header_line.size()) == hibf::prefix::first_header_line); + assert(line.starts_with(prefix::layout_first_header_line)); // parse High Level max bin index - assert(line.substr(hibf::prefix::high_level.size() + 2, 11) == "max_bin_id:"); - std::string_view const hibf_max_bin_str{line.begin() + 27, line.end()}; + constexpr size_t fullest_tbx_prefix_size = prefix::layout_fullest_technical_bin_idx.size(); + assert(line.substr(prefix::layout_top_level.size() + 2, fullest_tbx_prefix_size) == prefix::layout_fullest_technical_bin_idx); + std::string_view const hibf_max_bin_str{line.begin() + prefix::layout_top_level.size() + 2 + fullest_tbx_prefix_size, line.end()}; top_level_max_bin_id = parse_first_bin(hibf_max_bin_str); // read and parse header records, in order to sort them before adding them to the graph - while (std::getline(stream, line) && line != hibf::prefix::column_ids) + while (std::getline(stream, line) && line != prefix::layout_column_names) { - assert(line.substr(1, hibf::prefix::merged_bin.size()) == hibf::prefix::merged_bin); + assert(line.substr(1, prefix::layout_lower_level.size()) == prefix::layout_lower_level); // parse header line std::string_view const indices_str{ - line.begin() + 1 /*#*/ + hibf::prefix::merged_bin.size() + 1 /*_*/, - std::find(line.begin() + hibf::prefix::merged_bin.size() + 2, line.end(), ' ')}; + line.begin() + 1 /*#*/ + prefix::layout_lower_level.size() + 1 /*_*/, + std::find(line.begin() + prefix::layout_lower_level.size() + 2, line.end(), ' ')}; - assert(line.substr(hibf::prefix::merged_bin.size() + indices_str.size() + 3, 11) == "max_bin_id:"); - std::string_view const max_id_str{line.begin() + hibf::prefix::merged_bin.size() + indices_str.size() + 14, + assert(line.substr(prefix::layout_lower_level.size() + indices_str.size() + 3, fullest_tbx_prefix_size) == prefix::layout_fullest_technical_bin_idx); + std::string_view const max_id_str{line.begin() + prefix::layout_lower_level.size() + indices_str.size() + fullest_tbx_prefix_size + 3, line.end()}; max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str)); } - assert(line == hibf::prefix::column_ids); + assert(line == prefix::layout_column_names); // parse the rest of the file while (std::getline(stream, line)) @@ -122,12 +123,13 @@ void hibf::layout::layout::read_from(std::istream & stream) void hibf::layout::layout::write_to(std::ostream & stream) const { // write layout header with max bin ids - stream << prefix::first_header_line << " max_bin_id:" << top_level_max_bin_id << '\n'; + stream << prefix::layout_first_header_line << " " + << prefix::layout_fullest_technical_bin_idx << top_level_max_bin_id << '\n'; for (auto const & max_bin : max_bins) stream << max_bin << '\n'; // write header line - stream << prefix::column_ids << '\n'; + stream << prefix::layout_column_names << '\n'; // write layout entries for (auto const & user_bin : user_bins) diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp index 8b5f10d3..ec2eb853 100644 --- a/test/unit/hibf/config_test.cpp +++ b/test/unit/hibf/config_test.cpp @@ -28,24 +28,103 @@ TEST(config_test, write_to) configuration.write_to(ss); - std::string const expected_file{"##CONFIG:\n" - "##{\n" - "## \"config\": {\n" - "## \"version\": 1,\n" - "## \"number_of_user_bins\": 123456789,\n" - "## \"number_of_hash_functions\": 4,\n" - "## \"maximum_false_positive_rate\": 0.0001,\n" - "## \"threads\": 31,\n" - "## \"sketch_bits\": 8,\n" - "## \"tmax\": 128,\n" - "## \"alpha\": 1.0,\n" - "## \"max_rearrangement_ratio\": 0.333,\n" - "## \"disable_estimate_union\": true,\n" - "## \"disable_rearrangement\": false,\n" - "## \"disable_cutoffs\": false\n" - "## }\n" - "##}\n" - "##ENDCONFIG\n"}; + std::string const expected_file{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 1,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_false_positive_rate\": 0.0001,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"disable_cutoffs\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; EXPECT_EQ(ss.str(), expected_file); } + +TEST(config_test, read_from) +{ + std::stringstream ss{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 1,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_false_positive_rate\": 0.0001,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"disable_cutoffs\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_false_positive_rate, 0.0001); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); + EXPECT_EQ(configuration.disable_cutoffs, false); +} + +TEST(config_test, read_from_with_more_meta) +{ + std::stringstream ss{"@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 1,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_false_positive_rate\": 0.0001,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"disable_cutoffs\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_false_positive_rate, 0.0001); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); + EXPECT_EQ(configuration.disable_cutoffs, false); +} diff --git a/test/unit/hibf/detail/layout/layout_test.cpp b/test/unit/hibf/detail/layout/layout_test.cpp index 9d70160b..3bfa98d6 100644 --- a/test/unit/hibf/detail/layout/layout_test.cpp +++ b/test/unit/hibf/detail/layout/layout_test.cpp @@ -22,9 +22,9 @@ TEST(layout_test, printing_max_bins) for (auto const & mb : layout.max_bins) ss << mb << "\n"; - std::string expected = R"mb(#MERGED_BIN_ max_bin_id:0 -#MERGED_BIN_2 max_bin_id:2 -#MERGED_BIN_1;2;3;4 max_bin_id:22 + std::string expected = R"mb(#LOWER_LEVEL_IBF_ fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 )mb"; EXPECT_EQ(ss.str(), expected); @@ -67,10 +67,10 @@ TEST(layout_test, write_to) layout.write_to(ss); - std::string expected = R"layout_file(#HIGH_LEVEL_IBF max_bin_id:111 -#MERGED_BIN_0 max_bin_id:0 -#MERGED_BIN_2 max_bin_id:2 -#MERGED_BIN_1;2;3;4 max_bin_id:22 + std::string expected = R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 #USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS 7 0 1 4 1;0 1;22 @@ -82,10 +82,10 @@ TEST(layout_test, write_to) TEST(layout_test, read_from) { - std::stringstream ss{R"layout_file(#HIGH_LEVEL_IBF max_bin_id:111 -#MERGED_BIN_0 max_bin_id:0 -#MERGED_BIN_2 max_bin_id:2 -#MERGED_BIN_1;2;3;4 max_bin_id:22 + std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 #USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS 7 0 1 4 1;0 1;22 diff --git a/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp b/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp index f866f455..2d63dc94 100644 --- a/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp @@ -59,25 +59,25 @@ TEST(hibf_test, build_from_layout) { std::ofstream stream{layout_filename}; - stream << "##CONFIG:\n"; - stream << "##{\n"; - stream << "## \"config\": {\n"; - stream << "## \"version\": 1,\n"; - stream << "## \"number_of_user_bins\": 2,\n"; - stream << "## \"number_of_hash_functions\": 2,\n"; - stream << "## \"maximum_false_positive_rate\": 0.05,\n"; - stream << "## \"threads\": 1,\n"; - stream << "## \"sketch_bits\": 12,\n"; - stream << "## \"tmax\": 64,\n"; - stream << "## \"alpha\": 1.2,\n"; - stream << "## \"max_rearrangement_ratio\": 0.5,\n"; - stream << "## \"disable_estimate_union\": false,\n"; - stream << "## \"disable_rearrangement\": true,\n"; - stream << "## \"disable_cutoffs\": false\n"; - stream << "## }\n"; - stream << "##}\n"; - stream << "##ENDCONFIG\n"; - stream << "#HIGH_LEVEL_IBF max_bin_id:0\n"; + stream << "@HIBF_CONFIG\n"; + stream << "@{\n"; + stream << "@ \"hibf_config\": {\n"; + stream << "@ \"version\": 1,\n"; + stream << "@ \"number_of_user_bins\": 2,\n"; + stream << "@ \"number_of_hash_functions\": 2,\n"; + stream << "@ \"maximum_false_positive_rate\": 0.05,\n"; + stream << "@ \"threads\": 1,\n"; + stream << "@ \"sketch_bits\": 12,\n"; + stream << "@ \"tmax\": 64,\n"; + stream << "@ \"alpha\": 1.2,\n"; + stream << "@ \"max_rearrangement_ratio\": 0.5,\n"; + stream << "@ \"disable_estimate_union\": false,\n"; + stream << "@ \"disable_rearrangement\": true,\n"; + stream << "@ \"disable_cutoffs\": false\n"; + stream << "@ }\n"; + stream << "@}\n"; + stream << "@HIBF_CONFIG_END\n"; + stream << "#TOP_LEVEL_IBF fullest_technical_bin_idx:0\n"; stream << "#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n"; stream << "1\t0\t34\n"; stream << "0\t34\t30\n";