From eb4841cace6f9d1ead00d184d35dbe444b45214c Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Thu, 27 Jul 2023 16:16:46 +0200 Subject: [PATCH] [MISC] Improved fpr calculation. --- src/ibf.cpp | 91 ++++++++++++----------- test/api/estimate_test.cpp | 4 +- test/api/insert_delete_test.cpp | 124 ++++++++++++++++++-------------- 3 files changed, 123 insertions(+), 96 deletions(-) diff --git a/src/ibf.cpp b/src/ibf.cpp index ee0fe73..93b1a5d 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -710,6 +710,8 @@ void ibf_helper(std::vector const & minimiser_files, std::vector> expressions{}; std::vector> sizes{}; sizes.assign(num_files, {}); + std::vector sizes_ibf{}; + std::vector> counts_per_level{}; bool const calculate_cutoffs = cutoffs.empty(); @@ -720,8 +722,12 @@ void ibf_helper(std::vector const & minimiser_files, std::vector zero_vector(ibf_args.number_expression_thresholds); for (unsigned j = 0; j < num_files; j++) expressions.push_back(zero_vector); - } + + std::vector zero_vector2(ibf_args.number_expression_thresholds); + for (unsigned j = 0; j < num_files; j++) + counts_per_level.push_back(zero_vector2); + if constexpr (!minimiser_files_given) { if (minimiser_args.include_file != "") @@ -799,8 +805,6 @@ void ibf_helper(std::vector const & minimiser_files, } } - std::ofstream outfile_fpr; - outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); // File to store actual false positive rates per experiment. // Create IBFs std::vector> ibfs; for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) @@ -819,19 +823,11 @@ void ibf_helper(std::vector const & minimiser_files, } // m = -hn/ln(1-p^(1/h)) size = static_cast((-1.0*num_hash*((1.0*size)/num_files))/(std::log(1.0-std::pow(fprs[j], 1.0/num_hash)))); + sizes_ibf.push_back(size); ibfs.push_back(seqan3::interleaved_bloom_filter( seqan3::bin_count{num_files}, seqan3::bin_size{size}, seqan3::hash_function_count{num_hash})); - - for (unsigned i = 0; i < num_files; i++) - { - double fpr = std::pow(1.0- std::pow(1.0-(1.0/size), num_hash*sizes[i][j]), num_hash); - outfile_fpr << fpr << " "; - } - outfile_fpr << "\n"; } - outfile_fpr << "/\n"; - outfile_fpr.close(); // Add minimisers to ibf #pragma omp parallel for schedule(dynamic, chunk_size) @@ -888,6 +884,7 @@ void ibf_helper(std::vector const & minimiser_files, if (elem.second >= expressions[i][j]) { ibfs[j].emplace(elem.first, seqan3::bin_index{i}); + counts_per_level[i][j]++; break; } } @@ -896,6 +893,7 @@ void ibf_helper(std::vector const & minimiser_files, if (elem.second >= ibf_args.expression_thresholds[j]) { ibfs[j].emplace(elem.first, seqan3::bin_index{i}); + counts_per_level[i][j]++; break; } } @@ -921,7 +919,6 @@ void ibf_helper(std::vector const & minimiser_files, { store_ibf(ibfs[i], filename); } - } // Store all expression thresholds per level. @@ -938,6 +935,21 @@ void ibf_helper(std::vector const & minimiser_files, outfile << "/\n"; outfile.close(); } + + std::ofstream outfile_fpr; + outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + { + for (unsigned i = 0; i < num_files; i++) + { + // m = -hn/ln(1-p^(1/h)) + double fpr = std::pow(1.0- std::pow(1.0-(1.0/sizes_ibf[j]), num_hash *counts_per_level[i][j]), num_hash); + outfile_fpr << fpr << " "; + } + outfile_fpr << "\n"; + } + outfile_fpr << "/\n"; + outfile_fpr.close(); } // Create ibfs @@ -1072,11 +1084,12 @@ void insert_helper(std::vector const & minimiser_files, std::vector zero_vector(ibf_args.number_expression_thresholds); for (unsigned j = 0; j < num_files; j++) expressions.push_back(zero_vector); - std::vector zero_vector2(ibf_args.number_expression_thresholds); - for (unsigned j = 0; j < num_files; j++) - counts_per_level.push_back(zero_vector2); - } + + std::vector zero_vector2(ibf_args.number_expression_thresholds); + for (unsigned j = 0; j < num_files; j++) + counts_per_level.push_back(zero_vector2); + if constexpr (!minimiser_files_given) { if (minimiser_args.include_file != "") @@ -1221,6 +1234,7 @@ void insert_helper(std::vector const & minimiser_files, if (elem.second >= ibf_args.expression_thresholds[j]) { ibfs[j].emplace(elem.first, seqan3::bin_index{pos_insert[i]}); + counts_per_level[i][j]++; break; } } @@ -1280,35 +1294,32 @@ void insert_helper(std::vector const & minimiser_files, } // Store all fprs per level. - if constexpr(samplewise) - { - std::vector> fprs_prev{}; - read_levels(fprs_prev, path_in.string() + "IBF_FPRs.fprs"); + std::vector> fprs_prev{}; + read_levels(fprs_prev, path_in.string() + "IBF_FPRs.fprs"); - std::ofstream outfile_fpr; - outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); - for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + std::ofstream outfile_fpr; + outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + { + int exp_i = 0; + for (unsigned i = 0; i < new_bin_number; i++) { - int exp_i = 0; - for (unsigned i = 0; i < new_bin_number; i++) + if (std::find(pos_insert.begin(), pos_insert.end(), i) != pos_insert.end()) { - if (std::find(pos_insert.begin(), pos_insert.end(), i) != pos_insert.end()) - { - // m = -hn/ln(1-p^(1/h)) - double fpr = std::pow(1.0- std::pow(1.0-(1.0/sizes[j]), num_hash_functions *counts_per_level[exp_i][j]), num_hash_functions); - outfile_fpr << fpr << " "; - exp_i++; - } - else - { - outfile_fpr << fprs_prev[j][i] << " "; - } + // m = -hn/ln(1-p^(1/h)) + double fpr = std::pow(1.0- std::pow(1.0-(1.0/sizes[j]), num_hash_functions *counts_per_level[exp_i][j]), num_hash_functions); + outfile_fpr << fpr << " "; + exp_i++; + } + else + { + outfile_fpr << fprs_prev[j][i] << " "; } - outfile_fpr << "\n"; } - outfile_fpr << "/\n"; - outfile_fpr.close(); + outfile_fpr << "\n"; } + outfile_fpr << "/\n"; + outfile_fpr.close(); } // Insert into ibfs diff --git a/test/api/estimate_test.cpp b/test/api/estimate_test.cpp index 3e72b05..2674eb6 100644 --- a/test/api/estimate_test.cpp +++ b/test/api/estimate_test.cpp @@ -378,7 +378,7 @@ TEST(estimate, example_different_expressions_per_level) std::ifstream output_file(tmp_dir/"expression.out"); std::string line; // Count would expect 6 and 34 - std::string expected{"GeneA\t8\t26\t"}; + std::string expected{"GeneA\t7\t26\t"}; if (output_file.is_open()) { while ( std::getline (output_file, line) ) @@ -426,7 +426,7 @@ TEST(estimate, example_different_expressions_per_level_multiple_threads) std::ifstream output_file(tmp_dir/"expression.out"); std::string line; // Count would expect 6 and 34 - std::string expected{"GeneA\t8\t26\t"}; + std::string expected{"GeneA\t7\t26\t"}; if (output_file.is_open()) { while ( std::getline (output_file,line) ) diff --git a/test/api/insert_delete_test.cpp b/test/api/insert_delete_test.cpp index 5529a27..d8ca97c 100644 --- a/test/api/insert_delete_test.cpp +++ b/test/api/insert_delete_test.cpp @@ -60,6 +60,45 @@ TEST(delete, no_given_thresholds) std::filesystem::remove(tmp_dir/"IBF_delete_Exp_IBF_Deleted"); } +// Reads the level file ibf creates +template +void read_levels(std::vector> & expressions, std::filesystem::path filename) +{ + std::ifstream fin; + fin.open(filename); + auto stream_view = seqan3::detail::istreambuf(fin); + auto stream_it = std::ranges::begin(stream_view); + int j{0}; + std::vector empty_vector{}; + + std::string buffer{}; + + // Read line = expression levels + do + { + if (j == expressions.size()) + expressions.push_back(empty_vector); + std::ranges::copy(stream_view | seqan3::detail::take_until_or_throw(seqan3::is_char<' '>), + std::back_inserter(buffer)); + if constexpr(std::same_as) + expressions[j].push_back((uint16_t) std::stoi(buffer)); + else + expressions[j].push_back((double) std::stod(buffer)); + buffer.clear(); + if(*stream_it != '/') + ++stream_it; + + if (*stream_it == '\n') + { + ++stream_it; + j++; + } + } while (*stream_it != '/'); + ++stream_it; + + fin.close(); +} + TEST(insert, ibf) { std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory @@ -101,51 +140,18 @@ TEST(insert, ibf) load_ibf(ibf_insert, tmp_dir/"IBF_Insert_Exp_IBF_2"); EXPECT_TRUE((ibf == ibf_insert)); + std::vector> fpr_ibf{}; + read_levels(fpr_ibf, tmp_dir/"IBF_True_Exp_IBF_FPRs.fprs"); + std::vector> fpr_insert{}; + read_levels(fpr_insert, tmp_dir/"IBF_Insert_Exp_IBF_FPRs.fprs"); + EXPECT_EQ(fpr_ibf,fpr_insert); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_1"); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_2"); + std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_FPRs.fprs"); std::filesystem::remove(tmp_dir/"IBF_Insert_Exp_IBF_1"); std::filesystem::remove(tmp_dir/"IBF_Insert_Exp_IBF_2"); -} - - -// Reads the level file ibf creates -template -void read_levels(std::vector> & expressions, std::filesystem::path filename) -{ - std::ifstream fin; - fin.open(filename); - auto stream_view = seqan3::detail::istreambuf(fin); - auto stream_it = std::ranges::begin(stream_view); - int j{0}; - std::vector empty_vector{}; - - std::string buffer{}; - - // Read line = expression levels - do - { - if (j == expressions.size()) - expressions.push_back(empty_vector); - std::ranges::copy(stream_view | seqan3::detail::take_until_or_throw(seqan3::is_char<' '>), - std::back_inserter(buffer)); - if constexpr(std::same_as) - expressions[j].push_back((uint16_t) std::stoi(buffer)); - else - expressions[j].push_back((double) std::stod(buffer)); - buffer.clear(); - if(*stream_it != '/') - ++stream_it; - - if (*stream_it == '\n') - { - ++stream_it; - j++; - } - } while (*stream_it != '/'); - ++stream_it; - - fin.close(); + std::filesystem::remove(tmp_dir/"IBF_Insert_Exp_IBF_FPRs.fprs"); } TEST(insert, ibf_no_given_thresholds) @@ -194,12 +200,11 @@ TEST(insert, ibf_no_given_thresholds) read_levels(expressions_insert, tmp_dir/"IBF_Insert_Exp_IBF_Levels.levels"); EXPECT_EQ(expressions_ibf,expressions_insert); - /*std::vector> fpr_ibf{}; + std::vector> fpr_ibf{}; read_levels(fpr_ibf, tmp_dir/"IBF_True_Exp_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, tmp_dir/"IBF_Insert_Exp_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert);*/ - + EXPECT_EQ(fpr_ibf,fpr_insert); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_Level_0"); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_Level_1"); @@ -249,18 +254,19 @@ TEST(insert, ibf_delete) load_ibf(ibf_insert, tmp_dir/"IBF_Insert_Exp_IBF_2"); EXPECT_TRUE((ibf == ibf_insert)); - /*std::vector> fpr_ibf{}; + std::vector> fpr_ibf{}; read_levels(fpr_ibf, tmp_dir/"IBF_True_Exp_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, tmp_dir/"IBF_Insert_Exp_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert);*/ - + EXPECT_EQ(fpr_ibf,fpr_insert); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_1"); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_2"); + std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_FPRs.fprs"); std::filesystem::remove(tmp_dir/"IBF_Insert_Exp_IBF_1"); std::filesystem::remove(tmp_dir/"IBF_Insert_Exp_IBF_2"); std::filesystem::remove(tmp_dir/"IBF_Insert_Exp_IBF_Deleted"); + std::filesystem::remove(tmp_dir/"IBF_Insert_Exp_IBF_FPRs.fprs"); } @@ -313,11 +319,11 @@ TEST(insert, ibf_delete_no_given_threshold) read_levels(expressions_insert, tmp_dir/"IBF_Insert_Exp_IBF_Levels.levels"); EXPECT_EQ(expressions_ibf,expressions_insert); - /*std::vector> fpr_ibf{}; + std::vector> fpr_ibf{}; read_levels(fpr_ibf, tmp_dir/"IBF_True_Exp_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, tmp_dir/"IBF_Insert_Exp_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert);*/ + EXPECT_EQ(fpr_ibf,fpr_insert); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_Level_0"); std::filesystem::remove(tmp_dir/"IBF_True_Exp_IBF_Level_1"); @@ -362,6 +368,12 @@ TEST(insert, ibfmin) load_ibf(ibf_insert, tmp_dir/"IBFMIN_Insert_Given_IBF_2"); EXPECT_TRUE((ibf == ibf_insert)); + std::vector> fpr_ibf{}; + read_levels(fpr_ibf, tmp_dir/"IBFMIN_Test_Given_IBF_FPRs.fprs"); + std::vector> fpr_insert{}; + read_levels(fpr_insert, tmp_dir/"IBFMIN_Insert_Given_IBF_FPRs.fprs"); + EXPECT_EQ(fpr_ibf,fpr_insert); + std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_1"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_2"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_Data"); @@ -405,6 +417,12 @@ TEST(insert, ibfmin_delete) load_ibf(ibf_insert, tmp_dir/"IBFMIN_Insert_Given_IBF_2"); EXPECT_TRUE((ibf == ibf_insert)); + std::vector> fpr_ibf{}; + read_levels(fpr_ibf, tmp_dir/"IBFMIN_Test_Given_IBF_FPRs.fprs"); + std::vector> fpr_insert{}; + read_levels(fpr_insert, tmp_dir/"IBFMIN_Insert_Given_IBF_FPRs.fprs"); + EXPECT_EQ(fpr_ibf,fpr_insert); + std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_1"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_2"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_Data"); @@ -456,21 +474,19 @@ TEST(insert, ibfmin_no_given_thresholds) read_levels(expressions_insert, tmp_dir/"IBFMIN_Insert_Given_IBF_Levels.levels"); EXPECT_EQ(expressions_ibf,expressions_insert); - /*std::vector> fpr_ibf{}; + std::vector> fpr_ibf{}; read_levels(fpr_ibf, tmp_dir/"IBFMIN_Test_Given_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, tmp_dir/"IBFMIN_Insert_Given_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert);*/ + EXPECT_EQ(fpr_ibf,fpr_insert); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_Level_0"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_Level_1"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_Data"); - std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_FPRs.fprs"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_IBF_Levels.levels"); std::filesystem::remove(tmp_dir/"IBFMIN_Insert_Given_IBF_Level_0"); std::filesystem::remove(tmp_dir/"IBFMIN_Insert_Given_IBF_Level_1"); std::filesystem::remove(tmp_dir/"IBFMIN_Insert_Given_IBF_Data"); - std::filesystem::remove(tmp_dir/"IBFMIN_Insert_Given_IBF_FPRs.fprs"); std::filesystem::remove(tmp_dir/"IBFMIN_Insert_Given_IBF_Levels.levels"); } @@ -515,11 +531,11 @@ TEST(insert, delete_ibfmin_no_given_thresholds) read_levels(expressions_insert, tmp_dir/"IBFMIN_Insert_Given_Del_IBF_Levels.levels"); EXPECT_EQ(expressions_ibf,expressions_insert); - /*std::vector> fpr_ibf{}; + std::vector> fpr_ibf{}; read_levels(fpr_ibf, tmp_dir/"IBFMIN_Test_Given_IBF_FPRs.fprs"); std::vector> fpr_insert{}; read_levels(fpr_insert, tmp_dir/"IBFMIN_Insert_Given_IBF_FPRs.fprs"); - EXPECT_EQ(fpr_ibf,fpr_insert);*/ + EXPECT_EQ(fpr_ibf,fpr_insert); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_Del_IBF_Level_0"); std::filesystem::remove(tmp_dir/"IBFMIN_Test_Given_Del_IBF_Level_1");