From 67ff27fa6a47bdc4dd19d3478c47a61b338cc1c1 Mon Sep 17 00:00:00 2001 From: Ivan Khromov Date: Mon, 27 May 2024 20:43:44 +0300 Subject: [PATCH 1/5] Add OD verifying algorithm Introducing a novel algorithm to validate canonical order dependencies. This algorithm takes as input left and right column indices, context, and a flag indicating whether the dependency is ascending or descending. As output, it identifies rows where the dependency is violated through swaps or splits. Additionally, new options such as context and the ascending flag have been introduced. These parameters serve as inputs for the newly added algorithm. Furthermore, access rights in the class algos::fastod::ComplexStrippedPartition have been modified from private to protected. This adjustment was necessary as I developed a new class inheriting functionality from algos::fastod::ComplexStrippedPartition, requiring access to its private fields. --- src/core/algorithms/algorithm_types.h | 6 +- .../partitions/complex_stripped_partition.h | 2 +- src/core/algorithms/od/mining_algorithms.h | 1 + .../algorithms/od/od_verifier/od_verifier.cpp | 89 ++++++++++++++++++ .../algorithms/od/od_verifier/od_verifier.h | 76 ++++++++++++++++ .../algorithms/od/od_verifier/partition.cpp | 57 ++++++++++++ .../algorithms/od/od_verifier/partition.h | 90 +++++++++++++++++++ src/core/config/ascending_od/option.cpp | 9 ++ src/core/config/ascending_od/option.h | 9 ++ src/core/config/ascending_od/type.h | 5 ++ src/core/config/descriptions.h | 2 + src/core/config/indices/od_context.cpp | 9 ++ src/core/config/indices/od_context.h | 8 ++ src/core/config/names.h | 2 + 14 files changed, 362 insertions(+), 3 deletions(-) create mode 100644 src/core/algorithms/od/od_verifier/od_verifier.cpp create mode 100644 src/core/algorithms/od/od_verifier/od_verifier.h create mode 100644 src/core/algorithms/od/od_verifier/partition.cpp create mode 100644 src/core/algorithms/od/od_verifier/partition.h create mode 100644 src/core/config/ascending_od/option.cpp create mode 100644 src/core/config/ascending_od/option.h create mode 100644 src/core/config/ascending_od/type.h create mode 100644 src/core/config/indices/od_context.cpp create mode 100644 src/core/config/indices/od_context.h diff --git a/src/core/algorithms/algorithm_types.h b/src/core/algorithms/algorithm_types.h index 45836120cb..215c0a8cf0 100644 --- a/src/core/algorithms/algorithm_types.h +++ b/src/core/algorithms/algorithm_types.h @@ -11,7 +11,7 @@ using AlgorithmTypes = Apriori, metric::MetricVerifier, DataStats, fd_verifier::FDVerifier, HyUCC, PyroUCC, cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, Spider, Mind, Fastod, GfdValidation, EGfdValidation, NaiveGfdValidation, order::Order, - dd::Split>; + dd::Split, od_verifier::ODVerifier>; // clang-format off /* Enumeration of all supported non-pipeline algorithms. If you implement a new @@ -76,7 +76,9 @@ BETTER_ENUM(AlgorithmType, char, order, /* Differential dependencies mining algorithm */ - split + split, +/* Canonical OD verifier algorithm */ + od_verifier ) // clang-format on diff --git a/src/core/algorithms/od/fastod/partitions/complex_stripped_partition.h b/src/core/algorithms/od/fastod/partitions/complex_stripped_partition.h index 881d5a5b94..17e0c3988c 100644 --- a/src/core/algorithms/od/fastod/partitions/complex_stripped_partition.h +++ b/src/core/algorithms/od/fastod/partitions/complex_stripped_partition.h @@ -9,7 +9,7 @@ namespace algos::fastod { class ComplexStrippedPartition { -private: +protected: std::shared_ptr> sp_indexes_; std::shared_ptr> sp_begins_; std::shared_ptr> rb_indexes_; diff --git a/src/core/algorithms/od/mining_algorithms.h b/src/core/algorithms/od/mining_algorithms.h index 8091a813cf..72666b343f 100644 --- a/src/core/algorithms/od/mining_algorithms.h +++ b/src/core/algorithms/od/mining_algorithms.h @@ -2,3 +2,4 @@ #include "algorithms/od/fastod/fastod.h" #include "algorithms/od/order/order.h" +#include "algorithms/od/od_verifier/od_verifier.h" diff --git a/src/core/algorithms/od/od_verifier/od_verifier.cpp b/src/core/algorithms/od/od_verifier/od_verifier.cpp new file mode 100644 index 0000000000..ad0ea9ac94 --- /dev/null +++ b/src/core/algorithms/od/od_verifier/od_verifier.cpp @@ -0,0 +1,89 @@ +#include "od_verifier.h" + +#include "ascending_od/option.h" +#include "config/equal_nulls/option.h" +#include "config/indices/od_context.h" +#include "config/indices/option.h" +#include "config/tabular_data/input_table/option.h" +#include "partition.h" + +namespace algos::od_verifier { + +ODVerifier::ODVerifier() : Algorithm({}) { + RegisterOptions(); + MakeOptionsAvailable({config::kTableOpt.GetName(), config::kEqualNullsOpt.GetName()}); +} + +void ODVerifier::RegisterOptions() { + auto get_schema_cols = [this]() { return relation_->GetSchema()->GetNumColumns(); }; + + RegisterOption(config::kTableOpt(&input_table_)); + RegisterOption(config::kEqualNullsOpt(&is_null_equal_null_)); + RegisterOption(config::kLhsIndicesOpt(&lhs_indices_, get_schema_cols)); + RegisterOption(config::kRhsIndicesOpt(&rhs_indices_, get_schema_cols)); + RegisterOption(config::kODContextOpt(&context_indices_)); + RegisterOption(config::kAscendingODOpt(&ascending_)); +} + +void ODVerifier::MakeExecuteOptsAvailable() { + MakeOptionsAvailable({config::kLhsIndicesOpt.GetName(), config::kRhsIndicesOpt.GetName(), + config::kODContextOpt.GetName(), config::kAscendingODOpt.GetName()}); +} + +void ODVerifier::LoadDataInternal() { + relation_ = ColumnLayoutRelationData::CreateFrom(*input_table_, is_null_equal_null_); + + if (relation_->GetColumnData().empty()) { + throw std::runtime_error("Got an empty dataset: OD verifying is meaningless."); + } + input_table_->Reset(); + data_ = std::make_shared(DataFrame::FromInputTable(input_table_)); + if (data_->GetColumnCount() == 0) + throw std::runtime_error("Got an empty dataset: OD verifying is meaningless."); +} + +unsigned long long ODVerifier::ExecuteInternal() { + auto start_time = std::chrono::system_clock::now(); + if (ascending_) + VerifyOD(); + else + VerifyOD(); + auto elapsed_milliseconds = std::chrono::duration_cast( + std::chrono::system_clock::now() - start_time); + return elapsed_milliseconds.count(); +} + +template +void ODVerifier::VerifyOD() { + AttributeSet context; + + for (auto column : context_indices_) context.Set(column); + + fastod::ComplexStrippedPartition stripped_partition_swap( + (partition_cache_.GetStrippedPartition(context, data_))); + + if (stripped_partition_swap.Swap(lhs_indices_[0], rhs_indices_[0])) { + Partition part{stripped_partition_swap}; + std::vector> violates( + part.FindViolationsBySwap(lhs_indices_[0], rhs_indices_[0])); + + for (auto position_violate : violates) + row_violate_ods_by_swap_.push_back(position_violate.second + 1); + } + + context.Set(lhs_indices_[0]); + fastod::ComplexStrippedPartition stripped_partition_split( + partition_cache_.GetStrippedPartition(context, data_)); + + if (stripped_partition_split.Split(rhs_indices_[0])) { + Partition part{stripped_partition_split}; + std::vector> violates(part.FindViolationsBySplit(rhs_indices_[0])); + + for (auto position_violate : violates) + row_violate_ods_by_split_.push_back(position_violate.second + 1); + } + std::sort(row_violate_ods_by_split_.begin(), row_violate_ods_by_split_.end()); + std::sort(row_violate_ods_by_swap_.begin(), row_violate_ods_by_swap_.end()); +} + +} // namespace algos::od_verifier diff --git a/src/core/algorithms/od/od_verifier/od_verifier.h b/src/core/algorithms/od/od_verifier/od_verifier.h new file mode 100644 index 0000000000..54114e7cae --- /dev/null +++ b/src/core/algorithms/od/od_verifier/od_verifier.h @@ -0,0 +1,76 @@ +#pragma once + +#include "algorithms/algorithm.h" +#include "algorithms/od/fastod/model/canonical_od.h" +#include "config/indices/type.h" +#include "model/table/column_layout_relation_data.h" + +namespace algos::od_verifier { + +class ODVerifier : public Algorithm { +private: + using IndicesType = config::IndicesType; + using IndexType = config::IndexType; + using DataFrame = fastod::DataFrame; + using PartitionCache = fastod::PartitionCache; + using AscCanonicalOD = fastod::AscCanonicalOD; + using DescCanonicalOD = fastod::DescCanonicalOD; + using SimpleCanonicalOD = fastod::SimpleCanonicalOD; + using AttributeSet = fastod::AttributeSet; + + // input data + config::InputTable input_table_; + config::EqNullsType is_null_equal_null_; + IndicesType lhs_indices_; + IndicesType rhs_indices_; + IndicesType context_indices_; + bool ascending_; + + // auxiliary data + std::shared_ptr relation_; + std::shared_ptr data_; + PartitionCache partition_cache_; + + // rows that vioalates ods + std::vector row_violate_ods_by_swap_; + std::vector row_violate_ods_by_split_; + + // load input data + void RegisterOptions(); + void MakeExecuteOptsAvailable() override; + void LoadDataInternal() override; + + // runs the algorithm and measures its time + unsigned long long ExecuteInternal() override; + + // checks whether OD is violated and finds the rows where it is violated + template + void VerifyOD(); + + // reset statistic of violations + void ResetState() override { + row_violate_ods_by_swap_.clear(); + row_violate_ods_by_split_.clear(); + } + +public: + // checks whether the OD has broken + bool ODHolds() const { + return row_violate_ods_by_swap_.empty() && row_violate_ods_by_split_.empty(); + } + + // base constructor + ODVerifier(); + + // Returns the number of rows that violate the OD by split + size_t GetNumRowsViolateBySplit() const { + return row_violate_ods_by_split_.size(); + } + + // Returns the number of rows that violate the OD by swap + size_t GetNumRowsViolateBySwap() const { + return row_violate_ods_by_swap_.size(); + } +}; + +} // namespace algos::od_verifier diff --git a/src/core/algorithms/od/od_verifier/partition.cpp b/src/core/algorithms/od/od_verifier/partition.cpp new file mode 100644 index 0000000000..ecf49dce61 --- /dev/null +++ b/src/core/algorithms/od/od_verifier/partition.cpp @@ -0,0 +1,57 @@ +#include "partition.h" + +#include +#include +#include + +namespace algos::od_verifier { + +std::vector> Partition::CommonViolationBySplit(model::ColumnIndex right) const { + std::vector> violates; + + for (size_t begin_pointer = 0; begin_pointer < sp_begins_->size() - 1; begin_pointer++) { + size_t const group_begin = (*sp_begins_)[begin_pointer]; + size_t const group_end = (*sp_begins_)[begin_pointer + 1]; + + int const group_value = data_->GetValue((*sp_indexes_)[group_begin], right); + + for (size_t i = group_begin + 1; i < group_end; i++) { + if (data_->GetValue((*sp_indexes_)[i], right) != group_value) { + violates.emplace_back(std::pair(right, (*sp_indexes_)[i])); + } + } + } + + return violates; +} + +std::vector> Partition::RangeBasedViolationBySplit( + model::ColumnIndex right) const { + std::vector> violates; + + for (size_t begin_pointer = 0; begin_pointer < rb_begins_->size() - 1; ++begin_pointer) { + size_t const group_begin = (*rb_begins_)[begin_pointer]; + size_t const group_end = (*rb_begins_)[begin_pointer + 1]; + + int const group_value = data_->GetValue((*rb_indexes_)[group_begin].first, right); + + for (size_t i = group_begin; i < group_end; ++i) { + algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i]; + + for (size_t j = range.first; j <= range.second; ++j) { + if (data_->GetValue(j, right) != group_value) { + violates.emplace_back(std::pair(right, j)); + } + } + } + } + + return violates; +} + +std::vector> Partition::FindViolationsBySplit(model::ColumnIndex right) const { + return is_stripped_partition_ ? CommonViolationBySplit(right) + : RangeBasedViolationBySplit(right); +} + +} // namespace algos::od_verifier diff --git a/src/core/algorithms/od/od_verifier/partition.h b/src/core/algorithms/od/od_verifier/partition.h new file mode 100644 index 0000000000..16750b0ec7 --- /dev/null +++ b/src/core/algorithms/od/od_verifier/partition.h @@ -0,0 +1,90 @@ +#pragma once + +#include "algorithms/od/fastod/partitions/complex_stripped_partition.h" + +namespace algos::od_verifier { + +class Partition : protected algos::fastod::ComplexStrippedPartition { +private: + std::vector> CommonViolationBySplit(model::ColumnIndex right) const; + + std::vector> RangeBasedViolationBySplit(model::ColumnIndex right) const; + +public: + Partition() : algos::fastod::ComplexStrippedPartition() {} + + Partition(algos::fastod::ComplexStrippedPartition const& daddy) + : algos::fastod::ComplexStrippedPartition(daddy) {} + + std::vector> FindViolationsBySplit(model::ColumnIndex right) const; + + template + std::vector> FindViolationsBySwap(model::ColumnIndex left, + model::ColumnIndex right) const { + size_t const group_count = is_stripped_partition_ ? sp_begins_->size() : rb_begins_->size(); + std::vector> violates; + + for (size_t begin_pointer = 0; begin_pointer < group_count - 1; begin_pointer++) { + size_t const group_begin = is_stripped_partition_ ? (*sp_begins_)[begin_pointer] + : (*rb_begins_)[begin_pointer]; + + size_t const group_end = is_stripped_partition_ ? (*sp_begins_)[begin_pointer + 1] + : (*rb_begins_)[begin_pointer + 1]; + + std::vector> values; + std::vector row_pos; + + if (is_stripped_partition_) { + values.reserve(group_end - group_begin); + + for (size_t i = group_begin; i < group_end; ++i) { + size_t const index = (*sp_indexes_)[i]; + + values.emplace_back(data_->GetValue(index, left), + data_->GetValue(index, right)); + row_pos.emplace_back(index); + } + } else { + for (size_t i = group_begin; i < group_end; ++i) { + algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i]; + + for (size_t j = range.first; j <= range.second; ++j) { + values.emplace_back(data_->GetValue(j, left), data_->GetValue(j, right)); + } + } + } + + if constexpr (Ascending) { + std::sort(values.begin(), values.end(), + [](auto const& p1, auto const& p2) { return p1.first < p2.first; }); + } else { + std::sort(values.begin(), values.end(), + [](auto const& p1, auto const& p2) { return p2.first < p1.first; }); + } + + size_t prev_group_max_index = 0; + size_t current_group_max_index = 0; + bool is_first_group = true; + + for (size_t i = 0; i < values.size(); i++) { + auto const& [first, second] = values[i]; + + if (i != 0 && values[i - 1].first != first) { + is_first_group = false; + prev_group_max_index = current_group_max_index; + current_group_max_index = i; + } else if (values[current_group_max_index].second <= second) { + current_group_max_index = i; + } + + if (!is_first_group && values[prev_group_max_index].second > second) { + violates.push_back(std::pair(right, row_pos[i])); + } + } + } + + return violates; + } +}; + +} // namespace algos::od_verifier diff --git a/src/core/config/ascending_od/option.cpp b/src/core/config/ascending_od/option.cpp new file mode 100644 index 0000000000..a6074dade8 --- /dev/null +++ b/src/core/config/ascending_od/option.cpp @@ -0,0 +1,9 @@ +#include "ascending_od/option.h" + +#include "ascending_od/type.h" +#include "config/names_and_descriptions.h" + +namespace config { +extern CommonOption const kAscendingODOpt{names::kAscendingOD, + descriptions::kDAscendingOD, true}; +} // namespace config diff --git a/src/core/config/ascending_od/option.h b/src/core/config/ascending_od/option.h new file mode 100644 index 0000000000..5ee7737a63 --- /dev/null +++ b/src/core/config/ascending_od/option.h @@ -0,0 +1,9 @@ +#pragma once + +#include "config/ascending_od/type.h" +#include "config/common_option.h" + +namespace config { +extern CommonOption const kAscendingODOpt; + +} // namespace config diff --git a/src/core/config/ascending_od/type.h b/src/core/config/ascending_od/type.h new file mode 100644 index 0000000000..f5b6ad9150 --- /dev/null +++ b/src/core/config/ascending_od/type.h @@ -0,0 +1,5 @@ +#pragma once + +namespace config { +using AscendingODFlagType = bool; +} // namespace config diff --git a/src/core/config/descriptions.h b/src/core/config/descriptions.h index 4e8b85c3ea..b5b541668d 100644 --- a/src/core/config/descriptions.h +++ b/src/core/config/descriptions.h @@ -44,6 +44,8 @@ constexpr auto kDItemColumnIndex = "index of the column where an item name is st constexpr auto kDFirstColumnTId = "indicates that the first column contains the transaction IDs"; auto const kDMetric = details::kDMetricString.c_str(); constexpr auto kDLhsIndices = "LHS column indices"; +constexpr auto kDODContext = "context columns indices"; +constexpr auto kDAscendingOD = "flag shows whether the dependence is ascending or descending"; constexpr auto kDRhsIndices = "RHS column indices"; constexpr auto kDRhsIndex = "RHS column index"; constexpr auto kDUCCIndices = "column indices for UCC verification"; diff --git a/src/core/config/indices/od_context.cpp b/src/core/config/indices/od_context.cpp new file mode 100644 index 0000000000..3c476ee7fa --- /dev/null +++ b/src/core/config/indices/od_context.cpp @@ -0,0 +1,9 @@ +#include "config/indices/od_context.h" + +#include "config/names_and_descriptions.h" +#include "indices/type.h" + +namespace config { +extern CommonOption const kODContextOpt{names::kODContext, descriptions::kDODContext, + IndicesType({})}; +} // namespace config diff --git a/src/core/config/indices/od_context.h b/src/core/config/indices/od_context.h new file mode 100644 index 0000000000..7930af1ee7 --- /dev/null +++ b/src/core/config/indices/od_context.h @@ -0,0 +1,8 @@ +#pragma once + +#include "config/common_option.h" +#include "config/indices/type.h" + +namespace config { +extern CommonOption const kODContextOpt; +} // namespace config diff --git a/src/core/config/names.h b/src/core/config/names.h index 74f532cc95..5ecd9eadf3 100644 --- a/src/core/config/names.h +++ b/src/core/config/names.h @@ -29,6 +29,8 @@ constexpr auto kRhsIndex = "rhs_index"; constexpr auto kUCCIndices = "ucc_indices"; constexpr auto kParameter = "parameter"; constexpr auto kDistFromNullIsInfinity = "dist_from_null_is_infinity"; +constexpr auto kODContext = "od_context"; +constexpr auto kAscendingOD = "ascending"; constexpr auto kQGramLength = "q"; constexpr auto kMetricAlgorithm = "metric_algorithm"; constexpr auto kRadius = "radius"; From ac434f33ccb8ab66b71fda4fe18130537acb8031 Mon Sep 17 00:00:00 2001 From: Ivan Khromov Date: Mon, 27 May 2024 23:21:27 +0300 Subject: [PATCH 2/5] Add test for OD verification --- src/tests/all_csv_configs.cpp | 1 + src/tests/all_csv_configs.h | 1 + src/tests/test_ind_util.h | 1 + src/tests/test_od_verifier.cpp | 52 ++++++++++++++++++++++++++ test_input_data/ODVerificationData.csv | 7 ++++ 5 files changed, 62 insertions(+) create mode 100644 src/tests/test_od_verifier.cpp create mode 100644 test_input_data/ODVerificationData.csv diff --git a/src/tests/all_csv_configs.cpp b/src/tests/all_csv_configs.cpp index e1bdf6bcd2..cc42aebb0c 100644 --- a/src/tests/all_csv_configs.cpp +++ b/src/tests/all_csv_configs.cpp @@ -46,6 +46,7 @@ CSVConfig const kTestEmpty = CreateCsvConfig("TestEmpty.csv", ',', true); CSVConfig const kTestSingleColumn = CreateCsvConfig("TestSingleColumn.csv", ',', true); CSVConfig const kTestLong = CreateCsvConfig("TestLong.csv", ',', true); CSVConfig const kTestFD = CreateCsvConfig("TestFD.csv", ',', true); +CSVConfig const kTestODVerifier = CreateCsvConfig("ODVerificationData.csv", ',', true); CSVConfig const kOdTestNormOd = CreateCsvConfig("od_norm_data/OD_norm.csv", ',', true); CSVConfig const kOdTestNormSmall2x3 = CreateCsvConfig("od_norm_data/small_2x3.csv", ',', true); CSVConfig const kOdTestNormSmall3x3 = CreateCsvConfig("od_norm_data/small_3x3.csv", ',', true); diff --git a/src/tests/all_csv_configs.h b/src/tests/all_csv_configs.h index 9564a1f747..a8e874cbdb 100644 --- a/src/tests/all_csv_configs.h +++ b/src/tests/all_csv_configs.h @@ -36,6 +36,7 @@ extern CSVConfig const kTestEmpty; extern CSVConfig const kTestSingleColumn; extern CSVConfig const kTestLong; extern CSVConfig const kTestFD; +extern CSVConfig const kTestODVerifier; extern CSVConfig const kOdTestNormOd; extern CSVConfig const kOdTestNormSmall2x3; extern CSVConfig const kOdTestNormSmall3x3; diff --git a/src/tests/test_ind_util.h b/src/tests/test_ind_util.h index 63209a5006..6b81d1160b 100644 --- a/src/tests/test_ind_util.h +++ b/src/tests/test_ind_util.h @@ -3,6 +3,7 @@ #include #include #include +#include #include "algorithms/ind/ind.h" #include "algorithms/ind/ind_algorithm.h" diff --git a/src/tests/test_od_verifier.cpp b/src/tests/test_od_verifier.cpp new file mode 100644 index 0000000000..179b485bca --- /dev/null +++ b/src/tests/test_od_verifier.cpp @@ -0,0 +1,52 @@ +#include + +#include "algo_factory.h" +#include "all_csv_configs.h" +#include "config/names.h" +#include "od/od_verifier/od_verifier.h" + +namespace tests { + +struct ODVerifyingParams { + algos::StdParamsMap params; + size_t const row_violate_ods_by_split = 0; + size_t const row_violate_ods_by_swap = 0; + + ODVerifyingParams(config::IndicesType lhs_indices, config::IndicesType rhs_indices, + config::IndicesType context, bool ascending, size_t const row_error_split = 0, + size_t const row_error_swap = 0, + CSVConfig const& csv_config = kTestODVerifier) + : params({{config::names::kCsvConfig, csv_config}, + {config::names::kLhsIndices, std::move(lhs_indices)}, + {config::names::kRhsIndices, std::move(rhs_indices)}, + {config::names::kODContext, std::move(context)}, + {config::names::kAscendingOD, ascending}}), + row_violate_ods_by_split(row_error_split), + row_violate_ods_by_swap(row_error_swap) {} +}; + +class TestODVerifying : public ::testing::TestWithParam {}; + +TEST_P(TestODVerifying, DefaultTest) { + auto const& p = GetParam(); + auto mp = algos::StdParamsMap(p.params); + auto verifier = algos::CreateAndLoadAlgorithm(mp); + verifier->Execute(); + EXPECT_EQ(verifier->GetNumRowsViolateBySwap(), p.row_violate_ods_by_swap); + EXPECT_EQ(verifier->GetNumRowsViolateBySplit(), p.row_violate_ods_by_split); +} + +// clang-format off +INSTANTIATE_TEST_SUITE_P( + ODVerifierTestSuite, TestODVerifying, + ::testing::Values( + ODVerifyingParams({1}, {2}, {0}, true, 0, 0), + ODVerifyingParams({1}, {2}, {}, true, 1, 2), + ODVerifyingParams({3}, {4}, {0}, true, 0, 1), + ODVerifyingParams({1}, {2}, {0}, false, 0, 3), + ODVerifyingParams({3}, {4}, {0}, false, 0, 2), + ODVerifyingParams({5}, {6}, {0}, true, 1, 0) + )); +// clang-format on + +} // namespace tests diff --git a/test_input_data/ODVerificationData.csv b/test_input_data/ODVerificationData.csv new file mode 100644 index 0000000000..05eaa71bcf --- /dev/null +++ b/test_input_data/ODVerificationData.csv @@ -0,0 +1,7 @@ +1,2,3,4,5,6,7 +2020,10,1000,10,1000,10,1000 +2020,20,2000,20,2000,10,1000 +2020,30,3000,30,10,10,1001 +2021,10,1000,40,1000,10,1002 +2021,20,1500,50,2000,10,1002 +2022,5,10000,60,1000,10,1003 From eff9615f000488f0d2e8f24542b34548335f774c Mon Sep 17 00:00:00 2001 From: Ivan Khromov Date: Tue, 28 May 2024 00:30:07 +0300 Subject: [PATCH 3/5] Fix clang-format --- src/tests/test_ind_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/test_ind_util.h b/src/tests/test_ind_util.h index 6b81d1160b..64f838db2b 100644 --- a/src/tests/test_ind_util.h +++ b/src/tests/test_ind_util.h @@ -1,9 +1,9 @@ #pragma once #include +#include #include #include #include -#include #include "algorithms/ind/ind.h" #include "algorithms/ind/ind_algorithm.h" From 33202dfe3b4094b9b21d26a6b9abd54d372fd9be Mon Sep 17 00:00:00 2001 From: Ivan Khromov Date: Tue, 28 May 2024 00:41:12 +0300 Subject: [PATCH 4/5] Fix clang-format 2 --- src/core/algorithms/od/mining_algorithms.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/algorithms/od/mining_algorithms.h b/src/core/algorithms/od/mining_algorithms.h index 72666b343f..fa01a291c7 100644 --- a/src/core/algorithms/od/mining_algorithms.h +++ b/src/core/algorithms/od/mining_algorithms.h @@ -1,5 +1,5 @@ #pragma once #include "algorithms/od/fastod/fastod.h" -#include "algorithms/od/order/order.h" #include "algorithms/od/od_verifier/od_verifier.h" +#include "algorithms/od/order/order.h" From d0221d45dbbe071e27d028af20504297c69cc300 Mon Sep 17 00:00:00 2001 From: vano105 Date: Mon, 7 Oct 2024 23:25:16 +0300 Subject: [PATCH 5/5] Fix issues in pull request --- src/core/algorithms/algorithm_types.h | 15 +++--- src/core/algorithms/algorithms.h | 1 + src/core/algorithms/od/mining_algorithms.h | 1 - .../algorithms/od/od_verifier/od_verifier.cpp | 52 +++++++----------- .../algorithms/od/od_verifier/od_verifier.h | 53 ++++++++++++++----- .../algorithms/od/od_verifier/partition.cpp | 19 +++---- .../algorithms/od/od_verifier/partition.h | 22 ++++---- .../algorithms/od/verification_algorithms.h | 3 ++ src/tests/test_od_verifier.cpp | 12 ++--- 9 files changed, 97 insertions(+), 81 deletions(-) create mode 100644 src/core/algorithms/od/verification_algorithms.h diff --git a/src/core/algorithms/algorithm_types.h b/src/core/algorithms/algorithm_types.h index 215c0a8cf0..25f078b92c 100644 --- a/src/core/algorithms/algorithm_types.h +++ b/src/core/algorithms/algorithm_types.h @@ -10,8 +10,8 @@ using AlgorithmTypes = std::tuple; + Fastod, order::Order, od_verifier::ODVerifier, GfdValidation, EGfdValidation, + NaiveGfdValidation, dd::Split>; // clang-format off /* Enumeration of all supported non-pipeline algorithms. If you implement a new @@ -66,19 +66,18 @@ BETTER_ENUM(AlgorithmType, char, /* Order dependency mining algorithms */ fastod, + order, + +/* Canonical OD verifier algorithm */ + od_verifier, /* Graph functional dependency mining algorithms */ gfdvalid, egfdvalid, naivegfdvalid, -/* Order dependency mining algorithms */ - order, - /* Differential dependencies mining algorithm */ - split, -/* Canonical OD verifier algorithm */ - od_verifier + split ) // clang-format on diff --git a/src/core/algorithms/algorithms.h b/src/core/algorithms/algorithms.h index 0076f7706e..feed9a9f3e 100644 --- a/src/core/algorithms/algorithms.h +++ b/src/core/algorithms/algorithms.h @@ -10,6 +10,7 @@ #include "algorithms/ind/mining_algorithms.h" #include "algorithms/metric/verification_algorithms.h" #include "algorithms/od/mining_algorithms.h" +#include "algorithms/od/verification_algorithms.h" #include "algorithms/statistics/algorithms.h" #include "algorithms/ucc/mining_algorithms.h" #include "algorithms/ucc/verification_algorithms.h" diff --git a/src/core/algorithms/od/mining_algorithms.h b/src/core/algorithms/od/mining_algorithms.h index fa01a291c7..8091a813cf 100644 --- a/src/core/algorithms/od/mining_algorithms.h +++ b/src/core/algorithms/od/mining_algorithms.h @@ -1,5 +1,4 @@ #pragma once #include "algorithms/od/fastod/fastod.h" -#include "algorithms/od/od_verifier/od_verifier.h" #include "algorithms/od/order/order.h" diff --git a/src/core/algorithms/od/od_verifier/od_verifier.cpp b/src/core/algorithms/od/od_verifier/od_verifier.cpp index ad0ea9ac94..c3fcb7b4a6 100644 --- a/src/core/algorithms/od/od_verifier/od_verifier.cpp +++ b/src/core/algorithms/od/od_verifier/od_verifier.cpp @@ -5,7 +5,6 @@ #include "config/indices/od_context.h" #include "config/indices/option.h" #include "config/tabular_data/input_table/option.h" -#include "partition.h" namespace algos::od_verifier { @@ -17,12 +16,15 @@ ODVerifier::ODVerifier() : Algorithm({}) { void ODVerifier::RegisterOptions() { auto get_schema_cols = [this]() { return relation_->GetSchema()->GetNumColumns(); }; + IndicesType lhs_indices_, rhs_indices_; RegisterOption(config::kTableOpt(&input_table_)); RegisterOption(config::kEqualNullsOpt(&is_null_equal_null_)); RegisterOption(config::kLhsIndicesOpt(&lhs_indices_, get_schema_cols)); RegisterOption(config::kRhsIndicesOpt(&rhs_indices_, get_schema_cols)); RegisterOption(config::kODContextOpt(&context_indices_)); RegisterOption(config::kAscendingODOpt(&ascending_)); + lhs_indicex_ = lhs_indices_[0]; + rhs_indicex_ = rhs_indices_[0]; } void ODVerifier::MakeExecuteOptsAvailable() { @@ -38,52 +40,36 @@ void ODVerifier::LoadDataInternal() { } input_table_->Reset(); data_ = std::make_shared(DataFrame::FromInputTable(input_table_)); - if (data_->GetColumnCount() == 0) + if (data_->GetColumnCount() == 0) { throw std::runtime_error("Got an empty dataset: OD verifying is meaningless."); + } } unsigned long long ODVerifier::ExecuteInternal() { auto start_time = std::chrono::system_clock::now(); - if (ascending_) + if (ascending_) { VerifyOD(); - else + } else { VerifyOD(); + } auto elapsed_milliseconds = std::chrono::duration_cast( std::chrono::system_clock::now() - start_time); return elapsed_milliseconds.count(); } -template -void ODVerifier::VerifyOD() { - AttributeSet context; - - for (auto column : context_indices_) context.Set(column); - - fastod::ComplexStrippedPartition stripped_partition_swap( - (partition_cache_.GetStrippedPartition(context, data_))); - - if (stripped_partition_swap.Swap(lhs_indices_[0], rhs_indices_[0])) { - Partition part{stripped_partition_swap}; - std::vector> violates( - part.FindViolationsBySwap(lhs_indices_[0], rhs_indices_[0])); - - for (auto position_violate : violates) - row_violate_ods_by_swap_.push_back(position_violate.second + 1); - } - - context.Set(lhs_indices_[0]); - fastod::ComplexStrippedPartition stripped_partition_split( - partition_cache_.GetStrippedPartition(context, data_)); +// checks whether the OD has broken +bool ODVerifier::ODHolds() const { + return row_violate_ods_by_swap_.empty() && row_violate_ods_by_split_.empty(); +} - if (stripped_partition_split.Split(rhs_indices_[0])) { - Partition part{stripped_partition_split}; - std::vector> violates(part.FindViolationsBySplit(rhs_indices_[0])); +// Returns the number of rows that violate the OD by split +size_t ODVerifier::GetNumRowsViolateBySplit() const { + return row_violate_ods_by_split_.size(); +} - for (auto position_violate : violates) - row_violate_ods_by_split_.push_back(position_violate.second + 1); - } - std::sort(row_violate_ods_by_split_.begin(), row_violate_ods_by_split_.end()); - std::sort(row_violate_ods_by_swap_.begin(), row_violate_ods_by_swap_.end()); +// Returns the number of rows that violate the OD by swap +size_t ODVerifier::GetNumRowsViolateBySwap() const { + return row_violate_ods_by_swap_.size(); } } // namespace algos::od_verifier diff --git a/src/core/algorithms/od/od_verifier/od_verifier.h b/src/core/algorithms/od/od_verifier/od_verifier.h index 54114e7cae..4f2008629e 100644 --- a/src/core/algorithms/od/od_verifier/od_verifier.h +++ b/src/core/algorithms/od/od_verifier/od_verifier.h @@ -4,6 +4,7 @@ #include "algorithms/od/fastod/model/canonical_od.h" #include "config/indices/type.h" #include "model/table/column_layout_relation_data.h" +#include "partition.h" namespace algos::od_verifier { @@ -21,8 +22,8 @@ class ODVerifier : public Algorithm { // input data config::InputTable input_table_; config::EqNullsType is_null_equal_null_; - IndicesType lhs_indices_; - IndicesType rhs_indices_; + IndexType lhs_indicex_; + IndexType rhs_indicex_; IndicesType context_indices_; bool ascending_; @@ -45,7 +46,37 @@ class ODVerifier : public Algorithm { // checks whether OD is violated and finds the rows where it is violated template - void VerifyOD(); + void VerifyOD() { + AttributeSet context; + + for (auto column : context_indices_) context.Set(column); + + fastod::ComplexStrippedPartition stripped_partition_swap( + (partition_cache_.GetStrippedPartition(context, data_))); + + if (stripped_partition_swap.Swap(lhs_indicex_, rhs_indicex_)) { + ComplaxStrippedPartition part{stripped_partition_swap}; + std::vector> violates( + part.FindViolationsBySwap(lhs_indicex_, rhs_indicex_)); + + for (auto position_violate : violates) + row_violate_ods_by_swap_.push_back(position_violate.second + 1); + } + + context.Set(lhs_indicex_); + fastod::ComplexStrippedPartition stripped_partition_split( + partition_cache_.GetStrippedPartition(context, data_)); + + if (stripped_partition_split.Split(rhs_indicex_)) { + ComplaxStrippedPartition part{stripped_partition_split}; + std::vector> violates(part.FindViolationsBySplit(rhs_indicex_)); + + for (auto position_violate : violates) + row_violate_ods_by_split_.push_back(position_violate.second + 1); + } + std::sort(row_violate_ods_by_split_.begin(), row_violate_ods_by_split_.end()); + std::sort(row_violate_ods_by_swap_.begin(), row_violate_ods_by_swap_.end()); + } // reset statistic of violations void ResetState() override { @@ -54,23 +85,17 @@ class ODVerifier : public Algorithm { } public: - // checks whether the OD has broken - bool ODHolds() const { - return row_violate_ods_by_swap_.empty() && row_violate_ods_by_split_.empty(); - } - // base constructor ODVerifier(); + // checks whether the OD has broken + bool ODHolds() const; + // Returns the number of rows that violate the OD by split - size_t GetNumRowsViolateBySplit() const { - return row_violate_ods_by_split_.size(); - } + size_t GetNumRowsViolateBySplit() const; // Returns the number of rows that violate the OD by swap - size_t GetNumRowsViolateBySwap() const { - return row_violate_ods_by_swap_.size(); - } + size_t GetNumRowsViolateBySwap() const; }; } // namespace algos::od_verifier diff --git a/src/core/algorithms/od/od_verifier/partition.cpp b/src/core/algorithms/od/od_verifier/partition.cpp index ecf49dce61..1e1ae9d933 100644 --- a/src/core/algorithms/od/od_verifier/partition.cpp +++ b/src/core/algorithms/od/od_verifier/partition.cpp @@ -1,13 +1,13 @@ #include "partition.h" -#include #include #include namespace algos::od_verifier { -std::vector> Partition::CommonViolationBySplit(model::ColumnIndex right) const { - std::vector> violates; +std::vector +ComplaxStrippedPartition::CommonViolationBySplit(model::ColumnIndex right) const { + std::vector violates; for (size_t begin_pointer = 0; begin_pointer < sp_begins_->size() - 1; begin_pointer++) { size_t const group_begin = (*sp_begins_)[begin_pointer]; @@ -17,7 +17,7 @@ std::vector> Partition::CommonViolationBySplit(model::Column for (size_t i = group_begin + 1; i < group_end; i++) { if (data_->GetValue((*sp_indexes_)[i], right) != group_value) { - violates.emplace_back(std::pair(right, (*sp_indexes_)[i])); + violates.emplace_back(right, (*sp_indexes_)[i]); } } } @@ -25,9 +25,9 @@ std::vector> Partition::CommonViolationBySplit(model::Column return violates; } -std::vector> Partition::RangeBasedViolationBySplit( - model::ColumnIndex right) const { - std::vector> violates; +std::vector +ComplaxStrippedPartition::RangeBasedViolationBySplit(model::ColumnIndex right) const { + std::vector violates; for (size_t begin_pointer = 0; begin_pointer < rb_begins_->size() - 1; ++begin_pointer) { size_t const group_begin = (*rb_begins_)[begin_pointer]; @@ -40,7 +40,7 @@ std::vector> Partition::RangeBasedViolationBySplit( for (size_t j = range.first; j <= range.second; ++j) { if (data_->GetValue(j, right) != group_value) { - violates.emplace_back(std::pair(right, j)); + violates.emplace_back(right, j); } } } @@ -49,7 +49,8 @@ std::vector> Partition::RangeBasedViolationBySplit( return violates; } -std::vector> Partition::FindViolationsBySplit(model::ColumnIndex right) const { +std::vector +ComplaxStrippedPartition::FindViolationsBySplit(model::ColumnIndex right) const { return is_stripped_partition_ ? CommonViolationBySplit(right) : RangeBasedViolationBySplit(right); } diff --git a/src/core/algorithms/od/od_verifier/partition.h b/src/core/algorithms/od/od_verifier/partition.h index 16750b0ec7..64244d231d 100644 --- a/src/core/algorithms/od/od_verifier/partition.h +++ b/src/core/algorithms/od/od_verifier/partition.h @@ -4,25 +4,27 @@ namespace algos::od_verifier { -class Partition : protected algos::fastod::ComplexStrippedPartition { +class ComplaxStrippedPartition : protected algos::fastod::ComplexStrippedPartition { private: - std::vector> CommonViolationBySplit(model::ColumnIndex right) const; + using ViolationDescription = std::pair; - std::vector> RangeBasedViolationBySplit(model::ColumnIndex right) const; + std::vector CommonViolationBySplit(model::ColumnIndex right) const; + + std::vector RangeBasedViolationBySplit(model::ColumnIndex right) const; public: - Partition() : algos::fastod::ComplexStrippedPartition() {} + ComplaxStrippedPartition() : algos::fastod::ComplexStrippedPartition() {} - Partition(algos::fastod::ComplexStrippedPartition const& daddy) + ComplaxStrippedPartition(algos::fastod::ComplexStrippedPartition const& daddy) : algos::fastod::ComplexStrippedPartition(daddy) {} - std::vector> FindViolationsBySplit(model::ColumnIndex right) const; + std::vector FindViolationsBySplit(model::ColumnIndex right) const; template - std::vector> FindViolationsBySwap(model::ColumnIndex left, - model::ColumnIndex right) const { + std::vector FindViolationsBySwap(model::ColumnIndex left, + model::ColumnIndex right) const { size_t const group_count = is_stripped_partition_ ? sp_begins_->size() : rb_begins_->size(); - std::vector> violates; + std::vector violates; for (size_t begin_pointer = 0; begin_pointer < group_count - 1; begin_pointer++) { size_t const group_begin = is_stripped_partition_ ? (*sp_begins_)[begin_pointer] @@ -78,7 +80,7 @@ class Partition : protected algos::fastod::ComplexStrippedPartition { } if (!is_first_group && values[prev_group_max_index].second > second) { - violates.push_back(std::pair(right, row_pos[i])); + violates.emplace_back(right, row_pos[i]); } } } diff --git a/src/core/algorithms/od/verification_algorithms.h b/src/core/algorithms/od/verification_algorithms.h new file mode 100644 index 0000000000..d3028d84ab --- /dev/null +++ b/src/core/algorithms/od/verification_algorithms.h @@ -0,0 +1,3 @@ +#pragma once + +#include "algorithms/od/od_verifier/od_verifier.h" diff --git a/src/tests/test_od_verifier.cpp b/src/tests/test_od_verifier.cpp index 179b485bca..2edfec2cb7 100644 --- a/src/tests/test_od_verifier.cpp +++ b/src/tests/test_od_verifier.cpp @@ -9,8 +9,8 @@ namespace tests { struct ODVerifyingParams { algos::StdParamsMap params; - size_t const row_violate_ods_by_split = 0; - size_t const row_violate_ods_by_swap = 0; + size_t const number_of_rows_violate_by_split = 0; + size_t const number_of_rows_violate_by_swap = 0; ODVerifyingParams(config::IndicesType lhs_indices, config::IndicesType rhs_indices, config::IndicesType context, bool ascending, size_t const row_error_split = 0, @@ -21,8 +21,8 @@ struct ODVerifyingParams { {config::names::kRhsIndices, std::move(rhs_indices)}, {config::names::kODContext, std::move(context)}, {config::names::kAscendingOD, ascending}}), - row_violate_ods_by_split(row_error_split), - row_violate_ods_by_swap(row_error_swap) {} + number_of_rows_violate_by_split(row_error_split), + number_of_rows_violate_by_swap(row_error_swap) {} }; class TestODVerifying : public ::testing::TestWithParam {}; @@ -32,8 +32,8 @@ TEST_P(TestODVerifying, DefaultTest) { auto mp = algos::StdParamsMap(p.params); auto verifier = algos::CreateAndLoadAlgorithm(mp); verifier->Execute(); - EXPECT_EQ(verifier->GetNumRowsViolateBySwap(), p.row_violate_ods_by_swap); - EXPECT_EQ(verifier->GetNumRowsViolateBySplit(), p.row_violate_ods_by_split); + EXPECT_EQ(verifier->GetNumRowsViolateBySwap(), p.number_of_rows_violate_by_swap); + EXPECT_EQ(verifier->GetNumRowsViolateBySplit(), p.number_of_rows_violate_by_split); } // clang-format off