From 9b3495c1476f32bb56a81123117974cda13ee335 Mon Sep 17 00:00:00 2001 From: Ilia Date: Mon, 18 Mar 2024 19:40:40 +0300 Subject: [PATCH] Refactor TANE-based algorithms Generalize Tane and PFDTane algorithms, remove AUCC discovery and perform additional pruning. --- src/core/algorithms/fd/pfdtane/pfdtane.cpp | 265 ++----------------- src/core/algorithms/fd/pfdtane/pfdtane.h | 25 +- src/core/algorithms/fd/tane/tane.cpp | 277 +------------------- src/core/algorithms/fd/tane/tane.h | 38 +-- src/core/algorithms/fd/tane/tane_common.cpp | 269 +++++++++++++++++++ src/core/algorithms/fd/tane/tane_common.h | 35 +++ src/tests/test_pfdtane.cpp | 4 +- 7 files changed, 352 insertions(+), 561 deletions(-) create mode 100644 src/core/algorithms/fd/tane/tane_common.cpp create mode 100644 src/core/algorithms/fd/tane/tane_common.h diff --git a/src/core/algorithms/fd/pfdtane/pfdtane.cpp b/src/core/algorithms/fd/pfdtane/pfdtane.cpp index f66d02e53c..c12223d05a 100644 --- a/src/core/algorithms/fd/pfdtane/pfdtane.cpp +++ b/src/core/algorithms/fd/pfdtane/pfdtane.cpp @@ -24,7 +24,12 @@ namespace algos { using boost::dynamic_bitset; using Cluster = model::PositionListIndex::Cluster; -config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs) { +PFDTane::PFDTane() : TaneCommon() { + RegisterOptions(); +} + +config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs, + ColumnLayoutRelationData const*) { std::size_t max = 1; model::PositionListIndex const* x_pli = rhs->GetPositionListIndex(); for (Cluster const& x_cluster : x_pli->GetIndex()) { @@ -35,7 +40,7 @@ config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs) { config::ErrorType PFDTane::CalculateFdError(model::PositionListIndex const* x_pli, model::PositionListIndex const* xa_pli, - ErrorMeasure measure) { + ErrorMeasure measure, ColumnLayoutRelationData const*) { std::deque xa_index = xa_pli->GetIndex(); std::shared_ptr probing_table = x_pli->CalculateAndGetProbingTable(); std::sort(xa_index.begin(), xa_index.end(), @@ -70,6 +75,15 @@ config::ErrorType PFDTane::CalculateFdError(model::PositionListIndex const* x_pl return 1.0 - probability; } +config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs) { + return CalculateZeroAryFdError(rhs, relation_.get()); +} + +config::ErrorType PFDTane::CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) { + return CalculateFdError(lhs_pli, joint_pli, error_measure_, relation_.get()); +} + void PFDTane::RegisterOptions() { RegisterOption(config::ErrorOpt(&max_ucc_error_)); RegisterOption(config::ErrorMeasureOpt(&error_measure_)); @@ -81,251 +95,4 @@ void PFDTane::MakeExecuteOptsAvailable() { config::MaxLhsOpt.GetName()}); } -void PFDTane::ResetStateFd() {} - -PFDTane::PFDTane() : PliBasedFDAlgorithm({kDefaultPhaseName}) { - RegisterOptions(); -} - -double PFDTane::CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data) { - return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); -} - -void PFDTane::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, - [[maybe_unused]] config::ErrorType error, - [[maybe_unused]] RelationalSchema const* schema) { - dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); - PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); -} - -void PFDTane::Prune(model::LatticeLevel* level) { - RelationalSchema const* schema = relation_->GetSchema(); - std::list key_vertices; - for (auto& [map_key, vertex] : level->GetVertices()) { - Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination - - if (vertex->GetIsKeyCandidate()) { - double ucc_error = CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC - // TODO: do smth with UCC - - vertex->SetKeyCandidate(false); - if (ucc_error == 0) { - for (std::size_t rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index != boost::dynamic_bitset<>::npos; - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - Vertical rhs = static_cast(*schema->GetColumn((int)rhs_index)); - if (!columns.Contains(rhs)) { - bool is_rhs_candidate = true; - for (auto const& column : columns.GetColumns()) { - Vertical sibling = - columns.Without(static_cast(*column)).Union(rhs); - auto sibling_vertex = - level->GetLatticeVertex(sibling.GetColumnIndices()); - if (sibling_vertex == nullptr || - !sibling_vertex->GetConstRhsCandidates() - [rhs.GetColumnIndices().find_first()]) { - is_rhs_candidate = false; - break; - } - // for each outer rhs: if there is a sibling s.t. it doesn't - // have this rhs, there is no FD: vertex->rhs - } - // Found fd: vertex->rhs => register it - if (is_rhs_candidate) { - RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, - schema); - } - } - } - key_vertices.push_back(vertex.get()); - // cout << "--------------------------" << endl << "KeyVert: " << *vertex; - } - } - } - // if we seek for exact FDs then SetInvalid - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - for (auto key_vertex : key_vertices) { - key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); - key_vertex->SetInvalid(true); - } - } - } -} - -void PFDTane::ComputeDependencies(model::LatticeLevel* level) { - RelationalSchema const* schema = relation_->GetSchema(); - for (auto& [key_map, xa_vertex] : level->GetVertices()) { - if (xa_vertex->GetIsInvalid()) { - continue; - } - Vertical xa = xa_vertex->GetVertical(); - // Calculate XA PLI - if (xa_vertex->GetPositionListIndex() == nullptr) { - auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); - auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); - xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); - } - - dynamic_bitset<> xa_indices = xa.GetColumnIndices(); - dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); - auto xa_pli = xa_vertex->GetPositionListIndex(); - for (auto const& x_vertex : xa_vertex->GetParents()) { - Vertical const& lhs = x_vertex->GetVertical(); - - // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it - // easier?? - // like "a_index = xa_indices - x_indices;" - int a_index = xa_indices.find_first(); - dynamic_bitset<> x_indices = lhs.GetColumnIndices(); - while (a_index >= 0 && x_indices[a_index]) { - a_index = xa_indices.find_next(a_index); - } - if (!a_candidates[a_index]) { - continue; - } - auto x_pli = x_vertex->GetPositionListIndex(); - - // Check X -> A - config::ErrorType error = CalculateFdError(x_pli, xa_pli, error_measure_); - if (error <= max_fd_error_) { - Column const* rhs = schema->GetColumns()[a_index].get(); - - RegisterAndCountFd(lhs, rhs, error, schema); - xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); - if (error == 0) { - xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); - } - } - } - } -} - -unsigned long long PFDTane::ExecuteInternal() { - long apriori_millis_ = 0; - max_fd_error_ = max_ucc_error_; - RelationalSchema const* schema = relation_->GetSchema(); - - LOG(DEBUG) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " - << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) - << relation_->GetMaximumNip() << "."; - - for (auto& column : schema->GetColumns()) { - double avg_partners = relation_->GetColumnData(column->GetIndex()) - .GetPositionListIndex() - ->GetNepAsLong() * - 2.0 / relation_->GetNumRows(); - LOG(DEBUG) << "* " << column->ToString() << ": every tuple has " << std::setw(2) - << avg_partners << " partners on average."; - } - auto start_time = std::chrono::system_clock::now(); - double progress_step = 100.0 / (schema->GetNumColumns() + 1); - - // Initialize level 0 - std::vector> levels; - auto level0 = std::make_unique(0); - // TODO: через указатели кажется надо переделать - level0->Add(std::make_unique(*(schema->empty_vertical_))); - model::LatticeVertex const* empty_vertex = level0->GetVertices().begin()->second.get(); - levels.push_back(std::move(level0)); - AddProgress(progress_step); - - // Initialize level1 - dynamic_bitset<> zeroary_fd_rhs(schema->GetNumColumns()); - auto level1 = std::make_unique(1); - for (auto& column : schema->GetColumns()) { - // for each attribute set vertex - ColumnData const& column_data = relation_->GetColumnData(column->GetIndex()); - auto vertex = std::make_unique(static_cast(*column)); - - vertex->AddRhsCandidates(schema->GetColumns()); - vertex->GetParents().push_back(empty_vertex); - vertex->SetKeyCandidate(true); - vertex->SetPositionListIndex(column_data.GetPositionListIndex()); - - // check FDs: 0->A - double fd_error = CalculateZeroAryFdError(&column_data); - if (fd_error <= max_fd_error_) { // TODO: max_error - zeroary_fd_rhs.set(column->GetIndex()); - RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema); - - vertex->GetRhsCandidates().set(column->GetIndex(), false); - if (fd_error == 0) { - vertex->GetRhsCandidates().reset(); - } - } - - level1->Add(std::move(vertex)); - } - - for (auto& [key_map, vertex] : level1->GetVertices()) { - Vertical column = vertex->GetVertical(); - vertex->GetRhsCandidates() &= - ~zeroary_fd_rhs; //~ returns flipped copy <- removed already discovered zeroary FDs - - // вот тут костыль, чтобы вытянуть индекс колонки из вершины, в которой только один индекс - ColumnData const& column_data = - relation_->GetColumnData(column.GetColumnIndices().find_first()); - double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { - vertex->SetKeyCandidate(false); - if (ucc_error == 0 && max_lhs_ != 0) { - for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index < vertex->GetRhsCandidates().size(); - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - if (rhs_index != column.GetColumnIndices().find_first()) { - RegisterAndCountFd(column, schema->GetColumn(rhs_index), 0, schema); - } - } - vertex->GetRhsCandidates() &= column.GetColumnIndices(); - // set vertex invalid if we seek for exact dependencies - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - vertex->SetInvalid(true); - } - } - } - } - levels.push_back(std::move(level1)); - AddProgress(progress_step); - - unsigned int max_arity = - max_lhs_ == std::numeric_limits::max() ? max_lhs_ : max_lhs_ + 1; - for (unsigned int arity = 2; arity <= max_arity; arity++) { - model::LatticeLevel::ClearLevelsBelow(levels, arity - 1); - model::LatticeLevel::GenerateNextLevel(levels); - - model::LatticeLevel* level = levels[arity].get(); - LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity - << "-ary lattice vertices."; - if (level->GetVertices().empty()) { - break; - } - - ComputeDependencies(level); - - if (arity == max_arity) { - break; - } - - Prune(level); - // TODO: printProfilingData - AddProgress(progress_step); - } - - SetProgress(100); - std::chrono::milliseconds elapsed_milliseconds = - std::chrono::duration_cast(std::chrono::system_clock::now() - - start_time); - apriori_millis_ += elapsed_milliseconds.count(); - - LOG(DEBUG) << "Time: " << apriori_millis_ << " milliseconds"; - LOG(DEBUG) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; - LOG(DEBUG) << "Total intersections: " << model::PositionListIndex::intersection_count_ - << std::endl; - LOG(DEBUG) << "Total FD count: " << fd_collection_.Size(); - LOG(DEBUG) << "HASH: " << Fletcher16(); - return apriori_millis_; -} - } // namespace algos diff --git a/src/core/algorithms/fd/pfdtane/pfdtane.h b/src/core/algorithms/fd/pfdtane/pfdtane.h index b2bbc82dd6..cc2b7d9be2 100644 --- a/src/core/algorithms/fd/pfdtane/pfdtane.h +++ b/src/core/algorithms/fd/pfdtane/pfdtane.h @@ -2,6 +2,7 @@ #include "algorithms/fd/pli_based_fd_algorithm.h" #include "algorithms/fd/tane/lattice_level.h" +#include "algorithms/fd/tane/tane_common.h" #include "config/error/type.h" #include "config/error_measure/type.h" #include "config/max_lhs/type.h" @@ -11,31 +12,23 @@ namespace algos { -class PFDTane : public PliBasedFDAlgorithm { +class PFDTane : public TaneCommon { private: - config::ErrorType max_fd_error_; - config::ErrorType max_ucc_error_; - config::MaxLhsType max_lhs_; ErrorMeasure error_measure_ = +ErrorMeasure::per_tuple; - - void ResetStateFd() final; void RegisterOptions(); void MakeExecuteOptsAvailable() final; - void Prune(model::LatticeLevel* level); - void ComputeDependencies(model::LatticeLevel* level); - unsigned long long ExecuteInternal() final; + config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) override; + config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) override; public: PFDTane(); - static double CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data); - - void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, - RelationalSchema const* schema); - static config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs); + static config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs, + ColumnLayoutRelationData const* relation_data); static config::ErrorType CalculateFdError(model::PositionListIndex const* x_pli, model::PositionListIndex const* xa_pli, - ErrorMeasure error_measure); + ErrorMeasure error_measure, + ColumnLayoutRelationData const* relation_data); }; } // namespace algos diff --git a/src/core/algorithms/fd/tane/tane.cpp b/src/core/algorithms/fd/tane/tane.cpp index 4aee504185..686beaf9ed 100644 --- a/src/core/algorithms/fd/tane/tane.cpp +++ b/src/core/algorithms/fd/tane/tane.cpp @@ -19,7 +19,7 @@ namespace algos { using boost::dynamic_bitset; -Tane::Tane() : PliBasedFDAlgorithm({kDefaultPhaseName}) { +Tane::Tane() : TaneCommon() { RegisterOptions(); } @@ -32,281 +32,26 @@ void Tane::MakeExecuteOptsAvailable() { MakeOptionsAvailable({config::MaxLhsOpt.GetName(), config::ErrorOpt.GetName()}); } -void Tane::ResetStateFd() { - count_of_fd_ = 0; - count_of_ucc_ = 0; - apriori_millis_ = 0; -} - -double Tane::CalculateZeroAryFdError(ColumnData const* rhs, - ColumnLayoutRelationData const* relation_data) { +config::ErrorType Tane::CalculateZeroAryFdError(ColumnData const* rhs, + ColumnLayoutRelationData const* relation_data) { return 1 - rhs->GetPositionListIndex()->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); } -double Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, - model::PositionListIndex const* joint_pli, - ColumnLayoutRelationData const* relation_data) { +config::ErrorType Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli, + ColumnLayoutRelationData const* relation_data) { return (double)(lhs_pli->GetNepAsLong() - joint_pli->GetNepAsLong()) / static_cast(relation_data->GetNumTuplePairs()); } -double Tane::CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data) { - return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); +config::ErrorType Tane::CalculateZeroAryFdError(ColumnData const* rhs) { + return CalculateZeroAryFdError(rhs, relation_.get()); } -void Tane::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, [[maybe_unused]] double error, - [[maybe_unused]] RelationalSchema const* schema) { - dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); - PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); - count_of_fd_++; -} - -void Tane::RegisterUcc([[maybe_unused]] Vertical const& key, [[maybe_unused]] double error, - [[maybe_unused]] RelationalSchema const* schema) { - /*dynamic_bitset<> key_bitset = key.getColumnIndices(); - LOG(INFO) << "Discovered UCC: "; - for (int i = key_bitset.find_first(); i != -1; i = key_bitset.find_next(i)) { - LOG(INFO) << schema->GetColumn(i)->GetName() << " "; - } - LOG(INFO) << "- error equals " << error << std::endl;*/ - count_of_ucc_++; -} - -unsigned long long Tane::ExecuteInternal() { - max_fd_error_ = max_ucc_error_; - RelationalSchema const* schema = relation_->GetSchema(); - - LOG(INFO) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " - << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) - << relation_->GetMaximumNip() << "."; - - for (auto& column : schema->GetColumns()) { - double avg_partners = relation_->GetColumnData(column->GetIndex()) - .GetPositionListIndex() - ->GetNepAsLong() * - 2.0 / relation_->GetNumRows(); - LOG(INFO) << "* " << column->ToString() << ": every tuple has " << std::setw(2) - << avg_partners << " partners on average."; - } - auto start_time = std::chrono::system_clock::now(); - double progress_step = 100.0 / (schema->GetNumColumns() + 1); - - // Initialize level 0 - std::vector> levels; - auto level0 = std::make_unique(0); - // TODO: через указатели кажется надо переделать - level0->Add(std::make_unique(*(schema->empty_vertical_))); - model::LatticeVertex const* empty_vertex = level0->GetVertices().begin()->second.get(); - levels.push_back(std::move(level0)); - AddProgress(progress_step); - - // Initialize level1 - dynamic_bitset<> zeroary_fd_rhs(schema->GetNumColumns()); - auto level1 = std::make_unique(1); - for (auto& column : schema->GetColumns()) { - // for each attribute set vertex - ColumnData const& column_data = relation_->GetColumnData(column->GetIndex()); - auto vertex = std::make_unique(static_cast(*column)); - - vertex->AddRhsCandidates(schema->GetColumns()); - vertex->GetParents().push_back(empty_vertex); - vertex->SetKeyCandidate(true); - vertex->SetPositionListIndex(column_data.GetPositionListIndex()); - - // check FDs: 0->A - double fd_error = CalculateZeroAryFdError(&column_data, relation_.get()); - if (fd_error <= max_fd_error_) { // TODO: max_error - zeroary_fd_rhs.set(column->GetIndex()); - RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema); - - vertex->GetRhsCandidates().set(column->GetIndex(), false); - if (fd_error == 0) { - vertex->GetRhsCandidates().reset(); - } - } - - level1->Add(std::move(vertex)); - } - - for (auto& [key_map, vertex] : level1->GetVertices()) { - Vertical column = vertex->GetVertical(); - vertex->GetRhsCandidates() &= - ~zeroary_fd_rhs; //~ returns flipped copy <- removed already discovered zeroary FDs - - // вот тут костыль, чтобы вытянуть индекс колонки из вершины, в которой только один индекс - ColumnData const& column_data = - relation_->GetColumnData(column.GetColumnIndices().find_first()); - double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { - RegisterUcc(column, ucc_error, schema); - vertex->SetKeyCandidate(false); - if (ucc_error == 0 && max_lhs_ != 0) { - for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index < vertex->GetRhsCandidates().size(); - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - if (rhs_index != column.GetColumnIndices().find_first()) { - RegisterAndCountFd(column, schema->GetColumn(rhs_index), 0, schema); - } - } - vertex->GetRhsCandidates() &= column.GetColumnIndices(); - // set vertex invalid if we seek for exact dependencies - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - vertex->SetInvalid(true); - } - } - } - } - levels.push_back(std::move(level1)); - AddProgress(progress_step); - - unsigned int max_arity = - max_lhs_ == std::numeric_limits::max() ? max_lhs_ : max_lhs_ + 1; - for (unsigned int arity = 2; arity <= max_arity; arity++) { - // auto start_time = std::chrono::system_clock::now(); - model::LatticeLevel::ClearLevelsBelow(levels, arity - 1); - model::LatticeLevel::GenerateNextLevel(levels); - // std::chrono::duration elapsed_milliseconds = - // std::chrono::duration_cast(std::chrono::system_clock::now() - - // start_time); apriori_millis_ += elapsed_milliseconds.count(); - - model::LatticeLevel* level = levels[arity].get(); - LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity - << "-ary lattice vertices."; - if (level->GetVertices().empty()) { - break; - } - - for (auto& [key_map, xa_vertex] : level->GetVertices()) { - if (xa_vertex->GetIsInvalid()) { - continue; - } - - Vertical xa = xa_vertex->GetVertical(); - // Calculate XA PLI - if (xa_vertex->GetPositionListIndex() == nullptr) { - auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); - auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); - xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); - } - - dynamic_bitset<> xa_indices = xa.GetColumnIndices(); - dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); - - for (auto const& x_vertex : xa_vertex->GetParents()) { - Vertical const& lhs = x_vertex->GetVertical(); - - // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it - // easier?? - // like "a_index = xa_indices - x_indices;" - int a_index = xa_indices.find_first(); - dynamic_bitset<> x_indices = lhs.GetColumnIndices(); - while (a_index >= 0 && x_indices[a_index]) { - a_index = xa_indices.find_next(a_index); - } - if (!a_candidates[a_index]) { - continue; - } - - // Check X -> A - double error = CalculateFdError(x_vertex->GetPositionListIndex(), - xa_vertex->GetPositionListIndex(), relation_.get()); - if (error <= max_fd_error_) { - Column const* rhs = schema->GetColumns()[a_index].get(); - - // TODO: register FD to a file or something - RegisterAndCountFd(lhs, rhs, error, schema); - xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); - if (error == 0) { - xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); - } - } - } - } - - if (arity == max_arity) { - break; - } - - // Prune - // cout << "Pruning level: " << level->GetArity() << ". " << level->GetVertices().size() << - // " vertices_" << endl; - std::list key_vertices; - for (auto& [map_key, vertex] : level->GetVertices()) { - Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination - - if (vertex->GetIsKeyCandidate()) { - double ucc_error = - CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC - // TODO: do smth with UCC - - RegisterUcc(columns, ucc_error, schema); - vertex->SetKeyCandidate(false); - if (ucc_error == 0) { - for (size_t rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index != boost::dynamic_bitset<>::npos; - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - Vertical rhs = - static_cast(*schema->GetColumn((int)rhs_index)); - if (!columns.Contains(rhs)) { - bool is_rhs_candidate = true; - for (auto const& column : columns.GetColumns()) { - Vertical sibling = - columns.Without(static_cast(*column)) - .Union(rhs); - auto sibling_vertex = - level->GetLatticeVertex(sibling.GetColumnIndices()); - if (sibling_vertex == nullptr || - !sibling_vertex->GetConstRhsCandidates() - [rhs.GetColumnIndices().find_first()]) { - is_rhs_candidate = false; - break; - } - // for each outer rhs: if there is a sibling s.t. it doesn't - // have this rhs, there is no FD: vertex->rhs - } - // Found fd: vertex->rhs => register it - if (is_rhs_candidate) { - RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, - schema); - } - } - } - key_vertices.push_back(vertex.get()); - // cout << "--------------------------" << endl << "KeyVert: " << *vertex; - } - } - } - // if we seek for exact FDs then SetInvalid - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - for (auto key_vertex : key_vertices) { - key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); - key_vertex->SetInvalid(true); - } - } - } - - // TODO: printProfilingData - AddProgress(progress_step); - } - - SetProgress(100); - std::chrono::milliseconds elapsed_milliseconds = - std::chrono::duration_cast(std::chrono::system_clock::now() - - start_time); - apriori_millis_ += elapsed_milliseconds.count(); - - LOG(INFO) << "Time: " << apriori_millis_ << " milliseconds"; - LOG(INFO) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; - LOG(INFO) << "Total intersections: " << model::PositionListIndex::intersection_count_ - << std::endl; - LOG(INFO) << "Total FD count: " << count_of_fd_; - LOG(INFO) << "Total UCC count: " << count_of_ucc_; - LOG(INFO) << "HASH: " << Fletcher16(); - - return apriori_millis_; +config::ErrorType Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) { + return CalculateFdError(lhs_pli, joint_pli, relation_.get()); } } // namespace algos diff --git a/src/core/algorithms/fd/tane/tane.h b/src/core/algorithms/fd/tane/tane.h index 099ec31af3..9be7f184a6 100644 --- a/src/core/algorithms/fd/tane/tane.h +++ b/src/core/algorithms/fd/tane/tane.h @@ -7,43 +7,25 @@ #include "config/max_lhs/type.h" #include "model/table/position_list_index.h" #include "model/table/relation_data.h" +#include "tane_common.h" namespace algos { -class Tane : public PliBasedFDAlgorithm { +class Tane : public TaneCommon { private: void RegisterOptions(); void MakeExecuteOptsAvailable() final; - - void ResetStateFd() final; - unsigned long long ExecuteInternal() final; + config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) override; + config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli); public: - config::ErrorType max_fd_error_; - config::ErrorType max_ucc_error_; - config::MaxLhsType max_lhs_; - - int count_of_fd_ = 0; - int count_of_ucc_ = 0; - long apriori_millis_ = 0; - Tane(); - - static double CalculateZeroAryFdError(ColumnData const* rhs, - ColumnLayoutRelationData const* relation_data); - static double CalculateFdError(model::PositionListIndex const* lhs_pli, - model::PositionListIndex const* joint_pli, - ColumnLayoutRelationData const* relation_data); - static double CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data); - - // static double round(double error) { return ((int)(error * 32768) + 1)/ 32768.0; } - - void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, - RelationalSchema const* schema); - // void RegisterFd(Vertical const* lhs, Column const* rhs, double error, RelationalSchema const* - // schema); - void RegisterUcc(Vertical const& key, double error, RelationalSchema const* schema); + static config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs, + ColumnLayoutRelationData const* relation_data); + static config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli, + ColumnLayoutRelationData const* relation_data); }; } // namespace algos diff --git a/src/core/algorithms/fd/tane/tane_common.cpp b/src/core/algorithms/fd/tane/tane_common.cpp new file mode 100644 index 0000000000..7bc44902f7 --- /dev/null +++ b/src/core/algorithms/fd/tane/tane_common.cpp @@ -0,0 +1,269 @@ +#include "tane_common.h" + +#include +#include +#include +#include + +#include + +#include "config/error/option.h" +#include "config/error_measure/option.h" +#include "config/error_measure/type.h" +#include "config/max_lhs/option.h" +#include "config/names_and_descriptions.h" +#include "config/option.h" +#include "fd/tane/lattice_level.h" +#include "fd/tane/lattice_vertex.h" +#include "model/table/column_data.h" +#include "model/table/column_layout_relation_data.h" +#include "model/table/relational_schema.h" + +namespace algos { +using boost::dynamic_bitset; +using Cluster = model::PositionListIndex::Cluster; + +void TaneCommon::ResetStateFd() {} + +TaneCommon::TaneCommon() : PliBasedFDAlgorithm({kDefaultPhaseName}) {} + +double TaneCommon::CalculateUccError(model::PositionListIndex const* pli, + ColumnLayoutRelationData const* relation_data) { + return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); +} + +void TaneCommon::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, + [[maybe_unused]] config::ErrorType error, + [[maybe_unused]] RelationalSchema const* schema) { + dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); + PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); +} + +void TaneCommon::Prune(model::LatticeLevel* level) { + RelationalSchema const* schema = relation_->GetSchema(); + std::list key_vertices; + for (auto& [map_key, vertex] : level->GetVertices()) { + Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination + + if (vertex->GetIsKeyCandidate()) { + double ucc_error = CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); + if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC + + vertex->SetKeyCandidate(false); + if (ucc_error == 0) { + for (std::size_t rhs_index = vertex->GetRhsCandidates().find_first(); + rhs_index != boost::dynamic_bitset<>::npos; + rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { + Vertical rhs = static_cast(*schema->GetColumn((int)rhs_index)); + if (!columns.Contains(rhs)) { + bool is_rhs_candidate = true; + for (auto const& column : columns.GetColumns()) { + Vertical sibling = + columns.Without(static_cast(*column)).Union(rhs); + auto sibling_vertex = + level->GetLatticeVertex(sibling.GetColumnIndices()); + if (sibling_vertex == nullptr || + !sibling_vertex->GetConstRhsCandidates() + [rhs.GetColumnIndices().find_first()]) { + is_rhs_candidate = false; + break; + } + // for each outer rhs: if there is a sibling s.t. it doesn't + // have this rhs, there is no FD: vertex->rhs + } + // Found fd: vertex->rhs => register it + if (is_rhs_candidate) { + RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, + schema); + } + } + } + key_vertices.push_back(vertex.get()); + } + } + } + // if we seek for exact FDs then SetInvalid + if (max_fd_error_ == 0 && max_ucc_error_ == 0) { + for (auto key_vertex : key_vertices) { + key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); + key_vertex->SetInvalid(true); + } + } + } +} + +void TaneCommon::ComputeDependencies(model::LatticeLevel* level) { + RelationalSchema const* schema = relation_->GetSchema(); + for (auto& [key_map, xa_vertex] : level->GetVertices()) { + if (xa_vertex->GetIsInvalid()) { + continue; + } + Vertical xa = xa_vertex->GetVertical(); + // Calculate XA PLI + if (xa_vertex->GetPositionListIndex() == nullptr) { + auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); + auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); + xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); + } + + dynamic_bitset<> xa_indices = xa.GetColumnIndices(); + dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); + auto xa_pli = xa_vertex->GetPositionListIndex(); + for (auto const& x_vertex : xa_vertex->GetParents()) { + Vertical const& lhs = x_vertex->GetVertical(); + + // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it + // easier?? + // like "a_index = xa_indices - x_indices;" + int a_index = xa_indices.find_first(); + dynamic_bitset<> x_indices = lhs.GetColumnIndices(); + while (a_index >= 0 && x_indices[a_index]) { + a_index = xa_indices.find_next(a_index); + } + if (!a_candidates[a_index]) { + continue; + } + auto x_pli = x_vertex->GetPositionListIndex(); + + // Check X -> A + config::ErrorType error = CalculateFdError(x_pli, xa_pli); + if (error <= max_fd_error_) { + Column const* rhs = schema->GetColumns()[a_index].get(); + + RegisterAndCountFd(lhs, rhs, error, schema); + xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); + if (error == 0) { + xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); + } + } + } + } +} + +unsigned long long TaneCommon::ExecuteInternal() { + long apriori_millis_ = 0; + max_fd_error_ = max_ucc_error_; + RelationalSchema const* schema = relation_->GetSchema(); + + LOG(DEBUG) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " + << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) + << relation_->GetMaximumNip() << "."; + + for (auto& column : schema->GetColumns()) { + double avg_partners = relation_->GetColumnData(column->GetIndex()) + .GetPositionListIndex() + ->GetNepAsLong() * + 2.0 / relation_->GetNumRows(); + LOG(DEBUG) << "* " << column->ToString() << ": every tuple has " << std::setw(2) + << avg_partners << " partners on average."; + } + auto start_time = std::chrono::system_clock::now(); + double progress_step = 100.0 / (schema->GetNumColumns() + 1); + + // Initialize level 0 + std::vector> levels; + auto level0 = std::make_unique(0); + // TODO: через указатели кажется надо переделать + level0->Add(std::make_unique(*(schema->empty_vertical_))); + model::LatticeVertex const* empty_vertex = level0->GetVertices().begin()->second.get(); + levels.push_back(std::move(level0)); + AddProgress(progress_step); + + // Initialize level1 + dynamic_bitset<> zeroary_fd_rhs(schema->GetNumColumns()); + auto level1 = std::make_unique(1); + for (auto& column : schema->GetColumns()) { + // for each attribute set vertex + ColumnData const& column_data = relation_->GetColumnData(column->GetIndex()); + auto vertex = std::make_unique(static_cast(*column)); + + vertex->AddRhsCandidates(schema->GetColumns()); + vertex->GetParents().push_back(empty_vertex); + vertex->SetKeyCandidate(true); + vertex->SetPositionListIndex(column_data.GetPositionListIndex()); + + // check FDs: 0->A + double fd_error = CalculateZeroAryFdError(&column_data); + if (fd_error <= max_fd_error_) { // TODO: max_error + zeroary_fd_rhs.set(column->GetIndex()); + RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema); + + vertex->GetRhsCandidates().set(column->GetIndex(), false); + if (fd_error == 0) { + vertex->GetRhsCandidates().reset(); + } + } + + level1->Add(std::move(vertex)); + } + + for (auto& [key_map, vertex] : level1->GetVertices()) { + Vertical column = vertex->GetVertical(); + vertex->GetRhsCandidates() &= + ~zeroary_fd_rhs; //~ returns flipped copy <- removed already discovered zeroary FDs + + // вот тут костыль, чтобы вытянуть индекс колонки из вершины, в которой только один индекс + ColumnData const& column_data = + relation_->GetColumnData(column.GetColumnIndices().find_first()); + double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get()); + if (ucc_error <= max_ucc_error_) { + vertex->SetKeyCandidate(false); + if (ucc_error == 0 && max_lhs_ != 0) { + for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first(); + rhs_index < vertex->GetRhsCandidates().size(); + rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { + if (rhs_index != column.GetColumnIndices().find_first()) { + RegisterAndCountFd(column, schema->GetColumn(rhs_index), 0, schema); + } + } + vertex->GetRhsCandidates() &= column.GetColumnIndices(); + // set vertex invalid if we seek for exact dependencies + if (max_fd_error_ == 0 && max_ucc_error_ == 0) { + vertex->SetInvalid(true); + } + } + } + } + levels.push_back(std::move(level1)); + AddProgress(progress_step); + + unsigned int max_arity = + max_lhs_ == std::numeric_limits::max() ? max_lhs_ : max_lhs_ + 1; + for (unsigned int arity = 2; arity <= max_arity; arity++) { + model::LatticeLevel::ClearLevelsBelow(levels, arity - 1); + model::LatticeLevel::GenerateNextLevel(levels); + + model::LatticeLevel* level = levels[arity].get(); + LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity + << "-ary lattice vertices."; + if (level->GetVertices().empty()) { + break; + } + + ComputeDependencies(level); + + if (arity == max_arity) { + break; + } + + Prune(level); + // TODO: printProfilingData + AddProgress(progress_step); + } + + SetProgress(100); + std::chrono::milliseconds elapsed_milliseconds = + std::chrono::duration_cast(std::chrono::system_clock::now() - + start_time); + apriori_millis_ += elapsed_milliseconds.count(); + + LOG(DEBUG) << "Time: " << apriori_millis_ << " milliseconds"; + LOG(DEBUG) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; + LOG(DEBUG) << "Total intersections: " << model::PositionListIndex::intersection_count_ + << std::endl; + LOG(DEBUG) << "Total FD count: " << fd_collection_.Size(); + LOG(DEBUG) << "HASH: " << Fletcher16(); + return apriori_millis_; +} + +} // namespace algos diff --git a/src/core/algorithms/fd/tane/tane_common.h b/src/core/algorithms/fd/tane/tane_common.h new file mode 100644 index 0000000000..360ff07413 --- /dev/null +++ b/src/core/algorithms/fd/tane/tane_common.h @@ -0,0 +1,35 @@ +#pragma once + +#include "algorithms/fd/pli_based_fd_algorithm.h" +#include "algorithms/fd/tane/lattice_level.h" +#include "config/error/type.h" +#include "config/max_lhs/type.h" +#include "model/table/position_list_index.h" +#include "model/table/relation_data.h" + +namespace algos { + +class TaneCommon : public PliBasedFDAlgorithm { +protected: + config::ErrorType max_fd_error_; + config::ErrorType max_ucc_error_; + config::MaxLhsType max_lhs_; + +private: + void ResetStateFd() final; + void Prune(model::LatticeLevel* level); + void ComputeDependencies(model::LatticeLevel* level); + unsigned long long ExecuteInternal() final; + virtual config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) = 0; + virtual config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) = 0; + +public: + TaneCommon(); + static double CalculateUccError(model::PositionListIndex const* pli, + ColumnLayoutRelationData const* relation_data); + void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, + RelationalSchema const* schema); +}; + +} // namespace algos diff --git a/src/tests/test_pfdtane.cpp b/src/tests/test_pfdtane.cpp index ac15775bf9..751d4dbe90 100644 --- a/src/tests/test_pfdtane.cpp +++ b/src/tests/test_pfdtane.cpp @@ -55,8 +55,8 @@ TEST_P(TestPFDTaneValidation, ErrorCalculationTest) { for (auto const &[lhs_id, rhs_id, expected_error] : p.fds) { auto lhs = relation->GetColumnData(lhs_id).GetPositionListIndex(); auto rhs = relation->GetColumnData(rhs_id).GetPositionListIndex(); - config::ErrorType error = - algos::PFDTane::CalculateFdError(lhs, lhs->Intersect(rhs).get(), p.error_measure); + config::ErrorType error = algos::PFDTane::CalculateFdError(lhs, lhs->Intersect(rhs).get(), + p.error_measure, relation.get()); EXPECT_NEAR(error, expected_error, eps); } }