From 421391434edaa88675b0c8c84d195e281388cee9 Mon Sep 17 00:00:00 2001 From: Ayili Nikturab Date: Wed, 20 Dec 2023 17:28:18 +0300 Subject: [PATCH] fix tane based algos --- src/core/algorithms/fd/pfdtane/pfdtane.h | 3 +- src/core/algorithms/fd/tane/common_tane.cpp | 299 +++++++++++++++++++ src/core/algorithms/fd/tane/common_tane.h | 47 +++ src/core/algorithms/fd/tane/tane.cpp | 292 ------------------ src/core/algorithms/fd/tane/tane.h | 36 +-- src/core/model/table/position_list_index.cpp | 46 +-- 6 files changed, 376 insertions(+), 347 deletions(-) create mode 100644 src/core/algorithms/fd/tane/common_tane.cpp create mode 100644 src/core/algorithms/fd/tane/common_tane.h diff --git a/src/core/algorithms/fd/pfdtane/pfdtane.h b/src/core/algorithms/fd/pfdtane/pfdtane.h index 0ec6bd573c..deb10ada5d 100644 --- a/src/core/algorithms/fd/pfdtane/pfdtane.h +++ b/src/core/algorithms/fd/pfdtane/pfdtane.h @@ -2,13 +2,14 @@ #include +#include "algorithms/fd/tane/common_tane.h" #include "algorithms/fd/tane/tane.h" #include "model/table/position_list_index.h" #include "model/table/relation_data.h" namespace algos { -class PFDTane : public Tane { +class PFDTane : public tane::CommonTane { public: double CalculateZeroAryFdError(ColumnData const* rhs) override; double CalculateFdError(model::PositionListIndex const* x_pli, diff --git a/src/core/algorithms/fd/tane/common_tane.cpp b/src/core/algorithms/fd/tane/common_tane.cpp new file mode 100644 index 0000000000..8170af8d57 --- /dev/null +++ b/src/core/algorithms/fd/tane/common_tane.cpp @@ -0,0 +1,299 @@ +#include +#include +#include +#include + +#include + +#include "config/error/option.h" +#include "config/max_lhs/option.h" +#include "lattice_level.h" +#include "lattice_vertex.h" +#include "model/table/column_data.h" +#include "model/table/column_layout_relation_data.h" +#include "model/table/relational_schema.h" +#include "tane.h" + +namespace tane { + +using boost::dynamic_bitset; + +CommonTane::CommonTane() : algos::PliBasedFDAlgorithm({kDefaultPhaseName}) { + RegisterOptions(); +} + +void CommonTane::RegisterOptions() { + RegisterOption(config::ErrorOpt(&max_ucc_error_)); + RegisterOption(config::MaxLhsOpt(&max_lhs_)); +} + +void CommonTane::MakeExecuteOptsAvailable() { + MakeOptionsAvailable({config::MaxLhsOpt.GetName(), config::ErrorOpt.GetName()}); +} + +void CommonTane::ResetStateFd() { + count_of_fd_ = 0; + count_of_ucc_ = 0; + apriori_millis_ = 0; +} + +double CommonTane::CalculateUccError(model::PositionListIndex const* pli, + ColumnLayoutRelationData const* relation_data) { + return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); +} + +void CommonTane::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, + [[maybe_unused]] double error, + [[maybe_unused]] RelationalSchema const* schema) { + dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); + PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); + count_of_fd_++; +} + +void CommonTane::RegisterUcc([[maybe_unused]] Vertical const& key, [[maybe_unused]] double error, + [[maybe_unused]] RelationalSchema const* schema) { + /*dynamic_bitset<> key_bitset = key.getColumnIndices(); + LOG(INFO) << "Discovered UCC: "; + for (int i = key_bitset.find_first(); i != -1; i = key_bitset.find_next(i)) { + LOG(INFO) << schema->GetColumn(i)->GetName() << " "; + } + LOG(INFO) << "- error equals " << error << std::endl;*/ + count_of_ucc_++; +} + +unsigned long long CommonTane::ExecuteInternal() { + max_fd_error_ = max_ucc_error_; + RelationalSchema const* schema = relation_->GetSchema(); + + LOG(INFO) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " + << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) + << relation_->GetMaximumNip() << "."; + + for (auto& column : schema->GetColumns()) { + double avg_partners = relation_->GetColumnData(column->GetIndex()) + .GetPositionListIndex() + ->GetNepAsLong() * + 2.0 / relation_->GetNumRows(); + LOG(INFO) << "* " << column->ToString() << ": every tuple has " << std::setw(2) + << avg_partners << " partners on average."; + } + auto start_time = std::chrono::system_clock::now(); + double progress_step = 100.0 / (schema->GetNumColumns() + 1); + + // Initialize level 0 + std::vector> levels; + auto level0 = std::make_unique(0); + // TODO: через указатели кажется надо переделать + level0->Add(std::make_unique(*(schema->empty_vertical_))); + model::LatticeVertex const* empty_vertex = level0->GetVertices().begin()->second.get(); + levels.push_back(std::move(level0)); + AddProgress(progress_step); + + // Initialize level1 + dynamic_bitset<> zeroary_fd_rhs(schema->GetNumColumns()); + auto level1 = std::make_unique(1); + for (auto& column : schema->GetColumns()) { + // for each attribute set vertex + ColumnData const& column_data = relation_->GetColumnData(column->GetIndex()); + auto vertex = std::make_unique(static_cast(*column)); + + vertex->AddRhsCandidates(schema->GetColumns()); + vertex->GetParents().push_back(empty_vertex); + vertex->SetKeyCandidate(true); + vertex->SetPositionListIndex(column_data.GetPositionListIndex()); + + // check FDs: 0->A + double fd_error = CalculateZeroAryFdError(&column_data); + if (fd_error <= max_fd_error_) { // TODO: max_error + zeroary_fd_rhs.set(column->GetIndex()); + RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema); + + vertex->GetRhsCandidates().set(column->GetIndex(), false); + if (fd_error == 0) { + vertex->GetRhsCandidates().reset(); + } + } + + level1->Add(std::move(vertex)); + } + + for (auto& [key_map, vertex] : level1->GetVertices()) { + Vertical column = vertex->GetVertical(); + vertex->GetRhsCandidates() &= + ~zeroary_fd_rhs; //~ returns flipped copy <- removed already discovered zeroary FDs + + // вот тут костыль, чтобы вытянуть индекс колонки из вершины, в которой только один индекс + ColumnData const& column_data = + relation_->GetColumnData(column.GetColumnIndices().find_first()); + double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get()); + if (ucc_error <= max_ucc_error_) { + RegisterUcc(column, ucc_error, schema); + vertex->SetKeyCandidate(false); + if (ucc_error == 0 && max_lhs_ != 0) { + for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first(); + rhs_index < vertex->GetRhsCandidates().size(); + rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { + if (rhs_index != column.GetColumnIndices().find_first()) { + RegisterAndCountFd(column, schema->GetColumn(rhs_index), 0, schema); + } + } + vertex->GetRhsCandidates() &= column.GetColumnIndices(); + // set vertex invalid if we seek for exact dependencies + if (max_fd_error_ == 0 && max_ucc_error_ == 0) { + vertex->SetInvalid(true); + } + } + } + } + levels.push_back(std::move(level1)); + AddProgress(progress_step); + + unsigned int max_arity = + max_lhs_ == std::numeric_limits::max() ? max_lhs_ : max_lhs_ + 1; + for (unsigned int arity = 2; arity <= max_arity; arity++) { + // auto start_time = std::chrono::system_clock::now(); + model::LatticeLevel::ClearLevelsBelow(levels, arity - 1); + model::LatticeLevel::GenerateNextLevel(levels); + // std::chrono::duration elapsed_milliseconds = + // std::chrono::duration_cast(std::chrono::system_clock::now() - + // start_time); apriori_millis_ += elapsed_milliseconds.count(); + + model::LatticeLevel* level = levels[arity].get(); + LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity + << "-ary lattice vertices."; + if (level->GetVertices().empty()) { + break; + } + + for (auto& [key_map, xa_vertex] : level->GetVertices()) { + if (xa_vertex->GetIsInvalid()) { + continue; + } + + Vertical xa = xa_vertex->GetVertical(); + // Calculate XA PLI + if (xa_vertex->GetPositionListIndex() == nullptr) { + auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); + auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); + xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); + } + + dynamic_bitset<> xa_indices = xa.GetColumnIndices(); + dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); + + for (const auto& x_vertex : xa_vertex->GetParents()) { + Vertical const& lhs = x_vertex->GetVertical(); + + // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it + // easier?? + // like "a_index = xa_indices - x_indices;" + int a_index = xa_indices.find_first(); + dynamic_bitset<> x_indices = lhs.GetColumnIndices(); + while (a_index >= 0 && x_indices[a_index]) { + a_index = xa_indices.find_next(a_index); + } + if (!a_candidates[a_index]) { + continue; + } + + // Check X -> A + double error = CalculateFdError(x_vertex->GetPositionListIndex(), + xa_vertex->GetPositionListIndex()); + if (error <= max_fd_error_) { + Column const* rhs = schema->GetColumns()[a_index].get(); + + // TODO: register FD to a file or something + RegisterAndCountFd(lhs, rhs, error, schema); + xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); + if (error == 0) { + xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); + } + } + } + } + + if (arity == max_arity) { + break; + } + + // Prune + // cout << "Pruning level: " << level->GetArity() << ". " << level->GetVertices().size() << + // " vertices_" << endl; + std::list key_vertices; + for (auto& [map_key, vertex] : level->GetVertices()) { + Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination + + if (vertex->GetIsKeyCandidate()) { + double ucc_error = + CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); + if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC + // TODO: do smth with UCC + + RegisterUcc(columns, ucc_error, schema); + vertex->SetKeyCandidate(false); + if (ucc_error == 0) { + for (size_t rhs_index = vertex->GetRhsCandidates().find_first(); + rhs_index != boost::dynamic_bitset<>::npos; + rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { + Vertical rhs = + static_cast(*schema->GetColumn((int)rhs_index)); + if (!columns.Contains(rhs)) { + bool is_rhs_candidate = true; + for (const auto& column : columns.GetColumns()) { + Vertical sibling = + columns.Without(static_cast(*column)) + .Union(rhs); + auto sibling_vertex = + level->GetLatticeVertex(sibling.GetColumnIndices()); + if (sibling_vertex == nullptr || + !sibling_vertex->GetConstRhsCandidates() + [rhs.GetColumnIndices().find_first()]) { + is_rhs_candidate = false; + break; + } + // for each outer rhs: if there is a sibling s.t. it doesn't + // have this rhs, there is no FD: vertex->rhs + } + // Found fd: vertex->rhs => register it + if (is_rhs_candidate) { + RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, + schema); + } + } + } + key_vertices.push_back(vertex.get()); + // cout << "--------------------------" << endl << "KeyVert: " << *vertex; + } + } + } + // if we seek for exact FDs then SetInvalid + if (max_fd_error_ == 0 && max_ucc_error_ == 0) { + for (auto key_vertex : key_vertices) { + key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); + key_vertex->SetInvalid(true); + } + } + } + + // TODO: printProfilingData + AddProgress(progress_step); + } + + SetProgress(100); + std::chrono::milliseconds elapsed_milliseconds = + std::chrono::duration_cast(std::chrono::system_clock::now() - + start_time); + apriori_millis_ += elapsed_milliseconds.count(); + + LOG(INFO) << "Time: " << apriori_millis_ << " milliseconds"; + LOG(INFO) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; + LOG(INFO) << "Total intersections: " << model::PositionListIndex::intersection_count_ + << std::endl; + LOG(INFO) << "Total FD count: " << count_of_fd_; + LOG(INFO) << "Total UCC count: " << count_of_ucc_; + LOG(INFO) << "HASH: " << Fletcher16(); + + return apriori_millis_; +} + +} // namespace tane diff --git a/src/core/algorithms/fd/tane/common_tane.h b/src/core/algorithms/fd/tane/common_tane.h new file mode 100644 index 0000000000..9994dde564 --- /dev/null +++ b/src/core/algorithms/fd/tane/common_tane.h @@ -0,0 +1,47 @@ +#pragma once + +#include + +#include "algorithms/fd/pli_based_fd_algorithm.h" +#include "config/error/type.h" +#include "config/max_lhs/type.h" +#include "model/table/position_list_index.h" +#include "model/table/relation_data.h" + +namespace tane { + +class CommonTane : public algos::PliBasedFDAlgorithm { +private: + void RegisterOptions(); + void MakeExecuteOptsAvailable() final; + + void ResetStateFd() final; + unsigned long long ExecuteInternal() final; + +public: + config::ErrorType max_fd_error_; + config::ErrorType max_ucc_error_; + config::MaxLhsType max_lhs_; + + int count_of_fd_ = 0; + int count_of_ucc_ = 0; + long apriori_millis_ = 0; + + CommonTane(); + + virtual double CalculateZeroAryFdError(ColumnData const* rhs) = 0; + virtual double CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) = 0; + static double CalculateUccError(model::PositionListIndex const* pli, + ColumnLayoutRelationData const* relation_data); + + // static double round(double error) { return ((int)(error * 32768) + 1)/ 32768.0; } + + void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, + RelationalSchema const* schema); + // void RegisterFd(Vertical const* lhs, Column const* rhs, double error, RelationalSchema const* + // schema); + void RegisterUcc(Vertical const& key, double error, RelationalSchema const* schema); +}; + +} // namespace tane diff --git a/src/core/algorithms/fd/tane/tane.cpp b/src/core/algorithms/fd/tane/tane.cpp index 105eef6541..de5187811c 100644 --- a/src/core/algorithms/fd/tane/tane.cpp +++ b/src/core/algorithms/fd/tane/tane.cpp @@ -1,43 +1,7 @@ #include "tane.h" -#include -#include -#include -#include - -#include - -#include "config/error/option.h" -#include "config/max_lhs/option.h" -#include "lattice_level.h" -#include "lattice_vertex.h" -#include "model/table/column_data.h" -#include "model/table/column_layout_relation_data.h" -#include "model/table/relational_schema.h" - namespace algos { -using boost::dynamic_bitset; - -Tane::Tane() : PliBasedFDAlgorithm({kDefaultPhaseName}) { - RegisterOptions(); -} - -void Tane::RegisterOptions() { - RegisterOption(config::ErrorOpt(&max_ucc_error_)); - RegisterOption(config::MaxLhsOpt(&max_lhs_)); -} - -void Tane::MakeExecuteOptsAvailable() { - MakeOptionsAvailable({config::MaxLhsOpt.GetName(), config::ErrorOpt.GetName()}); -} - -void Tane::ResetStateFd() { - count_of_fd_ = 0; - count_of_ucc_ = 0; - apriori_millis_ = 0; -} - double Tane::CalculateZeroAryFdError(ColumnData const* rhs) { return 1 - rhs->GetPositionListIndex()->GetNepAsLong() / static_cast(GetRelation().GetNumTuplePairs()); @@ -49,260 +13,4 @@ double Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, static_cast(GetRelation().GetNumTuplePairs()); } -double Tane::CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data) { - return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); -} - -void Tane::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, [[maybe_unused]] double error, - [[maybe_unused]] RelationalSchema const* schema) { - dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); - PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); - count_of_fd_++; -} - -void Tane::RegisterUcc([[maybe_unused]] Vertical const& key, - [[maybe_unused]] double error, - [[maybe_unused]] RelationalSchema const* schema) { - /*dynamic_bitset<> key_bitset = key.getColumnIndices(); - LOG(INFO) << "Discovered UCC: "; - for (int i = key_bitset.find_first(); i != -1; i = key_bitset.find_next(i)) { - LOG(INFO) << schema->GetColumn(i)->GetName() << " "; - } - LOG(INFO) << "- error equals " << error << std::endl;*/ - count_of_ucc_++; -} - -unsigned long long Tane::ExecuteInternal() { - max_fd_error_ = max_ucc_error_; - RelationalSchema const* schema = relation_->GetSchema(); - - LOG(INFO) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " - << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) - << relation_->GetMaximumNip() << "."; - - for (auto& column : schema->GetColumns()) { - double avg_partners = relation_->GetColumnData(column->GetIndex()) - .GetPositionListIndex() - ->GetNepAsLong() * - 2.0 / relation_->GetNumRows(); - LOG(INFO) << "* " << column->ToString() << ": every tuple has " << std::setw(2) - << avg_partners << " partners on average."; - } - auto start_time = std::chrono::system_clock::now(); - double progress_step = 100.0 / (schema->GetNumColumns() + 1); - - // Initialize level 0 - std::vector> levels; - auto level0 = std::make_unique(0); - // TODO: через указатели кажется надо переделать - level0->Add(std::make_unique(*(schema->empty_vertical_))); - model::LatticeVertex const* empty_vertex = level0->GetVertices().begin()->second.get(); - levels.push_back(std::move(level0)); - AddProgress(progress_step); - - // Initialize level1 - dynamic_bitset<> zeroary_fd_rhs(schema->GetNumColumns()); - auto level1 = std::make_unique(1); - for (auto& column : schema->GetColumns()) { - // for each attribute set vertex - ColumnData const& column_data = relation_->GetColumnData(column->GetIndex()); - auto vertex = std::make_unique(static_cast(*column)); - - vertex->AddRhsCandidates(schema->GetColumns()); - vertex->GetParents().push_back(empty_vertex); - vertex->SetKeyCandidate(true); - vertex->SetPositionListIndex(column_data.GetPositionListIndex()); - - //check FDs: 0->A - double fd_error = CalculateZeroAryFdError(&column_data); - if (fd_error <= max_fd_error_) { // TODO: max_error - zeroary_fd_rhs.set(column->GetIndex()); - RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema); - - vertex->GetRhsCandidates().set(column->GetIndex(), false); - if (fd_error == 0) { - vertex->GetRhsCandidates().reset(); - } - } - - level1->Add(std::move(vertex)); - } - - for (auto& [key_map, vertex] : level1->GetVertices()) { - Vertical column = vertex->GetVertical(); - vertex->GetRhsCandidates() &= - ~zeroary_fd_rhs; //~ returns flipped copy <- removed already discovered zeroary FDs - - // вот тут костыль, чтобы вытянуть индекс колонки из вершины, в которой только один индекс - ColumnData const& column_data = - relation_->GetColumnData(column.GetColumnIndices().find_first()); - double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { - RegisterUcc(column, ucc_error, schema); - vertex->SetKeyCandidate(false); - if (ucc_error == 0 && max_lhs_ != 0) { - for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index < vertex->GetRhsCandidates().size(); - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - if (rhs_index != column.GetColumnIndices().find_first()) { - RegisterAndCountFd(column, schema->GetColumn(rhs_index), 0, schema); - } - } - vertex->GetRhsCandidates() &= column.GetColumnIndices(); - //set vertex invalid if we seek for exact dependencies - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - vertex->SetInvalid(true); - } - } - } - } - levels.push_back(std::move(level1)); - AddProgress(progress_step); - - unsigned int max_arity = max_lhs_ == std::numeric_limits::max() - ? max_lhs_ - : max_lhs_ + 1; - for (unsigned int arity = 2; arity <= max_arity; arity++) { - // auto start_time = std::chrono::system_clock::now(); - model::LatticeLevel::ClearLevelsBelow(levels, arity - 1); - model::LatticeLevel::GenerateNextLevel(levels); - // std::chrono::duration elapsed_milliseconds = - // std::chrono::duration_cast(std::chrono::system_clock::now() - - // start_time); apriori_millis_ += elapsed_milliseconds.count(); - - model::LatticeLevel* level = levels[arity].get(); - LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity - << "-ary lattice vertices."; - if (level->GetVertices().empty()) { - break; - } - - for (auto& [key_map, xa_vertex] : level->GetVertices()) { - if (xa_vertex->GetIsInvalid()) { - continue; - } - - Vertical xa = xa_vertex->GetVertical(); - //Calculate XA PLI - if (xa_vertex->GetPositionListIndex() == nullptr) { - auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); - auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); - xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); - } - - dynamic_bitset<> xa_indices = xa.GetColumnIndices(); - dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); - - for (const auto& x_vertex : xa_vertex->GetParents()) { - Vertical const& lhs = x_vertex->GetVertical(); - - // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it easier?? - //like "a_index = xa_indices - x_indices;" - int a_index = xa_indices.find_first(); - dynamic_bitset<> x_indices = lhs.GetColumnIndices(); - while (a_index >= 0 && x_indices[a_index]) { - a_index = xa_indices.find_next(a_index); - } - if (!a_candidates[a_index]) { - continue; - } - - // Check X -> A - double error = CalculateFdError(x_vertex->GetPositionListIndex(), - xa_vertex->GetPositionListIndex()); - if (error <= max_fd_error_) { - Column const* rhs = schema->GetColumns()[a_index].get(); - - //TODO: register FD to a file or something - RegisterAndCountFd(lhs, rhs, error, schema); - xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); - if (error == 0) { - xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); - } - } - } - } - - if (arity == max_arity) { - break; - } - - //Prune - //cout << "Pruning level: " << level->GetArity() << ". " << level->GetVertices().size() << " vertices_" << endl; - std::list key_vertices; - for (auto& [map_key, vertex] : level->GetVertices()) { - Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination - - if (vertex->GetIsKeyCandidate()) { - double ucc_error = - CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { //If a key candidate is an approx UCC - //TODO: do smth with UCC - - RegisterUcc(columns, ucc_error, schema); - vertex->SetKeyCandidate(false); - if (ucc_error == 0) { - for (size_t rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index != boost::dynamic_bitset<>::npos; - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - Vertical rhs = - static_cast(*schema->GetColumn((int)rhs_index)); - if (!columns.Contains(rhs)) { - bool is_rhs_candidate = true; - for (const auto& column : columns.GetColumns()) { - Vertical sibling = - columns.Without(static_cast(*column)).Union(rhs); - auto sibling_vertex = - level->GetLatticeVertex(sibling.GetColumnIndices()); - if (sibling_vertex == nullptr || - !sibling_vertex->GetConstRhsCandidates() - [rhs.GetColumnIndices().find_first()]) { - is_rhs_candidate = false; - break; - } - // for each outer rhs: if there is a sibling s.t. it doesn't have this rhs, there is no FD: vertex->rhs - } - //Found fd: vertex->rhs => register it - if (is_rhs_candidate) { - RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, - schema); - } - } - } - key_vertices.push_back(vertex.get()); - //cout << "--------------------------" << endl << "KeyVert: " << *vertex; - } - } - } - //if we seek for exact FDs then SetInvalid - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - for (auto key_vertex : key_vertices) { - key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); - key_vertex->SetInvalid(true); - } - } - } - - //TODO: printProfilingData - AddProgress(progress_step); - } - - SetProgress(100); - std::chrono::milliseconds elapsed_milliseconds = - std::chrono::duration_cast(std::chrono::system_clock::now() - - start_time); - apriori_millis_ += elapsed_milliseconds.count(); - - LOG(INFO) << "Time: " << apriori_millis_ << " milliseconds"; - LOG(INFO) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; - LOG(INFO) << "Total intersections: " << model::PositionListIndex::intersection_count_ - << std::endl; - LOG(INFO) << "Total FD count: " << count_of_fd_; - LOG(INFO) << "Total UCC count: " << count_of_ucc_; - LOG(INFO) << "HASH: " << Fletcher16(); - - return apriori_millis_; -} - } // namespace algos diff --git a/src/core/algorithms/fd/tane/tane.h b/src/core/algorithms/fd/tane/tane.h index e5e44144c5..afd64cfb9d 100644 --- a/src/core/algorithms/fd/tane/tane.h +++ b/src/core/algorithms/fd/tane/tane.h @@ -3,6 +3,7 @@ #include #include "algorithms/fd/pli_based_fd_algorithm.h" +#include "common_tane.h" #include "config/error/type.h" #include "config/max_lhs/type.h" #include "model/table/position_list_index.h" @@ -10,38 +11,11 @@ namespace algos { -class Tane : public PliBasedFDAlgorithm { -private: - void RegisterOptions(); - void MakeExecuteOptsAvailable() final; - - void ResetStateFd() final; - unsigned long long ExecuteInternal() final; - +class Tane : public tane::CommonTane { public: - config::ErrorType max_fd_error_; - config::ErrorType max_ucc_error_; - config::MaxLhsType max_lhs_; - - int count_of_fd_ = 0; - int count_of_ucc_ = 0; - long apriori_millis_ = 0; - - Tane(); - - virtual double CalculateZeroAryFdError(ColumnData const* rhs); - virtual double CalculateFdError(model::PositionListIndex const* lhs_pli, - model::PositionListIndex const* joint_pli); - static double CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data); - - // static double round(double error) { return ((int)(error * 32768) + 1)/ 32768.0; } - - void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, - RelationalSchema const* schema); - // void RegisterFd(Vertical const* lhs, Column const* rhs, double error, RelationalSchema const* - // schema); - void RegisterUcc(Vertical const& key, double error, RelationalSchema const* schema); + double CalculateZeroAryFdError(ColumnData const* rhs) override; + double CalculateFdError(model::PositionListIndex const* x_pli, + model::PositionListIndex const* xa_pli) override; }; } // namespace algos diff --git a/src/core/model/table/position_list_index.cpp b/src/core/model/table/position_list_index.cpp index ef98ed7672..c3662c65b0 100644 --- a/src/core/model/table/position_list_index.cpp +++ b/src/core/model/table/position_list_index.cpp @@ -55,7 +55,7 @@ std::unique_ptr PositionListIndex::CreateFor(std::vector null_cluster = index[ColumnLayoutRelationData::kNullValueId]; } if (!is_null_eq_null) { - index.erase(ColumnLayoutRelationData::kNullValueId); // move? + index.erase(ColumnLayoutRelationData::kNullValueId); // move? } double key_gap = 0.0; @@ -107,9 +107,9 @@ std::unordered_map PositionListIndex::CreateFrequencies( return frequencies; } -//unsigned long long PositionListIndex::CalculateNep(unsigned int numElements) { +// unsigned long long PositionListIndex::CalculateNep(unsigned int numElements) { // -//} +// } void PositionListIndex::SortClusters(std::deque>& clusters) { sort(clusters.begin(), clusters.end(), @@ -132,10 +132,8 @@ std::shared_ptr> PositionListIndex::CalculateAndGetProbin return std::make_shared>(probing_table); } - - -// интересное место: true --> надо передать поле без копирования, false --> надо сконструировать и выдать наружу -// кажется, самым лёгким способом будет навернуть shared_ptr +// интересное место: true --> надо передать поле без копирования, false --> надо сконструировать и +// выдать наружу кажется, самым лёгким способом будет навернуть shared_ptr /*std::shared_ptr> PositionListIndex::getProbingTable(bool isCaching) { auto probingTable = GetProbingTable(); if (isCaching) { @@ -145,19 +143,20 @@ std::shared_ptr> PositionListIndex::CalculateAndGetProbin return probingTable; }*/ -//std::deque> const & PositionListIndex::getIndex() const { -// return index; -//} +// std::deque> const & PositionListIndex::getIndex() const { +// return index; +// } -std::unique_ptr PositionListIndex::Intersect(PositionListIndex const* that) const { +std::unique_ptr PositionListIndex::Intersect( + PositionListIndex const* that) const { assert(this->relation_size_ == that->relation_size_); - return this->size_ > that->size_ ? - that->Probe(this->CalculateAndGetProbingTable()) : - this->Probe(that->CalculateAndGetProbingTable()); + return this->size_ > that->size_ ? that->Probe(this->CalculateAndGetProbingTable()) + : this->Probe(that->CalculateAndGetProbingTable()); } -//TODO: null_cluster_ некорректен -std::unique_ptr PositionListIndex::Probe(std::shared_ptr> probing_table) const { +// TODO: null_cluster_ некорректен +std::unique_ptr PositionListIndex::Probe( + std::shared_ptr> probing_table) const { assert(this->relation_size_ == probing_table->size()); std::deque> new_index; unsigned int new_size = 0; @@ -172,14 +171,13 @@ std::unique_ptr PositionListIndex::Probe(std::shared_ptr(position) >= probing_table->size()) { LOG(DEBUG) << "position: " + std::to_string(position) + - ", size: " + std::to_string(probing_table->size()); + ", size: " + std::to_string(probing_table->size()); for (size_t i = 0; i < positions.size(); ++i) { LOG(DEBUG) << "Position " + std::to_string(positions[i]); } } int probing_table_value_id = (*probing_table)[position]; - if (probing_table_value_id == singleton_value_id_) - continue; + if (probing_table_value_id == singleton_value_id_) continue; intersection_count_++; partial_index[probing_table_value_id].push_back(position); } @@ -198,16 +196,18 @@ std::unique_ptr PositionListIndex::Probe(std::shared_ptr const& a, std::vector const& b) { + return (*probing_table)[a[0]] < (*probing_table)[b[0]]; + }); return std::make_unique(std::move(new_index), std::move(null_cluster), new_size, new_entropy, new_nep, relation_size_, relation_size_); } -//TODO: null_cluster_ не поддерживается +// TODO: null_cluster_ не поддерживается std::unique_ptr PositionListIndex::ProbeAll( - Vertical const& probing_columns, ColumnLayoutRelationData& relation_data) { + Vertical const& probing_columns, ColumnLayoutRelationData& relation_data) { assert(this->relation_size_ == relation_data.GetNumRows()); std::deque> new_index; unsigned int new_size = 0;