From 554d08cf654a4bd1ec0b1824cca59bc8e88b4308 Mon Sep 17 00:00:00 2001 From: Ilia Date: Wed, 17 Apr 2024 17:50:46 +0300 Subject: [PATCH] Generalize TANE and PFDTANE algorithms Generalize TANE-based algorithms, add additional pFD mining tests. --- src/core/algorithms/fd/mining_algorithms.h | 1 - src/core/algorithms/fd/pfdtane/pfdtane.cpp | 327 -------------- src/core/algorithms/fd/pfdtane/pfdtane.h | 40 -- .../algorithms/fd/{pfdtane => tane}/enums.h | 0 src/core/algorithms/fd/tane/tane.cpp | 404 ++++++++++-------- src/core/algorithms/fd/tane/tane.h | 78 ++-- src/tests/test_fd_algorithm.cpp | 1 - src/tests/test_pfdtane.cpp | 14 +- 8 files changed, 294 insertions(+), 571 deletions(-) delete mode 100644 src/core/algorithms/fd/pfdtane/pfdtane.cpp delete mode 100644 src/core/algorithms/fd/pfdtane/pfdtane.h rename src/core/algorithms/fd/{pfdtane => tane}/enums.h (100%) diff --git a/src/core/algorithms/fd/mining_algorithms.h b/src/core/algorithms/fd/mining_algorithms.h index 0660e95735..ce2bcedfd7 100644 --- a/src/core/algorithms/fd/mining_algorithms.h +++ b/src/core/algorithms/fd/mining_algorithms.h @@ -8,6 +8,5 @@ #include "algorithms/fd/fdep/fdep.h" #include "algorithms/fd/fun/fun.h" #include "algorithms/fd/hyfd/hyfd.h" -#include "algorithms/fd/pfdtane/pfdtane.h" #include "algorithms/fd/pyro/pyro.h" #include "algorithms/fd/tane/tane.h" diff --git a/src/core/algorithms/fd/pfdtane/pfdtane.cpp b/src/core/algorithms/fd/pfdtane/pfdtane.cpp deleted file mode 100644 index 32e8727bed..0000000000 --- a/src/core/algorithms/fd/pfdtane/pfdtane.cpp +++ /dev/null @@ -1,327 +0,0 @@ -#include "pfdtane.h" - -#include -#include -#include -#include - -#include - -#include "config/error/option.h" -#include "config/error_measure/option.h" -#include "config/max_lhs/option.h" -#include "enums.h" -#include "fd/tane/lattice_level.h" -#include "fd/tane/lattice_vertex.h" -#include "model/table/column_data.h" -#include "model/table/column_layout_relation_data.h" -#include "model/table/relational_schema.h" - -namespace algos { -using boost::dynamic_bitset; -using Cluster = model::PositionListIndex::Cluster; - -config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs) { - std::size_t max = 1; - model::PositionListIndex const* x_pli = rhs->GetPositionListIndex(); - for (Cluster const& x_cluster : x_pli->GetIndex()) { - max = std::max(max, x_cluster.size()); - } - return 1.0 - static_cast(max) / x_pli->GetRelationSize(); -} - -config::ErrorType PFDTane::CalculateFdError(model::PositionListIndex const* x_pli, - model::PositionListIndex const* xa_pli, - ErrorMeasure measure) { - std::deque xa_index = xa_pli->GetIndex(); - std::shared_ptr probing_table = x_pli->CalculateAndGetProbingTable(); - std::sort(xa_index.begin(), xa_index.end(), - [&probing_table](Cluster const& a, Cluster const& b) { - return probing_table->at(a.front()) < probing_table->at(b.front()); - }); - double sum = 0.0; - std::size_t cluster_rows_count = 0; - std::deque const& x_index = x_pli->GetIndex(); - auto xa_cluster_it = xa_index.begin(); - - for (Cluster const& x_cluster : x_index) { - std::size_t max = 1; - for (int x_row : x_cluster) { - if (xa_cluster_it == xa_index.end()) { - break; - } - if (x_row == xa_cluster_it->at(0)) { - max = std::max(max, xa_cluster_it->size()); - xa_cluster_it++; - } - } - sum += measure == +ErrorMeasure::per_tuple ? static_cast(max) - : static_cast(max) / x_cluster.size(); - cluster_rows_count += x_cluster.size(); - } - unsigned int unique_rows = - static_cast(x_pli->GetRelationSize() - cluster_rows_count); - double probability = static_cast(sum + unique_rows) / - (measure == +ErrorMeasure::per_tuple ? x_pli->GetRelationSize() - : x_index.size() + unique_rows); - return 1.0 - probability; -} - -void PFDTane::RegisterOptions() { - RegisterOption(config::kErrorOpt(&max_ucc_error_)); - RegisterOption(config::kErrorMeasureOpt(&error_measure_)); -} - -void PFDTane::MakeExecuteOptsAvailableFDInternal() { - MakeOptionsAvailable({config::kErrorOpt.GetName(), config::kErrorMeasureOpt.GetName()}); -} - -void PFDTane::ResetStateFd() {} - -PFDTane::PFDTane(std::optional relation_manager) - : PliBasedFDAlgorithm({kDefaultPhaseName}, relation_manager) { - RegisterOptions(); -} - -double PFDTane::CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data) { - return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); -} - -void PFDTane::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, - [[maybe_unused]] config::ErrorType error, - [[maybe_unused]] RelationalSchema const* schema) { - dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); - PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); -} - -void PFDTane::Prune(model::LatticeLevel* level) { - RelationalSchema const* schema = relation_->GetSchema(); - std::list key_vertices; - for (auto& [map_key, vertex] : level->GetVertices()) { - Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination - - if (vertex->GetIsKeyCandidate()) { - double ucc_error = CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC - // TODO: do smth with UCC - - vertex->SetKeyCandidate(false); - if (ucc_error == 0) { - for (std::size_t rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index != boost::dynamic_bitset<>::npos; - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - Vertical rhs = static_cast(*schema->GetColumn((int)rhs_index)); - if (!columns.Contains(rhs)) { - bool is_rhs_candidate = true; - for (auto const& column : columns.GetColumns()) { - Vertical sibling = - columns.Without(static_cast(*column)).Union(rhs); - auto sibling_vertex = - level->GetLatticeVertex(sibling.GetColumnIndices()); - if (sibling_vertex == nullptr || - !sibling_vertex->GetConstRhsCandidates() - [rhs.GetColumnIndices().find_first()]) { - is_rhs_candidate = false; - break; - } - // for each outer rhs: if there is a sibling s.t. it doesn't - // have this rhs, there is no FD: vertex->rhs - } - // Found fd: vertex->rhs => register it - if (is_rhs_candidate) { - RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, - schema); - } - } - } - key_vertices.push_back(vertex.get()); - // cout << "--------------------------" << endl << "KeyVert: " << *vertex; - } - } - } - // if we seek for exact FDs then SetInvalid - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - for (auto key_vertex : key_vertices) { - key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); - key_vertex->SetInvalid(true); - } - } - } -} - -void PFDTane::ComputeDependencies(model::LatticeLevel* level) { - RelationalSchema const* schema = relation_->GetSchema(); - for (auto& [key_map, xa_vertex] : level->GetVertices()) { - if (xa_vertex->GetIsInvalid()) { - continue; - } - Vertical xa = xa_vertex->GetVertical(); - // Calculate XA PLI - if (xa_vertex->GetPositionListIndex() == nullptr) { - auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); - auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); - xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); - } - - dynamic_bitset<> xa_indices = xa.GetColumnIndices(); - dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); - auto xa_pli = xa_vertex->GetPositionListIndex(); - for (auto const& x_vertex : xa_vertex->GetParents()) { - Vertical const& lhs = x_vertex->GetVertical(); - - // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it - // easier?? - // like "a_index = xa_indices - x_indices;" - int a_index = xa_indices.find_first(); - dynamic_bitset<> x_indices = lhs.GetColumnIndices(); - while (a_index >= 0 && x_indices[a_index]) { - a_index = xa_indices.find_next(a_index); - } - if (!a_candidates[a_index]) { - continue; - } - auto x_pli = x_vertex->GetPositionListIndex(); - - // Check X -> A - config::ErrorType error = CalculateFdError(x_pli, xa_pli, error_measure_); - if (error <= max_fd_error_) { - Column const* rhs = schema->GetColumns()[a_index].get(); - - RegisterAndCountFd(lhs, rhs, error, schema); - xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); - if (error == 0) { - xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); - } - } - } - } -} - -unsigned long long PFDTane::ExecuteInternal() { - long apriori_millis = 0; - max_fd_error_ = max_ucc_error_; - RelationalSchema const* schema = relation_->GetSchema(); - - LOG(DEBUG) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " - << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) - << relation_->GetMaximumNip() << "."; - - for (auto& column : schema->GetColumns()) { - double avg_partners = relation_->GetColumnData(column->GetIndex()) - .GetPositionListIndex() - ->GetNepAsLong() * - 2.0 / relation_->GetNumRows(); - LOG(DEBUG) << "* " << column->ToString() << ": every tuple has " << std::setw(2) - << avg_partners << " partners on average."; - } - auto start_time = std::chrono::system_clock::now(); - double progress_step = 100.0 / (schema->GetNumColumns() + 1); - - // Initialize level 0 - std::vector> levels; - auto level0 = std::make_unique(0); - // TODO: через указатели кажется надо переделать - level0->Add(std::make_unique(*(schema->empty_vertical_))); - model::LatticeVertex const* empty_vertex = level0->GetVertices().begin()->second.get(); - levels.push_back(std::move(level0)); - AddProgress(progress_step); - - // Initialize level1 - dynamic_bitset<> zeroary_fd_rhs(schema->GetNumColumns()); - auto level1 = std::make_unique(1); - for (auto& column : schema->GetColumns()) { - // for each attribute set vertex - ColumnData const& column_data = relation_->GetColumnData(column->GetIndex()); - auto vertex = std::make_unique(static_cast(*column)); - - vertex->AddRhsCandidates(schema->GetColumns()); - vertex->GetParents().push_back(empty_vertex); - vertex->SetKeyCandidate(true); - vertex->SetPositionListIndex(column_data.GetPositionListIndex()); - - // check FDs: 0->A - double fd_error = CalculateZeroAryFdError(&column_data); - if (fd_error <= max_fd_error_) { // TODO: max_error - zeroary_fd_rhs.set(column->GetIndex()); - RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema); - - vertex->GetRhsCandidates().set(column->GetIndex(), false); - if (fd_error == 0) { - vertex->GetRhsCandidates().reset(); - } - } - - level1->Add(std::move(vertex)); - } - - for (auto& [key_map, vertex] : level1->GetVertices()) { - Vertical column = vertex->GetVertical(); - vertex->GetRhsCandidates() &= - ~zeroary_fd_rhs; //~ returns flipped copy <- removed already discovered zeroary FDs - - // вот тут костыль, чтобы вытянуть индекс колонки из вершины, в которой только один индекс - ColumnData const& column_data = - relation_->GetColumnData(column.GetColumnIndices().find_first()); - double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { - vertex->SetKeyCandidate(false); - if (ucc_error == 0 && max_lhs_ != 0) { - for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index < vertex->GetRhsCandidates().size(); - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - if (rhs_index != column.GetColumnIndices().find_first()) { - RegisterAndCountFd(column, schema->GetColumn(rhs_index), 0, schema); - } - } - vertex->GetRhsCandidates() &= column.GetColumnIndices(); - // set vertex invalid if we seek for exact dependencies - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - vertex->SetInvalid(true); - } - } - } - } - levels.push_back(std::move(level1)); - AddProgress(progress_step); - - unsigned int max_arity = - max_lhs_ == std::numeric_limits::max() ? max_lhs_ : max_lhs_ + 1; - for (unsigned int arity = 2; arity <= max_arity; arity++) { - model::LatticeLevel::ClearLevelsBelow(levels, arity - 1); - model::LatticeLevel::GenerateNextLevel(levels); - - model::LatticeLevel* level = levels[arity].get(); - LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity - << "-ary lattice vertices."; - if (level->GetVertices().empty()) { - break; - } - - ComputeDependencies(level); - - if (arity == max_arity) { - break; - } - - Prune(level); - // TODO: printProfilingData - AddProgress(progress_step); - } - - SetProgress(100); - std::chrono::milliseconds elapsed_milliseconds = - std::chrono::duration_cast(std::chrono::system_clock::now() - - start_time); - apriori_millis += elapsed_milliseconds.count(); - - LOG(DEBUG) << "Time: " << apriori_millis << " milliseconds"; - LOG(DEBUG) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; - LOG(DEBUG) << "Total intersections: " << model::PositionListIndex::intersection_count_ - << std::endl; - LOG(DEBUG) << "Total FD count: " << fd_collection_.Size(); - LOG(DEBUG) << "HASH: " << Fletcher16(); - return apriori_millis; -} - -} // namespace algos diff --git a/src/core/algorithms/fd/pfdtane/pfdtane.h b/src/core/algorithms/fd/pfdtane/pfdtane.h deleted file mode 100644 index b80951639e..0000000000 --- a/src/core/algorithms/fd/pfdtane/pfdtane.h +++ /dev/null @@ -1,40 +0,0 @@ -#pragma once - -#include "algorithms/fd/pli_based_fd_algorithm.h" -#include "algorithms/fd/tane/lattice_level.h" -#include "config/error/type.h" -#include "config/error_measure/type.h" -#include "config/max_lhs/type.h" -#include "enums.h" -#include "model/table/position_list_index.h" -#include "model/table/relation_data.h" - -namespace algos { - -class PFDTane : public PliBasedFDAlgorithm { -private: - config::ErrorType max_fd_error_; - config::ErrorType max_ucc_error_; - ErrorMeasure error_measure_ = +ErrorMeasure::per_tuple; - - void ResetStateFd() final; - void RegisterOptions(); - void MakeExecuteOptsAvailableFDInternal() final; - void Prune(model::LatticeLevel* level); - void ComputeDependencies(model::LatticeLevel* level); - unsigned long long ExecuteInternal() final; - -public: - PFDTane(std::optional relation_manager = std::nullopt); - static double CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data); - - void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, - RelationalSchema const* schema); - static config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs); - static config::ErrorType CalculateFdError(model::PositionListIndex const* x_pli, - model::PositionListIndex const* xa_pli, - ErrorMeasure error_measure); -}; - -} // namespace algos diff --git a/src/core/algorithms/fd/pfdtane/enums.h b/src/core/algorithms/fd/tane/enums.h similarity index 100% rename from src/core/algorithms/fd/pfdtane/enums.h rename to src/core/algorithms/fd/tane/enums.h diff --git a/src/core/algorithms/fd/tane/tane.cpp b/src/core/algorithms/fd/tane/tane.cpp index 0316b1149e..7e407c7ebc 100644 --- a/src/core/algorithms/fd/tane/tane.cpp +++ b/src/core/algorithms/fd/tane/tane.cpp @@ -8,87 +8,153 @@ #include #include "config/error/option.h" -#include "config/max_lhs/option.h" -#include "lattice_level.h" -#include "lattice_vertex.h" +#include "config/error_measure/option.h" +#include "enums.h" +#include "fd/pli_based_fd_algorithm.h" +#include "fd/tane/lattice_level.h" +#include "fd/tane/lattice_vertex.h" #include "model/table/column_data.h" #include "model/table/column_layout_relation_data.h" #include "model/table/relational_schema.h" namespace algos { - using boost::dynamic_bitset; +using Cluster = model::PositionListIndex::Cluster; -Tane::Tane(std::optional relation_manager) - : PliBasedFDAlgorithm({kDefaultPhaseName}, relation_manager) { - RegisterOptions(); -} +void tane::TaneCommon::ResetStateFd() {} -void Tane::RegisterOptions() { - RegisterOption(config::kErrorOpt(&max_ucc_error_)); -} +tane::TaneCommon::TaneCommon(std::optional relation_manager) + : PliBasedFDAlgorithm({kDefaultPhaseName}, relation_manager) {} -void Tane::MakeExecuteOptsAvailableFDInternal() { - MakeOptionsAvailable({config::kErrorOpt.GetName()}); +double tane::TaneCommon::CalculateUccError(model::PositionListIndex const* pli, + ColumnLayoutRelationData const* relation_data) { + return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); } -void Tane::ResetStateFd() { - count_of_fd_ = 0; - count_of_ucc_ = 0; - apriori_millis_ = 0; +void tane::TaneCommon::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, + [[maybe_unused]] config::ErrorType error, + [[maybe_unused]] RelationalSchema const* schema) { + dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); + PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); } -double Tane::CalculateZeroAryFdError(ColumnData const* rhs, - ColumnLayoutRelationData const* relation_data) { - return 1 - rhs->GetPositionListIndex()->GetNepAsLong() / - static_cast(relation_data->GetNumTuplePairs()); +void tane::TaneCommon::Prune(model::LatticeLevel* level) { + RelationalSchema const* schema = relation_->GetSchema(); + std::list key_vertices; + for (auto& [map_key, vertex] : level->GetVertices()) { + Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination + + if (vertex->GetIsKeyCandidate()) { + double ucc_error = CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); + if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC + + vertex->SetKeyCandidate(false); + if (ucc_error == 0) { + for (std::size_t rhs_index = vertex->GetRhsCandidates().find_first(); + rhs_index != boost::dynamic_bitset<>::npos; + rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { + Vertical rhs = static_cast(*schema->GetColumn((int)rhs_index)); + if (!columns.Contains(rhs)) { + bool is_rhs_candidate = true; + for (auto const& column : columns.GetColumns()) { + Vertical sibling = + columns.Without(static_cast(*column)).Union(rhs); + auto sibling_vertex = + level->GetLatticeVertex(sibling.GetColumnIndices()); + if (sibling_vertex == nullptr || + !sibling_vertex->GetConstRhsCandidates() + [rhs.GetColumnIndices().find_first()]) { + is_rhs_candidate = false; + break; + } + // for each outer rhs: if there is a sibling s.t. it doesn't + // have this rhs, there is no FD: vertex->rhs + } + // Found fd: vertex->rhs => register it + if (is_rhs_candidate) { + RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, + schema); + } + } + } + key_vertices.push_back(vertex.get()); + } + } + } + // if we seek for exact FDs then SetInvalid + if (max_fd_error_ == 0 && max_ucc_error_ == 0) { + for (auto key_vertex : key_vertices) { + key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); + key_vertex->SetInvalid(true); + } + } + } } -double Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, - model::PositionListIndex const* joint_pli, - ColumnLayoutRelationData const* relation_data) { - return (double)(lhs_pli->GetNepAsLong() - joint_pli->GetNepAsLong()) / - static_cast(relation_data->GetNumTuplePairs()); -} +void tane::TaneCommon::ComputeDependencies(model::LatticeLevel* level) { + RelationalSchema const* schema = relation_->GetSchema(); + for (auto& [key_map, xa_vertex] : level->GetVertices()) { + if (xa_vertex->GetIsInvalid()) { + continue; + } + Vertical xa = xa_vertex->GetVertical(); + // Calculate XA PLI + if (xa_vertex->GetPositionListIndex() == nullptr) { + auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); + auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); + xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); + } -double Tane::CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data) { - return pli->GetNepAsLong() / static_cast(relation_data->GetNumTuplePairs()); -} + dynamic_bitset<> xa_indices = xa.GetColumnIndices(); + dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); + auto xa_pli = xa_vertex->GetPositionListIndex(); + for (auto const& x_vertex : xa_vertex->GetParents()) { + Vertical const& lhs = x_vertex->GetVertical(); + + // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it + // easier?? + // like "a_index = xa_indices - x_indices;" + int a_index = xa_indices.find_first(); + dynamic_bitset<> x_indices = lhs.GetColumnIndices(); + while (a_index >= 0 && x_indices[a_index]) { + a_index = xa_indices.find_next(a_index); + } + if (!a_candidates[a_index]) { + continue; + } + auto x_pli = x_vertex->GetPositionListIndex(); -void Tane::RegisterAndCountFd(Vertical const& lhs, Column const* rhs, [[maybe_unused]] double error, - [[maybe_unused]] RelationalSchema const* schema) { - dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices(); - PliBasedFDAlgorithm::RegisterFd(lhs, *rhs); - count_of_fd_++; -} + // Check X -> A + config::ErrorType error = CalculateFdError(x_pli, xa_pli); + if (error <= max_fd_error_) { + Column const* rhs = schema->GetColumns()[a_index].get(); -void Tane::RegisterUcc([[maybe_unused]] Vertical const& key, [[maybe_unused]] double error, - [[maybe_unused]] RelationalSchema const* schema) { - /*dynamic_bitset<> key_bitset = key.getColumnIndices(); - LOG(INFO) << "Discovered UCC: "; - for (int i = key_bitset.find_first(); i != -1; i = key_bitset.find_next(i)) { - LOG(INFO) << schema->GetColumn(i)->GetName() << " "; + RegisterAndCountFd(lhs, rhs, error, schema); + xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); + if (error == 0) { + xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); + } + } + } } - LOG(INFO) << "- error equals " << error << std::endl;*/ - count_of_ucc_++; } -unsigned long long Tane::ExecuteInternal() { +unsigned long long tane::TaneCommon::ExecuteInternal() { + long apriori_millis = 0; max_fd_error_ = max_ucc_error_; RelationalSchema const* schema = relation_->GetSchema(); - LOG(INFO) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " - << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) - << relation_->GetMaximumNip() << "."; + LOG(DEBUG) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, " + << relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2) + << relation_->GetMaximumNip() << "."; for (auto& column : schema->GetColumns()) { double avg_partners = relation_->GetColumnData(column->GetIndex()) .GetPositionListIndex() ->GetNepAsLong() * 2.0 / relation_->GetNumRows(); - LOG(INFO) << "* " << column->ToString() << ": every tuple has " << std::setw(2) - << avg_partners << " partners on average."; + LOG(DEBUG) << "* " << column->ToString() << ": every tuple has " << std::setw(2) + << avg_partners << " partners on average."; } auto start_time = std::chrono::system_clock::now(); double progress_step = 100.0 / (schema->GetNumColumns() + 1); @@ -116,7 +182,7 @@ unsigned long long Tane::ExecuteInternal() { vertex->SetPositionListIndex(column_data.GetPositionListIndex()); // check FDs: 0->A - double fd_error = CalculateZeroAryFdError(&column_data, relation_.get()); + double fd_error = CalculateZeroAryFdError(&column_data); if (fd_error <= max_fd_error_) { // TODO: max_error zeroary_fd_rhs.set(column->GetIndex()); RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema); @@ -140,7 +206,6 @@ unsigned long long Tane::ExecuteInternal() { relation_->GetColumnData(column.GetColumnIndices().find_first()); double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get()); if (ucc_error <= max_ucc_error_) { - RegisterUcc(column, ucc_error, schema); vertex->SetKeyCandidate(false); if (ucc_error == 0 && max_lhs_ != 0) { for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first(); @@ -164,12 +229,8 @@ unsigned long long Tane::ExecuteInternal() { unsigned int max_arity = max_lhs_ == std::numeric_limits::max() ? max_lhs_ : max_lhs_ + 1; for (unsigned int arity = 2; arity <= max_arity; arity++) { - // auto start_time = std::chrono::system_clock::now(); model::LatticeLevel::ClearLevelsBelow(levels, arity - 1); model::LatticeLevel::GenerateNextLevel(levels); - // std::chrono::duration elapsed_milliseconds = - // std::chrono::duration_cast(std::chrono::system_clock::now() - - // start_time); apriori_millis_ += elapsed_milliseconds.count(); model::LatticeLevel* level = levels[arity].get(); LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity @@ -178,116 +239,13 @@ unsigned long long Tane::ExecuteInternal() { break; } - for (auto& [key_map, xa_vertex] : level->GetVertices()) { - if (xa_vertex->GetIsInvalid()) { - continue; - } - - Vertical xa = xa_vertex->GetVertical(); - // Calculate XA PLI - if (xa_vertex->GetPositionListIndex() == nullptr) { - auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex(); - auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex(); - xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2)); - } - - dynamic_bitset<> xa_indices = xa.GetColumnIndices(); - dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates(); - - for (auto const& x_vertex : xa_vertex->GetParents()) { - Vertical const& lhs = x_vertex->GetVertical(); - - // Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it - // easier?? - // like "a_index = xa_indices - x_indices;" - int a_index = xa_indices.find_first(); - dynamic_bitset<> x_indices = lhs.GetColumnIndices(); - while (a_index >= 0 && x_indices[a_index]) { - a_index = xa_indices.find_next(a_index); - } - if (!a_candidates[a_index]) { - continue; - } - - // Check X -> A - double error = CalculateFdError(x_vertex->GetPositionListIndex(), - xa_vertex->GetPositionListIndex(), relation_.get()); - if (error <= max_fd_error_) { - Column const* rhs = schema->GetColumns()[a_index].get(); - - // TODO: register FD to a file or something - RegisterAndCountFd(lhs, rhs, error, schema); - xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false); - if (error == 0) { - xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices(); - } - } - } - } + ComputeDependencies(level); if (arity == max_arity) { break; } - // Prune - // cout << "Pruning level: " << level->GetArity() << ". " << level->GetVertices().size() << - // " vertices_" << endl; - std::list key_vertices; - for (auto& [map_key, vertex] : level->GetVertices()) { - Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination - - if (vertex->GetIsKeyCandidate()) { - double ucc_error = - CalculateUccError(vertex->GetPositionListIndex(), relation_.get()); - if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC - // TODO: do smth with UCC - - RegisterUcc(columns, ucc_error, schema); - vertex->SetKeyCandidate(false); - if (ucc_error == 0) { - for (size_t rhs_index = vertex->GetRhsCandidates().find_first(); - rhs_index != boost::dynamic_bitset<>::npos; - rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) { - Vertical rhs = - static_cast(*schema->GetColumn((int)rhs_index)); - if (!columns.Contains(rhs)) { - bool is_rhs_candidate = true; - for (auto const& column : columns.GetColumns()) { - Vertical sibling = - columns.Without(static_cast(*column)) - .Union(rhs); - auto sibling_vertex = - level->GetLatticeVertex(sibling.GetColumnIndices()); - if (sibling_vertex == nullptr || - !sibling_vertex->GetConstRhsCandidates() - [rhs.GetColumnIndices().find_first()]) { - is_rhs_candidate = false; - break; - } - // for each outer rhs: if there is a sibling s.t. it doesn't - // have this rhs, there is no FD: vertex->rhs - } - // Found fd: vertex->rhs => register it - if (is_rhs_candidate) { - RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0, - schema); - } - } - } - key_vertices.push_back(vertex.get()); - // cout << "--------------------------" << endl << "KeyVert: " << *vertex; - } - } - } - // if we seek for exact FDs then SetInvalid - if (max_fd_error_ == 0 && max_ucc_error_ == 0) { - for (auto key_vertex : key_vertices) { - key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices(); - key_vertex->SetInvalid(true); - } - } - } - + Prune(level); // TODO: printProfilingData AddProgress(progress_step); } @@ -296,17 +254,121 @@ unsigned long long Tane::ExecuteInternal() { std::chrono::milliseconds elapsed_milliseconds = std::chrono::duration_cast(std::chrono::system_clock::now() - start_time); - apriori_millis_ += elapsed_milliseconds.count(); + apriori_millis += elapsed_milliseconds.count(); + + LOG(DEBUG) << "Time: " << apriori_millis << " milliseconds"; + LOG(DEBUG) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; + LOG(DEBUG) << "Total intersections: " << model::PositionListIndex::intersection_count_ + << std::endl; + LOG(DEBUG) << "Total FD count: " << fd_collection_.Size(); + LOG(DEBUG) << "HASH: " << Fletcher16(); + return apriori_millis; +} + +Tane::Tane(std::optional relation_manager) + : tane::TaneCommon(relation_manager) { + RegisterOptions(); +} + +void Tane::RegisterOptions() { + RegisterOption(config::kErrorOpt(&max_ucc_error_)); +} - LOG(INFO) << "Time: " << apriori_millis_ << " milliseconds"; - LOG(INFO) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms"; - LOG(INFO) << "Total intersections: " << model::PositionListIndex::intersection_count_ - << std::endl; - LOG(INFO) << "Total FD count: " << count_of_fd_; - LOG(INFO) << "Total UCC count: " << count_of_ucc_; - LOG(INFO) << "HASH: " << Fletcher16(); +void Tane::MakeExecuteOptsAvailableFDInternal() { + MakeOptionsAvailable({config::kErrorOpt.GetName()}); +} + +config::ErrorType Tane::CalculateZeroAryAFDError(ColumnData const* rhs, + ColumnLayoutRelationData const* relation_data) { + return 1 - rhs->GetPositionListIndex()->GetNepAsLong() / + static_cast(relation_data->GetNumTuplePairs()); +} + +config::ErrorType Tane::CalculateAFDError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli, + ColumnLayoutRelationData const* relation_data) { + return (double)(lhs_pli->GetNepAsLong() - joint_pli->GetNepAsLong()) / + static_cast(relation_data->GetNumTuplePairs()); +} + +config::ErrorType Tane::CalculateZeroAryFdError(ColumnData const* rhs) { + return CalculateZeroAryAFDError(rhs, relation_.get()); +} + +config::ErrorType Tane::CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) { + return CalculateAFDError(lhs_pli, joint_pli, relation_.get()); +} + +PFDTane::PFDTane(std::optional relation_manager) + : tane::TaneCommon(relation_manager) { + RegisterOptions(); +} + +config::ErrorType PFDTane::CalculateZeroAryPFDError(ColumnData const* rhs, + ColumnLayoutRelationData const*) { + std::size_t max = 1; + model::PositionListIndex const* x_pli = rhs->GetPositionListIndex(); + for (Cluster const& x_cluster : x_pli->GetIndex()) { + max = std::max(max, x_cluster.size()); + } + return 1.0 - static_cast(max) / x_pli->GetRelationSize(); +} + +config::ErrorType PFDTane::CalculatePFDError(model::PositionListIndex const* x_pli, + model::PositionListIndex const* xa_pli, + ErrorMeasure measure, + ColumnLayoutRelationData const*) { + std::deque xa_index = xa_pli->GetIndex(); + std::shared_ptr probing_table = x_pli->CalculateAndGetProbingTable(); + std::sort(xa_index.begin(), xa_index.end(), + [&probing_table](Cluster const& a, Cluster const& b) { + return probing_table->at(a.front()) < probing_table->at(b.front()); + }); + double sum = 0.0; + std::size_t cluster_rows_count = 0; + std::deque const& x_index = x_pli->GetIndex(); + auto xa_cluster_it = xa_index.begin(); + + for (Cluster const& x_cluster : x_index) { + std::size_t max = 1; + for (int x_row : x_cluster) { + if (xa_cluster_it == xa_index.end()) { + break; + } + if (x_row == xa_cluster_it->at(0)) { + max = std::max(max, xa_cluster_it->size()); + xa_cluster_it++; + } + } + sum += measure == +ErrorMeasure::per_tuple ? static_cast(max) + : static_cast(max) / x_cluster.size(); + cluster_rows_count += x_cluster.size(); + } + unsigned int unique_rows = + static_cast(x_pli->GetRelationSize() - cluster_rows_count); + double probability = static_cast(sum + unique_rows) / + (measure == +ErrorMeasure::per_tuple ? x_pli->GetRelationSize() + : x_index.size() + unique_rows); + return 1.0 - probability; +} + +config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs) { + return CalculateZeroAryPFDError(rhs, relation_.get()); +} + +config::ErrorType PFDTane::CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) { + return CalculatePFDError(lhs_pli, joint_pli, error_measure_, relation_.get()); +} + +void PFDTane::RegisterOptions() { + RegisterOption(config::kErrorOpt(&max_ucc_error_)); + RegisterOption(config::kErrorMeasureOpt(&error_measure_)); +} - return apriori_millis_; +void PFDTane::MakeExecuteOptsAvailableFDInternal() { + MakeOptionsAvailable({config::kErrorOpt.GetName(), config::kErrorMeasureOpt.GetName()}); } } // namespace algos diff --git a/src/core/algorithms/fd/tane/tane.h b/src/core/algorithms/fd/tane/tane.h index 4de4cf05c4..b514e7d510 100644 --- a/src/core/algorithms/fd/tane/tane.h +++ b/src/core/algorithms/fd/tane/tane.h @@ -1,47 +1,73 @@ #pragma once -#include - #include "algorithms/fd/pli_based_fd_algorithm.h" +#include "algorithms/fd/tane/lattice_level.h" #include "config/error/type.h" +#include "model/table/column_data.h" +#include "model/table/column_layout_relation_data.h" #include "model/table/position_list_index.h" -#include "model/table/relation_data.h" namespace algos { +namespace tane { -class Tane : public PliBasedFDAlgorithm { -private: - void RegisterOptions(); - void MakeExecuteOptsAvailableFDInternal() final; +class TaneCommon : public PliBasedFDAlgorithm { +protected: + config::ErrorType max_fd_error_; + config::ErrorType max_ucc_error_; +private: void ResetStateFd() final; + void Prune(model::LatticeLevel* level); + void ComputeDependencies(model::LatticeLevel* level); unsigned long long ExecuteInternal() final; + virtual config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) = 0; + virtual config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) = 0; public: - config::ErrorType max_fd_error_; - config::ErrorType max_ucc_error_; + TaneCommon(std::optional relation_manager = std::nullopt); + static double CalculateUccError(model::PositionListIndex const* pli, + ColumnLayoutRelationData const* relation_data); + void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, + RelationalSchema const* schema); +}; - int count_of_fd_ = 0; - int count_of_ucc_ = 0; - long apriori_millis_ = 0; +} // namespace tane - Tane(std::optional relation_manager = std::nullopt); +class Tane : public tane::TaneCommon { +private: + void RegisterOptions(); + void MakeExecuteOptsAvailableFDInternal() override final; + config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) override; + config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) override; - static double CalculateZeroAryFdError(ColumnData const* rhs, - ColumnLayoutRelationData const* relation_data); - static double CalculateFdError(model::PositionListIndex const* lhs_pli, - model::PositionListIndex const* joint_pli, - ColumnLayoutRelationData const* relation_data); - static double CalculateUccError(model::PositionListIndex const* pli, - ColumnLayoutRelationData const* relation_data); +public: + Tane(std::optional relation_manager = std::nullopt); + static config::ErrorType CalculateZeroAryAFDError( + ColumnData const* rhs, ColumnLayoutRelationData const* relation_data); + static config::ErrorType CalculateAFDError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli, + ColumnLayoutRelationData const* relation_data); +}; - // static double round(double error) { return ((int)(error * 32768) + 1)/ 32768.0; } +class PFDTane : public tane::TaneCommon { +private: + ErrorMeasure error_measure_ = +ErrorMeasure::per_tuple; + void RegisterOptions(); + void MakeExecuteOptsAvailableFDInternal() final; + config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) override; + config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli, + model::PositionListIndex const* joint_pli) override; - void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error, - RelationalSchema const* schema); - // void RegisterFd(Vertical const* lhs, Column const* rhs, double error, RelationalSchema const* - // schema); - void RegisterUcc(Vertical const& key, double error, RelationalSchema const* schema); +public: + PFDTane(std::optional relation_manager = std::nullopt); + static config::ErrorType CalculateZeroAryPFDError( + ColumnData const* rhs, ColumnLayoutRelationData const* relation_data); + static config::ErrorType CalculatePFDError(model::PositionListIndex const* x_pli, + model::PositionListIndex const* xa_pli, + ErrorMeasure error_measure, + ColumnLayoutRelationData const* relation_data); }; } // namespace algos diff --git a/src/tests/test_fd_algorithm.cpp b/src/tests/test_fd_algorithm.cpp index 60abfc143e..4337fd3d72 100644 --- a/src/tests/test_fd_algorithm.cpp +++ b/src/tests/test_fd_algorithm.cpp @@ -9,7 +9,6 @@ #include "algorithms/fd/fdep/fdep.h" #include "algorithms/fd/fun/fun.h" #include "algorithms/fd/hyfd/hyfd.h" -#include "algorithms/fd/pfdtane/pfdtane.h" #include "algorithms/fd/pyro/pyro.h" #include "algorithms/fd/tane/tane.h" #include "model/table/relational_schema.h" diff --git a/src/tests/test_pfdtane.cpp b/src/tests/test_pfdtane.cpp index 2deeb702fb..0850b80f34 100644 --- a/src/tests/test_pfdtane.cpp +++ b/src/tests/test_pfdtane.cpp @@ -3,8 +3,7 @@ #include "algo_factory.h" #include "all_csv_configs.h" #include "config/names.h" -#include "fd/pfdtane/enums.h" -#include "fd/pfdtane/pfdtane.h" +#include "fd/tane/tane.h" #include "model/table/column_layout_relation_data.h" #include "parser/csv_parser/csv_parser.h" @@ -51,11 +50,12 @@ TEST_P(TestPFDTaneValidation, ErrorCalculationTest) { double eps = 0.00001; auto table = std::make_shared(p.csv_config); auto relation = ColumnLayoutRelationData::CreateFrom(*table, true); + for (auto const& [lhs_id, rhs_id, expected_error] : p.fds) { auto const& lhs = relation->GetColumnData(lhs_id).GetPositionListIndex(); auto const& rhs = relation->GetColumnData(rhs_id).GetPositionListIndex(); - config::ErrorType error = - algos::PFDTane::CalculateFdError(lhs, lhs->Intersect(rhs).get(), p.error_measure); + config::ErrorType error = algos::PFDTane::CalculatePFDError( + lhs, lhs->Intersect(rhs).get(), p.error_measure, relation.get()); EXPECT_NEAR(error, expected_error, eps); } } @@ -64,7 +64,11 @@ TEST_P(TestPFDTaneValidation, ErrorCalculationTest) { INSTANTIATE_TEST_SUITE_P( PFDTaneTestMiningSuite, TestPFDTaneMining, ::testing::Values( - PFDTaneMiningParams(44381, 0.3, +algos::ErrorMeasure::per_value, kTestFD) + PFDTaneMiningParams(44381, 0.3, +algos::ErrorMeasure::per_value, kTestFD), + PFDTaneMiningParams(39491, 0.1, +algos::ErrorMeasure::per_value, kIris), + PFDTaneMiningParams(10695, 0.01, +algos::ErrorMeasure::per_value, kIris), + PFDTaneMiningParams(7893, 0.1, +algos::ErrorMeasure::per_value, kNeighbors10k), + PFDTaneMiningParams(41837, 0.01, +algos::ErrorMeasure::per_value, kNeighbors10k) )); INSTANTIATE_TEST_SUITE_P(