Skip to content

Commit

Permalink
Merge branch 'probatane-wip' into pfdtane-nongeneralized
Browse files Browse the repository at this point in the history
  • Loading branch information
iliya-b authored Jan 3, 2024
2 parents 60ea86c + 4213914 commit e03af21
Show file tree
Hide file tree
Showing 6 changed files with 349 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/core/algorithms/fd/pfdtane/pfdtane.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <string>

#include "algorithms/fd/tane/common_tane.h"
#include "algorithms/fd/tane/tane.h"
#include "model/table/position_list_index.h"
#include "model/table/relation_data.h"
Expand Down
299 changes: 299 additions & 0 deletions src/core/algorithms/fd/tane/common_tane.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
#include <chrono>
#include <iomanip>
#include <list>
#include <memory>

#include <easylogging++.h>

#include "config/error/option.h"
#include "config/max_lhs/option.h"
#include "lattice_level.h"
#include "lattice_vertex.h"
#include "model/table/column_data.h"
#include "model/table/column_layout_relation_data.h"
#include "model/table/relational_schema.h"
#include "tane.h"

namespace tane {

using boost::dynamic_bitset;

CommonTane::CommonTane() : algos::PliBasedFDAlgorithm({kDefaultPhaseName}) {
RegisterOptions();
}

void CommonTane::RegisterOptions() {
RegisterOption(config::ErrorOpt(&max_ucc_error_));
RegisterOption(config::MaxLhsOpt(&max_lhs_));
}

void CommonTane::MakeExecuteOptsAvailable() {
MakeOptionsAvailable({config::MaxLhsOpt.GetName(), config::ErrorOpt.GetName()});
}

void CommonTane::ResetStateFd() {
count_of_fd_ = 0;
count_of_ucc_ = 0;
apriori_millis_ = 0;
}

double CommonTane::CalculateUccError(model::PositionListIndex const* pli,
ColumnLayoutRelationData const* relation_data) {
return pli->GetNepAsLong() / static_cast<double>(relation_data->GetNumTuplePairs());
}

void CommonTane::RegisterAndCountFd(Vertical const& lhs, Column const* rhs,
[[maybe_unused]] double error,
[[maybe_unused]] RelationalSchema const* schema) {
dynamic_bitset<> lhs_bitset = lhs.GetColumnIndices();
PliBasedFDAlgorithm::RegisterFd(lhs, *rhs);
count_of_fd_++;
}

void CommonTane::RegisterUcc([[maybe_unused]] Vertical const& key, [[maybe_unused]] double error,
[[maybe_unused]] RelationalSchema const* schema) {
/*dynamic_bitset<> key_bitset = key.getColumnIndices();
LOG(INFO) << "Discovered UCC: ";
for (int i = key_bitset.find_first(); i != -1; i = key_bitset.find_next(i)) {
LOG(INFO) << schema->GetColumn(i)->GetName() << " ";
}
LOG(INFO) << "- error equals " << error << std::endl;*/
count_of_ucc_++;
}

unsigned long long CommonTane::ExecuteInternal() {
max_fd_error_ = max_ucc_error_;
RelationalSchema const* schema = relation_->GetSchema();

LOG(INFO) << schema->GetName() << " has " << relation_->GetNumColumns() << " columns, "
<< relation_->GetNumRows() << " rows, and a maximum NIP of " << std::setw(2)
<< relation_->GetMaximumNip() << ".";

for (auto& column : schema->GetColumns()) {
double avg_partners = relation_->GetColumnData(column->GetIndex())
.GetPositionListIndex()
->GetNepAsLong() *
2.0 / relation_->GetNumRows();
LOG(INFO) << "* " << column->ToString() << ": every tuple has " << std::setw(2)
<< avg_partners << " partners on average.";
}
auto start_time = std::chrono::system_clock::now();
double progress_step = 100.0 / (schema->GetNumColumns() + 1);

// Initialize level 0
std::vector<std::unique_ptr<model::LatticeLevel>> levels;
auto level0 = std::make_unique<model::LatticeLevel>(0);
// TODO: через указатели кажется надо переделать
level0->Add(std::make_unique<model::LatticeVertex>(*(schema->empty_vertical_)));
model::LatticeVertex const* empty_vertex = level0->GetVertices().begin()->second.get();
levels.push_back(std::move(level0));
AddProgress(progress_step);

// Initialize level1
dynamic_bitset<> zeroary_fd_rhs(schema->GetNumColumns());
auto level1 = std::make_unique<model::LatticeLevel>(1);
for (auto& column : schema->GetColumns()) {
// for each attribute set vertex
ColumnData const& column_data = relation_->GetColumnData(column->GetIndex());
auto vertex = std::make_unique<model::LatticeVertex>(static_cast<Vertical>(*column));

vertex->AddRhsCandidates(schema->GetColumns());
vertex->GetParents().push_back(empty_vertex);
vertex->SetKeyCandidate(true);
vertex->SetPositionListIndex(column_data.GetPositionListIndex());

// check FDs: 0->A
double fd_error = CalculateZeroAryFdError(&column_data);
if (fd_error <= max_fd_error_) { // TODO: max_error
zeroary_fd_rhs.set(column->GetIndex());
RegisterAndCountFd(*schema->empty_vertical_, column.get(), fd_error, schema);

vertex->GetRhsCandidates().set(column->GetIndex(), false);
if (fd_error == 0) {
vertex->GetRhsCandidates().reset();
}
}

level1->Add(std::move(vertex));
}

for (auto& [key_map, vertex] : level1->GetVertices()) {
Vertical column = vertex->GetVertical();
vertex->GetRhsCandidates() &=
~zeroary_fd_rhs; //~ returns flipped copy <- removed already discovered zeroary FDs

// вот тут костыль, чтобы вытянуть индекс колонки из вершины, в которой только один индекс
ColumnData const& column_data =
relation_->GetColumnData(column.GetColumnIndices().find_first());
double ucc_error = CalculateUccError(column_data.GetPositionListIndex(), relation_.get());
if (ucc_error <= max_ucc_error_) {
RegisterUcc(column, ucc_error, schema);
vertex->SetKeyCandidate(false);
if (ucc_error == 0 && max_lhs_ != 0) {
for (unsigned long rhs_index = vertex->GetRhsCandidates().find_first();
rhs_index < vertex->GetRhsCandidates().size();
rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) {
if (rhs_index != column.GetColumnIndices().find_first()) {
RegisterAndCountFd(column, schema->GetColumn(rhs_index), 0, schema);
}
}
vertex->GetRhsCandidates() &= column.GetColumnIndices();
// set vertex invalid if we seek for exact dependencies
if (max_fd_error_ == 0 && max_ucc_error_ == 0) {
vertex->SetInvalid(true);
}
}
}
}
levels.push_back(std::move(level1));
AddProgress(progress_step);

unsigned int max_arity =
max_lhs_ == std::numeric_limits<unsigned int>::max() ? max_lhs_ : max_lhs_ + 1;
for (unsigned int arity = 2; arity <= max_arity; arity++) {
// auto start_time = std::chrono::system_clock::now();
model::LatticeLevel::ClearLevelsBelow(levels, arity - 1);
model::LatticeLevel::GenerateNextLevel(levels);
// std::chrono::duration<double> elapsed_milliseconds =
// std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() -
// start_time); apriori_millis_ += elapsed_milliseconds.count();

model::LatticeLevel* level = levels[arity].get();
LOG(TRACE) << "Checking " << level->GetVertices().size() << " " << arity
<< "-ary lattice vertices.";
if (level->GetVertices().empty()) {
break;
}

for (auto& [key_map, xa_vertex] : level->GetVertices()) {
if (xa_vertex->GetIsInvalid()) {
continue;
}

Vertical xa = xa_vertex->GetVertical();
// Calculate XA PLI
if (xa_vertex->GetPositionListIndex() == nullptr) {
auto parent_pli_1 = xa_vertex->GetParents()[0]->GetPositionListIndex();
auto parent_pli_2 = xa_vertex->GetParents()[1]->GetPositionListIndex();
xa_vertex->AcquirePositionListIndex(parent_pli_1->Intersect(parent_pli_2));
}

dynamic_bitset<> xa_indices = xa.GetColumnIndices();
dynamic_bitset<> a_candidates = xa_vertex->GetRhsCandidates();

for (const auto& x_vertex : xa_vertex->GetParents()) {
Vertical const& lhs = x_vertex->GetVertical();

// Find index of A in XA. If a is not a candidate, continue. TODO: possible to do it
// easier??
// like "a_index = xa_indices - x_indices;"
int a_index = xa_indices.find_first();
dynamic_bitset<> x_indices = lhs.GetColumnIndices();
while (a_index >= 0 && x_indices[a_index]) {
a_index = xa_indices.find_next(a_index);
}
if (!a_candidates[a_index]) {
continue;
}

// Check X -> A
double error = CalculateFdError(x_vertex->GetPositionListIndex(),
xa_vertex->GetPositionListIndex());
if (error <= max_fd_error_) {
Column const* rhs = schema->GetColumns()[a_index].get();

// TODO: register FD to a file or something
RegisterAndCountFd(lhs, rhs, error, schema);
xa_vertex->GetRhsCandidates().set(rhs->GetIndex(), false);
if (error == 0) {
xa_vertex->GetRhsCandidates() &= lhs.GetColumnIndices();
}
}
}
}

if (arity == max_arity) {
break;
}

// Prune
// cout << "Pruning level: " << level->GetArity() << ". " << level->GetVertices().size() <<
// " vertices_" << endl;
std::list<model::LatticeVertex*> key_vertices;
for (auto& [map_key, vertex] : level->GetVertices()) {
Vertical columns = vertex->GetVertical(); // Originally it's a ColumnCombination

if (vertex->GetIsKeyCandidate()) {
double ucc_error =
CalculateUccError(vertex->GetPositionListIndex(), relation_.get());
if (ucc_error <= max_ucc_error_) { // If a key candidate is an approx UCC
// TODO: do smth with UCC

RegisterUcc(columns, ucc_error, schema);
vertex->SetKeyCandidate(false);
if (ucc_error == 0) {
for (size_t rhs_index = vertex->GetRhsCandidates().find_first();
rhs_index != boost::dynamic_bitset<>::npos;
rhs_index = vertex->GetRhsCandidates().find_next(rhs_index)) {
Vertical rhs =
static_cast<Vertical>(*schema->GetColumn((int)rhs_index));
if (!columns.Contains(rhs)) {
bool is_rhs_candidate = true;
for (const auto& column : columns.GetColumns()) {
Vertical sibling =
columns.Without(static_cast<Vertical>(*column))
.Union(rhs);
auto sibling_vertex =
level->GetLatticeVertex(sibling.GetColumnIndices());
if (sibling_vertex == nullptr ||
!sibling_vertex->GetConstRhsCandidates()
[rhs.GetColumnIndices().find_first()]) {
is_rhs_candidate = false;
break;
}
// for each outer rhs: if there is a sibling s.t. it doesn't
// have this rhs, there is no FD: vertex->rhs
}
// Found fd: vertex->rhs => register it
if (is_rhs_candidate) {
RegisterAndCountFd(columns, schema->GetColumn(rhs_index), 0,
schema);
}
}
}
key_vertices.push_back(vertex.get());
// cout << "--------------------------" << endl << "KeyVert: " << *vertex;
}
}
}
// if we seek for exact FDs then SetInvalid
if (max_fd_error_ == 0 && max_ucc_error_ == 0) {
for (auto key_vertex : key_vertices) {
key_vertex->GetRhsCandidates() &= key_vertex->GetVertical().GetColumnIndices();
key_vertex->SetInvalid(true);
}
}
}

// TODO: printProfilingData
AddProgress(progress_step);
}

SetProgress(100);
std::chrono::milliseconds elapsed_milliseconds =
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() -
start_time);
apriori_millis_ += elapsed_milliseconds.count();

LOG(INFO) << "Time: " << apriori_millis_ << " milliseconds";
LOG(INFO) << "Intersection time: " << model::PositionListIndex::micros_ / 1000 << "ms";
LOG(INFO) << "Total intersections: " << model::PositionListIndex::intersection_count_
<< std::endl;
LOG(INFO) << "Total FD count: " << count_of_fd_;
LOG(INFO) << "Total UCC count: " << count_of_ucc_;
LOG(INFO) << "HASH: " << Fletcher16();

return apriori_millis_;
}

} // namespace tane
47 changes: 47 additions & 0 deletions src/core/algorithms/fd/tane/common_tane.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#pragma once

#include <string>

#include "algorithms/fd/pli_based_fd_algorithm.h"
#include "config/error/type.h"
#include "config/max_lhs/type.h"
#include "model/table/position_list_index.h"
#include "model/table/relation_data.h"

namespace tane {

class CommonTane : public algos::PliBasedFDAlgorithm {
private:
void RegisterOptions();
void MakeExecuteOptsAvailable() final;

void ResetStateFd() final;
unsigned long long ExecuteInternal() final;

public:
config::ErrorType max_fd_error_;
config::ErrorType max_ucc_error_;
config::MaxLhsType max_lhs_;

int count_of_fd_ = 0;
int count_of_ucc_ = 0;
long apriori_millis_ = 0;

CommonTane();

virtual double CalculateZeroAryFdError(ColumnData const* rhs) = 0;
virtual double CalculateFdError(model::PositionListIndex const* lhs_pli,
model::PositionListIndex const* joint_pli) = 0;
static double CalculateUccError(model::PositionListIndex const* pli,
ColumnLayoutRelationData const* relation_data);

// static double round(double error) { return ((int)(error * 32768) + 1)/ 32768.0; }

void RegisterAndCountFd(Vertical const& lhs, Column const* rhs, double error,
RelationalSchema const* schema);
// void RegisterFd(Vertical const* lhs, Column const* rhs, double error, RelationalSchema const*
// schema);
void RegisterUcc(Vertical const& key, double error, RelationalSchema const* schema);
};

} // namespace tane
1 change: 0 additions & 1 deletion src/core/algorithms/fd/tane/tane.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,5 +308,4 @@ unsigned long long Tane::ExecuteInternal() {

return apriori_millis_;
}

} // namespace algos
1 change: 1 addition & 0 deletions src/core/algorithms/fd/tane/tane.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <string>

#include "algorithms/fd/pli_based_fd_algorithm.h"
#include "common_tane.h"
#include "config/error/type.h"
#include "config/max_lhs/type.h"
#include "model/table/position_list_index.h"
Expand Down
1 change: 1 addition & 0 deletions src/core/model/table/position_list_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ std::unique_ptr<PositionListIndex> PositionListIndex::IntersectPFD(
assert(this->relation_size_ == that->relation_size_);
return this->size_ > that->size_ ? that->ProbePFD(this->CalculateAndGetProbingTable())
: this->ProbePFD(that->CalculateAndGetProbingTable());

}

// TODO: null_cluster_ некорректен
Expand Down

0 comments on commit e03af21

Please sign in to comment.