Skip to content

Commit

Permalink
Generalize TANE and PFDTANE algorithms
Browse files Browse the repository at this point in the history
Generalize TANE-based algorithms,
add additional pFD mining tests.
  • Loading branch information
iliya-b committed Sep 14, 2024
1 parent 94027b1 commit 57905a7
Show file tree
Hide file tree
Showing 19 changed files with 207 additions and 457 deletions.
2 changes: 1 addition & 1 deletion src/core/algorithms/fd/mining_algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
#include "algorithms/fd/fdep/fdep.h"
#include "algorithms/fd/fun/fun.h"
#include "algorithms/fd/hyfd/hyfd.h"
#include "algorithms/fd/pfdtane/pfdtane.h"
#include "algorithms/fd/pyro/pyro.h"
#include "algorithms/fd/tane/pfdtane.h"
#include "algorithms/fd/tane/tane.h"
40 changes: 0 additions & 40 deletions src/core/algorithms/fd/pfdtane/pfdtane.h

This file was deleted.

File renamed without changes.
File renamed without changes.
88 changes: 88 additions & 0 deletions src/core/algorithms/fd/tane/pfdtane.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include "pfdtane.h"

#include <memory>

#include <easylogging++.h>

#include "config/error/option.h"
#include "config/error_measure/option.h"
#include "enums.h"
#include "fd/pli_based_fd_algorithm.h"
#include "model/table/column_data.h"
#include "model/table/column_layout_relation_data.h"

namespace algos {
using boost::dynamic_bitset;
using Cluster = model::PositionListIndex::Cluster;

PFDTane::PFDTane(std::optional<ColumnLayoutRelationDataManager> relation_manager)
: tane::TaneCommon(relation_manager) {
RegisterOptions();
}

config::ErrorType PFDTane::CalculateZeroAryPFDError(ColumnData const* rhs,
ColumnLayoutRelationData const*) {
std::size_t max = 1;
model::PositionListIndex const* x_pli = rhs->GetPositionListIndex();
for (Cluster const& x_cluster : x_pli->GetIndex()) {
max = std::max(max, x_cluster.size());
}
return 1.0 - static_cast<double>(max) / x_pli->GetRelationSize();
}

config::ErrorType PFDTane::CalculatePFDError(model::PositionListIndex const* x_pli,
model::PositionListIndex const* xa_pli,
ErrorMeasure measure,
ColumnLayoutRelationData const*) {
std::deque<Cluster> xa_index = xa_pli->GetIndex();
std::shared_ptr<Cluster const> probing_table = x_pli->CalculateAndGetProbingTable();
std::sort(xa_index.begin(), xa_index.end(),
[&probing_table](Cluster const& a, Cluster const& b) {
return probing_table->at(a.front()) < probing_table->at(b.front());
});
double sum = 0.0;
std::size_t cluster_rows_count = 0;
std::deque<Cluster> const& x_index = x_pli->GetIndex();
auto xa_cluster_it = xa_index.begin();

for (Cluster const& x_cluster : x_index) {
std::size_t max = 1;
for (int x_row : x_cluster) {
if (xa_cluster_it == xa_index.end()) {
break;
}
if (x_row == xa_cluster_it->at(0)) {
max = std::max(max, xa_cluster_it->size());
xa_cluster_it++;
}
}
sum += measure == +ErrorMeasure::per_tuple ? static_cast<double>(max)
: static_cast<double>(max) / x_cluster.size();
cluster_rows_count += x_cluster.size();
}
unsigned int unique_rows =
static_cast<unsigned int>(x_pli->GetRelationSize() - cluster_rows_count);
double probability = static_cast<double>(sum + unique_rows) /
(measure == +ErrorMeasure::per_tuple ? x_pli->GetRelationSize()
: x_index.size() + unique_rows);
return 1.0 - probability;
}

config::ErrorType PFDTane::CalculateZeroAryFdError(ColumnData const* rhs) {
return CalculateZeroAryPFDError(rhs, relation_.get());
}

config::ErrorType PFDTane::CalculateFdError(model::PositionListIndex const* lhs_pli,
model::PositionListIndex const* joint_pli) {
return CalculatePFDError(lhs_pli, joint_pli, error_measure_, relation_.get());
}

void PFDTane::RegisterOptions() {
RegisterOption(config::kErrorMeasureOpt(&error_measure_));
}

void PFDTane::MakeExecuteOptsAvailableFDInternal() {
MakeOptionsAvailable({config::kErrorOpt.GetName(), config::kErrorMeasureOpt.GetName()});
}

} // namespace algos
32 changes: 32 additions & 0 deletions src/core/algorithms/fd/tane/pfdtane.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#pragma once

#include "algorithms/fd/pli_based_fd_algorithm.h"
#include "config/error/type.h"
#include "enums.h"
#include "model/table/column_data.h"
#include "model/table/column_layout_relation_data.h"
#include "model/table/position_list_index.h"
#include "tane_common.h"

namespace algos {

class PFDTane : public tane::TaneCommon {
private:
ErrorMeasure error_measure_ = +ErrorMeasure::per_tuple;
void RegisterOptions();
void MakeExecuteOptsAvailableFDInternal() final;
config::ErrorType CalculateZeroAryFdError(ColumnData const* rhs) override;
config::ErrorType CalculateFdError(model::PositionListIndex const* lhs_pli,
model::PositionListIndex const* joint_pli) override;

public:
PFDTane(std::optional<ColumnLayoutRelationDataManager> relation_manager = std::nullopt);
static config::ErrorType CalculateZeroAryPFDError(
ColumnData const* rhs, ColumnLayoutRelationData const* relation_data);
static config::ErrorType CalculatePFDError(model::PositionListIndex const* x_pli,
model::PositionListIndex const* xa_pli,
ErrorMeasure error_measure,
ColumnLayoutRelationData const* relation_data);
};

} // namespace algos
Loading

0 comments on commit 57905a7

Please sign in to comment.