Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement OD verifier algorithm #420

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions src/core/algorithms/algorithm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ using AlgorithmTypes =
std::tuple<Depminer, DFD, FastFDs, FDep, FdMine, Pyro, Tane, PFDTane, FUN, hyfd::HyFD, Aid,
Apriori, metric::MetricVerifier, DataStats, fd_verifier::FDVerifier, HyUCC,
PyroUCC, cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, Spider, Mind,
Fastod, GfdValidation, EGfdValidation, NaiveGfdValidation, order::Order,
dd::Split>;
Fastod, order::Order, od_verifier::ODVerifier, GfdValidation, EGfdValidation,
NaiveGfdValidation, dd::Split>;

// clang-format off
/* Enumeration of all supported non-pipeline algorithms. If you implement a new
Expand Down Expand Up @@ -66,15 +66,16 @@ BETTER_ENUM(AlgorithmType, char,

/* Order dependency mining algorithms */
fastod,
order,

/* Canonical OD verifier algorithm */
od_verifier,

/* Graph functional dependency mining algorithms */
gfdvalid,
egfdvalid,
naivegfdvalid,

/* Order dependency mining algorithms */
order,

/* Differential dependencies mining algorithm */
split
)
Expand Down
1 change: 1 addition & 0 deletions src/core/algorithms/algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "algorithms/ind/mining_algorithms.h"
#include "algorithms/metric/verification_algorithms.h"
#include "algorithms/od/mining_algorithms.h"
#include "algorithms/od/verification_algorithms.h"
#include "algorithms/statistics/algorithms.h"
#include "algorithms/ucc/mining_algorithms.h"
#include "algorithms/ucc/verification_algorithms.h"
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace algos::fastod {

class ComplexStrippedPartition {
private:
protected:
std::shared_ptr<std::vector<size_t>> sp_indexes_;
std::shared_ptr<std::vector<size_t>> sp_begins_;
std::shared_ptr<std::vector<DataFrame::Range>> rb_indexes_;
Expand Down
75 changes: 75 additions & 0 deletions src/core/algorithms/od/od_verifier/od_verifier.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include "od_verifier.h"

#include "ascending_od/option.h"
#include "config/equal_nulls/option.h"
#include "config/indices/od_context.h"
#include "config/indices/option.h"
#include "config/tabular_data/input_table/option.h"

namespace algos::od_verifier {

ODVerifier::ODVerifier() : Algorithm({}) {
RegisterOptions();
MakeOptionsAvailable({config::kTableOpt.GetName(), config::kEqualNullsOpt.GetName()});
}

void ODVerifier::RegisterOptions() {
auto get_schema_cols = [this]() { return relation_->GetSchema()->GetNumColumns(); };

IndicesType lhs_indices_, rhs_indices_;
RegisterOption(config::kTableOpt(&input_table_));
RegisterOption(config::kEqualNullsOpt(&is_null_equal_null_));
RegisterOption(config::kLhsIndicesOpt(&lhs_indices_, get_schema_cols));
RegisterOption(config::kRhsIndicesOpt(&rhs_indices_, get_schema_cols));
RegisterOption(config::kODContextOpt(&context_indices_));
RegisterOption(config::kAscendingODOpt(&ascending_));
lhs_indicex_ = lhs_indices_[0];
rhs_indicex_ = rhs_indices_[0];
}

void ODVerifier::MakeExecuteOptsAvailable() {
MakeOptionsAvailable({config::kLhsIndicesOpt.GetName(), config::kRhsIndicesOpt.GetName(),
config::kODContextOpt.GetName(), config::kAscendingODOpt.GetName()});
}

void ODVerifier::LoadDataInternal() {
relation_ = ColumnLayoutRelationData::CreateFrom(*input_table_, is_null_equal_null_);

if (relation_->GetColumnData().empty()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should maintain consistent style of writing braces.

throw std::runtime_error("Got an empty dataset: OD verifying is meaningless.");
}
input_table_->Reset();
data_ = std::make_shared<DataFrame>(DataFrame::FromInputTable(input_table_));
if (data_->GetColumnCount() == 0) {
throw std::runtime_error("Got an empty dataset: OD verifying is meaningless.");
}
}

unsigned long long ODVerifier::ExecuteInternal() {
auto start_time = std::chrono::system_clock::now();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is better to use existing methods. Use util::TimedInvoke (example).

if (ascending_) {
VerifyOD<true>();
} else {
VerifyOD<false>();
}
auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
return elapsed_milliseconds.count();
}

// checks whether the OD has broken
bool ODVerifier::ODHolds() const {
return row_violate_ods_by_swap_.empty() && row_violate_ods_by_split_.empty();
}

// Returns the number of rows that violate the OD by split
size_t ODVerifier::GetNumRowsViolateBySplit() const {
return row_violate_ods_by_split_.size();
}

// Returns the number of rows that violate the OD by swap
size_t ODVerifier::GetNumRowsViolateBySwap() const {
return row_violate_ods_by_swap_.size();
}

} // namespace algos::od_verifier
101 changes: 101 additions & 0 deletions src/core/algorithms/od/od_verifier/od_verifier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#pragma once

#include "algorithms/algorithm.h"
#include "algorithms/od/fastod/model/canonical_od.h"
#include "config/indices/type.h"
#include "model/table/column_layout_relation_data.h"
#include "partition.h"

namespace algos::od_verifier {

class ODVerifier : public Algorithm {
private:
using IndicesType = config::IndicesType;
using IndexType = config::IndexType;
using DataFrame = fastod::DataFrame;
using PartitionCache = fastod::PartitionCache;
using AscCanonicalOD = fastod::AscCanonicalOD;
using DescCanonicalOD = fastod::DescCanonicalOD;
using SimpleCanonicalOD = fastod::SimpleCanonicalOD;
using AttributeSet = fastod::AttributeSet;

// input data
config::InputTable input_table_;
config::EqNullsType is_null_equal_null_;
IndexType lhs_indicex_;
IndexType rhs_indicex_;
IndicesType context_indices_;
bool ascending_;

// auxiliary data
std::shared_ptr<ColumnLayoutRelationData> relation_;
std::shared_ptr<DataFrame> data_;
PartitionCache partition_cache_;

// rows that vioalates ods
std::vector<int> row_violate_ods_by_swap_;
std::vector<int> row_violate_ods_by_split_;

// load input data
void RegisterOptions();
void MakeExecuteOptsAvailable() override;
void LoadDataInternal() override;

// runs the algorithm and measures its time
unsigned long long ExecuteInternal() override;

// checks whether OD is violated and finds the rows where it is violated
template <bool Ascending>
void VerifyOD() {
AttributeSet context;

for (auto column : context_indices_) context.Set(column);

fastod::ComplexStrippedPartition stripped_partition_swap(
(partition_cache_.GetStrippedPartition(context, data_)));

if (stripped_partition_swap.Swap<Ascending>(lhs_indicex_, rhs_indicex_)) {
ComplaxStrippedPartition part{stripped_partition_swap};
std::vector<std::pair<int, int>> violates(
part.FindViolationsBySwap<Ascending>(lhs_indicex_, rhs_indicex_));

for (auto position_violate : violates)
row_violate_ods_by_swap_.push_back(position_violate.second + 1);
}

context.Set(lhs_indicex_);
fastod::ComplexStrippedPartition stripped_partition_split(
partition_cache_.GetStrippedPartition(context, data_));

if (stripped_partition_split.Split(rhs_indicex_)) {
ComplaxStrippedPartition part{stripped_partition_split};
std::vector<std::pair<int, int>> violates(part.FindViolationsBySplit(rhs_indicex_));

for (auto position_violate : violates)
row_violate_ods_by_split_.push_back(position_violate.second + 1);
}
std::sort(row_violate_ods_by_split_.begin(), row_violate_ods_by_split_.end());
std::sort(row_violate_ods_by_swap_.begin(), row_violate_ods_by_swap_.end());
}

// reset statistic of violations
void ResetState() override {
row_violate_ods_by_swap_.clear();
row_violate_ods_by_split_.clear();
}

public:
// base constructor
ODVerifier();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Constructors should be mentioned before methods.


// checks whether the OD has broken
bool ODHolds() const;

// Returns the number of rows that violate the OD by split
size_t GetNumRowsViolateBySplit() const;

// Returns the number of rows that violate the OD by swap
size_t GetNumRowsViolateBySwap() const;
};

} // namespace algos::od_verifier
58 changes: 58 additions & 0 deletions src/core/algorithms/od/od_verifier/partition.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#include "partition.h"

#include <utility>
#include <vector>

namespace algos::od_verifier {

std::vector<ComplaxStrippedPartition::ViolationDescription>
ComplaxStrippedPartition::CommonViolationBySplit(model::ColumnIndex right) const {
std::vector<ComplaxStrippedPartition::ViolationDescription> violates;

for (size_t begin_pointer = 0; begin_pointer < sp_begins_->size() - 1; begin_pointer++) {
size_t const group_begin = (*sp_begins_)[begin_pointer];
size_t const group_end = (*sp_begins_)[begin_pointer + 1];

int const group_value = data_->GetValue((*sp_indexes_)[group_begin], right);

for (size_t i = group_begin + 1; i < group_end; i++) {
if (data_->GetValue((*sp_indexes_)[i], right) != group_value) {
violates.emplace_back(right, (*sp_indexes_)[i]);
}
}
}

return violates;
}

std::vector<ComplaxStrippedPartition::ViolationDescription>
ComplaxStrippedPartition::RangeBasedViolationBySplit(model::ColumnIndex right) const {
std::vector<ComplaxStrippedPartition::ViolationDescription> violates;

for (size_t begin_pointer = 0; begin_pointer < rb_begins_->size() - 1; ++begin_pointer) {
size_t const group_begin = (*rb_begins_)[begin_pointer];
size_t const group_end = (*rb_begins_)[begin_pointer + 1];

int const group_value = data_->GetValue((*rb_indexes_)[group_begin].first, right);

for (size_t i = group_begin; i < group_end; ++i) {
algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i];

for (size_t j = range.first; j <= range.second; ++j) {
if (data_->GetValue(j, right) != group_value) {
violates.emplace_back(right, j);
}
}
}
}

return violates;
}

std::vector<ComplaxStrippedPartition::ViolationDescription>
ComplaxStrippedPartition::FindViolationsBySplit(model::ColumnIndex right) const {
return is_stripped_partition_ ? CommonViolationBySplit(right)
: RangeBasedViolationBySplit(right);
}

} // namespace algos::od_verifier
92 changes: 92 additions & 0 deletions src/core/algorithms/od/od_verifier/partition.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#pragma once

#include "algorithms/od/fastod/partitions/complex_stripped_partition.h"

namespace algos::od_verifier {

class ComplaxStrippedPartition : protected algos::fastod::ComplexStrippedPartition {
private:
using ViolationDescription = std::pair<int, int>;

std::vector<ViolationDescription> CommonViolationBySplit(model::ColumnIndex right) const;

std::vector<ViolationDescription> RangeBasedViolationBySplit(model::ColumnIndex right) const;

public:
ComplaxStrippedPartition() : algos::fastod::ComplexStrippedPartition() {}

ComplaxStrippedPartition(algos::fastod::ComplexStrippedPartition const& daddy)
: algos::fastod::ComplexStrippedPartition(daddy) {}

std::vector<ViolationDescription> FindViolationsBySplit(model::ColumnIndex right) const;

template <bool Ascending>
std::vector<ViolationDescription> FindViolationsBySwap(model::ColumnIndex left,
model::ColumnIndex right) const {
size_t const group_count = is_stripped_partition_ ? sp_begins_->size() : rb_begins_->size();
std::vector<ComplaxStrippedPartition::ViolationDescription> violates;

for (size_t begin_pointer = 0; begin_pointer < group_count - 1; begin_pointer++) {
size_t const group_begin = is_stripped_partition_ ? (*sp_begins_)[begin_pointer]
: (*rb_begins_)[begin_pointer];

size_t const group_end = is_stripped_partition_ ? (*sp_begins_)[begin_pointer + 1]
: (*rb_begins_)[begin_pointer + 1];

std::vector<std::pair<int, int>> values;
std::vector<int> row_pos;

if (is_stripped_partition_) {
values.reserve(group_end - group_begin);

for (size_t i = group_begin; i < group_end; ++i) {
size_t const index = (*sp_indexes_)[i];

values.emplace_back(data_->GetValue(index, left),
data_->GetValue(index, right));
row_pos.emplace_back(index);
}
} else {
for (size_t i = group_begin; i < group_end; ++i) {
algos::fastod::DataFrame::Range const range = (*rb_indexes_)[i];

for (size_t j = range.first; j <= range.second; ++j) {
values.emplace_back(data_->GetValue(j, left), data_->GetValue(j, right));
}
}
}

if constexpr (Ascending) {
std::sort(values.begin(), values.end(),
[](auto const& p1, auto const& p2) { return p1.first < p2.first; });
} else {
std::sort(values.begin(), values.end(),
[](auto const& p1, auto const& p2) { return p2.first < p1.first; });
}

size_t prev_group_max_index = 0;
size_t current_group_max_index = 0;
bool is_first_group = true;

for (size_t i = 0; i < values.size(); i++) {
auto const& [first, second] = values[i];

if (i != 0 && values[i - 1].first != first) {
is_first_group = false;
prev_group_max_index = current_group_max_index;
current_group_max_index = i;
} else if (values[current_group_max_index].second <= second) {
current_group_max_index = i;
}

if (!is_first_group && values[prev_group_max_index].second > second) {
violates.emplace_back(right, row_pos[i]);
}
}
}

return violates;
}
};

} // namespace algos::od_verifier
3 changes: 3 additions & 0 deletions src/core/algorithms/od/verification_algorithms.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once

#include "algorithms/od/od_verifier/od_verifier.h"
9 changes: 9 additions & 0 deletions src/core/config/ascending_od/option.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include "ascending_od/option.h"

#include "ascending_od/type.h"
#include "config/names_and_descriptions.h"

namespace config {
extern CommonOption<AscendingODFlagType> const kAscendingODOpt{names::kAscendingOD,
descriptions::kDAscendingOD, true};
} // namespace config
Loading