Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Spider algorithm #304

Merged
merged 9 commits into from
Feb 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/core/algorithms/algorithm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace algos {
using AlgorithmTypes =
std::tuple<Depminer, DFD, FastFDs, FDep, Fd_mine, Pyro, Tane, FUN, hyfd::HyFD, Aid, Apriori,
metric::MetricVerifier, DataStats, fd_verifier::FDVerifier, HyUCC, PyroUCC,
cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, GfdValidation,
cfd::FDFirstAlgorithm, ACAlgorithm, UCCVerifier, Faida, Spider, GfdValidation,
EGfdValidation, NaiveGfdValidation>;

// clang-format off
Expand Down Expand Up @@ -59,6 +59,7 @@ BETTER_ENUM(AlgorithmType, char,

/* Inclusion dependency mining algorithms */
faida,
spider,

/* Graph functional dependency mining algorithms */
gfdvalid,
Expand Down
1 change: 0 additions & 1 deletion src/core/algorithms/ind/ind_algorithm.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#pragma once

#include <memory>
#include <string>
#include <string_view>
#include <vector>

Expand Down
1 change: 1 addition & 0 deletions src/core/algorithms/ind/mining_algorithms.h
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
#include "algorithms/ind/faida/faida.h"
#include "algorithms/ind/spider/spider.h"
38 changes: 38 additions & 0 deletions src/core/algorithms/ind/spider/attribute.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/** \file
* \brief Spider attribute
*
* Attribute class methods definition
*/
#include "attribute.h"

namespace algos::spider {

std::vector<AttributeIndex> AINDAttribute::GetRefIds(config::ErrorType max_error) const {
slesarev-hub marked this conversation as resolved.
Show resolved Hide resolved
std::vector<AttributeIndex> refs;
auto const dep_count = static_cast<double>(occurrences_[id_]);
for (size_t ref_id = 0; ref_id != occurrences_.size(); ++ref_id) {
if (id_ == ref_id) continue;
config::ErrorType const error = 1 - occurrences_[ref_id] / dep_count;
if (error <= max_error) {
refs.push_back(ref_id);
}
}
return refs;
}

boost::dynamic_bitset<> INDAttribute::GetBitset(AttributeIndex attr_id, AttributeIndex attr_count) {
boost::dynamic_bitset<> bitset(attr_count);
bitset.set(attr_id);
bitset.flip();
return bitset;
}

void INDAttribute::IntersectRefs(boost::dynamic_bitset<> const& bitset,
std::vector<INDAttribute>& attrs) {
for (auto ref_id : util::BitsetToIndices<AttributeIndex>(refs_ & ~bitset)) {
attrs[ref_id].RemoveDependent(id_);
}
refs_ &= bitset;
}

} // namespace algos::spider
141 changes: 141 additions & 0 deletions src/core/algorithms/ind/spider/attribute.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/** \file
* \brief Spider attribute
*
* Attribute class definition
*/
#pragma once

#include <string>
#include <utility>
#include <vector>

#include <boost/dynamic_bitset.hpp>

#include "config/error/type.h"
#include "model/table/column_combination.h"
#include "model/table/column_domain_iterator.h"
#include "model/table/column_index.h"
#include "util/bitset_utils.h"

namespace algos::spider {

using AttributeIndex = model::ColumnIndex;

namespace details {
/// base class for IND attributes
class Attribute {
public:
using Iterator = model::ColumnDomainIterator;

protected:
AttributeIndex id_; /* attribute unique identificator */
AttributeIndex attr_count_; /* attribute unique identificator */
Iterator it_; /* domain iterator */

public:
Attribute(AttributeIndex attr_id, AttributeIndex attr_count, model::ColumnDomain const& domain)
: id_(attr_id), attr_count_(attr_count), it_(domain) {}

/// get unqiue attribute id
AttributeIndex GetId() const noexcept {
return id_;
}

/// check whether the attribute has processed
virtual bool HasFinished() const noexcept {
return !it_.HasNext();
}

std::string const& GetCurrentValue() const noexcept {
return it_.GetValue();
}

void MoveToNext() {
it_.MoveToNext();
}

/// compare attributes first by their values and then by their ids
bool operator>(Attribute const& rhs) const {
int const cmp = GetCurrentValue().compare(rhs.GetCurrentValue());
return cmp == 0 ? GetId() > rhs.GetId() : cmp > 0;
}

model::ColumnCombination ToCC() const {
model::ColumnDomain const& domain = it_.GetDomain();
return {domain.GetTableId(), std::vector{domain.GetColumnId()}};
}
};
} // namespace details

/// attribute for AIND
class AINDAttribute final : public details::Attribute {
std::vector<unsigned int> occurrences_;

public:
template <typename... Args>
explicit AINDAttribute(Args&&... args)
: details::Attribute(std::forward<Args>(args)...), occurrences_(attr_count_) {}

void IntersectRefs(std::vector<AttributeIndex> const& ids) {
for (auto ref_id : ids) {
occurrences_[ref_id]++;
}
}

/// get referenced attribute indices
std::vector<AttributeIndex> GetRefIds(config::ErrorType max_error) const;
};

/// attribute for IND
class INDAttribute final : public details::Attribute {
boost::dynamic_bitset<> refs_; /* referenced attributes indices */
boost::dynamic_bitset<> deps_; /* dependent attributes indices */

static boost::dynamic_bitset<> GetBitset(AttributeIndex attr_id, AttributeIndex attr_count);

public:
template <typename... Args>
explicit INDAttribute(Args&&... args)
: details::Attribute(std::forward<Args>(args)...),
refs_(GetBitset(id_, attr_count_)),
deps_(refs_) {}

///
/// \brief intersect referenced attributes indices with provided indices
///
/// Iterate through the referenced attributes indices and remove those
/// that do not exist in the provided indices `bitset`.\n
/// Additionally, it updates dependent attributes indices.
///
/// \param bitset indices bitset to intersect with
/// \param attrs attribute objects to update
///
void IntersectRefs(boost::dynamic_bitset<> const& bitset, std::vector<INDAttribute>& attrs);

/// get referenced attribute indices
std::vector<AttributeIndex> GetRefIds() const {
return util::BitsetToIndices<AttributeIndex>(refs_);
}

/// remove dependent attribute index
void RemoveDependent(AttributeIndex id) {
deps_.set(id, false);
}

///
/// \brief check whether the attribute has processed
///
/// processing is completed if all values have been processed
/// or there are no more dependent and referenced candidates
///
bool HasFinished() const noexcept final {
return !it_.HasNext() || (refs_.none() && deps_.none());
}

/// get referenced attributes indices
boost::dynamic_bitset<>& GetRefsBitset() noexcept {
return refs_;
}
};

} // namespace algos::spider
143 changes: 143 additions & 0 deletions src/core/algorithms/ind/spider/spider.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/** \file
* \brief Spider algorithm
*
* Spider algorithm class methods definition
*/
#include "spider.h"

#include <functional>
#include <memory>
#include <queue>
#include <string>
#include <type_traits>
#include <utility>

#include "attribute.h"
#include "config/equal_nulls/option.h"
#include "config/error/option.h"
#include "config/mem_limit/option.h"
#include "config/names_and_descriptions.h"
#include "config/option_using.h"
#include "config/thread_number/option.h"
#include "util/timed_invoke.h"

namespace algos {

using AttributeIndex = spider::AttributeIndex;

Spider::Spider() : INDAlgorithm({}) {
DESBORDANTE_OPTION_USING;

RegisterOption(config::EqualNullsOpt(&is_null_equal_null_));
RegisterOption(config::ThreadNumberOpt(&threads_num_));
RegisterOption(config::MemLimitMBOpt(&mem_limit_mb_));
RegisterOption(config::ErrorOpt(&max_ind_error_));
MakeLoadOptsAvailable();
}

void Spider::MakeLoadOptsAvailable() {
MakeOptionsAvailable({config::EqualNullsOpt.GetName(), config::ThreadNumberOpt.GetName(),
config::MemLimitMBOpt.GetName()});
}

void Spider::MakeExecuteOptsAvailable() {
MakeOptionsAvailable({config::ErrorOpt.GetName()});
}

void Spider::LoadDataInternal() {
auto const create_domains = [&] {
domains_ = model::ColumnDomain::CreateFrom(input_tables_, mem_limit_mb_, threads_num_);
};
timings_.load = util::TimedInvoke(create_domains);
}

namespace {
template <typename Attribute>
std::vector<Attribute> InitAttributes(std::vector<model::ColumnDomain> const& domains) {
std::vector<Attribute> attrs;
AttributeIndex attr_count = domains.size();
attrs.reserve(attr_count);
for (AttributeIndex attr_id = 0; attr_id != attr_count; ++attr_id) {
attrs.emplace_back(attr_id, attr_count, domains[attr_id]);
}
return attrs;
}

template <typename Attribute>
std::vector<Attribute> GetProcessedAttributes(std::vector<model::ColumnDomain> const& domains,
config::EqNullsType is_null_equal_null) {
using AttributeRW = std::reference_wrapper<Attribute>;
slesarev-hub marked this conversation as resolved.
Show resolved Hide resolved
std::vector attrs = InitAttributes<Attribute>(domains);
std::priority_queue<AttributeRW, std::vector<AttributeRW>, std::greater<Attribute>> attr_pq(
attrs.begin(), attrs.end());
boost::dynamic_bitset<> ids_bitset(attrs.size());
while (!attr_pq.empty()) {
AttributeRW attr_rw = attr_pq.top();
std::string const& value = attr_rw.get().GetCurrentValue();
do {
attr_pq.pop();
ids_bitset.set(attr_rw.get().GetId());
if (attr_pq.empty()) break;
attr_rw = attr_pq.top();
if (value.empty() && !is_null_equal_null) break;
} while (attr_rw.get().GetCurrentValue() == value);

auto ids_vec = util::BitsetToIndices<AttributeIndex>(ids_bitset);
for (auto id : ids_vec) {
if constexpr (std::is_same_v<Attribute, spider::INDAttribute>) {
attrs[id].IntersectRefs(ids_bitset, attrs);
} else {
attrs[id].IntersectRefs(ids_vec);
}
}
for (auto id : ids_vec) {
Attribute& attr = attrs[id];
if (!attr.HasFinished()) {
attr.MoveToNext();
attr_pq.emplace(attr);
}
}
ids_bitset.reset();
}
return attrs;
}
}; // namespace

void Spider::RegisterIND(model::ColumnCombination lhs, model::ColumnCombination rhs) {
RegisterIND(std::make_shared<model::ColumnCombination>(std::move(lhs)),
std::make_shared<model::ColumnCombination>(std::move(rhs)));
}

void Spider::MineINDs() {
using spider::INDAttribute;
std::vector const attrs = GetProcessedAttributes<INDAttribute>(domains_, is_null_equal_null_);
for (auto const& dep : attrs) {
for (AttributeIndex ref_id : dep.GetRefIds()) {
RegisterIND(dep.ToCC(), attrs[ref_id].ToCC());
}
}
}

void Spider::MineAINDs() {
using spider::AINDAttribute;
std::vector const attrs = GetProcessedAttributes<AINDAttribute>(domains_, is_null_equal_null_);
for (auto const& dep : attrs) {
for (AttributeIndex ref_id : dep.GetRefIds(max_ind_error_)) {
RegisterIND(dep.ToCC(), attrs[ref_id].ToCC());
}
}
}

unsigned long long Spider::ExecuteInternal() {
auto const mining_func = (max_ind_error_ == 0) ? &Spider::MineINDs : &Spider::MineAINDs;
timings_.compute = util::TimedInvoke(mining_func, this);
timings_.total = timings_.load + timings_.compute;
return timings_.total;
}

void Spider::ResetINDAlgorithmState() {
timings_.compute = 0;
timings_.total = 0;
}

} // namespace algos
Loading
Loading