Skip to content

Commit

Permalink
Add DC verifier python bindings and basic example
Browse files Browse the repository at this point in the history
  • Loading branch information
xJoskiy committed Nov 28, 2024
1 parent 56e3aed commit 20e6645
Show file tree
Hide file tree
Showing 13 changed files with 122 additions and 4 deletions.
2 changes: 1 addition & 1 deletion examples/basic/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ These scenarios showcase a single pattern by discussing its definition and provi
+ [verifying_nd](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_nd) — a scenario showing how to verify numerical dependencies.
+ [verifying_mfd.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_mfd.py) — a scenario showing how to verify metric functional dependencies.
+ [verifying_ucc.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_ucc.py) — a scenario showing how to verify a unique column combination.

+ [verifying_dc.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_dc.py) — a scenario showing how to verify a denial constraint.
65 changes: 65 additions & 0 deletions examples/basic/verifying_dc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import desbordante as db
import pandas as pd

# TODO: Add console interaction with user

# The given denial constraint tells us that if two persons live in the
# same state, the one earning a lower salary has a lower tax rate
DC = "!(s.State == t.State and s.Salary < t.Salary and s.FedTaxRate > t.FedTaxRate)"

TABLE_1 = "examples/datasets/taxes.csv"
TABLE_2 = "examples/datasets/taxes_2.csv"

print("""This is an example of how to use Denial Constraint (DC) verification for checking hypotheses on data.
DC verification is perfomed by the Rapidash algorithm: https://arxiv.org/abs/2309.12436
DC φ is a conjunction of predicates of the following form:
∀s, t ∈ R, s ≠ t: ¬(p_1 ∧ . . . ∧ p_m)
DCs involve comparisons between pairs of rows within a dataset.
A typical DC example, derived from a Functional Dependency such as A -> B,
is expressed as: "∀s, t ∈ R, s ≠ t, ¬(t.A == s.A and t.B ≠ s.B).
This denotes that for any pair of rows in the relation, it should not be the case
that while the values in column A are equal, the values in column B are unequal.
""")

print("Consider the following dataset:\n")
data = pd.read_csv(TABLE_1, header=[0])
print(data, end="\n\n")
print("""And the following Denial Constraint: !(s.State == t.State and s.Salary < t.Salary and s.FedTaxRate > t.FedTaxRate).
It tells us that for all people in the same state the person with a higher salary has a higher tax rate.
Then we run the algorithm in order to see if the constraint holds.
""")

# Creating a verifier and loading data
verifier = db.dc_verification.algorithms.Default()
verifier.load_data(table=(TABLE_1, ',', True))

# Algorithm execution
verifier.execute(denial_constraint=DC)

# Obtaining the result
result: bool = verifier.dc_holds()

print(f"DC {DC} holds: {result}\n")

print("""Now we will modify the initial dataset to make things a little bit more interesting.
Consider the previous table but with an additional record for Texas:
""")

data = pd.read_csv(TABLE_2, header=[0])
print(data, end="\n\n")

verifier = db.dc_verification.algorithms.Default()
verifier.load_data(table=(TABLE_2, ',', True))
verifier.execute(denial_constraint=DC)

result: bool = verifier.dc_holds()

print("DC " + DC + " holds: " + str(result) + "\n")

print("""Now we can see that the same DC we examined on the previous dataset doesn't hold on the new one.
The thing is that for the last record (Texas, 5000, 0.05) there are people in Texas with a lower salary
but higher tax rate. Pairs of records like this that contradict a DC are called violations.
In this case the following pairs are violations:
(6, 9), (7, 9), (8, 9), where each number is an index of a record.
""")
10 changes: 10 additions & 0 deletions examples/datasets/taxes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
State,Salary,FedTaxRate
NewYork,3000,0.2
NewYork,4000,0.25
NewYork,5000,0.3
Wisconsin,5000,0.15
Wisconsin,6000,0.2
Wisconsin,4000,0.1
Texas,1000,0.15
Texas,2000,0.25
Texas,3000,0.3
11 changes: 11 additions & 0 deletions examples/datasets/taxes_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
State,Salary,FedTaxRate
NewYork,3000,0.2
NewYork,4000,0.25
NewYork,5000,0.3
Wisconsin,5000,0.15
Wisconsin,6000,0.2
Wisconsin,4000,0.1
Texas,1000,0.15
Texas,2000,0.25
Texas,3000,0.3
Texas,5000,0.05
4 changes: 3 additions & 1 deletion src/python_bindings/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "bind_main_classes.h"
#include "cfd/bind_cfd.h"
#include "data/bind_data_types.h"
#include "dc/bind_dc_verification.h"
#include "dd/bind_split.h"
#include "dynamic/bind_dynamic_fd_verification.h"
#include "fd/bind_fd.h"
Expand Down Expand Up @@ -58,7 +59,8 @@ PYBIND11_MODULE(desbordante, module, pybind11::mod_gil_not_used()) {
BindDynamicFdVerification,
BindNdVerification,
BindSFD,
BindMd}) {
BindMd,
BindDCVerification}) {
bind_func(module);
}
}
Expand Down
17 changes: 17 additions & 0 deletions src/python_bindings/dc/bind_dc_verification.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#include "dc/bind_dc_verification.h"

#include "algorithms/dc/verifier/dc_verifier.h"
#include "py_util/bind_primitive.h"

namespace python_bindings {

namespace py = pybind11;

void BindDCVerification(py::module_& main_module) {
auto dc_verification_module = main_module.def_submodule("dc_verification");

BindPrimitiveNoBase<algos::DCVerifier>(dc_verification_module, "DCVerification")
.def("dc_holds", &algos::DCVerifier::DCHolds);
}

} // namespace python_bindings
7 changes: 7 additions & 0 deletions src/python_bindings/dc/bind_dc_verification.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#pragma once

#include <pybind11/pybind11.h>

namespace python_bindings {
void BindDCVerification(pybind11::module_& main_module);
} // namespace python_bindings
1 change: 1 addition & 0 deletions src/python_bindings/py_util/get_py_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ py::tuple GetPyType(std::type_index type_index) {
PyTypePair<std::filesystem::path, kPyStr>,
PyTypePair<std::vector<std::filesystem::path>, kPyList, kPyStr>,
PyTypePair<std::unordered_set<size_t>, kPySet, kPyInt>,
PyTypePair<std::string, kPyStr>,
};
return type_map.at(type_index)();
}
Expand Down
1 change: 1 addition & 0 deletions src/python_bindings/py_util/py_to_any.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ std::unordered_map<std::type_index, ConvFunc> const kConverters{
kNormalConvPair<std::filesystem::path>,
kNormalConvPair<std::vector<std::filesystem::path>>,
kNormalConvPair<std::unordered_set<size_t>>,
kNormalConvPair<std::string>,
};

} // namespace
Expand Down
2 changes: 2 additions & 0 deletions src/tests/all_csv_configs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,6 @@ CSVConfig const kLineItem = CreateCsvConfig("LineItem.csv", '|', true);
CSVConfig const kAnimalsBeverages = CreateCsvConfig("animals_beverages.csv", ',', true);
CSVConfig const kTestDC = CreateCsvConfig("TestDC.csv", ',', true);
CSVConfig const kTestDC1 = CreateCsvConfig("TestDC1.csv", ',', true);
CSVConfig const kTestDC2 = CreateCsvConfig("TestDC2.csv", ',', true);
CSVConfig const kTestDC3 = CreateCsvConfig("TestDC3.csv", ',', true);
} // namespace tests
2 changes: 2 additions & 0 deletions src/tests/all_csv_configs.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,6 @@ extern CSVConfig const kLineItem;
extern CSVConfig const kAnimalsBeverages;
extern CSVConfig const kTestDC;
extern CSVConfig const kTestDC1;
extern CSVConfig const kTestDC2;
extern CSVConfig const kTestDC3;
} // namespace tests
2 changes: 1 addition & 1 deletion test_input_data/TestDC.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7
0,1,2,cum deo,7,64.811,2.19,aa
3,4,19,ridiculous,10,15.9,-43.01,bb
1,1,14,crucio,12,17.693,69.012,crm
3,1,38,lumos,31,-0.19,-1.12,bb
3,1,38,lumos,31,-0.19,-1.12,bb
2 changes: 1 addition & 1 deletion test_input_data/TestDC1.csv
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ Wisconsin,4000,0.1
Texas,1000,0.15
Texas,2000,0.25
Texas,3000,0.3
Texas,3000,0.31
Texas,3000,0.31

0 comments on commit 20e6645

Please sign in to comment.