diff --git a/examples/basic/README.md b/examples/basic/README.md index db700411a5..848412fe0f 100644 --- a/examples/basic/README.md +++ b/examples/basic/README.md @@ -26,4 +26,4 @@ These scenarios showcase a single pattern by discussing its definition and provi + [verifying_nd](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_nd) — a scenario showing how to verify numerical dependencies. + [verifying_mfd.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_mfd.py) — a scenario showing how to verify metric functional dependencies. + [verifying_ucc.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_ucc.py) — a scenario showing how to verify a unique column combination. - ++ [verifying_dc.py](https://github.com/Desbordante/desbordante-core/tree/main/examples/basic/verifying_dc.py) — a scenario showing how to verify a denial constraint. diff --git a/examples/basic/verifying_dc.py b/examples/basic/verifying_dc.py new file mode 100644 index 0000000000..467a6a5e49 --- /dev/null +++ b/examples/basic/verifying_dc.py @@ -0,0 +1,65 @@ +import desbordante as db +import pandas as pd + +# TODO: Add console interaction with user + +# The given denial constraint tells us that if two persons live in the +# same state, the one earning a lower salary has a lower tax rate +DC = "!(s.State == t.State and s.Salary < t.Salary and s.FedTaxRate > t.FedTaxRate)" + +TABLE_1 = "examples/datasets/taxes.csv" +TABLE_2 = "examples/datasets/taxes_2.csv" + +print("""This is an example of how to use Denial Constraint (DC) verification for checking hypotheses on data. +DC verification is perfomed by the Rapidash algorithm: https://arxiv.org/abs/2309.12436 +DC φ is a conjunction of predicates of the following form: +∀s, t ∈ R, s ≠ t: ¬(p_1 ∧ . . . ∧ p_m) + +DCs involve comparisons between pairs of rows within a dataset. +A typical DC example, derived from a Functional Dependency such as A -> B, +is expressed as: "∀s, t ∈ R, s ≠ t, ¬(t.A == s.A and t.B ≠ s.B). +This denotes that for any pair of rows in the relation, it should not be the case +that while the values in column A are equal, the values in column B are unequal. +""") + +print("Consider the following dataset:\n") +data = pd.read_csv(TABLE_1, header=[0]) +print(data, end="\n\n") +print("""And the following Denial Constraint: !(s.State == t.State and s.Salary < t.Salary and s.FedTaxRate > t.FedTaxRate). +It tells us that for all people in the same state the person with a higher salary has a higher tax rate. +Then we run the algorithm in order to see if the constraint holds. +""") + +# Creating a verifier and loading data +verifier = db.dc_verification.algorithms.Default() +verifier.load_data(table=(TABLE_1, ',', True)) + +# Algorithm execution +verifier.execute(denial_constraint=DC) + +# Obtaining the result +result: bool = verifier.dc_holds() + +print(f"DC {DC} holds: {result}\n") + +print("""Now we will modify the initial dataset to make things a little bit more interesting. +Consider the previous table but with an additional record for Texas: +""") + +data = pd.read_csv(TABLE_2, header=[0]) +print(data, end="\n\n") + +verifier = db.dc_verification.algorithms.Default() +verifier.load_data(table=(TABLE_2, ',', True)) +verifier.execute(denial_constraint=DC) + +result: bool = verifier.dc_holds() + +print("DC " + DC + " holds: " + str(result) + "\n") + +print("""Now we can see that the same DC we examined on the previous dataset doesn't hold on the new one. +The thing is that for the last record (Texas, 5000, 0.05) there are people in Texas with a lower salary +but higher tax rate. Pairs of records like this that contradict a DC are called violations. +In this case the following pairs are violations: +(6, 9), (7, 9), (8, 9), where each number is an index of a record. +""") diff --git a/examples/datasets/taxes.csv b/examples/datasets/taxes.csv new file mode 100644 index 0000000000..9cb6af5b97 --- /dev/null +++ b/examples/datasets/taxes.csv @@ -0,0 +1,10 @@ +State,Salary,FedTaxRate +NewYork,3000,0.2 +NewYork,4000,0.25 +NewYork,5000,0.3 +Wisconsin,5000,0.15 +Wisconsin,6000,0.2 +Wisconsin,4000,0.1 +Texas,1000,0.15 +Texas,2000,0.25 +Texas,3000,0.3 diff --git a/examples/datasets/taxes_2.csv b/examples/datasets/taxes_2.csv new file mode 100644 index 0000000000..9f80920fca --- /dev/null +++ b/examples/datasets/taxes_2.csv @@ -0,0 +1,11 @@ +State,Salary,FedTaxRate +NewYork,3000,0.2 +NewYork,4000,0.25 +NewYork,5000,0.3 +Wisconsin,5000,0.15 +Wisconsin,6000,0.2 +Wisconsin,4000,0.1 +Texas,1000,0.15 +Texas,2000,0.25 +Texas,3000,0.3 +Texas,5000,0.05 diff --git a/src/python_bindings/bindings.cpp b/src/python_bindings/bindings.cpp index 36528f132c..c7894bf32f 100644 --- a/src/python_bindings/bindings.cpp +++ b/src/python_bindings/bindings.cpp @@ -8,6 +8,7 @@ #include "bind_main_classes.h" #include "cfd/bind_cfd.h" #include "data/bind_data_types.h" +#include "dc/bind_dc_verification.h" #include "dd/bind_split.h" #include "dynamic/bind_dynamic_fd_verification.h" #include "fd/bind_fd.h" @@ -58,7 +59,8 @@ PYBIND11_MODULE(desbordante, module, pybind11::mod_gil_not_used()) { BindDynamicFdVerification, BindNdVerification, BindSFD, - BindMd}) { + BindMd, + BindDCVerification}) { bind_func(module); } } diff --git a/src/python_bindings/dc/bind_dc_verification.cpp b/src/python_bindings/dc/bind_dc_verification.cpp new file mode 100644 index 0000000000..36abe2718a --- /dev/null +++ b/src/python_bindings/dc/bind_dc_verification.cpp @@ -0,0 +1,17 @@ +#include "dc/bind_dc_verification.h" + +#include "algorithms/dc/verifier/dc_verifier.h" +#include "py_util/bind_primitive.h" + +namespace python_bindings { + +namespace py = pybind11; + +void BindDCVerification(py::module_& main_module) { + auto dc_verification_module = main_module.def_submodule("dc_verification"); + + BindPrimitiveNoBase(dc_verification_module, "DCVerification") + .def("dc_holds", &algos::DCVerifier::DCHolds); +} + +} // namespace python_bindings diff --git a/src/python_bindings/dc/bind_dc_verification.h b/src/python_bindings/dc/bind_dc_verification.h new file mode 100644 index 0000000000..203f648f0c --- /dev/null +++ b/src/python_bindings/dc/bind_dc_verification.h @@ -0,0 +1,7 @@ +#pragma once + +#include + +namespace python_bindings { +void BindDCVerification(pybind11::module_& main_module); +} // namespace python_bindings diff --git a/src/python_bindings/py_util/get_py_type.cpp b/src/python_bindings/py_util/get_py_type.cpp index fc98706b7e..9b5d9c05a3 100644 --- a/src/python_bindings/py_util/get_py_type.cpp +++ b/src/python_bindings/py_util/get_py_type.cpp @@ -98,6 +98,7 @@ py::tuple GetPyType(std::type_index type_index) { PyTypePair, PyTypePair, kPyList, kPyStr>, PyTypePair, kPySet, kPyInt>, + PyTypePair, }; return type_map.at(type_index)(); } diff --git a/src/python_bindings/py_util/py_to_any.cpp b/src/python_bindings/py_util/py_to_any.cpp index 7285031878..a104afc854 100644 --- a/src/python_bindings/py_util/py_to_any.cpp +++ b/src/python_bindings/py_util/py_to_any.cpp @@ -134,6 +134,7 @@ std::unordered_map const kConverters{ kNormalConvPair, kNormalConvPair>, kNormalConvPair>, + kNormalConvPair, }; } // namespace diff --git a/src/tests/all_csv_configs.cpp b/src/tests/all_csv_configs.cpp index f031d19258..dd74bc41fa 100644 --- a/src/tests/all_csv_configs.cpp +++ b/src/tests/all_csv_configs.cpp @@ -123,4 +123,6 @@ CSVConfig const kLineItem = CreateCsvConfig("LineItem.csv", '|', true); CSVConfig const kAnimalsBeverages = CreateCsvConfig("animals_beverages.csv", ',', true); CSVConfig const kTestDC = CreateCsvConfig("TestDC.csv", ',', true); CSVConfig const kTestDC1 = CreateCsvConfig("TestDC1.csv", ',', true); +CSVConfig const kTestDC2 = CreateCsvConfig("TestDC2.csv", ',', true); +CSVConfig const kTestDC3 = CreateCsvConfig("TestDC3.csv", ',', true); } // namespace tests diff --git a/src/tests/all_csv_configs.h b/src/tests/all_csv_configs.h index 8b9791ebf2..c80e40b768 100644 --- a/src/tests/all_csv_configs.h +++ b/src/tests/all_csv_configs.h @@ -103,4 +103,6 @@ extern CSVConfig const kLineItem; extern CSVConfig const kAnimalsBeverages; extern CSVConfig const kTestDC; extern CSVConfig const kTestDC1; +extern CSVConfig const kTestDC2; +extern CSVConfig const kTestDC3; } // namespace tests diff --git a/test_input_data/TestDC.csv b/test_input_data/TestDC.csv index bef7c71131..27692ee205 100644 --- a/test_input_data/TestDC.csv +++ b/test_input_data/TestDC.csv @@ -5,4 +5,4 @@ Col0,Col1,Col2,Col3,Col4,Col5,Col6,Col7 0,1,2,cum deo,7,64.811,2.19,aa 3,4,19,ridiculous,10,15.9,-43.01,bb 1,1,14,crucio,12,17.693,69.012,crm -3,1,38,lumos,31,-0.19,-1.12,bb \ No newline at end of file +3,1,38,lumos,31,-0.19,-1.12,bb diff --git a/test_input_data/TestDC1.csv b/test_input_data/TestDC1.csv index 13677dc0f9..9a343eccdf 100644 --- a/test_input_data/TestDC1.csv +++ b/test_input_data/TestDC1.csv @@ -8,4 +8,4 @@ Wisconsin,4000,0.1 Texas,1000,0.15 Texas,2000,0.25 Texas,3000,0.3 -Texas,3000,0.31 \ No newline at end of file +Texas,3000,0.31