Skip to content

Commit

Permalink
Add ACAlgorithm python example
Browse files Browse the repository at this point in the history
  • Loading branch information
Sched71 authored and polyntsov committed Dec 10, 2023
1 parent 0fd3148 commit 898a798
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ add_custom_command(TARGET copy-python-examples
copy ${CMAKE_SOURCE_DIR}/examples/dedupe.py
${CMAKE_SOURCE_DIR}/examples/mine_typos.py
${CMAKE_SOURCE_DIR}/examples/anomaly_detection.py
${CMAKE_SOURCE_DIR}/examples/algebraic_constraints.py
${CMAKE_BINARY_DIR}/examples
COMMAND ${CMAKE_COMMAND} -E
copy ${CMAKE_SOURCE_DIR}/examples/datasets/duplicates.csv
${CMAKE_SOURCE_DIR}/examples/datasets/Workshop.csv
${CMAKE_SOURCE_DIR}/examples/datasets/cargo_data_1.csv
${CMAKE_SOURCE_DIR}/examples/datasets/cargo_data_2.csv
${CMAKE_SOURCE_DIR}/examples/datasets/cargo_data_3.csv
${CMAKE_SOURCE_DIR}/examples/datasets/cargo_march.csv
${CMAKE_BINARY_DIR}/examples/datasets
)
50 changes: 50 additions & 0 deletions examples/algebraic_constraints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import desbordante
import pandas
import operator

TABLE = 'datasets/cargo_march.csv'
HEADER = 0
SEPARATOR = ','
P_FUZZ = 0.85
FUZZINESS = 0.2
BUMPS_LIMIT = 0
WEIGHT = 0.1
BIN_OPERATION = '-'
AC_SEED = 11
ITERATIONS_LIMIT = 4
OPERATIONS = {
'+': (operator.add, 'Sum'),
'-': (operator.sub, 'Difference'),
'*': (operator.mul, 'Product'),
'/': (operator.truediv, 'Ratio'),
}
operation, operation_name = OPERATIONS[BIN_OPERATION]

algo = desbordante.ACAlgorithm()

df = pandas.read_csv(TABLE, sep=SEPARATOR, header=HEADER)
df_without_id = df[['Delivery date', 'Dispatch date']]

algo.load_data(df=df_without_id)

algo.execute(p_fuzz=P_FUZZ, fuzziness=FUZZINESS, bumps_limit=BUMPS_LIMIT, weight=WEIGHT,
bin_operation=BIN_OPERATION, ac_seed=AC_SEED, iterations_limit=ITERATIONS_LIMIT)

ac_ranges = algo.get_ac_ranges()
for ac_range in ac_ranges:
l_col = df_without_id.columns[ac_range.column_indices[0]]
r_col = df_without_id.columns[ac_range.column_indices[1]]
print(f'Discovered ranges for ({l_col} {BIN_OPERATION} {r_col}) are:')
print(ac_range.ranges)

ac_exceptions = algo.get_ac_exceptions()
print()
print(f'Rows in which the result of the chosen operation ({BIN_OPERATION}) is outside of discovered ranges:')
for ac_exception in ac_exceptions:
id, delivery_date, dispatch_date = df.iloc[ac_exception.row_index]
print(f'id: {id}')
print(f'Dispatch date: {dispatch_date}')
print(f'Delivery date: {delivery_date}')
print(f'{operation_name}: {operation(delivery_date, dispatch_date)}')
print()

45 changes: 45 additions & 0 deletions examples/datasets/cargo_march.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
id,Delivery date,Dispatch date
0,3,1
1,8,1
2,8,2
3,10,4
4,7,4
5,14,10
6,21,17
7,30,1
8,30,27
9,26,22
10,25,20
11,25,20
12,19,14
13,16,11
14,16,1
15,26,4
16,23,7
17,26,10
18,30,9
19,26,6
20,23,3
21,20,1
22,23,4
23,27,8
24,29,11
25,28,10
26,18,7
27,28,10
28,19,1
29,20,3
30,22,11
31,9,2
32,29,12
33,27,5
34,22,5
35,30,14
36,25,9
37,25,9
38,30,14
39,16,11
40,30,15
41,20,1
42,30,9
43,29,12

0 comments on commit 898a798

Please sign in to comment.