-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathconfig.yaml
50 lines (50 loc) · 1.81 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Schema for Dedupe's tables:
schema: dedupe
# The source table to dedupe:
table: dedupe.entries
# The column in the source table that is the key
key: entry_id
# The fields to use as evidence of a duplicate row and their comparison operator:
fields:
- field: ssn
type: String
has missing: true
- field: first_name
type: String
- field: last_name
type: String
- field: dob
type: String
# has missing: true
- field: race
type: Categorical
categories: [pacisland, amindian, asian, other, black, white]
- field: ethnicity
type: Categorical
categories: [hispanic, nonhispanic]
- field: sex
type: Categorical
categories: [M, F]
interactions:
- [last_name, dob]
- [ssn, dob]
# This filter allows you to specify a bare minimum set of required columns
filter_condition: last_name is not null AND (ssn is not null OR (first_name is not null AND dob is not null))
# And after dedupe, an exact record linkage step using a subset of columns can
# merge the found clusters
merge_exact:
- [first_name, last_name, dob]
- [last_name, ssn]
# Turn off interactive labeling; this requires a saved training set
prompt_for_labels: False
# You can manually tune the desired recall for the training labels. This
# establishes a blocking pattern such that at least `recall` fraction of the
# true labeled matches get blocked together.
recall: 0.99
# Specify the location of the saved training dataset
training_file: tests/dedup_postgres_training.json
# You may optionally specify a seed for deterministic deduplicaiton. Note that
# in Python 3.2.3 and higher, Dedupe also requires the PYTHONHASHSEED
# environment variable to be set to an integer for its results to be
# deterministic due to ordering dependencies.
seed: 0