Skip to content

Commit

Permalink
Financial metadata alignment (rusty1s#113)
Browse files Browse the repository at this point in the history
* financial metadata

* update

* update

* update

* update

* update

* fix test

* reset

* update
  • Loading branch information
rusty1s authored Dec 2, 2021
1 parent 21b6b15 commit 30a176e
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 196 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
data/

benchmark/train/results/
results/

__pycache__/
*.egg-info/
Expand Down
138 changes: 68 additions & 70 deletions benchmark/store/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,101 +9,99 @@ Currently supported datasets: `Financial`
### Financial Benchmark

```
Connect to Snowflake: 1.371360s
Connect to Snowflake: 1.514640s
-------------------------------
Read LOAN table: 0.401759s
Set index: 0.000873s
Encode 'DATE': 0.006606s
Encode 'AMOUNT': 0.000488s
Encode 'DURATION': 0.000096s
Encode 'PAYMENTS': 0.000076s
Encode 'STATUS': 0.000342s
Create KUMO table: 0.028216s
Read LOAN table: 0.428332s
Set index: 0.000708s
Encode 'DATE': 0.005177s
Encode 'AMOUNT': 0.000173s
Encode 'DURATION': 0.000361s
Encode 'PAYMENTS': 0.000060s
Create KUMO table: 0.007735s
-------------------------------
Read ORDERS table: 1.504474s
Set index: 0.001064s
Encode 'AMOUNT': 0.000312s
Encode 'K_SYMBOL': 0.001114s
Encode 'BANK_TO': 0.000637s
Create KUMO table: 0.003743s
Read ORDERS table: 2.618956s
Set index: 0.000878s
Encode 'AMOUNT': 0.000128s
Encode 'K_SYMBOL': 0.000655s
Encode 'BANK_TO': 0.000364s
Create KUMO table: 0.002286s
-------------------------------
Read TRANS table: 2.937932s
Set index: 0.171753s
Encode 'DATE': 1.229484s
Encode 'TYPE': 0.051564s
Encode 'OPERATION': 0.082219s
Encode 'AMOUNT': 0.017670s
Encode 'BALANCE': 0.016016s
Encode 'K_SYMBOL': 0.054851s
Encode 'BANK': 0.047701s
Create KUMO table: 1.616459s
Read TRANS table: 11.489222s
Set index: 0.099014s
Encode 'DATE': 0.840119s
Encode 'TYPE': 0.048333s
Encode 'OPERATION': 0.067162s
Encode 'AMOUNT': 0.028083s
Encode 'BALANCE': 0.027553s
Encode 'K_SYMBOL': 0.052182s
Create KUMO table: 1.132723s
-------------------------------
Read ACCOUNT table: 0.431110s
Set index: 0.000495s
Encode 'DATE': 0.011122s
Encode 'FREQUENCY': 0.000712s
Create KUMO table: 0.013038s
Read ACCOUNT table: 1.264477s
Set index: 0.000512s
Encode 'FREQUENCY': 0.000739s
Encode 'DATE': 0.008209s
Create KUMO table: 0.010024s
-------------------------------
Read DISTRICT table: 0.203407s
Set index: 0.000737s
Encode 'A4': 0.000143s
Encode 'A4': 0.000068s
Encode 'A5': 0.000065s
Encode 'A6': 0.000063s
Encode 'A7': 0.000061s
Encode 'A8': 0.000061s
Encode 'A9': 0.000061s
Encode 'A10': 0.000061s
Encode 'A11': 0.000060s
Encode 'A12': 0.000060s
Encode 'A13': 0.000060s
Encode 'A14': 0.000060s
Encode 'A15': 0.000077s
Encode 'A16': 0.000066s
Encode 'A3': 0.000287s
Create KUMO table: 0.003570s
Read DISTRICT table: 0.264152s
Set index: 0.000733s
Encode 'A3': 0.000240s
Encode 'A4': 0.000076s
Encode 'A4': 0.000042s
Encode 'A5': 0.000095s
Encode 'A6': 0.000101s
Encode 'A7': 0.000060s
Encode 'A8': 0.000022s
Encode 'A9': 0.000088s
Encode 'A10': 0.000083s
Encode 'A11': 0.000049s
Encode 'A12': 0.000031s
Encode 'A13': 0.000035s
Encode 'A14': 0.000105s
Encode 'A15': 0.000048s
Encode 'A16': 0.000070s
Create KUMO table: 0.003583s
-------------------------------
Read CLIENT table: 0.456185s
Set index: 0.000459s
Encode 'GENDER': 0.000696s
Encode 'BIRTH_DATE': 0.012299s
Create KUMO table: 0.014269s
Read CLIENT table: 0.685362s
Set index: 0.000408s
Encode 'GENDER': 0.000434s
Encode 'BIRTH_DATE': 0.008498s
Create KUMO table: 0.009834s
-------------------------------
Read DISP table: 0.378486s
Set index: 0.000396s
Encode 'TYPE': 0.000539s
Create KUMO table: 0.001489s
Read DISP table: 0.454303s
Set index: 0.000318s
Encode 'TYPE': 0.000374s
Create KUMO table: 0.001456s
-------------------------------
Read CARD table: 0.554722s
Set index: 0.000414s
Encode 'TYPE': 0.000342s
Encode 'ISSUED': 0.005496s
Create KUMO table: 0.007061s
Read CARD table: 0.235480s
Set index: 0.000317s
Encode 'TYPE': 0.000239s
Encode 'ISSUED': 0.003710s
Create KUMO table: 0.004777s
-------------------------------
Create KUMO store: 0.066861s
Create KUMO store: 0.091569s
-------------------------------
Store(
name='Financial',
LOAN={
feat=[682, 33],
discrete_feat=[682, 1]
feat=[682, 37],
discrete_feat=[682, 0],
label=[682, 1]
},
ORDERS={
feat=[6471, 6],
discrete_feat=[6471, 1]
},
TRANS={
feat=[1056320, 41],
discrete_feat=[1056320, 2]
feat=[1056320, 50],
discrete_feat=[1056320, 0]
},
ACCOUNT={
feat=[4500, 33],
discrete_feat=[4500, 0]
},
DISTRICT={
feat=[77, 14],
discrete_feat=[77, 1]
feat=[77, 22],
discrete_feat=[77, 0]
},
CLIENT={
feat=[5369, 32],
Expand Down
22 changes: 12 additions & 10 deletions benchmark/store/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@
'encoders': [
('DATE', Datetime(format='%Y-%m-%d')),
('AMOUNT', Numerical(scaler='standard')),
('DURATION', Numerical(scaler='standard')),
('DURATION', OneHot(num_categories=5)),
('PAYMENTS', Numerical(scaler='standard')),
('STATUS', Index()),
],
'foreign_key_info': [('ACCOUNT_ID', 'ACCOUNT')],
'label_encoder': ('STATUS', Index()),
},
'ORDERS': {
'index_col': ['ORDERS_ID'],
'encoders': [
('BANK_TO', Index()),
# 'BANK_TO' encodes each row uniquely, so we drop it.
# 'ACCOUNT_TO' encodes each row uniquely, so we drop it.
('AMOUNT', Numerical(scaler='standard')),
('K_SYMBOL', OneHot(num_categories=5)),
],
Expand All @@ -43,35 +43,36 @@
('OPERATION', OneHot(num_categories=6)),
('AMOUNT', Numerical(scaler='standard')),
('BALANCE', Numerical(scaler='standard')),
('K_SYMBOL', Index()),
('BANK', Index()),
('K_SYMBOL', OneHot(num_categories=9)),
# 'BANK' has too many empty values.
# 'ACCOUNT' has too many empty values.
],
'foreign_key_info': [('ACCOUNT_ID', 'ACCOUNT')],
},
'ACCOUNT': {
'index_col': ['ACCOUNT_ID'],
'encoders': [
('DATE', Datetime(format='%Y-%m-%d')),
('FREQUENCY', OneHot(num_categories=3)),
('DATE', Datetime(format='%Y-%m-%d')),
],
'foreign_key_info': [('DISTRICT_ID', 'DISTRICT')],
},
'DISTRICT': {
'index_col': ['DISTRICT_ID'],
'encoders': [
# 'A2' encodes each row uniquely, so we drop it.
('A3', Index()),
('A3', OneHot(num_categories=8)),
('A4', Numerical(scaler='standard')),
('A4', Numerical(scaler='standard')),
('A5', Numerical(scaler='standard')),
('A6', Numerical(scaler='standard')),
('A7', Numerical(scaler='standard')),
('A8', Numerical(scaler='standard')),
('A8', Numerical()),
('A9', Numerical(scaler='standard')),
('A10', Numerical(scaler='standard')),
('A11', Numerical(scaler='standard')),
('A12', Numerical(scaler='standard')),
('A13', Numerical(scaler='standard')),
('A12', Numerical()),
('A13', Numerical()),
('A14', Numerical(scaler='standard')),
('A15', Numerical(scaler='standard')),
('A16', Numerical(scaler='standard')),
Expand Down Expand Up @@ -141,6 +142,7 @@ def wrapper(*args, **kwargs):
df=df,
encoders=kwargs['encoders'],
foreign_key_info=kwargs['foreign_key_info'],
label_encoder=kwargs.get('label_encoder', None),
)
tables.append(table)

Expand Down
22 changes: 12 additions & 10 deletions kumo/scan/data_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@

csv_file_ext = '.csv'

max_categories = 50
max_categories_binary = 1000
one_hot_max_classes = 50
max_one_hot_categories = 10
max_categories = 100
max_range_in_std = 4
# maximum average string length for word embedding (GloVe) to be used
max_len_word_emb = 15
Expand Down Expand Up @@ -84,10 +83,11 @@ def _num_uniques(self, col_data: np.ndarray):
return len(unique_vals)

def decide_integer_encoding(self, col_data):
self.encoder = encoder.Numerical()
if self.num_classes < one_hot_max_classes or self.is_label:
# One-hot encoding is equivalent to shallow in computation,
# hence we exclusively use shallow instead of one-hot for now.
if self.is_label:
self.encoder = encoder.Index()
if self.num_classes < max_one_hot_categories:
self.encoder = encoder.OneHot(self.num_classes)
elif self.num_classes < max_categories:
self.encoder = encoder.Index()
elif (self.max - self.min) / self.std < max_range_in_std:
# use regression for integer labels unless num classes is bounded.
Expand Down Expand Up @@ -120,10 +120,12 @@ def decide_float_encoding(self, col_data):
def decide_str_encoding(self, col_data):
# If the column is label, it's always treated as categorical.
# TODO: Does not support language as label yet.
if self.num_classes < max_categories or self.is_label:
if self.is_label:
self.encoder = encoder.Index()
elif self.num_classes < max_one_hot_categories:
self.encoder = encoder.OneHot(self.num_classes)
elif self.num_classes < max_categories:
self.encoder = encoder.Index()
elif self.num_classes < max_categories_binary:
self.encoder = encoder.Binary(self.num_classes)
elif self.max_len < max_len_word_emb:
self.encoder = encoder.GloVe()
else:
Expand Down
Loading

0 comments on commit 30a176e

Please sign in to comment.