-
Notifications
You must be signed in to change notification settings - Fork 13
/
data.py
75 lines (69 loc) · 2.78 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#%%
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
import os, sys
from utils import *
import zipfile
import pandas as pd
import datatable as dt
import numpy as np
HOME = os.path.dirname(os.path.abspath(__file__))
MODEL_DIR = os.path.join(HOME, 'models')
DATA_DIR = os.path.join(HOME, 'data')
sys.path.append(HOME)
'''
The API token from https://www.kaggle.com/<username>/account needs to put in ~/.kaggle/ folder in MacOS/Linux or to C:\\Users\\.kaggle\\ on Windows
'''
train_dtypes = {'date': np.int32,
'ts_id': np.int64,
'resp': np.float64,
'weight': np.float64,
# 'feature_0': np.int8
}
for c in range(1,5):
train_dtypes['resp_'+str(c)] = np.float64
for c in range(130):
train_dtypes['feature_'+str(c)] = np.float32
#%%
if __name__ == "__main__":
print(f"Current directory : {HOME}")
print(f"Current data directory: {DATA_DIR}")
data_file = find_files('train.csv', DATA_DIR)
data_parquet = find_files('train.parquet', DATA_DIR)
data_feather = find_files('train.feather', DATA_DIR)
if not data_file:
try:
api = KaggleApi()
api.authenticate()
api.competition_download_files('jane-street-market-prediction',
path=DATA_DIR, quiet=False)
data_file = find_files('zip', DATA_DIR)
with zipfile.ZipFile(data_file,"r") as f:
f.extractall(DATA_DIR)
except RuntimeError as err:
print(f"Needs API token: {err}")
elif data_parquet and data_feather:
print(f"Train parquet at: {data_parquet[0]}.")
with timer("Loading train"):
train = pd.read_parquet(data_parquet[0])
print(train.dtypes[:10])
print(train.dtypes[-10:])
print(f"Train feather at: {data_feather[0]}.")
with timer("Loading train"):
train = pd.read_feather(data_feather[0])
print(train.dtypes[:10])
print(train.dtypes[-10:])
elif not data_parquet and data_feather:
with timer("Processing train parquet"):
# train = pd.read_csv(data_file[0])
# train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns})
train = dt.fread(data_file[0],
columns=set(train_dtypes.keys())).to_pandas().astype(train_dtypes)
train.set_index('ts_id')
train.to_parquet(os.path.join(DATA_DIR,'train.parquet'))
else:
with timer("Processing train feather"):
train = dt.fread(data_file[0],
columns=set(train_dtypes.keys())).to_pandas().astype(train_dtypes)
train.set_index('ts_id')
train.to_feather(os.path.join(DATA_DIR,'train.feather'))