-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_split.py
147 lines (104 loc) · 4.77 KB
/
dataset_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
from pathlib import Path
import time
import logging
import os
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
class DatasetSplit(object):
def __init__(self, args):
self.dataset_path = args.dataset_path
self.target_col = args.target_col
self.task_type = args.task_type
#self.search_sample_num = args.search_sample_num
self.args = args
self.dataset_name = Path(self.dataset_path).stem
self.split_data_path = Path('data/sample') / self.dataset_name
self.search_data_csv_name = 'search_data.csv'
self.keep_data_csv_name = 'keep_data.csv'
#
def split_dataset_with_ratio(self, df, test_rate, random_state=1):
df_target = df[self.target_col]
df_feature = df.drop(self.target_col, axis=1)
#
if self.task_type == 'classifier':
x_train, x_test, y_train, y_test = train_test_split(
df_feature, df_target, test_size=test_rate,
shuffle=True, stratify=df_target, random_state=random_state)
elif self.task_type == 'regression':
x_train, x_test, y_train, y_test = train_test_split(
df_feature, df_target, test_size=test_rate,
shuffle=True, random_state=random_state)
else:
logging.info('task_type error')
x_test = x_train = y_train = y_test = None
# feature target merge
train_data = x_train.merge(pd.DataFrame(y_train), left_index=True,
right_index=True, how='left')
test_data = x_test.merge(pd.DataFrame(y_test), left_index=True,
right_index=True, how='left')
return train_data, test_data
@staticmethod
def fe_simple_pipline(df_fe):
for header in df_fe.columns.values:
if df_fe[header].dtype == "object":
oe = preprocessing.OrdinalEncoder()
trans_column = oe.fit_transform(df_fe[header].values.reshape(-1, 1))
df_fe[header] = trans_column.reshape(1, -1)[0]
return df_fe
@staticmethod
def fe_simple_pipline(df_fe):
for header in df_fe.columns.values:
if df_fe[header].dtype == "object":
oe = preprocessing.OrdinalEncoder()
trans_column = oe.fit_transform(df_fe[header].values.reshape(-1, 1))
df_fe[header] = trans_column.reshape(1, -1)[0]
return df_fe
def single_save_to_csv(self, df_data, save_file_name):
df_data.reset_index(drop=True, inplace=True)
# self.split_data_path = Path('data') / self.dataset_name
if not os.path.exists(self.split_data_path):
os.makedirs(self.split_data_path)
df_data.to_csv(self.split_data_path / save_file_name, index=False)
# df_data.to_csv(self.split_data_path / save_file_name)
def save_to_csv(self, train_data, test_data):
save_path = '{}_{}'.format(
self.dataset_name, time.strftime("%Y%m%d-%H%M%S"))
sampled_path = Path('data/sample') / save_path
if not os.path.exists(sampled_path):
os.makedirs(sampled_path)
train_data.to_csv(sampled_path / 'train_data.csv', index=False)
test_data.to_csv(sampled_path / 'test_data.csv', index=False)
return sampled_path
def load_all_data(self):
#all_data = reduce_mem_usage(pd.read_csv(self.dataset_path), logging)
all_data = pd.read_csv(self.dataset_path)
all_data = all_data[self.args.continuous_col+self.args.discrete_col +
[self.args.target_col]]
#all_data = all_data[0:1000]
print('all_data shape', all_data.shape)
#
all_data.drop_duplicates(keep='first', inplace=True)
all_data.reset_index(drop=True, inplace=True)
return all_data
def split_load_search_data(self):
#all_data = reduce_mem_usage(pd.read_csv(self.dataset_path), logging)
all_data = pd.read_csv(self.dataset_path)
all_data = all_data[self.args.continuous_col+self.args.discrete_col +
[self.args.target_col]]
print('all_data shape', all_data.shape)
#
search_data, keep_data = self.split_dataset_with_ratio(all_data, 0.2, random_state=1)
#
search_data.drop_duplicates(keep='first', inplace=True)
search_data.reset_index(drop=True, inplace=True)
search_data.reset_index(drop=True, inplace=True)
keep_data.reset_index(drop=True, inplace=True)
#
self.single_save_to_csv(keep_data, self.keep_data_csv_name)
self.single_save_to_csv(search_data, self.search_data_csv_name)
return search_data
if __name__ == '__main__':
pass