-
Notifications
You must be signed in to change notification settings - Fork 4
/
concept drift.py
155 lines (109 loc) · 3.89 KB
/
concept drift.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
# In[2]:
random_state = 42
np.random.seed(random_state)
# In[3]:
def gen_fake_norm_dateset(column_size=20, instance_size=100000):
"""
Input size: total batch size
Distribution: gen a fake dataset for test, 20 coloumns is normal distributaion.
"""
dataset = {}
for i in range(column_size):
dataset['col_{}'.format(i)] = np.random.normal(0,1,instance_size)
df = pd.DataFrame(dataset)
train = df[:instance_size//2]
test = df[instance_size//2:]
# add drift to column 0
test['col_0'] += np.random.normal(0.1,0.5,len(test))
return train, test
# In[4]:
batch1, batch2 = gen_fake_norm_dateset()
# In[5]:
def train_test_split(X, y, test_size, random_state=2018):
"""
split data to train and test
"""
sss = list(StratifiedShuffleSplit(
n_splits=1, test_size=test_size, random_state=random_state).split(X, y))
X_train = np.take(X, sss[0][0], axis=0)
X_test = np.take(X, sss[0][1], axis=0)
y_train = np.take(y, sss[0][0], axis=0)
y_test = np.take(y, sss[0][1], axis=0)
return [X_train, X_test, y_train, y_test]
# In[13]:
def get_fea_importance(clf, feature_name):
"""
get feature importance from lightGBM
"""
gain = clf.feature_importance('gain')
importance_df = pd.DataFrame({
'feature':clf.feature_name(),
'split': clf.feature_importance('split'),
'gain': gain, # * gain / gain.sum(),
'gain_percent':100 *gain / gain.sum(),
}).sort_values('gain',ascending=False)
return importance_df
# In[14]:
def adversial_validation(batch1, batch2):
"""
split two batch to get importance
"""
feature_name = list(batch1.columns)
train_X = batch1
train_Y = np.ones(train_X.shape[0])
test_X = batch2
test_Y = np.zeros(test_X.shape[0])
X = np.concatenate((train_X.values,test_X.values),axis=0)
y = np.concatenate((train_Y,test_Y),axis=0)
test_size = int(len(X)/5)
X, X_test, y, y_test = train_test_split(X, y, test_size, random_state = 42)
para = {
'num_leaves': 6,
'learning_rate': 0.1,
'bagging_fraction': 0.2,
'feature_fraction': 0.5,
'max_depth': 3,
"objective": "binary",
"metric":"auc",
'verbose': -1,
"seed": 42,
'num_threads': 8,
}
lgb_train = lgb.Dataset(X, y, free_raw_data=True)
lgb_val = lgb.Dataset(X_test, y_test, free_raw_data=True, reference=lgb_train)
lgb_model = lgb.train(para, lgb_train, valid_sets=lgb_val, valid_names='eval',feature_name=feature_name,
verbose_eval=False, early_stopping_rounds=10, num_boost_round=50)
fpr, tpr, thresholds = metrics.roc_curve(
y_test, lgb_model.predict(X_test, num_iteration = lgb_model.best_iteration))
auc = metrics.auc(fpr, tpr)
print("----Adversial Score is {}------".format(auc))
fea_importance_adversial = get_fea_importance(lgb_model, feature_name)
print(fea_importance_adversial.head(10))
return fea_importance_adversial, auc
# ### get the batch split result, feature importance and auc
# In[15]:
fea_imp, auc_true = adversial_validation(batch1, batch2)
# ### Estimate the threshold. We could run more to get a distribution
# In[17]:
estimate_thres_auc = []
estimate_thres_gain = []
for i in range(5):
len_batch1 = len(batch1)
base_df = batch1.append(batch2).reset_index(drop = False).sample(frac=1)
fea_base, auc_base = adversial_validation(base_df[:len_batch1], base_df[len_batch1:])
estimate_thres_auc.append(auc_base)
estimate_thres_gain.append(fea_base['gain'].values[0])
# In[18]:
#auc threashold
np.mean(estimate_thres_auc)
# In[19]:
# drift threashold
np.mean(estimate_thres_gain)