-
Notifications
You must be signed in to change notification settings - Fork 79
/
AUC_Geo_Rank_Weighted_Average.py
236 lines (188 loc) · 8.48 KB
/
AUC_Geo_Rank_Weighted_Average.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import numpy as np
import operator
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
"""
Script to do the final ensemble via geomean-rank-weighted-averaging
using many cross validations schemas for better AUC score
"""
#convert a single 1-dimensional array to rank (e.g sort the score from the smallest to the highest and give them scores as 1,2...len(array))
def ranking(score):
""" method to create a score into rank"""
data=[]
for i in range(len(score)):
data.append([score[i],i])
data=sorted(data, key=operator.itemgetter(0), reverse=False)
value=data[0][0]
data[0][0]=1
for i in range(1,len(score)):
val=data[i][0]
if val>value :
value=val
data[i][0]=(i+1)
else :
data[i][0]=data[i-1][0]
data=sorted(data, key=operator.itemgetter(1), reverse=False)
final_rank=[]
for i in range(len(score)):
final_rank.append(data[i][0])
return final_rank
#retrieve specific column fron 2dimensional array as a 1dimensional array
def select_column(data, col) :
array=[]
for i in range(len(data)):
array.append(data[i][col])
return array
# put an array back to the given column j
def putcolumn(data,array,j) :
for i in range(len(data)):
data[i][j]=array[i]
# convert each one of the columns in the given array to ranks
def create_ranklist (data ) :
for j in range(len(data[0])):
putcolumn( data,ranking(select_column(data,j)),j)
# method to load a specific column
def loadcolumn(filename,col=4, skip=1, floats=True):
pred=[]
op=open(filename,'r')
if skip==1:
op.readline() #header
for line in op:
line=line.replace('\n','')
sps=line.split(',')
#load always the last columns
if floats:
pred.append(float(sps[col]))
else :
pred.append(str(sps[col]))
op.close()
return pred
def printfilcsve(X, filename):
np.savetxt(filename,X)
def save_results(predictions, filename):
"""Given a vector of predictions, save results in CSV format."""
with open(filename, 'w') as f:
f.write("id,ACTION\n")
for i, pred in enumerate(predictions):
f.write("%d,%f\n" % (i + 1, pred))
def main():
# meta models to be used to assess how much weight to contribute to the final submission
meta=["main_xgboost",
"main_logit_2way",
"main_logit_3way",
"main_logit_3way_best",
"main_xgboos_count",
"main_xgboos_count_2D",
"main_xgboos_count_3D"]
y = np.loadtxt("train.csv", delimiter=',',usecols=[0], skiprows=1)
print("len of target=%d" % (len(y))) # reconciliation check
weights=[0, # all weights to 1, e.g. average
0,
0,
0.4,
0,
0.4,
1
] # the weights of the 4 level 3 meta models
number_of_folds=5 # for cv
usesccaling_to_0_1=True # some submissions need probas-ish
use_geo=True #false = uses linear rank average
Load=True
use_rank=True # IF we want to use rank
#basiclaly it says multiple the extra lvl3 model by 1, the xgboost model by 0.05 and the neural net with 0.25
if Load:
Xmetatrain=None
Xmetatest=None
#append all the predictions into 1 list (array)
for modelname in meta :
mini_xtrain=np.loadtxt(modelname + '.train.csv')
mini_xtest=np.loadtxt(modelname + '.test.csv')
mean_train=np.mean(mini_xtrain)
mean_test=np.mean(mini_xtest)
print("model %s auc %f mean train/test %f/%f " % (modelname,roc_auc_score(y,mini_xtrain) ,mean_train,mean_test))
if Xmetatrain==None:
Xmetatrain=mini_xtrain
Xmetatest=mini_xtest
else :
Xmetatrain=np.column_stack((Xmetatrain,mini_xtrain))
Xmetatest=np.column_stack((Xmetatest,mini_xtest))
# convert my scores to list
X=Xmetatrain
X_test=Xmetatest
joblib.dump((X,X_test),"METADUMP.pkl" )
else :
X,X_test=joblib.load("METADUMP.pkl")
outset="AUC_Geo_Rank_Weighted_Average" # Output base name
seedlist=[87, 111, 1337, 42 , 201628] # many seeds for more accurate results
train_stacker=[0.0 for i in range (0,len(X))]
mean_auc = 0.0
for seeder in seedlist:
print("kfolding seed %d " % (seeder) )
kfolder=StratifiedKFold(y, n_folds=number_of_folds,shuffle=True, random_state=seeder)
#number_of_folds=0
#X,y=shuffle(X,y, random_state=SEED) # Shuffle since the data is ordered by time
i=0 # iterator counter
print ("starting cross validation with %d kfolds " % (number_of_folds))
if number_of_folds>0:
for train_index, test_index in kfolder:
# creaning and validation sets
X_train, X_cv = X[train_index], X[test_index]
y_train, y_cv = np.array(y)[train_index], np.array(y)[test_index]
print (" train size: %d. test size: %d, cols: %d " % ((X_train.shape[0]) ,(X_cv.shape[0]) ,(X_train.shape[1]) ))
minmax=MinMaxScaler(feature_range=(0, 1))
X_cv=X_cv.tolist()
if use_rank:
create_ranklist(X_cv)
#X_cv= minmax.fit_transform((X_cv))
#print X_cv
if use_geo: # use geo mean
preds=[1.0 for s in range (0,len(X_cv))]
for i in range (0,len(X_cv)) :
for j in range (0,len(weights)) :
preds[i]*=X_cv[i][j]**weights[j]
else :
preds=[0.0 for s in range (0,len(X_cv))]
for i in range (0,len(X_cv)) :
for j in range (0,len(weights)) :
preds[i]+=X_cv[i][j]*weights[j]
if usesccaling_to_0_1:
preds= minmax.fit_transform(preds)
# compute Loglikelihood metric for this CV fold
#scalepreds(preds)
AUC = roc_auc_score(y_cv,preds)
print "size train: %d CV : %d AUC (fold %d/%d): %f" % ((X_train.shape[0]), len(X_cv), i + 1, number_of_folds, AUC)
mean_auc += AUC
#save the results
no=0
for real_index in test_index:
train_stacker[real_index]=(preds[no])
no+=1
i+=1
mean_auc/=(len(seedlist)*5.0)
print ("Average AUC: %f" % mean_auc)
minmax=MinMaxScaler(feature_range=(0, 1))
X_test=X_test.tolist()
if use_rank:
create_ranklist(X_test)
# combine all the ranked scores in a weighted manner for the test lvl 3 out-of-fold predictions
if use_geo: # use geo mean
preds=[1.0 for s in range (0,len(X_test))]
for i in range (0,len(X_test)) :
for j in range (0,len(weights)) :
preds[i]*=X_test[i][j]**weights[j]
else : # linear wighted rank average
preds=[0.0 for s in range (0,len(X_test))]
for i in range (0,len(X_test)) :
for j in range (0,len(weights)) :
preds[i]+=X_test[i][j]*weights[j]
if usesccaling_to_0_1:
preds= minmax.fit_transform(preds)
#convert to numpy
preds=np.array(preds)
#write the results
save_results(preds, outset+"_submission_" +str(mean_auc) + ".csv")
print("Done.")
if __name__=="__main__":
main()