-
Notifications
You must be signed in to change notification settings - Fork 0
/
Binary_Classification.py
320 lines (274 loc) · 11 KB
/
Binary_Classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 29 09:23:37 2020
Binary Classification with OneR Classification Algorithm
Authors: Deep Patel and Kenneth Hora
"""
# Imports the necessary libraries
import pandas as pd
#import numpy as np
import os
os.chdir("C:\\Users\\deepp\\Google Drive")
# Reads the training and test files
training = pd.read_excel('titanic_traning.xlsx')
test = pd.read_excel('titanic_test.xlsx')
# Making the index of the test dataframe the same as the ID number
start_test_index = test["ID"][0]
end_test_index = test["ID"][len(test["ID"])-1]
test_index_list = list(range(start_test_index, (end_test_index+1)))
test.index = test_index_list
# Machine learning for gender
female_survive = 0
female_dead = 0
male_survive = 0
male_dead = 0
# Counts the number of survivors among the genders
for i in range(len(training["ID"])):
if training["gender"][i] == 'female' and training["survived"][i] == 1:
female_survive += 1
elif training["gender"][i] == 'female' and training["survived"][i] == 0:
female_dead += 1
elif training["gender"][i] == 'male' and training["survived"][i] == 1:
male_survive += 1
elif training["gender"][i] == 'male' and training["survived"][i] == 0:
male_dead += 1
# Determines the gender votes
if (female_survive)/(female_survive + female_dead) > .5:
female_vote = 1
else:
female_vote = -1
if (male_survive)/(male_survive + male_dead) > .5:
male_vote = 1
else:
male_vote = -1
# Machine learning for pclass
# Initilized the plcass vote variables
class1_vote = 0
class2_vote = 0
class3_vote = 0
# Counts the difference between survivors and casualties among the pclass
# feature.
for i in range(len(training["ID"])):
if training["pclass"][i] == 1 and training["survived"][i] == 1:
class1_vote += 1
elif training["pclass"][i] == 1 and training["survived"][i] == 0:
class1_vote -= 1
elif training["pclass"][i] == 2 and training["survived"][i] == 1:
class2_vote += 1
elif training["pclass"][i] == 2 and training["survived"][i] == 0:
class2_vote -= 1
elif training["pclass"][i] == 3 and training["survived"][i] == 1:
class3_vote += 1
elif training["pclass"][i] == 3 and training["survived"][i] == 0:
class3_vote -= 1
# Determines the pclass votes
if class1_vote > 0:
class1_vote = 1
elif class1_vote < 0:
class1_vote = -1
else:
class1_vote = 0
if class2_vote > 0:
class2_vote = 1
elif class2_vote < 0:
class2_vote = -1
else:
class2_vote = 0
if class3_vote > 0:
class3_vote = 1
elif class3_vote < 0:
class3_vote = -1
else:
class3_vote = 0
# Machine learning for parch
# Determines the max value in the parch feature
max_parch = max(training["parch"])
# Initilizes a list for parch votes
parch_votes = list(range(0,(max_parch +1)))
# Counts the difference between survivors and casualties among the parch
# feature.
for i in range(len(training["ID"])):
val = training["parch"][i]
if training["survived"][i] == 1:
parch_votes[val] += 1
elif training["survived"][i] == 0:
parch_votes[val] -= 1
# Machine learning for sibsp
# Determines the max value for the sibsp feature
max_sibsp = max(training["sibsp"])
# Initilizes a list for sibsp votes
sibsp_votes = list(range(0,(max_sibsp +1)))
# Counts the difference between survivors and casualties among the sibsp
# feature.
for i in range(len(training["ID"])):
val = training["sibsp"][i]
if training["survived"][i] == 1:
sibsp_votes[val] += 1
elif training["survived"][i] == 0:
sibsp_votes[val] -= 1
# Machine learning for embarked status
# Initilizes a list for embarked votes
embarked_votes = pd.Series([0,0,0], index = ('C', 'Q', 'S'))
# Counts the difference between survivors and casualties among the embarked
# feature.
for i in range(len(training["ID"])):
val = training["embarked"][i]
if training["survived"][i] == 1:
embarked_votes[val] += 1
elif training["survived"][i] == 0:
embarked_votes[val] -= 1
# Making prediction for gender
gender_prediction = pd.Series(test_index_list, index = test_index_list)
for i in test["ID"]:
if test["gender"][i] == 'female':
gender_prediction[i] = female_vote
elif test["gender"][i] == 'male':
gender_prediction[i] = male_vote
else:
gender_prediction[i] = 0
# Making prediction for pclass
pclass_prediction = pd.Series(test_index_list, index = test_index_list)
for i in test["ID"]:
if test["pclass"][i] == 1:
pclass_prediction[i] = class1_vote
elif test["pclass"][i] == 2:
pclass_prediction[i] = class2_vote
elif test["pclass"][i] == 3:
pclass_prediction[i] = class3_vote
else:
pclass_prediction[i] = 0
# Making prediction for parch
parch_prediction = pd.Series(test_index_list, index = test_index_list)
for i in test["ID"]:
if test["parch"][i] > 0:
parch_prediction[i] = 1
elif test["parch"][i] == 0:
parch_prediction[i] = -1
else:
parch_prediction[i] = 0
# Making prediction for sibsp
# sibsp values of 5 do not get a vote in either direction because the value
# of sib_votes[5] is 0 and most values below it are negative and most values
# above it are positive.
sibsp_prediction = pd.Series(test_index_list, index = test_index_list)
for i in test["ID"]:
if test["sibsp"][i] == 1 or test["sibsp"][i] > 5:
sibsp_prediction[i] = 1
elif test["sibsp"][i] < 5:
sibsp_prediction[i] = -1
else:
sibsp_prediction[i] = 0
# Making prediction for embark
embarked_prediction = pd.Series(test_index_list, index = test_index_list)
for i in test["ID"]:
if test["embarked"][i] == 'C':
embarked_prediction[i] = 1
elif test["embarked"][i] == 'Q' or test["embarked"][i] == 'S':
embarked_prediction[i] = -1
else:
embarked_prediction[i] = 0
# Making final prediction based on 5 features
final_prediction = pd.Series(test_index_list, index = test_index_list)
for i in test["ID"]:
if gender_prediction[i] + pclass_prediction[i] + parch_prediction[i] \
+ sibsp_prediction[i] + embarked_prediction[i] > 0:
final_prediction[i] = 1
else:
final_prediction[i] = 0
# Reads the feature based prediction sheets
gender_pred = pd.read_excel('titanic_test_predictions.xlsx', \
sheet_name = 'Gender_Based_Prediction')
pclass_pred = pd.read_excel('titanic_test_predictions.xlsx', \
sheet_name = 'pclass_Based_Prediction')
parch_pred = pd.read_excel('titanic_test_predictions.xlsx', \
sheet_name = 'parch_Based_Prediction')
sibsp_pred = pd.read_excel('titanic_test_predictions.xlsx', \
sheet_name = 'sibsp_Based_Prediction')
embarked_pred = pd.read_excel('titanic_test_predictions.xlsx', \
sheet_name = 'embarked_Based_Prediction')
# Makes the indices the same as the ID number
gender_pred.index = test_index_list
pclass_pred.index = test_index_list
parch_pred.index = test_index_list
sibsp_pred.index = test_index_list
embarked_pred.index = test_index_list
# Fills out the "Prediction" column in the feature based prediciton dataframes
gender_pred["Prediction"] = gender_prediction
pclass_pred["Prediction"] = pclass_prediction
parch_pred["Prediction"] = parch_prediction
sibsp_pred["Prediction"] = sibsp_prediction
embarked_pred["Prediction"] = embarked_prediction
# Corrects the value assignment in each "Prediction" column
for i in test_index_list:
if gender_pred["Prediction"][i] == -1:
gender_pred["Prediction"][i] = 0
if pclass_pred["Prediction"][i] == -1:
pclass_pred["Prediction"][i] = 0
if parch_pred["Prediction"][i] == -1:
parch_pred["Prediction"][i] = 0
if sibsp_pred["Prediction"][i] == -1:
sibsp_pred["Prediction"][i] = 0
if embarked_pred["Prediction"][i] == -1:
embarked_pred["Prediction"][i] = 0
# Initilizes the successful prediction count for each feature
gender_count = 0
pclass_count = 0
sibsp_count = 0
parch_count = 0
embarked_count = 0
# Counts the successful predictions for each features
for i in test["ID"]:
if gender_pred["Prediction"][i] == gender_pred["Ground truth"][i]:
gender_count += 1
if pclass_pred["Prediction"][i] == pclass_pred["Ground truth"][i]:
pclass_count += 1
if parch_pred["Prediction"][i] == parch_pred["Ground truth"][i]:
parch_count += 1
if sibsp_pred["Prediction"][i] == sibsp_pred["Ground truth"][i]:
sibsp_count += 1
if embarked_pred["Prediction"][i] == embarked_pred["Ground truth"][i]:
embarked_count += 1
# Calculates the success rate for each feature
gender_success = round(gender_count/len(test), 3)
pclass_success = round(pclass_count/len(test), 3)
sibsp_success = round(sibsp_count/len(test), 3)
parch_success = round(parch_count/len(test), 3)
embarked_success = round(embarked_count/len(test), 3)
# Creates a data dictionary for success rate
data = {'Feature': ['Gender_Based_Prediction', 'pclass_Based_Prediction', \
'sibsp_Based_Prediction', 'parch_Based_Prediction', \
'embarked_Based_Prediction'],
'Success Rate': [gender_success, pclass_success, sibsp_success, \
parch_success, embarked_success]
}
# Creates the dataframe for the success rate
success_rate = pd.DataFrame(data)
# Writes the data frames to the excel file
with pd.ExcelWriter('titanic_test_predictions_Team4.xlsx') as writer:
gender_pred.to_excel(writer, sheet_name = 'gender_Based_Prediction', \
index = False)
pclass_pred.to_excel(writer, sheet_name = "pclass_Based_Prediction", \
index = False)
sibsp_pred.to_excel(writer, sheet_name = "sibsp_Based_Prediction", \
index = False)
parch_pred.to_excel(writer, sheet_name = "parch_Based_Prediction", \
index = False)
embarked_pred.to_excel(writer, sheet_name = "embarked_Based_Prediction", \
index = False)
success_rate.to_excel(writer, sheet_name = "Prediction Success Rate", \
index = False)
# Adjusts column widths within sheets
workbook=writer.book
worksheet1=writer.sheets['gender_Based_Prediction']
worksheet1.set_column('B:C',12)
worksheet2=writer.sheets['pclass_Based_Prediction']
worksheet2.set_column('B:C',12)
worksheet3=writer.sheets['sibsp_Based_Prediction']
worksheet3.set_column('B:C',12)
worksheet4=writer.sheets['parch_Based_Prediction']
worksheet4.set_column('B:C',12)
worksheet4=writer.sheets['embarked_Based_Prediction']
worksheet4.set_column('B:C',12)
worksheet5=writer.sheets['Prediction Success Rate']
worksheet5.set_column('A:A',25)
worksheet5.set_column('B:B',12)