-
Notifications
You must be signed in to change notification settings - Fork 0
/
titanic_clean.py
88 lines (77 loc) · 3.18 KB
/
titanic_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
#=========fun for encode name
def encode_name(namelist):
word2id={}
for i in range(len(namelist)):
if namelist[i] in word2id.keys():
namelist[i]=word2id[namelist[i]]
else:
word2id[namelist[i]]= len(word2id)
namelist[i]=word2id[namelist[i]]
def clean_data(Data, label= False):
# Training data => label = True / Testing data => label = False
#Create new column "family size"= Sibsp+parch
Data["Family_size"]=Data["SibSp"]+Data["Parch"]
#drop not considered data 12 -6 +1 = 7
Data_cleaned=Data.drop(columns=["PassengerId","Embarked","Cabin","SibSp","Parch"])
if label: # for Training data drop Survived
Data_cleaned=Data_cleaned.drop(columns=["Survived"])
#----------------Start cleaning data
# encode Sex
Data_sex= Data_cleaned[["Sex"]]
ordinal_encoder = OrdinalEncoder()
train_sex_encoded= ordinal_encoder.fit_transform(Data_sex)
Data_cleaned["Sex"]= train_sex_encoded
#------------------------------------
# Age
# - Normalization
amean=Data_cleaned["Age"].mean()
astd=Data_cleaned["Age"].std()
for i in range(len(Data_cleaned["Age"])):
if np.isnan(Data_cleaned["Age"][i]):
#print(train_data_cleaned["Age"][i])
Data_cleaned["Age"][i]=amean
Data_cleaned["Age"][i]=(Data_cleaned["Age"][i]-amean)/astd
#Ticket
# remove non number content
for i in range(len(Data_cleaned["Ticket"])):
try:
Data_cleaned["Ticket"][i] = float(Data_cleaned["Ticket"][i].split()[-1])
except ValueError:
Data_cleaned["Ticket"][i] = np.nan
# - Normalization
tmean=Data_cleaned["Ticket"].mean()
tstd=Data_cleaned["Ticket"].std()
for i in range(len(Data_cleaned["Ticket"])):
if np.isnan(Data_cleaned["Ticket"][i]):
#print(train_data_cleaned["Age"][i])
Data_cleaned["Ticket"][i]=tmean
Data_cleaned["Ticket"][i]=(Data_cleaned["Ticket"][i]-tmean)/tstd
#Fare
# - Normalization
fmean=Data_cleaned["Fare"].mean()
fstd= Data_cleaned["Fare"].std()
for i in range(len(Data_cleaned["Fare"])):
if np.isnan(Data_cleaned["Fare"][i]):
#print(train_data_cleaned["Age"][i])
Data_cleaned["Fare"][i]=fmean
Data_cleaned["Fare"][i]=(Data_cleaned["Fare"][i]-fmean)/fstd
#Name
# - Remove titles
for i in range(len(Data_cleaned['Name'])):
line = Data_cleaned['Name'][i].replace('(','').replace(')','').split()
#print(f'Index {i} name={data_name[i]} ')
if line[2] == "Mr." or line[2] == "Mrs." or line[2] == "Miss.":
Data_cleaned['Name'][i]=line[3]
else:
Data_cleaned['Name'][i]=line[2]
# - encode name
encode_name(Data_cleaned['Name'])
#change type of train_data_cleaned to float
Data_cleaned.astype("float64")
# - Normalization
nmean=Data_cleaned["Name"].mean()
nstd=Data_cleaned["Name"].std()
Data_cleaned['Name']=(Data_cleaned['Name']-nmean)/nstd
return Data_cleaned