-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_engineer_support.py
100 lines (83 loc) · 3.23 KB
/
feature_engineer_support.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
from sklearn import preprocessing
Title_Dictionary = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
def status(feature):
print 'Processing',feature,': ok'
def one_hot(data, cat_feature, record):
values = list(data[cat_feature].unique())
index = values.index(record)
count = len(values)
arr = [0.0]*count
arr[index] = 1.0
return arr
#a[i] = 1
#return a
def process_age(combined):
# a function that fills the missing values of the Age variable
def fillAges(row):
if row['Sex']=='female' and row['Pclass'] == 1:
if row['Title'] == 'Miss':
return 30
elif row['Title'] == 'Mrs':
return 45
elif row['Title'] == 'Officer':
return 49
elif row['Title'] == 'Royalty':
return 39
elif row['Sex']=='female' and row['Pclass'] == 2:
if row['Title'] == 'Miss':
return 20
elif row['Title'] == 'Mrs':
return 30
elif row['Sex']=='female' and row['Pclass'] == 3:
if row['Title'] == 'Miss':
return 18
elif row['Title'] == 'Mrs':
return 31
elif row['Sex']=='male' and row['Pclass'] == 1:
if row['Title'] == 'Master':
return 6
elif row['Title'] == 'Mr':
return 41.5
elif row['Title'] == 'Officer':
return 52
elif row['Title'] == 'Royalty':
return 40
elif row['Sex']=='male' and row['Pclass'] == 2:
if row['Title'] == 'Master':
return 2
elif row['Title'] == 'Mr':
return 30
elif row['Title'] == 'Officer':
return 41.5
elif row['Sex']=='male' and row['Pclass'] == 3:
if row['Title'] == 'Master':
return 6
elif row['Title'] == 'Mr':
return 26
combined.Age = combined.apply(lambda r : fillAges(r) if np.isnan(r['Age']) else r['Age'], axis=1)
status('age')
def scale_all_features(combined):
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
#combined[features] = combined[features].apply(lambda x: x/x.max(), axis=0)
combined[features] = preprocessing.normalize(combined[features], axis=0)
print 'Features scaled successfully !'