-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_eval.py
114 lines (88 loc) · 3.44 KB
/
preprocess_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Import
import pandas as pd
# Input/Output
# csv_in = input()
# csv_out = input()
# Input data csv
# csv_in = "Spaceship-Titanic/Data/train.csv"
csv_in = "Spaceship-Titanic/Data/test.csv"
# csv_out = "Spaceship-Titanic/Data/train_preprocessed.csv"
csv_out = "Spaceship-Titanic/Data/eval_preprocessed_full.csv"
# DataFrame
df = pd.read_csv(csv_in)
# The following 5 attributes account for spending
# Fillna: # TODO OK Fill with average for column.
df["RoomService"] = df["RoomService"].fillna(df["RoomService"].mean())
df["FoodCourt"] = df["FoodCourt"].fillna(df["FoodCourt"].mean())
df["ShoppingMall"] = df["ShoppingMall"].fillna(df["ShoppingMall"].mean())
df["Spa"] = df["Spa"].fillna(df["Spa"].mean())
df["VRDeck"] = df["VRDeck"].fillna(df["VRDeck"].mean())
# Fill all Nan with 0
# TODO: Improve Nan-replacement?
df.fillna(0, inplace=True)
# PassengerId
# TODO OK Ignore for now. Be aware of column indexing when undeleting
# del df["PassengerId"]
# HomePlanet
# TODO OK Beslut om homeplanets skal være en 3d array eller 3 kollonner. Bliver som det er.
y = pd.get_dummies(df["HomePlanet"], prefix="HomePlanet")
del df["HomePlanet"]
df.insert(1, column="Home_Earth", value=y["HomePlanet_Earth"])
df.insert(2, column="Home_Europa", value=y["HomePlanet_Europa"])
df.insert(3, column="Home_Mars", value=y["HomePlanet_Mars"])
# CryoSleep
# Removes nan because we have enough data
# df.dropna(subset=["CryoSleep"], inplace=True)
df["CryoSleep"] = df["CryoSleep"].astype(int)
# Cabin
# TODO Ignored for now. Be aware of column indexing when undeleting
del df["Cabin"]
# Destination
# One-Hot encodes destination
# '55 Cancri e' 'TRAPPIST-1e' 'PSO J318.5-22'
# df.dropna(subset=["Destination"], inplace=True)
h = pd.get_dummies(df["Destination"], prefix="Dest")
del df["Destination"]
df.insert(5, column="Dest_Cancri", value=h["Dest_55 Cancri e"])
df.insert(6, column="Dest_TRAPPIST", value=h["Dest_PSO J318.5-22"])
df.insert(7, column="Dest_PSO", value=h["Dest_TRAPPIST-1e"])
# Age
# Scaling of age.
# TODO Is is still better to use min-max scaling? To preserve som range..
# One solution. Age above 100 is set to 1. Otherwise set to age*0.01
#df.dropna(subset=["Age"], inplace=True)
# TODO Scaling is now removed here in preprocess. Scaling can later be done when neccesary, just like spending.
#df["Age"] = df["Age"].apply(lambda x: 0.01*x if x <= 100 else 1.)
# VIP
# Fills nan to False/0 because they account for the vast majority
# print(df["VIP"].describe())
# df["VIP"] = df["VIP"].fillna(False)
df["VIP"] = df["VIP"].astype(int)
# Insert new TotalSpending attribute
# TotalSpending = df.loc[:,"RoomService":"VRDeck"].sum(axis=1)
# df.insert(16, column="TotalSpending", value=TotalSpending)
# Name
# Ignore name for now.
# TODO: Make family connection?
del df["Name"]
# Transported (Label)
try:
df["Transported"] = df["Transported"].astype(int)
except:
pass
# Save df
df.to_csv(csv_out, index=False)
if __name__ == "__main__":
#print(df.describe(include="all"))
# print(df.iloc[:,:8].describe(include="all"))
# print(df.iloc[:,9:].describe(include="all"))
#print(df.iloc[:,:5])
# print(df.iloc[:,5:10])
# print(df.iloc[:,10:])
print(f"\n Isna: \n{df.isna().any()}")
print(f"\n Dtypes: \n{df.dtypes}")
# print(df["TotalSpending"].where(df["TotalSpending"]==0).count())
# print(df["VIP"].where(df["VIP"]==1).count())
#print(df.iloc[0:2,-1].values)
#print(df.iloc[0:2,:-1].values)
print(f"Preprocess done")