-
Notifications
You must be signed in to change notification settings - Fork 1
/
project_fun.py
169 lines (133 loc) · 5.41 KB
/
project_fun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
def outcome(time_delay):
"""
Converts continuous target variable into binary.
0 for being in time
1 for being delayed
"""
if time_delay > 0:
return '1'
elif time_delay == 0:
return '0'
else:
return 'NaN'
def get_airport_info(data, data_air, location='ARRSTN', cols=['country']):
"""
Adds information about the airport to the dataframe and returns it.
Args:
data (df): the Train.csv dataframe. Defaults to df.
data_air (df): the airports.csv dataframe. Defaults to df_air.
location (str): Either the arrival or departure airport. Must be 'ARRSTN' or 'DEPSTN'. Defaults to 'ARRSTN'.
cols (list): List of strings to specify which information to add.
Possible are:
'icao' 'iata' 'name' 'city' 'subd' 'country' 'elevation' 'lat' 'lon' 'tz'
Defaults to ['country']. Note, that elevation is provided in feet.
Return:
Dataframe with additional columns appended to input dataframe
"""
# get string s to add this to the new column names
if location == 'ARRSTN':
s = '_arr'
elif location == 'DEPSTN':
s = '_dep'
else:
raise ValueError(f"location must either be 'ARRSTN' or 'DEPSTN' but got {location}. ")
# loop through list of columns to add
for col in cols:
dict = {}
# loop over rows
for iata in data[location].unique():
mask = data_air['iata'] == iata
temp = data_air[col][mask]
dict[iata] = temp.to_string(index=False)
# add column
data[col+s] = data[location]
data[col+s] = data[col+s].replace(dict)
# Remove rows where 'ARRSTN' or 'DEPSTN' is 'Series([], )'
# (some iata from Train.csv are unknown to iata list in airports.csv)
# these appear in each column of certain rows, so remove them based on last column added
data = data.drop(data[(data[col+s] == 'Series([], )')].index)
data.reset_index(drop=True, inplace=True)
return data
# source: ds-predictive-regression, notebook 1
# Calculate metric
def calculate_metrics(y_train, y_pred_train, y_test, y_pred_test):
"""Calculate and print out RMSE and R2 for train and test data
Args:
y_train (array): true values of y_train
y_pred_train (array): predicted values of model for y_train
y_test (array): true values of y_test
y_pred_test (array): predicted values of model for y_test
"""
print("Metrics on training data")
rmse = np.sqrt(mean_squared_error(y_train,y_pred_train))
r2 = r2_score(y_train,y_pred_train)
print("RMSE:", round(rmse, 3))
print("R2:", round(r2, 3))
print("---"*10)
# Calculate metric
print("Metrics on test data")
rmse = mean_squared_error(y_test, y_pred_test, squared=False)
# you can get the same result with this line:
# rmse = np.sqrt(mean_squared_error(y_test,y_pred_test))
r2 = r2_score(y_test,y_pred_test)
print("RMSE:", round(rmse, 3))
print("R2:", round(r2, 3))
print("---"*10)
# source: ds-predictive-regression, notebook 1
def error_analysis(y_test, y_pred_test):
"""Generated true vs. predicted values and residual scatter plot for models
Args:
y_test (array): true values for y_test
y_pred_test (array): predicted values of model for y_test
"""
# Calculate residuals
residuals = y_test - y_pred_test
# Plot real vs. predicted values
fig, ax = plt.subplots(1,2, figsize=(15, 5))
plt.subplots_adjust(right=1)
plt.suptitle('Error Analysis')
ax[0].scatter(y_pred_test, y_test, color="#FF5A36", alpha=0.7)
ax[0].plot([-400, 350], [-400, 350], color="#193251")
ax[0].set_title("True vs. predicted values", fontsize=16)
ax[0].set_xlabel("predicted values")
ax[0].set_ylabel("true values")
ax[0].set_xlim((y_pred_test.min()-10), (y_pred_test.max()+10))
ax[0].set_ylim((y_test.min()-40), (y_test.max()+40))
ax[1].scatter(y_pred_test, residuals, color="#FF5A36", alpha=0.7)
ax[1].plot([-400, 350], [0,0], color="#193251")
ax[1].set_title("Residual Scatter Plot", fontsize=16)
ax[1].set_xlabel("predicted values")
ax[1].set_ylabel("residuals")
ax[1].set_xlim((y_pred_test.min()-10), (y_pred_test.max()+10))
ax[1].set_ylim((residuals.min()-10), (residuals.max()+10));
def feature_combi(columns):
"""
Creates a list with all possible combinations without repetition, i.e. featured f1,f2 is same
as choosing f2,f1
Args:
columns (list of strings): list with column names
"""
col_combis = []
for i in range(1, len(columns)):
col_combis.extend([list(combi) for combi in itertools.combinations(columns, i)])
col_combis.append(columns)
return col_combis
#MAIN function
#
if(__name__ == '__main__'):
# import data
df = pd.read_csv('data/Train.csv',parse_dates=['DATOP','STD'])
df_air = pd.read_csv('data/airports.csv')
# test get_airport_info
# create test dataframe with additional columns
df_test = get_airport_info(data=df, data_air=df_air, location='ARRSTN', cols=['country','elevation','lat','lon'])
# test outcome
df_test['outcome'] = df.target.apply(lambda x: outcome(x))
print(df_test.head(6))
# test feature_combi
test_fc = ['a','b','c']
print(feature_combi(test_fc))