-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalysis.py
146 lines (127 loc) · 5.74 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
# import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
# from statsmodels.stats.diagnostic import het_goldfeldquandt
from statsmodels.stats.outliers_influence import variance_inflation_factor
def scatter(df):
scatter_matrix(vars, alpha=0.2, figsize=(12,12), diagonal='hist')
plt.tight_layout()
plt.show()
# plt.savefig('images/scatmatrix')
def bar_weather(df):
grouped = df.groupby(['weather_type'])['fuel_used'].mean()
grouped.plot.bar()
plt.xticks(rotation=50, horizontalalignment='right')
plt.xlabel('Weather Type', weight='bold')
plt.ylabel('Liters of Fuel Used', weight='bold')
plt.title('Liters of Fuel Used by Weather Type', weight='bold', fontsize=15)
plt.tight_layout()
# plt.show()
plt.savefig('images/weather')
def bar_veh(df):
grouped = df.groupby(['type'])['fuel_used'].mean()
grouped.plot.bar()
plt.xticks(rotation = 50, horizontalalignment='right')
plt.xlabel('Vehicle Type', weight='bold')
plt.ylabel('Liters of Fuel Used', weight='bold')
plt.title('Liters of Fuel Used by Vehicle Type', weight='bold', fontsize=15)
plt.tight_layout()
# plt.show()
plt.savefig('images/veh_type')
def clean(X):
#turn booleans into integers
X = X.applymap(lambda x: 1 if x == True else x)
X = X.applymap(lambda x: 0 if x == False else x)
#turn weather_type and thpe into binary variables, drop mist so that it is reference group
X = pd.get_dummies(X, columns=['weather_type'])
X.drop(['weather_type_Good'], axis=1, inplace=True)
X = pd.get_dummies(X, columns=['type'])
X.drop(['type_Sedan'], axis=1, inplace=True)
return X
def linear_train(X_train, y_train):
X_train = sm.add_constant(X_train)
result_train = sm.OLS(y_train,X_train).fit()
print('TRAINING MODEL')
print(result_train.summary())
return result_train
def linear_test(X_test, y_test):
X_test = sm.add_constant(X_test)
result_test = sm.OLS(y_test,X_test).fit()
print('TESTING MODEL')
print(result_test.summary())
def QQ(result):
resid_stud = result.outlier_test()['student_resid']
dev_null = sm.graphics.qqplot(resid_stud, line='45', fit=True)
plt.title('Q-Q plot for Testing Model', weight='bold', fontsize=15)
plt.show()
# plt.savefig('images/QQ_Training')
def heteroscedasticity(result):
print('Goldfeld-quandt test returns: F stat, p-value:')
print(het_goldfeldquandt(result.resid, result.model.exog))
plt.scatter(result.fittedvalues, result.resid)
plt.title('Scatterplot of Residuals for Model 1', weight='bold', fontsize=15)
plt.xlabel('Predicted Values', weight='bold')
plt.ylabel('Residuals', weight='bold')
plt.tight_layout()
plt.title('Scatterplot of Residuals for Training Model', weight='bold', fontsize=15)
# plt.show()
plt.savefig('images/Resids_Training')
def VIF(X_train):
#from https://www.kaggle.com/ffisegydd/sklearn-multicollinearity-class
print('VIF:')
variables = X_train.columns
vif = [variance_inflation_factor(X_train[variables].values, X_train.columns.get_loc(var)) for var in X_train.columns]
zipped = list(zip(variables, vif))
print(zipped)
# create heatmap of correlations between features
# corr_df = X_train.corr(method='pearson')
# mask = np.zeros_like(corr_df)
# mask[np.triu_indices_from(mask)] = True
# sns.heatmap(corr_df, cmap='RdYlGn_r', vmax=1.0, mask=mask, linewidth=2.5)
# plt.yticks(rotation=0)
# plt.xticks(rotation=90)
# plt.title('Correlations Among Features for Testing Model', weight='bold', fontsize=15)
# # plt.title('Correlations Among Features for Model 2', weight='bold', fontsize=15)
# plt.tight_layout()
# plt.show()
if __name__ == '__main__':
# df = pd.read_excel('data/ios_telemetry.xlsx', index_col='id')
# df.to_csv('data/ios_telemetry.csv')
df = pd.read_csv('data/ios_telemetry.csv', index_col='id')
pd.options.display.max_columns = 200
#all values for rfd (ready for deletion) are False, so keep all
#hard_brake is 100% False
#Weather: coding 'Not Reported' as missing
df['weather'].replace(to_replace=['Not Reported'],value=np.NaN, inplace=True)
weather_dict = {'broken clouds':'Bad', 'clear sky': 'Good', 'few clouds':'Good', 'mist':'Bad', 'overcast clouds':'Bad'}
df['weather_type'] = df['weather'].replace(weather_dict)
#kmph: coding 0 as missing
# df['kmph'].replace(to_replace=0,value=np.NaN, inplace=True)
df['fuel_used'].replace(to_replace=0, value=np.NaN, inplace=True)
#Recod vehicle_id with vehicle info
model_dict = {3272:'2002 Mitsubishi Lancer', 18338:'2015 Chrysler ', 20478:'2017 BMW 530i', 14345:'2012 Jeep Grand Cherokee 4WD', 19599:'2016 Ford ESCAPE FWD'}
type_dict = {3272:'Sedan', 18338:'Sedan', 20478:'Sedan', 14345:'SUV', 19599:'SUV'}
df['vehicle_id_name'] = df['vehicle_id'].replace(model_dict)
df['type'] = df['vehicle_id'].replace(type_dict)
df.dropna(inplace = True)
'''EDA'''
vars = df[['fuel_used', 'kmph', 'celsius', 'altitude_delta', 'g_force', 'kml', 'kilometers']]
# print(scatter(vars))
print(bar_weather(df))
# print(bar_veh(df))
'''Linear regression'''
#dfs for linear regression
X = df[['kmph', 'celsius', 'altitude_delta', 'g_force', 'kml', 'weather_type', 'type']]
y = df[['fuel_used']]
#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y)
#Testing regression assumptions
# print(VIF(clean(X_train)))
# print(QQ(linear_train(clean(X_train), y_train)))
# print(heteroscedasticity(linear_train(clean(X_train), y_train)))
#Run model on test data
# print(linear_test(clean(X_test), y_test))