-
Notifications
You must be signed in to change notification settings - Fork 7
/
code.R
233 lines (177 loc) · 7.08 KB
/
code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#clearing the workspace prior to start
rm(list = ls())
#loading libraries
library(dplyr)
library(ggplot2)
library(magrittr)
#reading the train data
train <- read.csv("train.csv")
#removing the unwanted variables
train <- within(train,rm(X,X.1,X.2,X.3))
#reading the test data
test <- read.csv("test.csv")
#reading in sample submission file
sample_submission <- read.csv("sample_submission.csv")
#attaching the datasets
attach(train)
attach(test)
#overview of datasets
#dimension of train set
dim(train)
#structure of train set
str(train)
#top few observations from train set
head(train)
#descriptive statistics of train set
summary(train)
#dimension of test set
dim(test)
#structure of test set
str(test)
#top few observations of test set
head(test)
#descriptive statistics of test set
summary(test)
#checking sample submission format
head(sample_submission)
#Data Wrangling
#Checking out Style variable for both train and test sets
levels(train$Style)
summary(train$Style)
levels(test$Style)
summary(test$Style)
qplot(data = train, x = Style,main = "Style frequency distribution in train data") + geom_bar()
qplot(data = test,x = Style,main = "Style frequency distribution in test data") + geom_bar()
#As we see "fashion" factor level is present in the data description that
#was given to us, it is in the test set but DOES NOT figure anywhere in the
#train set
table(train$Style,train$Recommended)
prop.table(table(train$Style,train$Recommended),1)*100
#We reduce the style categories by clubbing
train$Style[train$Style %in% c('Flare','Novelty','OL')] <- 'sexy'
train$Style[train$Style %in% c('work')] <- 'vintage'
train$Style <- factor(train$Style)
#Performing the same on test set
test$Style[test$Style %in% c('Flare','Novelty','OL','fashion')] <- 'sexy'
test$Style[test$Style %in% c('work')] <- 'vintage'
test$Style <- factor(test$Style)
#checking out price variable for both test and train sets
qplot(data = train,x = Price, main = "Price frequency distribution in train data") + geom_bar()
levels(train$Price)
qplot(data = test,x = Price,main = "Price frequency distribution in test data") + geom_bar()
levels(test$Price)
summary(train$Price)
summary(test$Price)
table(train$Price,train$Recommended)
prop.table(table(train$Price,train$Recommended),1)*100
#We see that price variable has "" factor level, let us check out the number of observations
#before getting to thinking of performing imputation
subset(train,Price == "")
#We find only one observation with missing rating, so we replace it by most frequently occuring
#price rating occuring in the price column
summary(train$Price)
train$Price[train$Price == ""] <- "Average"
#Likewise, we check the test data and do the needful
subset(test,Price == "")
summary(test$Price)
test$Price[test$Price == ""] <- "Average"
#Next we assign the "low" to "Low", "high" to "High" factor levels
train$Price[train$Price == "low"] <- "Low"
train$Price[train$Price == "high"] <- "High"
test$Price[test$Price == "low"] <- "Low"
test$Price[test$Price == "high"] <- "High"
#refactoring the price rating variable in both train and test sets
train$Price <- factor(train$Price)
test$Price <- factor(test$Price)
summary(train$Price)
summary(test$Price)
#checking out rating variable for both test and train sets
str(train$Rating)
str(test$Rating)
summary(train$Rating)
summary(test$Rating)
hist(train$Rating,main = "Rating in Train data",xlab = "Rating",col = "red")
hist(test$Rating,main = "Rating in Test data",xlab = "Rating",col = "blue")
#we see that there are no observations which have ratings from 1 to 3,
#checking out the same
dim(subset(train,Rating >=1 & Rating <= 3))
#next we check number of observations having ratings between 0 to 1
dim(subset(train,Rating >=0 & Rating <=1))
#So we have all 82 observations which fall in above range
#let us check the summary of the above range of rating scores
summary(select(subset(train,Rating >=0 & Rating <=1),Rating))
#So, we see all observations in above category have in fact, 0 rating
#next we check the portion 3-4
dim(subset(train,Rating >=3 & Rating <=3.5))
dim(subset(train,Rating >=3.5 & Rating <=4))
#and now we check portion 4-4.5 and 4.5-5
dim(subset(train,Rating >=4 & Rating <=4.5))
dim(subset(train,Rating >=4.5 & Rating <=5))
#after observing the above results, we go for binning the rating variable
#for both the train and test sets
train$Rating[train$Rating >=0 & train$Rating <= 1 ] <- 0
train$Rating[train$Rating >= 2.5 & train$Rating <= 4.5] <- 1
train$Rating[train$Rating > 4.5 & train$Rating <= 5] <- 2
#performing the same operations on test data, we get
test$Rating[test$Rating >= 0 & test$Rating <= 1] <- 0
test$Rating[test$Rating >= 2.5 & test$Rating <= 4.5] <- 1
test$Rating[test$Rating > 4.5 & test$Rating <= 5] <- 2
train$Rating <- factor(train$Rating)
test$Rating <- factor(test$Rating)
#revisiting rating
table(train$Rating,train$Recommended)
prop.table(table(train$Rating,train$Recommended),1)*100
summary(train$Rating)
summary(test$Rating)
#checking the size variable for both test and train sets
str(train$Size)
str(test$Size)
levels(train$Size)
levels(test$Size)
table(train$Size,train$Recommended)
#we can see the levels of both the test and train sets don't match
qplot(data = train,x = Size,main = "Frequency distribution of Size in train set") + geom_bar()
train$Size[train$Size == "small"] <- "S"
qplot(data = test,x = Size,main = "Frequency distribution of Size in test set") + geom_bar()
test$Size[test$Size == "s"] <- "S"
#re-factoring the Size variable in both test and train sets
train$Size <- factor(train$Size)
test$Size <- factor(test$Size)
table(train$Size,train$Recommended)
prop.table(table(train$Size,train$Recommended),1)*100
#checking out the Season variable for both the test and train sets
str(train$Season)
str(test$Season)
levels(train$Season)
levels(test$Season)
summary(train$Season)
summary(test$Season)
qplot(data = train,x = Season,main = "Frequency distribution of Season in train data") + geom_bar()
qplot(data = test,x = Season,main = "Frequency distribution of Season in test data") + geom_bar()
#doing trivial substitutions
train$Season[train$Season == "Automn"] <- "Autumn"
test$Season[test$Season == "Automn"] <- "Autumn"
train$Season[train$Season == "spring"] <- "Spring"
test$Season[test$Season == "spring"] <- "Spring"
train$Season[train$Season == "winter"] <- "Winter"
test$Season[test$Season == "winter"] <- "Winter"
train$Season[train$Season == "summer"] <- "Summer"
test$Season[test$Season == "summer"] <- "Summer"
#we replace the "" factor level by majority imputation
train$Season[train$Season == ""] <- "Summer"
test$Season[test$Season == ""] <- "Summer"
#re-factoring the Season in both train and test sets
train$Season <- factor(train$Season)
test$Season <- factor(test$Season)
#Checking Neckline variable in both train and test sets
str(train$NeckLine)
str(test$NeckLine)
#we see that test data has one factor level than the train set
levels(train$NeckLine)
levels(test$NeckLine)
#performing trivial substitutions
train$NeckLine[train$NeckLine == "sweetheart"] <- "Sweetheart"
train$NeckLine <- factor(train$NeckLine)
qplot(data = train,x = NeckLine) + geom_bar()
qplot(data = test,x = NeckLine) + geom_bar()
#