-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKMeans_Script.R
259 lines (218 loc) · 8.04 KB
/
KMeans_Script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
######################################
### TEAM 12 K-MEANS CLUSTERING LAB ###
######################################
rm(list=ls())
#install.packages("NbClust")
library(NbClust)
#install.packages("jpeg")
library(jpeg)
#install.packages("ggplot2")
library(ggplot2)
setwd("C:\\Users\\Cole Combs\\Documents\\MSBA\\SEMESTER II\\Machine Learning II\\Team 12 Presentation - Clustering")
#####IRIS DATASET EXAMPLE#####
#Initial Data Discussion and Prep#
head(iris)
iris_2 <- iris[-5] #Remove Categorical Variables
head(iris_2)
iris_3 <- as.data.frame(scale(iris_2)) #Scale Variables
head(iris_3)
sapply(iris_2,mean) #Unscaled Mean and SD
sapply(iris_2,sd)
sapply(iris_3,mean) #Scaled Mean and SD, AKA Mean Zero - S.D. One
sapply(iris_3,sd)
#Create WSS Plot Function and Run Example on Iris_3 Dataset
#Function will calculate “With-In-Sum-Of-Squares (WSS),” a measure
#to explain the homogeneity within a cluster
wssplot <- function(data, nc=0, seed=2015){
#Generate WSS for unclustered dataset
wss <- (nrow(data)-1)*sum(apply(data,2,var))
#Generate WSS for 2 trough k clusters
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i,nstart=1)$withinss)
#nstart should be larger to account for randomness of initial clusters
}
dev.new()
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
}
wssplot(iris_3,30)
#WSS decreases drastically until K=3, and flattens out around K=7
#Use K=7 and Find WSS for Each Variable's 7 Clusters
iris_kmeans <- kmeans(iris_3,7)
str(iris_kmeans)
# $cluster identifies which cluster each observation was assigned to
# $centers records the centroids/means for each variable and cluster
iris_kmeans$centers
# $totss is total sum of squares, which is the total distance from data points to global mean
# $withinss is the total distance from a cluster's data points to its centroid/mean
# $tot.withinss is the total of all WSS values
# $betweenss is the total weighted distance of various cluster centroids to the global mean
# $size is the size of each cluster
iris_kmeans$size
# $iter is the number of outer iterations
# $ifault is an indicator for possible errors - 0 meaning none
#Compare to species type to examine for patterns
clusters <- iris_kmeans$cluster
iris_3$clusters <- clusters
table(iris$Species,clusters)
########CLUSTERING WINE EXAMPLE#######
#install.packages('rattle')
#install.packages('cluster')
library(rattle)
library(cluster)
data <- wine
head(data)
data <- scale(data[-1]) #Scale data to Mean Zero-SD One
head(data)
k.means.fit <- kmeans(data,3)
o = order(k.means.fit$cluster)
k.means.fit$centers #Look at each variables cluster means
k.means.fit$cluster #Look at cluster assignment
k.means.fit$size #Examine total size of clusters
wssplot(data,10) #What do we see with looking at 1 to 10 clusters
wssplot(data,50) #Looking at 50 clusters, we see it is important to consider larger picture
#Using clust plot to examine clusters using only the first and second primary components
dev.new()
clusplot(data, k.means.fit$cluster, main = "2D Representation of Cluster Solution", color = TRUE, shade = TRUE, lines = 0)
table(wine[,1],k.means.fit$cluster)
#####IMAGE RECREATION WITH CLUSTERING######
img <- readJPEG("ClusteringImage.jpg") #Read two images into R
#Create a function to pulls RGB colors and coords from JPEG
img_rgb <- function(image){
img_dim <- dim(image)
data.frame(
x_axis = rep(1:img_dim[2], each = img_dim[1]),
y_axis = rep(img_dim[1]:1, img_dim[2]),
Red = as.vector(img[,,1]),
Green = as.vector(img[,,2]),
Blue = as.vector(img[,,3]))
}
img_colors <- img_rgb(img)
#wssplot(img_colors[c(3,4,5)],25) #Examine WSS as more clusters are considered
#Plot Initial Image
dev.new()
ggplot(data = img_colors, aes(x = x_axis, y = y_axis)) +
geom_point(colour = rgb(img_colors[c("Red", "Green", "Blue")])) +
labs(title = "Original Image") +
xlab("x-axis") +
ylab("y-axis") +
coord_fixed(ratio = 1)
#Recreate Initial Image with K clusters
# Try 5 and 10
ks = 10
k_img_clstr <- kmeans(img_colors[, c("Red", "Green", "Blue")], centers = ks)
k_img_colors <- rgb(k_img_clstr$centers[k_img_clstr$cluster,])
#Plotting the Compressed Image
dev.new()
ggplot(data = img_colors, aes(x = x_axis, y = y_axis)) +
geom_point(colour = k_img_colors) +
labs(title = paste("k-Means Clustering of", ks, "Colors")) +
xlab("x") +
ylab("y") +
coord_fixed(ratio = 1)
#####MEDICAL APPLICATION######
#We will use an image from a hemotoxylin and eosin staining procedure
#HE stains allow pathologists and researchers to differentiate between
#tissue types and cell structures due to variable absorption of the
#chemicals
img <- readJPEG("ClusteringImage4.jpg") #Medical Image
#Create a function to pulls RGB colors and coords from JPEG
img_rgb <- function(image){
img_dim <- dim(image)
data.frame(
x_axis = rep(1:img_dim[2], each = img_dim[1]),
y_axis = rep(img_dim[1]:1, img_dim[2]),
Red = as.vector(img[,,1]),
Green = as.vector(img[,,2]),
Blue = as.vector(img[,,3]))
}
img_colors <- img_rgb(img)
wssplot(img_colors[c(3,4,5)],30)
#Plot Initial Image
dev.new()
ggplot(data = img_colors, aes(x = x_axis, y = y_axis)) +
geom_point(colour = rgb(img_colors[c("Red", "Green", "Blue")])) +
labs(title = "Original Image") +
xlab("x-axis") +
ylab("y-axis") +
coord_fixed(ratio = 1)
#Recreate Initial Image with K clusters
ks = 15
k_img_clstr <- kmeans(img_colors[, c("Red", "Green", "Blue")], centers = ks)
k_img_colors <- rgb(k_img_clstr$centers[k_img_clstr$cluster,])
str(k_img_colors) #Examining the structure of this new variable, we see it contains
#the hex codes needed to represent each pixel's new cluster color
#Plotting the Compressed Image
dev.new()
ggplot(data = img_colors, aes(x = x_axis, y = y_axis)) +
geom_point(colour = k_img_colors) +
labs(title = paste("k-Means Clustering of", ks, "Colors")) +
xlab("x") +
ylab("y") +
coord_fixed(ratio = 1)
#### PLOTTING SELECTED CLUSTERS IN MEDICAL EXAMPLE#####
#Plot Unique Colors
y = 1:ks
x = rep(3,ks)
col = rep(NA,ks)
names = rep(NA,ks)
subset = 1
for (i in 1:ks){
x[i] = i
col[i] = rgb(k_img_clstr$centers[k_img_clstr$cluster[k_img_clstr$cluster == i],])
names[i] = paste("Grp",i)
subset = subset + 1
}
z = as.data.frame(cbind(x,y,names,col))
unq = unique(unlist(k_img_colors))
dev.new()
ggplot(data = z, aes(x = x, y = y, label = names)) +
geom_point(colour = col, size = 6) +
labs(title = paste("Unique Colors From", ks, "Clusters")) +
geom_text(aes(label=names),hjust=.5, vjust=2)+
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.x=element_blank(),
axis.ticks.y=element_blank()) +
coord_fixed(ratio = 1)
#Create Base Image Colors for Subsetting Image Color Selection
img <- readJPEG("ClusteringImage4_Grey.jpg")
#Create a function to pulls RGB colors and coords from JPEG
img_rgb <- function(image){
img_dim <- dim(image)
data.frame(
x_axis = rep(1:img_dim[2], each = img_dim[1]),
y_axis = rep(img_dim[1]:1, img_dim[2]),
Red = as.vector(img[,,1]),
Green = as.vector(img[,,2]),
Blue = as.vector(img[,,3]))
}
base_colors <- img_rgb(img)
#Plot Base Image
dev.new()
ggplot(data = base_colors, aes(x = x_axis, y = y_axis)) +
geom_point(colour = rgb(base_colors[c("Red", "Green", "Blue")])) +
labs(title = "Base Image") +
xlab("x-axis") +
ylab("y-axis") +
coord_fixed(ratio = 1)
base_clstr <- kmeans(base_colors[, c("Red", "Green", "Blue")], centers = 30)
subset_colors <- rgb(base_clstr$centers[base_clstr$cluster,])
head(subset_colors)
#Generate Subset Image Colors
subset = 9 #SELECT CHOSEN SUBSET COLOR TO HIGHLIGHT
selected_colors <- rgb(k_img_clstr$centers[k_img_clstr$cluster[k_img_clstr$cluster == subset],])
for (i in 1:length(k_img_clstr$cluster)){
if (k_img_clstr$cluster[i] == subset){subset_colors[i] <- selected_colors[1]}
}
#Plotting the Subset Image
dev.new()
ggplot(data = img_colors, aes(x = x_axis, y = y_axis)) +
geom_point(colour = subset_colors) +
labs(title = paste("Cluster Analysis of Group:", subset)) +
xlab("x") +
ylab("y") +
coord_fixed(ratio = 1)