-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconsensus_clustering.R
54 lines (37 loc) · 1.5 KB
/
consensus_clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(diceR)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# Diverse Cluster Ensemble
# 1. Load data ---------------------------
data = read.csv("data/interim/preprocessed/toronto_short_winsorized.csv")
#df <- scale(data)
df = prepare_data(data, scale = TRUE, type = c("conventional"), min.var = 1)
# 2. Clustering ---------------------------
# Define clustering algorithms to be used
# 1. km - K-Means
# 2. pam - K-Medoids
# 3. hc - Hierarhical clustering
# 4. gmm - Gaussian Mixture Model
# 5. hdbscan - HDBSCAN
algorithms = c("km", "hc", "pam", "gmm", "hdbscan")
# Define distance functions to be used
distance = c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski", "spearman")
p.item = 0.9 # 90% resampling
reps = 5 # 5 replicates
CC <- consensus_cluster(df, nk = 3:5, p.item = p.item, reps = reps,
algorithms = algorithms,
distance = distance, scale = TRUE)
co <- capture.output(str(CC))
strwrap(co, width = 80)
# 3. Evaluation ---------------------------
# Explore a single clustering algorithm with k = 4
pam.4 <- CC[, , "PAM_Euclidean", "4", drop = FALSE]
# Create and visualize consensus matrix
cm <- consensus_matrix(pam.4)
dim(cm)
hm <- graph_heatmap(pam.4)
ccomb_matrix <- consensus_combine(CC, element = "matrix")
ccomb_class <- consensus_combine(CC, element = "class")
str(ccomb_matrix, max.level = 2)
# Evaluate the performance
ccomp <- consensus_evaluate(data, CC, trim = TRUE, n = 1)
str(ccomp, max.level = 2)