-
Notifications
You must be signed in to change notification settings - Fork 2
/
hierarichal_clustering.py
118 lines (89 loc) · 2.8 KB
/
hierarichal_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 28 01:05:18 2021
@author: ethan
"""
import pandas as pd
import numpy as np
import sklearn
from pyclustertend import hopkins
from sklearn.preprocessing import scale
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("fetal_health.csv")
X = data
X_data, y = data.iloc[:,:-1], data['fetal_health']
X = scale(X_data)
y= y.map({1: "Normal", 2: "Suspect", 3: "Pathological" })
# %% Heirarical Clustering on all data
sns.clustermap(X, method='ward')
sns.clustermap(X, method = 'complete')
sns.clustermap(X, method = 'average')
# %% Heirarical Clustering on only pathological
X_p = X[y=='Pathological', :]
sns.clustermap(X_p, method='ward')
sns.clustermap(X_p, method = 'complete')
sns.clustermap(X_p, method = 'average')
# %%
#Tsne
from sklearn.manifold import TSNE
X_tsne = TSNE(random_state=101).fit_transform(X)
x = np.array(X_tsne[:,0])
y2 = np.array(X_tsne[:,1])
sns.scatterplot(x= x, y = y2, hue =y )
# %%
from sklearn.cluster import AgglomerativeClustering
fig, axs = plt.subplots(ncols=4)
links = ['single', 'complete', 'average', 'ward']
plt.title("Hierarichal Clustering Projected on Tsne")
for i, link in enumerate(links):
agg = AgglomerativeClustering(n_clusters = 3, linkage = link)
agg.fit(X_p)
labels = agg.labels_
X_tsne = TSNE(random_state=105).fit_transform(X[y=='Pathological', :])
x_p = np.array(X_tsne[:,0])
y_p = np.array(X_tsne[:,1])
sns.scatterplot(x= x_p, y = y_p, hue = labels.astype(str), ax= axs[i])
axs[i].title.set_text(link)
plt.show()
# %% Absolute Difference of Cluster feature means
agg = AgglomerativeClustering(n_clusters = 2, linkage = 'ward')
agg.fit(X_p)
labels = agg.labels_
X1 = X_p[labels == 0]
X2 = X_p[labels == 1]
diffs = abs(X1.mean(0) - X2.mean(0))
diffs = pd.Series(diffs, index = X_data.columns).sort_values()
diffs.plot.barh(y='ABS of Means')
plt.title("Abs Diff between Cluster Feature Means")
plt.tight_layout()
# %% Post Clustering Selection
feats = list(diffs[-2:].index)
X_2 = np.array(X_data[feats])
X_2 = X_2[y=='Pathological', :]
X_2 = scale(X_2)
agg = AgglomerativeClustering(n_clusters = 2, linkage = 'ward')
agg.fit(X_2)
labels = agg.labels_
#Tsne
from sklearn.manifold import TSNE
X_2_tsne = TSNE(random_state=101).fit_transform(X_2)
t1 = np.array(X_2_tsne[:,0])
t2 = np.array(X_2_tsne[:,1])
sns.scatterplot(x= t1, y = t2 , hue=labels)
# %% Post Clustering Selection
feats = list(diffs[-2:].index)
X_2 = np.array(X_data[feats])
X_2 = X_2[y=='Pathological', :]
X_2 = scale(X_2)
agg = AgglomerativeClustering(n_clusters = 2, linkage = 'ward')
agg.fit(X_2)
labels = agg.labels_
#Tsne
from sklearn.manifold import TSNE
X_2_tsne = TSNE(random_state=101).fit_transform(X_2)
t1 = X_2[:,0]
t2 = X_2[:,1]
plt.xlabel(feats[0])
plt.ylabel(feats[1])
sns.scatterplot(x=t1, y = t2 , hue=labels)