-
Notifications
You must be signed in to change notification settings - Fork 2
/
label_vectorization.py
executable file
·96 lines (75 loc) · 3.53 KB
/
label_vectorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import numpy as np
from scipy.stats import entropy
import math
from sklearn.manifold import TSNE
import pdb
# Helper functions to read data, to get answer and probability vectors
def get_ans_pct_vectors(answer_counters):
answer_list = []
answer_pcts = []
for k, v in answer_counters.items():
answer_list.append(v)
answer_pcts.append(answers2pct(v))
ans_vectors = np.asarray(answer_list, dtype=int)
pct_vectors = np.asarray(answer_pcts, dtype=float)
#print(len(answer_list), ans_vectors.shape, len(answer_pcts), pct_vectors.shape)
return pct_vectors
def answers2pct(answers):
try:
s = sum(answers)
return [float(x)/s for x in answers]
except ZeroDivisionError as e:
return answers
# Helper functions to read data, to get answer vectors
def get_ans_vectors(tweetid_answer_counters):
itemid_list, answer_list = zip(*tweetid_answer_counters.items())
ans_vectors = np.asarray(answer_list, dtype=int)
# print(len(itemid_list), len(answer_list), ans_vectors.shape)
return ans_vectors
def tests(true_pct_vectors, prediction_proba_vectors):
# Do similar things as
# https://github.com/kobe2452/subjective_active_learning/blob/master/MultinomialCluster.py#L128
#print(true_pct_vectors.shape, prediction_proba_vectors.shape)
# xentropy = [entropy(x) for x in true_pct_vectors]
xentropy = []
for x in true_pct_vectors:
if sum(x) == 0:
xentropy.append(0)
else:
xentropy.append(entropy(x))
tentropy = [entropy(t) for t in prediction_proba_vectors]
xmax = [max(x) for x in true_pct_vectors]
tmax = [max(t) for t in prediction_proba_vectors]
if (len(xentropy) != len(xmax)) or (len(tentropy) != len(tmax)):
print(len(xentropy), len(xmax), len(tentropy), len(tmax))
return zip(xentropy, tentropy, xmax, tmax)
def get_assignments(test_true_vectors, test_pred_vectors):
dist_by_cluster = [[0.0] * len(test_true_vectors[0]) for i in test_pred_vectors[0]]
assignments_per_cluster = [0.0] * len(test_pred_vectors[0])
cluster_assignments = [np.argmax(tpv) for tpv in test_pred_vectors]
for i in range(len(test_true_vectors)):
dist_by_cluster[cluster_assignments[i]] = [j + k for j,k in zip(dist_by_cluster[cluster_assignments[i]], test_true_vectors[i])]
assignments_per_cluster[cluster_assignments[i]] += 1
return cluster_assignments, dist_by_cluster, assignments_per_cluster
def get_perplexity(test_vectors, cluster_assignments, dist_by_cluster, assignments_per_cluster):
dist_by_cluster = np.asarray([answers2pct(x) for x in dist_by_cluster])
if np.sum(test_vectors, axis=1).shape == (len(test_vectors),):
test_vectors = test_vectors
else:
test_vectors = np.asarray([answers2pct(x) for x in test_vectors])
cluster_assignments = np.asarray(cluster_assignments)
#print(test_vectors.shape, cluster_assignments.shape, dist_by_cluster.shape)
epsilon = 0.00001
tot = 0.0 #K-L Divergence
for tv, ci in zip(test_vectors, cluster_assignments):
for x, y in zip(tv, dist_by_cluster[ci]):
x = x + epsilon
y = y + epsilon
tot += x * math.log(x/y)
#cross_entropy = [sum([-x * math.log(y) ]) ]
return (tot / len(test_vectors))
def use_tSNE_reduce_dimensions(vectors, target_dim):
# http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
tsne = TSNE(n_components=target_dim, init='pca', random_state=5, learning_rate=100.0)
return tsne.fit_transform(vectors)