-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
140 lines (109 loc) · 3.41 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import logging
import numpy as np
import scipy.stats
from web.datasets.analogy import fetch_google_analogy, fetch_msr_analogy
from web.datasets.categorization import (
fetch_AP,
fetch_battig,
fetch_BLESS,
fetch_ESSLI_1a,
fetch_ESSLI_2b,
fetch_ESSLI_2c,
)
from web.datasets.similarity import (
fetch_MEN,
fetch_MTurk,
fetch_RG65,
fetch_RW,
fetch_SimLex999,
fetch_WS353,
)
from web.embedding import Embedding
from web.vocabulary import OrderedVocabulary
def pos_direct(vecs):
vecs = vecs * np.sign(scipy.stats.skew(vecs, axis=0))
return vecs
def get_logger(log_file=None, log_level=logging.INFO, stream=True):
logger = logging.getLogger(__name__)
handlers = []
if stream:
stream_handler = logging.StreamHandler()
handlers.append(stream_handler)
if log_file is not None:
file_handler = logging.FileHandler(str(log_file), "w")
handlers.append(file_handler)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
for handler in handlers:
handler.setFormatter(formatter)
handler.setLevel(log_level)
logger.addHandler(handler)
logger.setLevel(log_level)
return logger
class MyEmbedding(Embedding):
# override
def __init__(self, vocab, vectors):
super().__init__(vocab, vectors)
@staticmethod
def from_words_and_vectors(words, vectors):
vocab = OrderedVocabulary(words)
return MyEmbedding(vocab, vectors)
def split_range(p, dim):
# Splits the range 0 to dim-1 into 'p' roughly equal parts
# and returns the endpoints of each part.
avg = dim // p
remainder = dim % p
ranges = []
start = 0
for i in range(p):
# Calculate the end of the current segment
end = start + avg + (i < remainder) - 1
# Append the range endpoints
ranges.append((start, end + 1))
# Update the start for the next segment
start = end + 1
return ranges
def get_tasks():
analogy_tasks = {"Google": fetch_google_analogy(), "MSR": fetch_msr_analogy()}
similarity_tasks = {
"MEN": fetch_MEN(),
"WS353": fetch_WS353(),
"WS353R": fetch_WS353(which="relatedness"),
"WS353S": fetch_WS353(which="similarity"),
"SimLex999": fetch_SimLex999(),
"RW": fetch_RW(),
"RG65": fetch_RG65(),
"MTurk": fetch_MTurk(),
}
categorization_tasks = {
"AP": fetch_AP(),
"BLESS": fetch_BLESS(),
"Battig": fetch_battig(),
"ESSLI_2c": fetch_ESSLI_2c(),
"ESSLI_2b": fetch_ESSLI_2b(),
"ESSLI_1a": fetch_ESSLI_1a(),
}
return analogy_tasks, similarity_tasks, categorization_tasks
def calc_c_I(picked_emb, normed_embed, topk):
vecs = []
_, length = picked_emb.shape
for idx in range(length):
topk_ids = np.argsort(picked_emb[:, idx])[-topk:]
mean_emb = np.mean(normed_embed[topk_ids], axis=0)
vecs.append(mean_emb)
cossims = []
for i in range(len(vecs) - 1):
cossim = (
np.dot(vecs[i], vecs[i + 1])
/ np.linalg.norm(vecs[i])
/ np.linalg.norm(vecs[i + 1])
)
cossims.append(cossim)
return np.mean(cossims)
def test():
print(split_range(3, 10))
print(split_range(4, 10))
print(split_range(5, 10))
print(split_range(3, 11))
print(split_range(3, 12))
if __name__ == "__main__":
test()