-
Notifications
You must be signed in to change notification settings - Fork 1
/
wordreps.py
57 lines (54 loc) · 1.7 KB
/
wordreps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
import math
import zipfile
class WordReps():
def __init__(self):
self.vects=None
self.vocab=None
#------------------------------------------------------------------
def Read_Embeddings_zip_file(self,Embedding_file,dim):
"""
Read the word vectors where the first token in each line is the word.
Input: word embedding filename (string), dimensionality of the embeddings (integer)
"""
self.dim=dim
vects = {}
vocab = []
print ("Retrieving words embeddings...")
zfile = zipfile.ZipFile(Embedding_file[0])
for finfo in zfile.infolist():
F = zfile.open(finfo)
# read the vectors.
line = F.readline()
if len(line.split())==2:
print ("Header Exists.")
line=F.readline()
while len(line) != 0:
p = line.split()
word = p[0].decode('UTF-8')
v = np.zeros(self.dim, float)
for i in range(0, self.dim):
v[i] = float(p[i+1])
vects[word] = v
vocab.append(word)
line = F.readline()
print ("Number of words in the vocabulary is: ",len(vocab))
F.close()
self.vocab = vocab
self.vects = vects
break
#------------------------------------------------------------------
def get_embedding(self,word):
"""
If we can find the embedding for the word in vects, we will return it.
Otherwise, we will check if the lowercase version of word appears in vects
and if so we will return the embedding for the lowercase version of the word.
Otherwise we will return a zero vector.
"""
if word in self.vects:
return self.vects[word]
elif word.lower() in self.vects:
return self.vects[word.lower()]
else:
return np.zeros(self.dim, dtype=float)
#------------------------------------------------------------