-
Notifications
You must be signed in to change notification settings - Fork 5
/
read_write.py
28 lines (23 loc) · 859 Bytes
/
read_write.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import sys
import gzip
import numpy
import math
from collections import Counter
from operator import itemgetter
''' Optionally open a file as gzip '''
def gzopen(f):
return gzip.open(f) if f.endswith('.gz') else open(f)
''' Read all the word vectors and normalize them '''
def read_word_vectors(filename):
word_vecs = {}
if filename.endswith('.gz'): file_object = gzip.open(filename, 'r')
else: file_object = open(filename, 'r')
for line_num, line in enumerate(file_object):
line = line.strip().lower()
word = line.split()[0]
word_vecs[word] = numpy.zeros(len(line.split())-1, dtype=float)
for index, vec_val in enumerate(line.split()[1:]):
word_vecs[word][index] = float(vec_val)
''' normalize weight vector '''
word_vecs[word] /= math.sqrt((word_vecs[word]**2).sum() + 1e-6)
return word_vecs