-
Notifications
You must be signed in to change notification settings - Fork 9
/
create-vector.py
39 lines (34 loc) · 1.03 KB
/
create-vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import sys
vectors = {}
all_feat = {}
# Read features of a word line by line and merge them. If a word has a given
# POS, then append the POS to the feature.
for line in sys.stdin:
things = line.strip().lower().split()
pos_present = False
try:
word, pos = things[0].split('.')
pos_present = True
except:
word = things[0]
if word not in vectors:
vectors[word] = {}
for feat in things[1:]:
if pos_present:
feat = feat+'.'+pos
vectors[word][feat] = 1
if feat not in all_feat:
all_feat[feat] = len(all_feat)
print >> sys.stderr, "Vector length:", len(all_feat)
print >> sys.stderr, "Vocab length:", len(vectors)
# Calculate the average sparsity of the resultant vectors
active = 0.
for word in vectors:
active += len(vectors[word])
print >> sys.stderr, "Sparsity:", 100 * (1 - active/(len(all_feat)*len(vectors)))
# Convert the dictionary into vectors
for word in vectors:
vector = len(all_feat)*['0']
for feat in vectors[word]:
vector[all_feat[feat]] = '1'
print word, " ".join(vector)