Skip to content

Commit

Permalink
Merge pull request #9 from dt382/master
Browse files Browse the repository at this point in the history
Added nearest neighbour and inverted softmax translation
  • Loading branch information
nhammerla authored Sep 14, 2017
2 parents 3eb6356 + aadc37d commit 451ee74
Showing 1 changed file with 61 additions and 2 deletions.
63 changes: 61 additions & 2 deletions fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self, vector_file='', transform=None):
"""Read in word vectors in fasttext format"""
self.word2id = {}

# Captures word order, only used for export(), so that more frequent words are earlier in the file
# Captures word order, for export() and translate methods
self.id2word = []

print('reading word vectors from %s' % vector_file)
Expand All @@ -39,7 +39,10 @@ def __init__(self, vector_file='', transform=None):
self.word2id[elems[0]] = i
self.embed[i] = elems[1:self.n_dim+1]
self.id2word.append(elems[0])


# Used in translate_inverted_softmax()
self.softmax_denominators = None

if transform is not None:
print('Applying transformation to embedding')
self.apply_transform(transform)
Expand Down Expand Up @@ -77,7 +80,63 @@ def export(self, outpath):

fout.close()

def translate_nearest_neighbour(self, source_vector):
"""Obtain translation of source_vector using nearest neighbour retrieval"""
similarity_vector = np.matmul(FastVector.normalised(self.embed), source_vector)
target_id = np.argmax(similarity_vector)
return self.id2word[target_id]

def translate_inverted_softmax(self, source_vector, source_space, nsamples,
beta=10., batch_size=100, recalculate=True):
"""
Obtain translation of source_vector using sampled inverted softmax retrieval
with inverse temperature beta.
nsamples vectors are drawn from source_space in batches of batch_size
to calculate the inverted softmax denominators.
Denominators from previous call are reused if recalculate=False. This saves
time if multiple words are translated from the same source language.
"""
embed_normalised = FastVector.normalised(self.embed)
# calculate contributions to softmax denominators in batches
# to save memory
if self.softmax_denominators is None or recalculate is True:
self.softmax_denominators = np.zeros(self.embed.shape[0])
while nsamples > 0:
# get batch of randomly sampled vectors from source space
sample_vectors = source_space.get_samples(min(nsamples, batch_size))
# calculate cosine similarities between sampled vectors and
# all vectors in the target space
sample_similarities = \
np.matmul(embed_normalised,
FastVector.normalised(sample_vectors).transpose())
# accumulate contribution to denominators
self.softmax_denominators \
+= np.sum(np.exp(beta * sample_similarities), axis=1)
nsamples -= batch_size
# cosine similarities between source_vector and all target vectors
similarity_vector = np.matmul(embed_normalised,
source_vector/np.linalg.norm(source_vector))
# exponentiate and normalise with denominators to obtain inverted softmax
softmax_scores = np.exp(beta * similarity_vector) / \
self.softmax_denominators
# pick highest score as translation
target_id = np.argmax(softmax_scores)
return self.id2word[target_id]

def get_samples(self, nsamples):
"""Return a matrix of nsamples randomly sampled vectors from embed"""
sample_ids = np.random.choice(self.embed.shape[0], nsamples, replace=False)
return self.embed[sample_ids]

@classmethod
def normalised(cls, mat, axis=-1, order=2):
"""Utility function to normalise the rows of a numpy array."""
norm = np.linalg.norm(
mat, axis=axis, ord=order, keepdims=True)
norm[norm == 0] = 1
return mat / norm

@classmethod
def cosine_similarity(cls, vec_a, vec_b):
"""Compute cosine similarity between vec_a and vec_b"""
Expand Down

0 comments on commit 451ee74

Please sign in to comment.