Merge pull request #9 from dt382/master

Added nearest neighbour and inverted softmax translation
babylonhealth · Sep 14, 2017 · 451ee74 · 451ee74
2 parents 3eb6356 + aadc37d
commit 451ee74
Showing 1 changed file with 61 additions and 2 deletions.
diff --git a/fasttext.py b/fasttext.py
@@ -26,7 +26,7 @@ def __init__(self, vector_file='', transform=None):
         """Read in word vectors in fasttext format"""
         self.word2id = {}
 
-        # Captures word order, only used for export(), so that more frequent words are earlier in the file
+        # Captures word order, for export() and translate methods
         self.id2word = []
 
         print('reading word vectors from %s' % vector_file)
@@ -39,7 +39,10 @@ def __init__(self, vector_file='', transform=None):
                 self.word2id[elems[0]] = i
                 self.embed[i] = elems[1:self.n_dim+1]
                 self.id2word.append(elems[0])
-
+
+        # Used in translate_inverted_softmax()
+        self.softmax_denominators = None
+
         if transform is not None:
             print('Applying transformation to embedding')
             self.apply_transform(transform)
@@ -77,7 +80,63 @@ def export(self, outpath):
 
         fout.close()
 
+    def translate_nearest_neighbour(self, source_vector):
+        """Obtain translation of source_vector using nearest neighbour retrieval"""
+        similarity_vector = np.matmul(FastVector.normalised(self.embed), source_vector)
+        target_id = np.argmax(similarity_vector)
+        return self.id2word[target_id]
+
+    def translate_inverted_softmax(self, source_vector, source_space, nsamples,
+                                   beta=10., batch_size=100, recalculate=True):
+        """
+        Obtain translation of source_vector using sampled inverted softmax retrieval
+        with inverse temperature beta.
+
+        nsamples vectors are drawn from source_space in batches of batch_size
+        to calculate the inverted softmax denominators.
+        Denominators from previous call are reused if recalculate=False. This saves
+        time if multiple words are translated from the same source language.
+        """
+        embed_normalised = FastVector.normalised(self.embed)
+        # calculate contributions to softmax denominators in batches
+        # to save memory
+        if self.softmax_denominators is None or recalculate is True:
+            self.softmax_denominators = np.zeros(self.embed.shape[0])
+            while nsamples > 0:
+                # get batch of randomly sampled vectors from source space
+                sample_vectors = source_space.get_samples(min(nsamples, batch_size))
+                # calculate cosine similarities between sampled vectors and
+                # all vectors in the target space
+                sample_similarities = \
+                    np.matmul(embed_normalised,
+                              FastVector.normalised(sample_vectors).transpose())
+                # accumulate contribution to denominators
+                self.softmax_denominators \
+                    += np.sum(np.exp(beta * sample_similarities), axis=1)
+                nsamples -= batch_size
+        # cosine similarities between source_vector and all target vectors
+        similarity_vector = np.matmul(embed_normalised,
+                                      source_vector/np.linalg.norm(source_vector))
+        # exponentiate and normalise with denominators to obtain inverted softmax
+        softmax_scores = np.exp(beta * similarity_vector) / \
+                         self.softmax_denominators
+        # pick highest score as translation
+        target_id = np.argmax(softmax_scores)
+        return self.id2word[target_id]
+
+    def get_samples(self, nsamples):
+        """Return a matrix of nsamples randomly sampled vectors from embed"""
+        sample_ids = np.random.choice(self.embed.shape[0], nsamples, replace=False)
+        return self.embed[sample_ids]
 
+    @classmethod
+    def normalised(cls, mat, axis=-1, order=2):
+        """Utility function to normalise the rows of a numpy array."""
+        norm = np.linalg.norm(
+            mat, axis=axis, ord=order, keepdims=True)
+        norm[norm == 0] = 1
+        return mat / norm
+
     @classmethod
     def cosine_similarity(cls, vec_a, vec_b):
         """Compute cosine similarity between vec_a and vec_b"""