ozgurdemir · brentp · Jun 29, 2017
diff --git a/README.md b/README.md
@@ -1,16 +1,16 @@
 A simple go library for approximate nearest neighbours (ANN).
 
-#Background
+# Background
 In many computational problems such as NLP, Recommendation Systems and Search, items (e.g. words) are represented as vectors in a multidimensional space. Then given a specific item it's nearest neighbours need to be find e.g. given a query find the most similar ones. 
 A naive liner scan over the data set might be too slow for most data sets. 
 
 Hence, more efficient algorithms are needed. One of the most widely used approaches is Locality Sensitive Hashing ([LSH](https://en.wikipedia.org/wiki/Locality-sensitive_hashing)). This family of algorithms are very fast but might not give the exact solution and are hence called approximate nearest neighbours (ANN). The trade off between accuracy and speed is set via parameters of the algorithm. 
 
-#GoLsh
-is a library that finds approximate nearest neighbours based on the cosine distance between two vectors ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). It is meant to be used as an online algorithm. That is whenever queries are comming in by user requests.
+# GoLsh
+is a library that finds approximate nearest neighbours based on the cosine distance between two vectors ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). It is meant to be used as an online algorithm. That is whenever queries are coming in by user requests.
 
-#<a name="algorithm"></a>Algorithm
-the basic idea of the algorithm is very simple. Find a hash encodind for every vector such that similar vectors have the same hash value. Hence, finding similar items boils down to finding vectors with the same hash value which can be done very efficiently using a hash table. 
+# <a name="algorithm">Algorithm</a>
+the basic idea of the algorithm is very simple. Find a hash encoding for every vector such that similar vectors have the same hash value. Hence, finding similar items boils down to finding vectors with the same hash value which can be done very efficiently using a hash table. 
 
 The hashes are generated by randomly splitting the space using **d** hyperplanes. For every hyperplane it is determined if the current vector lies to the left (0) or to the right (1). Doing so every vector can be represented as a **d** dimensional bit hashes. Vectors lying nearby will most probably fall into the same hash representation but still a single bit flip may lead to neighbours not found. Hence, the above step is repeated **numEmbedding** times to be more accurate.
 
@@ -22,32 +22,40 @@ The algorithm proceeds as follows:
 	* however if this number is set too high collisions will rarely happen and no less results will be found
 2. in order to increase the recall (number of found neighbours) step 1. is repeated **numEmbedding** times. After this step every vector in the corpus is represented as **numEmedding** bitstrings of length **d**.
 
-#Usage
+# Usage
 the library is initialized as follows:
 
-	golsh.NewLsh(vectors *map[int][]float32, numEmbeddings int, d int) Lsh
+```Go
+golsh.NewLsh(vectors [][]float64, numEmbeddings int, d int) Lsh
+```
 
 **vectors** is a simple go map from an user defined int (id) to the input vectors to be searched against. This function will return an golsh.Lsh object which is used for all subsequent operations. The two parameters **numEmbeddings** and **d** controll the trade off between speed and accuracy ([see above](#algorithm)).
 
 ## Get vector based on id
 
-	lsh.Vector(id int) ([]float32, bool)
+```Go
+lsh.Vector(id int) ([]float32, bool)
+```
 
 given an id will return the vector stored. This is just a convinience function and may be used if the vector to search with is part of the corpus itself.
 
 ## Find approximate nearest neighbours
 
-	lsh.Ann(vector []float32, k int, threshold float32) ([]Hit, int, error)
+```Go
+lsh.Ann(vector []float64, k int, threshold float64) ([]Hit, int, error)
+```
 
 **vector** the vector to search nearest neighbours for. **k** max number of neighbours returned. **threshold** min cosine similarity that a neighbour needs to have. This parameter is used to filter false positives.
 
 ## Result
 the result is of type []golsh.Hit where Hit consists of:
 
-	type Hit struct {
-		ID     int
-		Vector *[]float32
-		Cosine float32
-	}
+```Go
+type Hit struct {
+    ID     int
+    Vector *[]float64
+    Cosine float64
+}
+```
 
-where **ID** is the id of the result vector. **Vector** is the result vector itself and **Cosine** is the exact cosine distance between the query and this result vector. The result array is sorted by similarity that is most similar vector is pos 1.
+where **ID** is the id of the result vector. **Vector** is the result vector itself and **Cosine** is the exact cosine distance between the query and this result vector. The result array is sorted by cosine similarity to the query vector.
diff --git a/embedding.go b/embedding.go
@@ -1,67 +1,53 @@
 package golsh
 
-import (
-	"bytes"
-	"fmt"
-	"math/rand"
-)
+import "math/rand"
 
 type random interface {
-	draw() float32
+	draw() float64
 }
 
-type gauss struct{}
+type gauss struct {
+}
 
-func (g *gauss) draw() float32 {
-	return float32(rand.NormFloat64())
+func (g *gauss) draw() float64 {
+	return rand.NormFloat64()
 }
 
 type embedding struct {
-	normals [][]float32
+	normals [][]float64
 }
 
 func newEmbedding(d int, size int, r random) embedding {
-	normals := make([][]float32, d, d)
+	normals := make([][]float64, d)
 	for i := 0; i < d; i++ {
 		normals[i] = normal(size, r)
 	}
 	return embedding{normals}
 }
 
-func normal(size int, r random) []float32 {
-	result := make([]float32, size, size)
+func normal(size int, r random) []float64 {
+	result := make([]float64, size)
 	for i := 0; i < size; i++ {
 		result[i] = r.draw()
 	}
 	return result
 }
 
 // returns an embedding of size d
-func (e *embedding) embed(id int, vector []float32) string {
-	result := make([]bool, len(e.normals), len(e.normals))
+func (e *embedding) embed(vector []float64) uint64 {
+	var result uint64
 	for i, normal := range e.normals {
-		result[i] = dimension(vector, normal)
-	}
-	return fmt.Sprintf("%d-%s", id, bitToString(result))
-}
-
-func dimension(vecA []float32, vecB []float32) bool {
-	dot := dot(vecA, vecB)
-	if dot > 0 {
-		return true
+		if dot(vector, normal) > 0 {
+			result |= (1 << uint64(i))
+		}
 	}
-	return false
+	return result
 }
 
-func bitToString(bits []bool) string {
-	var buffer bytes.Buffer
-
-	for _, bit := range bits {
-		if bit {
-			buffer.WriteString("1")
-		} else {
-			buffer.WriteString("0")
-		}
+func dot(x, y []float64) float64 {
+	var sum float64
+	for i, v := range x {
+		sum += y[i] * v
 	}
-	return buffer.String()
+	return sum
 }
diff --git a/embedding_test.go b/embedding_test.go
@@ -6,86 +6,65 @@ import (
 )
 
 func TestFakeRandom(t *testing.T) {
-	r := newFakeRandom([]float32{1.0, 2.0, 3.0})
+	r := newFakeRandom([]float64{1.0, 2.0, 3.0})
 
-	if got, expected := r.draw(), float32(1.0); expected != got {
+	if got, expected := r.draw(), float64(1.0); expected != got {
 		t.Fatalf("expected %f but got %f", expected, got)
 	}
 
-	if got, expected := r.draw(), float32(2.0); expected != got {
+	if got, expected := r.draw(), float64(2.0); expected != got {
 		t.Fatalf("expected %f but got %f", expected, got)
 	}
 
-	if got, expected := r.draw(), float32(3.0); expected != got {
+	if got, expected := r.draw(), float64(3.0); expected != got {
 		t.Fatalf("expected %f but got %f", expected, got)
 	}
 }
 
 func TestNewEmbedding(t *testing.T) {
-	r := newFakeRandom([]float32{1, 2, 3, 4})
+	r := newFakeRandom([]float64{1, 2, 3, 4})
 	got := newEmbedding(2, 4, &r)
 
 	if got, expected := len(got.normals), 2; got != expected {
 		t.Fatalf("expected %d but got %d", expected, got)
 	}
 
-	if got, expected := got.normals[0], []float32{1, 2, 3, 4}; !reflect.DeepEqual(got, expected) {
+	if got, expected := got.normals[0], []float64{1, 2, 3, 4}; !reflect.DeepEqual(got, expected) {
 		// t.Fatalf("expected %v but got %v", expected, got)
 	}
 
-	if got, expected := got.normals[1], []float32{1, 2, 3, 4}; !reflect.DeepEqual(got, expected) {
+	if got, expected := got.normals[1], []float64{1, 2, 3, 4}; !reflect.DeepEqual(got, expected) {
 		// t.Fatalf("expected %v but got %v", expected, got)
 	}
 }
 
 func TestNormal(t *testing.T) {
-	r := newFakeRandom([]float32{1.0, 2.0})
-	if got, expected := normal(2, &r), []float32{1.0, 2.0}; !reflect.DeepEqual(expected, got) {
+	r := newFakeRandom([]float64{1.0, 2.0})
+	if got, expected := normal(2, &r), []float64{1.0, 2.0}; !reflect.DeepEqual(expected, got) {
 		// t.Fatalf("expected %f but got %f", expected, got)
 	}
 }
 
 func TestEmbed(t *testing.T) {
-	normalA := []float32{1.0, 0.0}
-	normalB := []float32{0.0, 1.0}
-	embedding := embedding{[][]float32{normalA, normalB}}
-	got := embedding.embed(1, []float32{1.0, 0.0})
-	if expected := "1-10"; got != expected {
-		t.Fatalf("expected %s but got %s", expected, got)
-	}
-}
-
-func TestDimension(t *testing.T) {
-	vecA := []float32{1.0, 1.0}
-	vecB := []float32{1.0, 0.0}
-	vecC := []float32{0.0, 0.0}
-
-	if got, expected := dimension(vecA, vecB), true; !reflect.DeepEqual(expected, got) {
-		t.Fatalf("expected %t but got %t", expected, got)
-	}
-
-	if got, expected := dimension(vecA, vecC), false; !reflect.DeepEqual(expected, got) {
-		t.Fatalf("expected %t but got %t", expected, got)
-	}
-}
-
-func TestBitString(t *testing.T) {
-	got := bitToString([]bool{true, false, true, true})
-	if expected := "1011"; got != expected {
-		t.Fatalf("expected %s but got %s", expected, got)
+	normalA := []float64{1.0, 0.0}
+	normalB := []float64{0.0, 1.0}
+	embedding := embedding{[][]float64{normalA, normalB}}
+	got := embedding.embed([]float64{1.0, 0.0})
+	if expected := uint64(1); !reflect.DeepEqual(got, expected) {
+		t.Fatalf("expected %v but got %v", expected, got)
 	}
 }
 
 type fakeRandom struct {
 	index int
-	set   []float32
+	set   []float64
 }
 
-func newFakeRandom(set []float32) fakeRandom {
+func newFakeRandom(set []float64) fakeRandom {
 	return fakeRandom{-1, set}
 }
 
-func (f *fakeRandom) draw() float32 {
+func (f *fakeRandom) draw() float64 {
 	f.index++
 	if f.index > len(f.set)-1 {
 		f.index = 0