From 94f5fd40ddf8dcacf25093eb0c347022dab9f557 Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Mon, 26 Aug 2024 16:04:42 -0400
Subject: [PATCH 1/8] Add C++ tests and overloaded Index methods that accept 2D
 vector of floats instead of NDArray

---
 .gitignore              |   1 +
 cpp/src/Index.h         |  11 ++-
 cpp/src/TypedIndex.h    |  36 ++++++++
 cpp/test/CMakeLists.txt |   3 +
 cpp/test/test_main.cpp  | 182 +++++++++++++++++++++++++++++++++-------
 cpp/test/test_utils.cpp |  70 ++++++++++++++++
 6 files changed, 270 insertions(+), 33 deletions(-)
 create mode 100644 cpp/test/test_utils.cpp
diff --git a/.gitignore b/.gitignore
index f5287bce..f46320cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ java/classpath.txt
 java/linux-build/include/*
 python/voyager-headers
 .asv/
+*.dSYM
 
 # Cmake
 CMakeLists.txt.user
diff --git a/cpp/src/Index.h b/cpp/src/Index.h
index f6fe581c..67d3219c 100644
--- a/cpp/src/Index.h
+++ b/cpp/src/Index.h
@@ -44,7 +44,7 @@
  */
 class Index {
 public:
-  virtual ~Index(){};
+  virtual ~Index() {};
 
   virtual void setEF(size_t ef) = 0;
   virtual int getEF() const = 0;
@@ -71,6 +71,11 @@ class Index {
 
   virtual hnswlib::labeltype addItem(std::vector<float> vector,
                                      std::optional<hnswlib::labeltype> id) = 0;
+
+  virtual std::vector<hnswlib::labeltype>
+  addItems(std::vector<std::vector<float>> input,
+           std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) = 0;
+
   virtual std::vector<hnswlib::labeltype>
   addItems(NDArray<float, 2> input, std::vector<hnswlib::labeltype> ids = {},
            int numThreads = -1) = 0;
@@ -86,6 +91,10 @@ class Index {
   virtual std::tuple<std::vector<hnswlib::labeltype>, std::vector<float>>
   query(std::vector<float> queryVector, int k = 1, long queryEf = -1) = 0;
 
+  virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
+  query(std::vector<std::vector<float>> queryVectors, int k = 1,
+        int numThreads = -1, long queryEf = -1) = 0;
+
   virtual std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<float, 2>>
   query(NDArray<float, 2> queryVectors, int k = 1, int numThreads = -1,
         long queryEf = -1) = 0;
diff --git a/cpp/src/TypedIndex.h b/cpp/src/TypedIndex.h
index 63066a40..bc130214 100644
--- a/cpp/src/TypedIndex.h
+++ b/cpp/src/TypedIndex.h
@@ -290,6 +290,24 @@ class TypedIndex : public Index {
     return addItems(NDArray<float, 2>(vector, {1, (int)vector.size()}), ids)[0];
   }
 
+  std::vector<hnswlib::labeltype>
+  addItems(const std::vector<std::vector<float>> vectors,
+           std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
+    // Convert the 2D array of float to NDArray<float, 2>
+    int numVectors = vectors.size();
+    int dimensions = numVectors > 0 ? vectors[0].size() : 0;
+    std::array<int, 2> shape = {numVectors, dimensions};
+
+    // flatten the 2d array of floats
+    std::vector<float> flatArray;
+    for (const auto &vector : vectors) {
+      flatArray.insert(flatArray.end(), vector.begin(), vector.end());
+    }
+    NDArray<float, 2> ndarray(flatArray, shape);
+
+    return addItems(ndarray, ids, numThreads);
+  }
+
   std::vector<hnswlib::labeltype>
   addItems(NDArray<float, 2> floatInput,
            std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
@@ -502,6 +520,24 @@ class TypedIndex : public Index {
     return algorithmImpl->label_lookup_;
   }
 
+  std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
+  query(std::vector<std::vector<float>> floatQueryVectors, int k = 1,
+        int numThreads = -1, long queryEf = -1) {
+    // Convert the 2D array of float to NDArray<float, 2>
+    int numVectors = floatQueryVectors.size();
+    int dimensions = numVectors > 0 ? floatQueryVectors[0].size() : 0;
+    std::array<int, 2> shape = {numVectors, dimensions};
+
+    // flatten the 2d array of floats
+    std::vector<float> flatArray;
+    for (const auto &vector : floatQueryVectors) {
+      flatArray.insert(flatArray.end(), vector.begin(), vector.end());
+    }
+    NDArray<float, 2> ndarray(flatArray, shape);
+
+    return query(ndarray, k, numThreads, queryEf);
+  }
+
   std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
   query(NDArray<float, 2> floatQueryVectors, int k = 1, int numThreads = -1,
         long queryEf = -1) {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index a46805c1..1606f1a7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -4,6 +4,9 @@ set(TEST_FILES test_main.cpp doctest_setup.cpp)  # Add any test files here
 # Create an executable for the tests
 add_executable(VoyagerTests ${TEST_FILES})
 
+# Add compiler flags
+target_compile_options(VoyagerTests PRIVATE -g)
+
 # Link the test executable with the main project and Doctest
 # target_link_libraries(MyProjectTests PRIVATE MyProject doctest::doctest)
 target_link_libraries(VoyagerTests
diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp
index dbfa5dc1..eba1a4c9 100644
--- a/cpp/test/test_main.cpp
+++ b/cpp/test/test_main.cpp
@@ -1,49 +1,167 @@
 #include "doctest.h"
 
 #include "TypedIndex.h"
+#include "test_utils.cpp"
 #include <tuple>
 #include <type_traits>
 
 template <typename dist_t, typename data_t = dist_t,
           typename scalefactor = std::ratio<1, 1>>
-void testCombination(TypedIndex<dist_t, data_t, scalefactor> &index,
-                     SpaceType spaceType, int numDimensions,
-                     StorageDataType storageType) {
-  CHECK(toString(index.getSpace()) == toString(spaceType));
-  CHECK(index.getNumDimensions() == numDimensions);
-  CHECK(toString(index.getStorageDataType()) == toString(storageType));
+void testIndexProperties(TypedIndex<dist_t, data_t, scalefactor> &index,
+                         SpaceType spaceType, int numDimensions,
+                         StorageDataType storageType) {
+  REQUIRE(toString(index.getSpace()) == toString(spaceType));
+  REQUIRE(index.getNumDimensions() == numDimensions);
+  REQUIRE(toString(index.getStorageDataType()) == toString(storageType));
 }
 
-TEST_CASE("Test combinations of different instantiations and sizes") {
-  std::vector<SpaceType> spaceTypesSet = {SpaceType::Euclidean,
-                                          SpaceType::InnerProduct};
-  std::vector<int> numDimensionsSet = {4, 16, 128, 1024};
-  std::vector<int> numElementsSet = {100, 1000, 100000};
+/**
+ * Test the query method of the index. The index is populated with random
+ * vectors, and then queried with the same vectors. The expected result is that
+ * each vector's nearest neighbor is itself and that the distance is zero
+ * (allowing for some precision error based on the storage type).
+ */
+template <typename dist_t, typename data_t = dist_t,
+          typename scalefactor = std::ratio<1, 1>>
+void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
+               int numDimensions, SpaceType spaceType,
+               StorageDataType storageType, bool testSingleVectorMethod,
+               float precisionTolerance) {
+  // create test data and ids
+  std::vector<std::vector<float>> inputData =
+      randomVectors(numVectors, numDimensions);
+  std::vector<hnswlib::labeltype> ids(numVectors);
+  for (int i = 0; i < numVectors; i++) {
+    ids[i] = i;
+  }
+
+  // add items to index
+  if (testSingleVectorMethod == true) {
+    for (auto id : ids) {
+      index.addItem(inputData[id], id);
+    }
+  } else {
+    index.addItems(inputData, ids, -1);
+  }
+
+  int k = 1;
+  float lowerBound = 0.0f - precisionTolerance;
+  float upperBound = 0.0f + precisionTolerance;
+
+  // Use the single-query interface (query with a single target vector)
+  for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
+    for (int i = 0; i < numVectors; i++) {
+
+      /**
+       * Use the raw inputData as target vectors for querying. We don't use the
+       * index data because once data has been added to the index, the model can
+       * change the "ground truth" by changing the data format.
+       */
+      auto targetVector = inputData[i];
+      auto nearestNeighbor = index.query(targetVector, k, queryEf);
+
+      auto labels = std::get<0>(nearestNeighbor);
+      auto distances = std::get<1>(nearestNeighbor);
+      REQUIRE(labels.size() == k);
+      REQUIRE(distances.size() == k);
+
+      /**
+       * E4M3 is too low precision for us to confidently assume that querying
+       * with the unquantized (fp32) vector will return the quantized vector as
+       * its NN InnerProduct will have negative distance to the closest item,
+       * not zero
+       */
+      if (storageType != StorageDataType::E4M3 &&
+          spaceType != SpaceType::InnerProduct) {
+        REQUIRE(i == labels[0]);
+        REQUIRE(distances[0] >= lowerBound);
+        REQUIRE(distances[0] <= upperBound);
+      }
+    }
+  }
+
+  // Use the bulk-query interface  (query with multiple target vectors at once)
+  for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
+    for (int i = 0; i < numVectors; i++) {
+      auto nearestNeighbors = index.query(
+          inputData, /* k= */ 1, /* numThreads= */ -1, /* queryEf= */ queryEf);
+      NDArray<hnswlib::labeltype, 2> labels = std::get<0>(nearestNeighbors);
+      NDArray<dist_t, 2> distances = std::get<1>(nearestNeighbors);
+      REQUIRE(labels.shape[0] == numVectors);
+      REQUIRE(labels.shape[1] == k);
+      REQUIRE(distances.shape[0] == numVectors);
+      REQUIRE(distances.shape[1] == k);
+
+      for (int i = 0; i < numVectors; i++) {
+        auto label = labels.data[i];
+        auto distance = distances.data[i];
+
+        /**
+         * E4M3 is too low precision for us to confidently assume that querying
+         * with the unquantized (fp32) vector will return the quantized vector
+         * as its NN InnerProduct will have negative distance to the closest
+         * item, not zero
+         */
+        if (storageType != StorageDataType::E4M3 &&
+            spaceType != SpaceType::InnerProduct) {
+          REQUIRE(i == label);
+          REQUIRE(distance >= lowerBound);
+          REQUIRE(distance <= upperBound);
+        }
+      }
+    }
+  }
+}
+
+TEST_CASE("Test combinations of different instantiations. Test that each "
+          "vector's NN is itself and distance is approximately zero.") {
+  std::unordered_map<StorageDataType, float> PRECISION_TOLERANCE_PER_DATA_TYPE =
+      {{StorageDataType::Float32, 0.00001f},
+       {StorageDataType::Float8, 0.10f},
+       {StorageDataType::E4M3, 0.20f}};
+  std::vector<SpaceType> spaceTypesSet = {
+      SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine};
+  std::vector<int> numDimensionsSet = {32};
+  std::vector<int> numVectorsSet = {500};
   std::vector<StorageDataType> storageTypesSet = {
       StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3};
-
-  auto count = 0;
+  std::vector<bool> testSingleVectorMethods = {true, false};
 
   for (auto spaceType : spaceTypesSet) {
-    for (auto numDimensions : numDimensionsSet) {
-      for (auto numElements : numElementsSet) {
-        for (auto storageType : storageTypesSet) {
-          SUBCASE("Test instantiation ") {
-            CAPTURE(spaceType);
-            CAPTURE(numDimensions);
-            CAPTURE(numElements);
-            CAPTURE(storageType);
-
-            if (storageType == StorageDataType::Float8) {
-              auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
-                  spaceType, numDimensions);
-              testCombination(index, spaceType, numDimensions, storageType);
-            } else if (storageType == StorageDataType::Float32) {
-              auto index = TypedIndex<float>(spaceType, numDimensions);
-              testCombination(index, spaceType, numDimensions, storageType);
-            } else if (storageType == StorageDataType::E4M3) {
-              auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
-              testCombination(index, spaceType, numDimensions, storageType);
+    for (auto storageType : storageTypesSet) {
+      for (auto numDimensions : numDimensionsSet) {
+        for (auto numVectors : numVectorsSet) {
+          for (auto testSingleVectorMethod : testSingleVectorMethods) {
+
+            SUBCASE("Test instantiation ") {
+              CAPTURE(spaceType);
+              CAPTURE(numDimensions);
+              CAPTURE(numVectors);
+              CAPTURE(storageType);
+
+              if (storageType == StorageDataType::Float8) {
+                auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(
+                    spaceType, numDimensions);
+                testIndexProperties(index, spaceType, numDimensions,
+                                    storageType);
+                testQuery(index, numVectors, numDimensions, spaceType,
+                          storageType, testSingleVectorMethod,
+                          PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
+              } else if (storageType == StorageDataType::Float32) {
+                auto index = TypedIndex<float>(spaceType, numDimensions);
+                testIndexProperties(index, spaceType, numDimensions,
+                                    storageType);
+                testQuery(index, numVectors, numDimensions, spaceType,
+                          storageType, testSingleVectorMethod,
+                          PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
+              } else if (storageType == StorageDataType::E4M3) {
+                auto index = TypedIndex<float, E4M3>(spaceType, numDimensions);
+                testIndexProperties(index, spaceType, numDimensions,
+                                    storageType);
+                testQuery(index, numVectors, numDimensions, spaceType,
+                          storageType, testSingleVectorMethod,
+                          PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]);
+              }
             }
           }
         }
diff --git a/cpp/test/test_utils.cpp b/cpp/test/test_utils.cpp
new file mode 100644
index 00000000..e90d93d9
--- /dev/null
+++ b/cpp/test/test_utils.cpp
@@ -0,0 +1,70 @@
+#include <random>
+#include <vector>
+
+#include "array_utils.h"
+
+NDArray<float, 2> randomQuantizedVectorsNDArray(int numVectors,
+                                                int dimensions) {
+  NDArray<float, 2> vectors = NDArray<float, 2>({numVectors, dimensions});
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  int numElements = numVectors * dimensions;
+  for (int i = 0; i < numElements; ++i) {
+    vectors.data[i] = static_cast<int>(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f;
+  }
+
+  return vectors;
+}
+
+NDArray<float, 2> randomVectorsNDArray(int numVectors, int dimensions) {
+  NDArray<float, 2> vectors = NDArray<float, 2>({numVectors, dimensions});
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  int numElements = numVectors * dimensions;
+  for (int i = 0; i < numElements; ++i) {
+    vectors.data[i] = static_cast<float>(dis(gen)) * 2 - 1;
+  }
+
+  return vectors;
+}
+
+std::vector<std::vector<float>> randomQuantizedVectors(int numVectors,
+                                                       int dimensions) {
+  std::vector<std::vector<float>> vectors(numVectors,
+                                          std::vector<float>(dimensions));
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  for (int i = 0; i < numVectors; ++i) {
+    for (int j = 0; j < dimensions; ++j) {
+      vectors[i][j] = static_cast<int>(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f;
+    }
+  }
+
+  return vectors;
+}
+
+std::vector<std::vector<float>> randomVectors(int numVectors, int dimensions) {
+  std::vector<std::vector<float>> vectors(numVectors,
+                                          std::vector<float>(dimensions));
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0, 1.0);
+
+  for (int i = 0; i < numVectors; ++i) {
+    for (int j = 0; j < dimensions; ++j) {
+      vectors[i][j] = static_cast<float>(dis(gen)) * 2 - 1;
+    }
+  }
+
+  return vectors;
+}

From 723e189198047e12df1dd7031fb7b11d51988847 Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Mon, 26 Aug 2024 21:53:23 -0400
Subject: [PATCH 2/8] Use most recent version of clang-format

---
 .github/workflows/all.yml | 2 +-
 CONTRIBUTING.md           | 2 +-
 java/JavaOutputStream.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml
index 505e2fc0..7b1e819b 100644
--- a/.github/workflows/all.yml
+++ b/.github/workflows/all.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Check C++ Formatting
         uses: jidicula/clang-format-action@v4.13.0
         with:
-          clang-format-version: 16
+          clang-format-version: 18
 
   run-cpp-tests:
     runs-on: ${{ matrix.os }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2ef187e9..2f05bc4d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -130,7 +130,7 @@ tox -e format
 ```
 
 ### C++
-If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 16) installed, and then use clang-format to handle C++ formatting:
+If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 18) installed, and then use clang-format to handle C++ formatting:
 ```bash
 cd cpp
 cmake .
diff --git a/java/JavaOutputStream.h b/java/JavaOutputStream.h
index 5db295ee..a9ac9418 100644
--- a/java/JavaOutputStream.h
+++ b/java/JavaOutputStream.h
@@ -82,7 +82,7 @@ class JavaOutputStream : public OutputStream {
     return true;
   }
 
-  virtual ~JavaOutputStream(){};
+  virtual ~JavaOutputStream() {};
 
 private:
   JNIEnv *env;

From b61ef1e8dc75009c832eac8da9d553be4d192bd8 Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Tue, 27 Aug 2024 00:16:26 -0400
Subject: [PATCH 3/8] Undo clang-format bump. Fix formatting

---
 .github/workflows/all.yml | 2 +-
 CONTRIBUTING.md           | 2 +-
 cpp/src/Index.h           | 2 +-
 java/JavaOutputStream.h   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml
index 7b1e819b..505e2fc0 100644
--- a/.github/workflows/all.yml
+++ b/.github/workflows/all.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Check C++ Formatting
         uses: jidicula/clang-format-action@v4.13.0
         with:
-          clang-format-version: 18
+          clang-format-version: 16
 
   run-cpp-tests:
     runs-on: ${{ matrix.os }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2f05bc4d..2ef187e9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -130,7 +130,7 @@ tox -e format
 ```
 
 ### C++
-If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 18) installed, and then use clang-format to handle C++ formatting:
+If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 16) installed, and then use clang-format to handle C++ formatting:
 ```bash
 cd cpp
 cmake .
diff --git a/cpp/src/Index.h b/cpp/src/Index.h
index 67d3219c..a8b82ee3 100644
--- a/cpp/src/Index.h
+++ b/cpp/src/Index.h
@@ -44,7 +44,7 @@
  */
 class Index {
 public:
-  virtual ~Index() {};
+  virtual ~Index(){};
 
   virtual void setEF(size_t ef) = 0;
   virtual int getEF() const = 0;
diff --git a/java/JavaOutputStream.h b/java/JavaOutputStream.h
index a9ac9418..5db295ee 100644
--- a/java/JavaOutputStream.h
+++ b/java/JavaOutputStream.h
@@ -82,7 +82,7 @@ class JavaOutputStream : public OutputStream {
     return true;
   }
 
-  virtual ~JavaOutputStream() {};
+  virtual ~JavaOutputStream(){};
 
 private:
   JNIEnv *env;

From 8186a1ffa3f767bdf43a6a2e387f7b05d93b3d2a Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Thu, 29 Aug 2024 03:03:36 -0400
Subject: [PATCH 4/8] clean up C++ test, increase number of vectors

---
 cpp/test/test_main.cpp | 51 +++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp
index eba1a4c9..d7d39548 100644
--- a/cpp/test/test_main.cpp
+++ b/cpp/test/test_main.cpp
@@ -82,32 +82,30 @@ void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
 
   // Use the bulk-query interface  (query with multiple target vectors at once)
   for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) {
+    auto nearestNeighbors = index.query(
+        inputData, /* k= */ k, /* numThreads= */ -1, /* queryEf= */ queryEf);
+    NDArray<hnswlib::labeltype, 2> labels = std::get<0>(nearestNeighbors);
+    NDArray<dist_t, 2> distances = std::get<1>(nearestNeighbors);
+    REQUIRE(labels.shape[0] == numVectors);
+    REQUIRE(labels.shape[1] == k);
+    REQUIRE(distances.shape[0] == numVectors);
+    REQUIRE(distances.shape[1] == k);
+
     for (int i = 0; i < numVectors; i++) {
-      auto nearestNeighbors = index.query(
-          inputData, /* k= */ 1, /* numThreads= */ -1, /* queryEf= */ queryEf);
-      NDArray<hnswlib::labeltype, 2> labels = std::get<0>(nearestNeighbors);
-      NDArray<dist_t, 2> distances = std::get<1>(nearestNeighbors);
-      REQUIRE(labels.shape[0] == numVectors);
-      REQUIRE(labels.shape[1] == k);
-      REQUIRE(distances.shape[0] == numVectors);
-      REQUIRE(distances.shape[1] == k);
-
-      for (int i = 0; i < numVectors; i++) {
-        auto label = labels.data[i];
-        auto distance = distances.data[i];
-
-        /**
-         * E4M3 is too low precision for us to confidently assume that querying
-         * with the unquantized (fp32) vector will return the quantized vector
-         * as its NN InnerProduct will have negative distance to the closest
-         * item, not zero
-         */
-        if (storageType != StorageDataType::E4M3 &&
-            spaceType != SpaceType::InnerProduct) {
-          REQUIRE(i == label);
-          REQUIRE(distance >= lowerBound);
-          REQUIRE(distance <= upperBound);
-        }
+      auto label = labels.data[i];
+      auto distance = distances.data[i];
+
+      /**
+       * E4M3 is too low precision for us to confidently assume that querying
+       * with the unquantized (fp32) vector will return the quantized vector
+       * as its NN InnerProduct will have negative distance to the closest
+       * item, not zero
+       */
+      if (storageType != StorageDataType::E4M3 &&
+          spaceType != SpaceType::InnerProduct) {
+        REQUIRE(i == label);
+        REQUIRE(distance >= lowerBound);
+        REQUIRE(distance <= upperBound);
       }
     }
   }
@@ -122,7 +120,7 @@ TEST_CASE("Test combinations of different instantiations. Test that each "
   std::vector<SpaceType> spaceTypesSet = {
       SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine};
   std::vector<int> numDimensionsSet = {32};
-  std::vector<int> numVectorsSet = {500};
+  std::vector<int> numVectorsSet = {2000};
   std::vector<StorageDataType> storageTypesSet = {
       StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3};
   std::vector<bool> testSingleVectorMethods = {true, false};
@@ -138,6 +136,7 @@ TEST_CASE("Test combinations of different instantiations. Test that each "
               CAPTURE(numDimensions);
               CAPTURE(numVectors);
               CAPTURE(storageType);
+              CAPTURE(testSingleVectorMethod);
 
               if (storageType == StorageDataType::Float8) {
                 auto index = TypedIndex<float, int8_t, std::ratio<1, 127>>(

From 12fd3278ddc1de01636220ad8f8c47431c805588 Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Thu, 29 Aug 2024 10:46:14 -0400
Subject: [PATCH 5/8] Fix comment

---
 cpp/test/test_main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp
index d7d39548..c76f5e70 100644
--- a/cpp/test/test_main.cpp
+++ b/cpp/test/test_main.cpp
@@ -68,7 +68,7 @@ void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
       /**
        * E4M3 is too low precision for us to confidently assume that querying
        * with the unquantized (fp32) vector will return the quantized vector as
-       * its NN InnerProduct will have negative distance to the closest item,
+       * its NN. InnerProduct will have negative distance to the closest item,
        * not zero
        */
       if (storageType != StorageDataType::E4M3 &&
@@ -98,7 +98,7 @@ void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
       /**
        * E4M3 is too low precision for us to confidently assume that querying
        * with the unquantized (fp32) vector will return the quantized vector
-       * as its NN InnerProduct will have negative distance to the closest
+       * as its NN. InnerProduct will have negative distance to the closest
        * item, not zero
        */
       if (storageType != StorageDataType::E4M3 &&

From a3d04c8418e7f2e26cd3151ef694f2f04fec1837 Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Thu, 29 Aug 2024 23:16:50 -0400
Subject: [PATCH 6/8] Move code into reusable function

---
 cpp/src/TypedIndex.h  | 28 ++--------------------------
 cpp/src/array_utils.h | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/cpp/src/TypedIndex.h b/cpp/src/TypedIndex.h
index bc130214..c30ae97c 100644
--- a/cpp/src/TypedIndex.h
+++ b/cpp/src/TypedIndex.h
@@ -293,19 +293,7 @@ class TypedIndex : public Index {
   std::vector<hnswlib::labeltype>
   addItems(const std::vector<std::vector<float>> vectors,
            std::vector<hnswlib::labeltype> ids = {}, int numThreads = -1) {
-    // Convert the 2D array of float to NDArray<float, 2>
-    int numVectors = vectors.size();
-    int dimensions = numVectors > 0 ? vectors[0].size() : 0;
-    std::array<int, 2> shape = {numVectors, dimensions};
-
-    // flatten the 2d array of floats
-    std::vector<float> flatArray;
-    for (const auto &vector : vectors) {
-      flatArray.insert(flatArray.end(), vector.begin(), vector.end());
-    }
-    NDArray<float, 2> ndarray(flatArray, shape);
-
-    return addItems(ndarray, ids, numThreads);
+    return addItems(vectorsToNDArray(vectors), ids, numThreads);
   }
 
   std::vector<hnswlib::labeltype>
@@ -523,19 +511,7 @@ class TypedIndex : public Index {
   std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
   query(std::vector<std::vector<float>> floatQueryVectors, int k = 1,
         int numThreads = -1, long queryEf = -1) {
-    // Convert the 2D array of float to NDArray<float, 2>
-    int numVectors = floatQueryVectors.size();
-    int dimensions = numVectors > 0 ? floatQueryVectors[0].size() : 0;
-    std::array<int, 2> shape = {numVectors, dimensions};
-
-    // flatten the 2d array of floats
-    std::vector<float> flatArray;
-    for (const auto &vector : floatQueryVectors) {
-      flatArray.insert(flatArray.end(), vector.begin(), vector.end());
-    }
-    NDArray<float, 2> ndarray(flatArray, shape);
-
-    return query(ndarray, k, numThreads, queryEf);
+    return query(vectorsToNDArray(floatQueryVectors), k, numThreads, queryEf);
   }
 
   std::tuple<NDArray<hnswlib::labeltype, 2>, NDArray<dist_t, 2>>
diff --git a/cpp/src/array_utils.h b/cpp/src/array_utils.h
index 7c2a7556..fe4dbd72 100644
--- a/cpp/src/array_utils.h
+++ b/cpp/src/array_utils.h
@@ -309,3 +309,20 @@ std::string toFloatVectorString(std::vector<data_t> vec) {
   return toFloatVectorString<dist_t, data_t, scalefactor>(vec.data(),
                                                           vec.size());
 }
+
+/**
+ * Convert a 2D vector of float to NDArray<float, 2>
+ */
+NDArray<float, 2> vectorsToNDArray(std::vector<std::vector<float>> vectors) {
+  int numVectors = vectors.size();
+  int dimensions = numVectors > 0 ? vectors[0].size() : 0;
+  std::array<int, 2> shape = {numVectors, dimensions};
+
+  // flatten the 2d array into the NDArray's underlying 1D vector
+  std::vector<float> flatArray;
+  for (const auto &vector : vectors) {
+    flatArray.insert(flatArray.end(), vector.begin(), vector.end());
+  }
+
+  return NDArray<float, 2>(flatArray, shape);
+}

From f57b9ae1037208e5b9aa5c7012d70452d9e326e2 Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Wed, 4 Sep 2024 23:33:10 -0400
Subject: [PATCH 7/8] Use quantized random input vectors for Float8 and E4M3
 storage. Remove unused util methods

---
 cpp/test/test_main.cpp  |  9 +++++++--
 cpp/test/test_utils.cpp | 33 ++-------------------------------
 2 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp
index c76f5e70..5b7c5c56 100644
--- a/cpp/test/test_main.cpp
+++ b/cpp/test/test_main.cpp
@@ -28,8 +28,13 @@ void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
                StorageDataType storageType, bool testSingleVectorMethod,
                float precisionTolerance) {
   // create test data and ids
-  std::vector<std::vector<float>> inputData =
-      randomVectors(numVectors, numDimensions);
+  std::vector<std::vector<float>> inputData;
+  if (storageType == StorageDataType::Float8 ||
+      storageType == StorageDataType::E4M3) {
+    inputData = randomQuantizedVectors(numVectors, numDimensions);
+  } else if (storageType == StorageDataType::Float32) {
+    inputData = randomVectors(numVectors, numDimensions);
+  }
   std::vector<hnswlib::labeltype> ids(numVectors);
   for (int i = 0; i < numVectors; i++) {
     ids[i] = i;
diff --git a/cpp/test/test_utils.cpp b/cpp/test/test_utils.cpp
index e90d93d9..91fdbb31 100644
--- a/cpp/test/test_utils.cpp
+++ b/cpp/test/test_utils.cpp
@@ -3,37 +3,7 @@
 
 #include "array_utils.h"
 
-NDArray<float, 2> randomQuantizedVectorsNDArray(int numVectors,
-                                                int dimensions) {
-  NDArray<float, 2> vectors = NDArray<float, 2>({numVectors, dimensions});
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<> dis(0, 1.0);
-
-  int numElements = numVectors * dimensions;
-  for (int i = 0; i < numElements; ++i) {
-    vectors.data[i] = static_cast<int>(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f;
-  }
-
-  return vectors;
-}
-
-NDArray<float, 2> randomVectorsNDArray(int numVectors, int dimensions) {
-  NDArray<float, 2> vectors = NDArray<float, 2>({numVectors, dimensions});
-
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<> dis(0, 1.0);
-
-  int numElements = numVectors * dimensions;
-  for (int i = 0; i < numElements; ++i) {
-    vectors.data[i] = static_cast<float>(dis(gen)) * 2 - 1;
-  }
-
-  return vectors;
-}
-
+// create test data intended for Float8 storage or E4M3 storage
 std::vector<std::vector<float>> randomQuantizedVectors(int numVectors,
                                                        int dimensions) {
   std::vector<std::vector<float>> vectors(numVectors,
@@ -52,6 +22,7 @@ std::vector<std::vector<float>> randomQuantizedVectors(int numVectors,
   return vectors;
 }
 
+// create test data intended for Float32 storage
 std::vector<std::vector<float>> randomVectors(int numVectors, int dimensions) {
   std::vector<std::vector<float>> vectors(numVectors,
                                           std::vector<float>(dimensions));

From 2264c0498d5410957251bbe49be8985f2683886f Mon Sep 17 00:00:00 2001
From: Stephen Xie <stephenx@spotify.com>
Date: Fri, 6 Sep 2024 00:10:09 -0400
Subject: [PATCH 8/8] Optimize vectorsToNDArray() and add validation for vector
 sizes, add tests

---
 cpp/src/array_utils.h  | 22 +++++++++++++++------
 cpp/test/test_main.cpp | 45 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/cpp/src/array_utils.h b/cpp/src/array_utils.h
index fe4dbd72..42717eff 100644
--- a/cpp/src/array_utils.h
+++ b/cpp/src/array_utils.h
@@ -310,18 +310,28 @@ std::string toFloatVectorString(std::vector<data_t> vec) {
                                                           vec.size());
 }
 
-/**
- * Convert a 2D vector of float to NDArray<float, 2>
- */
+/** Convert a 2D vector of float to NDArray<float, 2> */
 NDArray<float, 2> vectorsToNDArray(std::vector<std::vector<float>> vectors) {
   int numVectors = vectors.size();
   int dimensions = numVectors > 0 ? vectors[0].size() : 0;
   std::array<int, 2> shape = {numVectors, dimensions};
 
-  // flatten the 2d array into the NDArray's underlying 1D vector
-  std::vector<float> flatArray;
+  // Flatten the 2d array into the NDArray's underlying 1D vector
+  std::vector<float> flatArray(numVectors * dimensions);
+  // Pointer to the beginning of the flat array
+  float *flatArrayPtr = flatArray.data();
   for (const auto &vector : vectors) {
-    flatArray.insert(flatArray.end(), vector.begin(), vector.end());
+    // check that all provided vectors are same size, using the 1st vector as
+    // the reference
+    if (vector.size() != dimensions) {
+      throw std::invalid_argument("All vectors must be of the same size, but "
+                                  "received vectors of size: " +
+                                  std::to_string(dimensions) + " and " +
+                                  std::to_string(vector.size()) + ".");
+    }
+    // Use std::memcpy to copy the elements directly into the flat array
+    std::memcpy(flatArrayPtr, vector.data(), vector.size() * sizeof(float));
+    flatArrayPtr += vector.size(); // Increment the pointer
   }
 
   return NDArray<float, 2>(flatArray, shape);
diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp
index 5b7c5c56..e35f81a1 100644
--- a/cpp/test/test_main.cpp
+++ b/cpp/test/test_main.cpp
@@ -27,7 +27,11 @@ void testQuery(TypedIndex<dist_t, data_t, scalefactor> &index, int numVectors,
                int numDimensions, SpaceType spaceType,
                StorageDataType storageType, bool testSingleVectorMethod,
                float precisionTolerance) {
-  // create test data and ids
+  /**
+   * Create test data and ids. If we are using Float8 or E4M3 storage, quantize
+   * the vector values, if we are using Float32 storage, keep the float values
+   * as-is. We want to match the storage type use case with the input data.
+   */
   std::vector<std::vector<float>> inputData;
   if (storageType == StorageDataType::Float8 ||
       storageType == StorageDataType::E4M3) {
@@ -173,3 +177,42 @@ TEST_CASE("Test combinations of different instantiations. Test that each "
     }
   }
 }
+
+TEST_CASE("Test vectorsToNDArray converts 2D vector of float to NDArray<float, "
+          "2>") {
+  std::vector<std::vector<float>> vectors = {{1.0f, 2.0f, 3.0f, 4.0f},
+                                             {5.0f, 6.0f, 7.0f, 8.0f},
+                                             {9.0f, 10.0f, 11.0f, 12.0f}};
+  NDArray<float, 2> ndArray = vectorsToNDArray(vectors);
+  REQUIRE(ndArray.shape.size() == 2);
+  REQUIRE(ndArray.shape[0] == 3);
+  REQUIRE(ndArray.shape[1] == 4);
+  REQUIRE(ndArray.data.size() == 12);
+  REQUIRE(ndArray.data[0] == 1.0f);
+  REQUIRE(ndArray.data[1] == 2.0f);
+  REQUIRE(ndArray.data[2] == 3.0f);
+  REQUIRE(ndArray.data[3] == 4.0f);
+  REQUIRE(ndArray.data[4] == 5.0f);
+  REQUIRE(ndArray.data[5] == 6.0f);
+  REQUIRE(ndArray.data[6] == 7.0f);
+  REQUIRE(ndArray.data[7] == 8.0f);
+  REQUIRE(ndArray.data[8] == 9.0f);
+  REQUIRE(ndArray.data[9] == 10.0f);
+  REQUIRE(ndArray.data[10] == 11.0f);
+  REQUIRE(ndArray.data[11] == 12.0f);
+  REQUIRE(*ndArray[0] == 1.0f);
+  REQUIRE(*ndArray[1] == 5.0f);
+  REQUIRE(*ndArray[2] == 9.0f);
+}
+
+TEST_CASE("Test vectorsToNDArray throws error if vectors are not of the same "
+          "size") {
+  std::vector<std::vector<float>> vectors1 = {{1.0f, 2.0f, 3.0f, 4.0f},
+                                              {5.0f, 6.0f, 7.0f},
+                                              {9.0f, 10.0f, 11.0f, 12.0f}};
+  REQUIRE_THROWS_AS(vectorsToNDArray(vectors1), std::invalid_argument);
+
+  std::vector<std::vector<float>> vectors2 = {
+      {1.0f}, {5.0f, 6.0f, 7.0f}, {9.0f, 10.0f, 11.0f}};
+  REQUIRE_THROWS_AS(vectorsToNDArray(vectors2), std::invalid_argument);
+}