From 94f5fd40ddf8dcacf25093eb0c347022dab9f557 Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Mon, 26 Aug 2024 16:04:42 -0400 Subject: [PATCH 1/8] Add C++ tests and overloaded Index methods that accept 2D vector of floats instead of NDArray --- .gitignore | 1 + cpp/src/Index.h | 11 ++- cpp/src/TypedIndex.h | 36 ++++++++ cpp/test/CMakeLists.txt | 3 + cpp/test/test_main.cpp | 182 +++++++++++++++++++++++++++++++++------- cpp/test/test_utils.cpp | 70 ++++++++++++++++ 6 files changed, 270 insertions(+), 33 deletions(-) create mode 100644 cpp/test/test_utils.cpp diff --git a/.gitignore b/.gitignore index f5287bce..f46320cf 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ java/classpath.txt java/linux-build/include/* python/voyager-headers .asv/ +*.dSYM # Cmake CMakeLists.txt.user diff --git a/cpp/src/Index.h b/cpp/src/Index.h index f6fe581c..67d3219c 100644 --- a/cpp/src/Index.h +++ b/cpp/src/Index.h @@ -44,7 +44,7 @@ */ class Index { public: - virtual ~Index(){}; + virtual ~Index() {}; virtual void setEF(size_t ef) = 0; virtual int getEF() const = 0; @@ -71,6 +71,11 @@ class Index { virtual hnswlib::labeltype addItem(std::vector vector, std::optional id) = 0; + + virtual std::vector + addItems(std::vector> input, + std::vector ids = {}, int numThreads = -1) = 0; + virtual std::vector addItems(NDArray input, std::vector ids = {}, int numThreads = -1) = 0; @@ -86,6 +91,10 @@ class Index { virtual std::tuple, std::vector> query(std::vector queryVector, int k = 1, long queryEf = -1) = 0; + virtual std::tuple, NDArray> + query(std::vector> queryVectors, int k = 1, + int numThreads = -1, long queryEf = -1) = 0; + virtual std::tuple, NDArray> query(NDArray queryVectors, int k = 1, int numThreads = -1, long queryEf = -1) = 0; diff --git a/cpp/src/TypedIndex.h b/cpp/src/TypedIndex.h index 63066a40..bc130214 100644 --- a/cpp/src/TypedIndex.h +++ b/cpp/src/TypedIndex.h @@ -290,6 +290,24 @@ class TypedIndex : public Index { return addItems(NDArray(vector, {1, (int)vector.size()}), ids)[0]; } + std::vector + addItems(const std::vector> vectors, + std::vector ids = {}, int numThreads = -1) { + // Convert the 2D array of float to NDArray + int numVectors = vectors.size(); + int dimensions = numVectors > 0 ? vectors[0].size() : 0; + std::array shape = {numVectors, dimensions}; + + // flatten the 2d array of floats + std::vector flatArray; + for (const auto &vector : vectors) { + flatArray.insert(flatArray.end(), vector.begin(), vector.end()); + } + NDArray ndarray(flatArray, shape); + + return addItems(ndarray, ids, numThreads); + } + std::vector addItems(NDArray floatInput, std::vector ids = {}, int numThreads = -1) { @@ -502,6 +520,24 @@ class TypedIndex : public Index { return algorithmImpl->label_lookup_; } + std::tuple, NDArray> + query(std::vector> floatQueryVectors, int k = 1, + int numThreads = -1, long queryEf = -1) { + // Convert the 2D array of float to NDArray + int numVectors = floatQueryVectors.size(); + int dimensions = numVectors > 0 ? floatQueryVectors[0].size() : 0; + std::array shape = {numVectors, dimensions}; + + // flatten the 2d array of floats + std::vector flatArray; + for (const auto &vector : floatQueryVectors) { + flatArray.insert(flatArray.end(), vector.begin(), vector.end()); + } + NDArray ndarray(flatArray, shape); + + return query(ndarray, k, numThreads, queryEf); + } + std::tuple, NDArray> query(NDArray floatQueryVectors, int k = 1, int numThreads = -1, long queryEf = -1) { diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index a46805c1..1606f1a7 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -4,6 +4,9 @@ set(TEST_FILES test_main.cpp doctest_setup.cpp) # Add any test files here # Create an executable for the tests add_executable(VoyagerTests ${TEST_FILES}) +# Add compiler flags +target_compile_options(VoyagerTests PRIVATE -g) + # Link the test executable with the main project and Doctest # target_link_libraries(MyProjectTests PRIVATE MyProject doctest::doctest) target_link_libraries(VoyagerTests diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp index dbfa5dc1..eba1a4c9 100644 --- a/cpp/test/test_main.cpp +++ b/cpp/test/test_main.cpp @@ -1,49 +1,167 @@ #include "doctest.h" #include "TypedIndex.h" +#include "test_utils.cpp" #include #include template > -void testCombination(TypedIndex &index, - SpaceType spaceType, int numDimensions, - StorageDataType storageType) { - CHECK(toString(index.getSpace()) == toString(spaceType)); - CHECK(index.getNumDimensions() == numDimensions); - CHECK(toString(index.getStorageDataType()) == toString(storageType)); +void testIndexProperties(TypedIndex &index, + SpaceType spaceType, int numDimensions, + StorageDataType storageType) { + REQUIRE(toString(index.getSpace()) == toString(spaceType)); + REQUIRE(index.getNumDimensions() == numDimensions); + REQUIRE(toString(index.getStorageDataType()) == toString(storageType)); } -TEST_CASE("Test combinations of different instantiations and sizes") { - std::vector spaceTypesSet = {SpaceType::Euclidean, - SpaceType::InnerProduct}; - std::vector numDimensionsSet = {4, 16, 128, 1024}; - std::vector numElementsSet = {100, 1000, 100000}; +/** + * Test the query method of the index. The index is populated with random + * vectors, and then queried with the same vectors. The expected result is that + * each vector's nearest neighbor is itself and that the distance is zero + * (allowing for some precision error based on the storage type). + */ +template > +void testQuery(TypedIndex &index, int numVectors, + int numDimensions, SpaceType spaceType, + StorageDataType storageType, bool testSingleVectorMethod, + float precisionTolerance) { + // create test data and ids + std::vector> inputData = + randomVectors(numVectors, numDimensions); + std::vector ids(numVectors); + for (int i = 0; i < numVectors; i++) { + ids[i] = i; + } + + // add items to index + if (testSingleVectorMethod == true) { + for (auto id : ids) { + index.addItem(inputData[id], id); + } + } else { + index.addItems(inputData, ids, -1); + } + + int k = 1; + float lowerBound = 0.0f - precisionTolerance; + float upperBound = 0.0f + precisionTolerance; + + // Use the single-query interface (query with a single target vector) + for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) { + for (int i = 0; i < numVectors; i++) { + + /** + * Use the raw inputData as target vectors for querying. We don't use the + * index data because once data has been added to the index, the model can + * change the "ground truth" by changing the data format. + */ + auto targetVector = inputData[i]; + auto nearestNeighbor = index.query(targetVector, k, queryEf); + + auto labels = std::get<0>(nearestNeighbor); + auto distances = std::get<1>(nearestNeighbor); + REQUIRE(labels.size() == k); + REQUIRE(distances.size() == k); + + /** + * E4M3 is too low precision for us to confidently assume that querying + * with the unquantized (fp32) vector will return the quantized vector as + * its NN InnerProduct will have negative distance to the closest item, + * not zero + */ + if (storageType != StorageDataType::E4M3 && + spaceType != SpaceType::InnerProduct) { + REQUIRE(i == labels[0]); + REQUIRE(distances[0] >= lowerBound); + REQUIRE(distances[0] <= upperBound); + } + } + } + + // Use the bulk-query interface (query with multiple target vectors at once) + for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) { + for (int i = 0; i < numVectors; i++) { + auto nearestNeighbors = index.query( + inputData, /* k= */ 1, /* numThreads= */ -1, /* queryEf= */ queryEf); + NDArray labels = std::get<0>(nearestNeighbors); + NDArray distances = std::get<1>(nearestNeighbors); + REQUIRE(labels.shape[0] == numVectors); + REQUIRE(labels.shape[1] == k); + REQUIRE(distances.shape[0] == numVectors); + REQUIRE(distances.shape[1] == k); + + for (int i = 0; i < numVectors; i++) { + auto label = labels.data[i]; + auto distance = distances.data[i]; + + /** + * E4M3 is too low precision for us to confidently assume that querying + * with the unquantized (fp32) vector will return the quantized vector + * as its NN InnerProduct will have negative distance to the closest + * item, not zero + */ + if (storageType != StorageDataType::E4M3 && + spaceType != SpaceType::InnerProduct) { + REQUIRE(i == label); + REQUIRE(distance >= lowerBound); + REQUIRE(distance <= upperBound); + } + } + } + } +} + +TEST_CASE("Test combinations of different instantiations. Test that each " + "vector's NN is itself and distance is approximately zero.") { + std::unordered_map PRECISION_TOLERANCE_PER_DATA_TYPE = + {{StorageDataType::Float32, 0.00001f}, + {StorageDataType::Float8, 0.10f}, + {StorageDataType::E4M3, 0.20f}}; + std::vector spaceTypesSet = { + SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine}; + std::vector numDimensionsSet = {32}; + std::vector numVectorsSet = {500}; std::vector storageTypesSet = { StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3}; - - auto count = 0; + std::vector testSingleVectorMethods = {true, false}; for (auto spaceType : spaceTypesSet) { - for (auto numDimensions : numDimensionsSet) { - for (auto numElements : numElementsSet) { - for (auto storageType : storageTypesSet) { - SUBCASE("Test instantiation ") { - CAPTURE(spaceType); - CAPTURE(numDimensions); - CAPTURE(numElements); - CAPTURE(storageType); - - if (storageType == StorageDataType::Float8) { - auto index = TypedIndex>( - spaceType, numDimensions); - testCombination(index, spaceType, numDimensions, storageType); - } else if (storageType == StorageDataType::Float32) { - auto index = TypedIndex(spaceType, numDimensions); - testCombination(index, spaceType, numDimensions, storageType); - } else if (storageType == StorageDataType::E4M3) { - auto index = TypedIndex(spaceType, numDimensions); - testCombination(index, spaceType, numDimensions, storageType); + for (auto storageType : storageTypesSet) { + for (auto numDimensions : numDimensionsSet) { + for (auto numVectors : numVectorsSet) { + for (auto testSingleVectorMethod : testSingleVectorMethods) { + + SUBCASE("Test instantiation ") { + CAPTURE(spaceType); + CAPTURE(numDimensions); + CAPTURE(numVectors); + CAPTURE(storageType); + + if (storageType == StorageDataType::Float8) { + auto index = TypedIndex>( + spaceType, numDimensions); + testIndexProperties(index, spaceType, numDimensions, + storageType); + testQuery(index, numVectors, numDimensions, spaceType, + storageType, testSingleVectorMethod, + PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]); + } else if (storageType == StorageDataType::Float32) { + auto index = TypedIndex(spaceType, numDimensions); + testIndexProperties(index, spaceType, numDimensions, + storageType); + testQuery(index, numVectors, numDimensions, spaceType, + storageType, testSingleVectorMethod, + PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]); + } else if (storageType == StorageDataType::E4M3) { + auto index = TypedIndex(spaceType, numDimensions); + testIndexProperties(index, spaceType, numDimensions, + storageType); + testQuery(index, numVectors, numDimensions, spaceType, + storageType, testSingleVectorMethod, + PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]); + } } } } diff --git a/cpp/test/test_utils.cpp b/cpp/test/test_utils.cpp new file mode 100644 index 00000000..e90d93d9 --- /dev/null +++ b/cpp/test/test_utils.cpp @@ -0,0 +1,70 @@ +#include +#include + +#include "array_utils.h" + +NDArray randomQuantizedVectorsNDArray(int numVectors, + int dimensions) { + NDArray vectors = NDArray({numVectors, dimensions}); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0, 1.0); + + int numElements = numVectors * dimensions; + for (int i = 0; i < numElements; ++i) { + vectors.data[i] = static_cast(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f; + } + + return vectors; +} + +NDArray randomVectorsNDArray(int numVectors, int dimensions) { + NDArray vectors = NDArray({numVectors, dimensions}); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0, 1.0); + + int numElements = numVectors * dimensions; + for (int i = 0; i < numElements; ++i) { + vectors.data[i] = static_cast(dis(gen)) * 2 - 1; + } + + return vectors; +} + +std::vector> randomQuantizedVectors(int numVectors, + int dimensions) { + std::vector> vectors(numVectors, + std::vector(dimensions)); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0, 1.0); + + for (int i = 0; i < numVectors; ++i) { + for (int j = 0; j < dimensions; ++j) { + vectors[i][j] = static_cast(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f; + } + } + + return vectors; +} + +std::vector> randomVectors(int numVectors, int dimensions) { + std::vector> vectors(numVectors, + std::vector(dimensions)); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0, 1.0); + + for (int i = 0; i < numVectors; ++i) { + for (int j = 0; j < dimensions; ++j) { + vectors[i][j] = static_cast(dis(gen)) * 2 - 1; + } + } + + return vectors; +} From 723e189198047e12df1dd7031fb7b11d51988847 Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Mon, 26 Aug 2024 21:53:23 -0400 Subject: [PATCH 2/8] Use most recent version of clang-format --- .github/workflows/all.yml | 2 +- CONTRIBUTING.md | 2 +- java/JavaOutputStream.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index 505e2fc0..7b1e819b 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -39,7 +39,7 @@ jobs: - name: Check C++ Formatting uses: jidicula/clang-format-action@v4.13.0 with: - clang-format-version: 16 + clang-format-version: 18 run-cpp-tests: runs-on: ${{ matrix.os }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2ef187e9..2f05bc4d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -130,7 +130,7 @@ tox -e format ``` ### C++ -If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 16) installed, and then use clang-format to handle C++ formatting: +If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 18) installed, and then use clang-format to handle C++ formatting: ```bash cd cpp cmake . diff --git a/java/JavaOutputStream.h b/java/JavaOutputStream.h index 5db295ee..a9ac9418 100644 --- a/java/JavaOutputStream.h +++ b/java/JavaOutputStream.h @@ -82,7 +82,7 @@ class JavaOutputStream : public OutputStream { return true; } - virtual ~JavaOutputStream(){}; + virtual ~JavaOutputStream() {}; private: JNIEnv *env; From b61ef1e8dc75009c832eac8da9d553be4d192bd8 Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Tue, 27 Aug 2024 00:16:26 -0400 Subject: [PATCH 3/8] Undo clang-format bump. Fix formatting --- .github/workflows/all.yml | 2 +- CONTRIBUTING.md | 2 +- cpp/src/Index.h | 2 +- java/JavaOutputStream.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index 7b1e819b..505e2fc0 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -39,7 +39,7 @@ jobs: - name: Check C++ Formatting uses: jidicula/clang-format-action@v4.13.0 with: - clang-format-version: 18 + clang-format-version: 16 run-cpp-tests: runs-on: ${{ matrix.os }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2f05bc4d..2ef187e9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -130,7 +130,7 @@ tox -e format ``` ### C++ -If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 18) installed, and then use clang-format to handle C++ formatting: +If you are working on any C++ code throughout the repo, ensure you have `clang-format` (version 16) installed, and then use clang-format to handle C++ formatting: ```bash cd cpp cmake . diff --git a/cpp/src/Index.h b/cpp/src/Index.h index 67d3219c..a8b82ee3 100644 --- a/cpp/src/Index.h +++ b/cpp/src/Index.h @@ -44,7 +44,7 @@ */ class Index { public: - virtual ~Index() {}; + virtual ~Index(){}; virtual void setEF(size_t ef) = 0; virtual int getEF() const = 0; diff --git a/java/JavaOutputStream.h b/java/JavaOutputStream.h index a9ac9418..5db295ee 100644 --- a/java/JavaOutputStream.h +++ b/java/JavaOutputStream.h @@ -82,7 +82,7 @@ class JavaOutputStream : public OutputStream { return true; } - virtual ~JavaOutputStream() {}; + virtual ~JavaOutputStream(){}; private: JNIEnv *env; From 8186a1ffa3f767bdf43a6a2e387f7b05d93b3d2a Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Thu, 29 Aug 2024 03:03:36 -0400 Subject: [PATCH 4/8] clean up C++ test, increase number of vectors --- cpp/test/test_main.cpp | 51 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp index eba1a4c9..d7d39548 100644 --- a/cpp/test/test_main.cpp +++ b/cpp/test/test_main.cpp @@ -82,32 +82,30 @@ void testQuery(TypedIndex &index, int numVectors, // Use the bulk-query interface (query with multiple target vectors at once) for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) { + auto nearestNeighbors = index.query( + inputData, /* k= */ k, /* numThreads= */ -1, /* queryEf= */ queryEf); + NDArray labels = std::get<0>(nearestNeighbors); + NDArray distances = std::get<1>(nearestNeighbors); + REQUIRE(labels.shape[0] == numVectors); + REQUIRE(labels.shape[1] == k); + REQUIRE(distances.shape[0] == numVectors); + REQUIRE(distances.shape[1] == k); + for (int i = 0; i < numVectors; i++) { - auto nearestNeighbors = index.query( - inputData, /* k= */ 1, /* numThreads= */ -1, /* queryEf= */ queryEf); - NDArray labels = std::get<0>(nearestNeighbors); - NDArray distances = std::get<1>(nearestNeighbors); - REQUIRE(labels.shape[0] == numVectors); - REQUIRE(labels.shape[1] == k); - REQUIRE(distances.shape[0] == numVectors); - REQUIRE(distances.shape[1] == k); - - for (int i = 0; i < numVectors; i++) { - auto label = labels.data[i]; - auto distance = distances.data[i]; - - /** - * E4M3 is too low precision for us to confidently assume that querying - * with the unquantized (fp32) vector will return the quantized vector - * as its NN InnerProduct will have negative distance to the closest - * item, not zero - */ - if (storageType != StorageDataType::E4M3 && - spaceType != SpaceType::InnerProduct) { - REQUIRE(i == label); - REQUIRE(distance >= lowerBound); - REQUIRE(distance <= upperBound); - } + auto label = labels.data[i]; + auto distance = distances.data[i]; + + /** + * E4M3 is too low precision for us to confidently assume that querying + * with the unquantized (fp32) vector will return the quantized vector + * as its NN InnerProduct will have negative distance to the closest + * item, not zero + */ + if (storageType != StorageDataType::E4M3 && + spaceType != SpaceType::InnerProduct) { + REQUIRE(i == label); + REQUIRE(distance >= lowerBound); + REQUIRE(distance <= upperBound); } } } @@ -122,7 +120,7 @@ TEST_CASE("Test combinations of different instantiations. Test that each " std::vector spaceTypesSet = { SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine}; std::vector numDimensionsSet = {32}; - std::vector numVectorsSet = {500}; + std::vector numVectorsSet = {2000}; std::vector storageTypesSet = { StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3}; std::vector testSingleVectorMethods = {true, false}; @@ -138,6 +136,7 @@ TEST_CASE("Test combinations of different instantiations. Test that each " CAPTURE(numDimensions); CAPTURE(numVectors); CAPTURE(storageType); + CAPTURE(testSingleVectorMethod); if (storageType == StorageDataType::Float8) { auto index = TypedIndex>( From 12fd3278ddc1de01636220ad8f8c47431c805588 Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Thu, 29 Aug 2024 10:46:14 -0400 Subject: [PATCH 5/8] Fix comment --- cpp/test/test_main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp index d7d39548..c76f5e70 100644 --- a/cpp/test/test_main.cpp +++ b/cpp/test/test_main.cpp @@ -68,7 +68,7 @@ void testQuery(TypedIndex &index, int numVectors, /** * E4M3 is too low precision for us to confidently assume that querying * with the unquantized (fp32) vector will return the quantized vector as - * its NN InnerProduct will have negative distance to the closest item, + * its NN. InnerProduct will have negative distance to the closest item, * not zero */ if (storageType != StorageDataType::E4M3 && @@ -98,7 +98,7 @@ void testQuery(TypedIndex &index, int numVectors, /** * E4M3 is too low precision for us to confidently assume that querying * with the unquantized (fp32) vector will return the quantized vector - * as its NN InnerProduct will have negative distance to the closest + * as its NN. InnerProduct will have negative distance to the closest * item, not zero */ if (storageType != StorageDataType::E4M3 && From a3d04c8418e7f2e26cd3151ef694f2f04fec1837 Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Thu, 29 Aug 2024 23:16:50 -0400 Subject: [PATCH 6/8] Move code into reusable function --- cpp/src/TypedIndex.h | 28 ++-------------------------- cpp/src/array_utils.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/cpp/src/TypedIndex.h b/cpp/src/TypedIndex.h index bc130214..c30ae97c 100644 --- a/cpp/src/TypedIndex.h +++ b/cpp/src/TypedIndex.h @@ -293,19 +293,7 @@ class TypedIndex : public Index { std::vector addItems(const std::vector> vectors, std::vector ids = {}, int numThreads = -1) { - // Convert the 2D array of float to NDArray - int numVectors = vectors.size(); - int dimensions = numVectors > 0 ? vectors[0].size() : 0; - std::array shape = {numVectors, dimensions}; - - // flatten the 2d array of floats - std::vector flatArray; - for (const auto &vector : vectors) { - flatArray.insert(flatArray.end(), vector.begin(), vector.end()); - } - NDArray ndarray(flatArray, shape); - - return addItems(ndarray, ids, numThreads); + return addItems(vectorsToNDArray(vectors), ids, numThreads); } std::vector @@ -523,19 +511,7 @@ class TypedIndex : public Index { std::tuple, NDArray> query(std::vector> floatQueryVectors, int k = 1, int numThreads = -1, long queryEf = -1) { - // Convert the 2D array of float to NDArray - int numVectors = floatQueryVectors.size(); - int dimensions = numVectors > 0 ? floatQueryVectors[0].size() : 0; - std::array shape = {numVectors, dimensions}; - - // flatten the 2d array of floats - std::vector flatArray; - for (const auto &vector : floatQueryVectors) { - flatArray.insert(flatArray.end(), vector.begin(), vector.end()); - } - NDArray ndarray(flatArray, shape); - - return query(ndarray, k, numThreads, queryEf); + return query(vectorsToNDArray(floatQueryVectors), k, numThreads, queryEf); } std::tuple, NDArray> diff --git a/cpp/src/array_utils.h b/cpp/src/array_utils.h index 7c2a7556..fe4dbd72 100644 --- a/cpp/src/array_utils.h +++ b/cpp/src/array_utils.h @@ -309,3 +309,20 @@ std::string toFloatVectorString(std::vector vec) { return toFloatVectorString(vec.data(), vec.size()); } + +/** + * Convert a 2D vector of float to NDArray + */ +NDArray vectorsToNDArray(std::vector> vectors) { + int numVectors = vectors.size(); + int dimensions = numVectors > 0 ? vectors[0].size() : 0; + std::array shape = {numVectors, dimensions}; + + // flatten the 2d array into the NDArray's underlying 1D vector + std::vector flatArray; + for (const auto &vector : vectors) { + flatArray.insert(flatArray.end(), vector.begin(), vector.end()); + } + + return NDArray(flatArray, shape); +} From f57b9ae1037208e5b9aa5c7012d70452d9e326e2 Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Wed, 4 Sep 2024 23:33:10 -0400 Subject: [PATCH 7/8] Use quantized random input vectors for Float8 and E4M3 storage. Remove unused util methods --- cpp/test/test_main.cpp | 9 +++++++-- cpp/test/test_utils.cpp | 33 ++------------------------------- 2 files changed, 9 insertions(+), 33 deletions(-) diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp index c76f5e70..5b7c5c56 100644 --- a/cpp/test/test_main.cpp +++ b/cpp/test/test_main.cpp @@ -28,8 +28,13 @@ void testQuery(TypedIndex &index, int numVectors, StorageDataType storageType, bool testSingleVectorMethod, float precisionTolerance) { // create test data and ids - std::vector> inputData = - randomVectors(numVectors, numDimensions); + std::vector> inputData; + if (storageType == StorageDataType::Float8 || + storageType == StorageDataType::E4M3) { + inputData = randomQuantizedVectors(numVectors, numDimensions); + } else if (storageType == StorageDataType::Float32) { + inputData = randomVectors(numVectors, numDimensions); + } std::vector ids(numVectors); for (int i = 0; i < numVectors; i++) { ids[i] = i; diff --git a/cpp/test/test_utils.cpp b/cpp/test/test_utils.cpp index e90d93d9..91fdbb31 100644 --- a/cpp/test/test_utils.cpp +++ b/cpp/test/test_utils.cpp @@ -3,37 +3,7 @@ #include "array_utils.h" -NDArray randomQuantizedVectorsNDArray(int numVectors, - int dimensions) { - NDArray vectors = NDArray({numVectors, dimensions}); - - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<> dis(0, 1.0); - - int numElements = numVectors * dimensions; - for (int i = 0; i < numElements; ++i) { - vectors.data[i] = static_cast(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f; - } - - return vectors; -} - -NDArray randomVectorsNDArray(int numVectors, int dimensions) { - NDArray vectors = NDArray({numVectors, dimensions}); - - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<> dis(0, 1.0); - - int numElements = numVectors * dimensions; - for (int i = 0; i < numElements; ++i) { - vectors.data[i] = static_cast(dis(gen)) * 2 - 1; - } - - return vectors; -} - +// create test data intended for Float8 storage or E4M3 storage std::vector> randomQuantizedVectors(int numVectors, int dimensions) { std::vector> vectors(numVectors, @@ -52,6 +22,7 @@ std::vector> randomQuantizedVectors(int numVectors, return vectors; } +// create test data intended for Float32 storage std::vector> randomVectors(int numVectors, int dimensions) { std::vector> vectors(numVectors, std::vector(dimensions)); From 2264c0498d5410957251bbe49be8985f2683886f Mon Sep 17 00:00:00 2001 From: Stephen Xie Date: Fri, 6 Sep 2024 00:10:09 -0400 Subject: [PATCH 8/8] Optimize vectorsToNDArray() and add validation for vector sizes, add tests --- cpp/src/array_utils.h | 22 +++++++++++++++------ cpp/test/test_main.cpp | 45 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/cpp/src/array_utils.h b/cpp/src/array_utils.h index fe4dbd72..42717eff 100644 --- a/cpp/src/array_utils.h +++ b/cpp/src/array_utils.h @@ -310,18 +310,28 @@ std::string toFloatVectorString(std::vector vec) { vec.size()); } -/** - * Convert a 2D vector of float to NDArray - */ +/** Convert a 2D vector of float to NDArray */ NDArray vectorsToNDArray(std::vector> vectors) { int numVectors = vectors.size(); int dimensions = numVectors > 0 ? vectors[0].size() : 0; std::array shape = {numVectors, dimensions}; - // flatten the 2d array into the NDArray's underlying 1D vector - std::vector flatArray; + // Flatten the 2d array into the NDArray's underlying 1D vector + std::vector flatArray(numVectors * dimensions); + // Pointer to the beginning of the flat array + float *flatArrayPtr = flatArray.data(); for (const auto &vector : vectors) { - flatArray.insert(flatArray.end(), vector.begin(), vector.end()); + // check that all provided vectors are same size, using the 1st vector as + // the reference + if (vector.size() != dimensions) { + throw std::invalid_argument("All vectors must be of the same size, but " + "received vectors of size: " + + std::to_string(dimensions) + " and " + + std::to_string(vector.size()) + "."); + } + // Use std::memcpy to copy the elements directly into the flat array + std::memcpy(flatArrayPtr, vector.data(), vector.size() * sizeof(float)); + flatArrayPtr += vector.size(); // Increment the pointer } return NDArray(flatArray, shape); diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp index 5b7c5c56..e35f81a1 100644 --- a/cpp/test/test_main.cpp +++ b/cpp/test/test_main.cpp @@ -27,7 +27,11 @@ void testQuery(TypedIndex &index, int numVectors, int numDimensions, SpaceType spaceType, StorageDataType storageType, bool testSingleVectorMethod, float precisionTolerance) { - // create test data and ids + /** + * Create test data and ids. If we are using Float8 or E4M3 storage, quantize + * the vector values, if we are using Float32 storage, keep the float values + * as-is. We want to match the storage type use case with the input data. + */ std::vector> inputData; if (storageType == StorageDataType::Float8 || storageType == StorageDataType::E4M3) { @@ -173,3 +177,42 @@ TEST_CASE("Test combinations of different instantiations. Test that each " } } } + +TEST_CASE("Test vectorsToNDArray converts 2D vector of float to NDArray") { + std::vector> vectors = {{1.0f, 2.0f, 3.0f, 4.0f}, + {5.0f, 6.0f, 7.0f, 8.0f}, + {9.0f, 10.0f, 11.0f, 12.0f}}; + NDArray ndArray = vectorsToNDArray(vectors); + REQUIRE(ndArray.shape.size() == 2); + REQUIRE(ndArray.shape[0] == 3); + REQUIRE(ndArray.shape[1] == 4); + REQUIRE(ndArray.data.size() == 12); + REQUIRE(ndArray.data[0] == 1.0f); + REQUIRE(ndArray.data[1] == 2.0f); + REQUIRE(ndArray.data[2] == 3.0f); + REQUIRE(ndArray.data[3] == 4.0f); + REQUIRE(ndArray.data[4] == 5.0f); + REQUIRE(ndArray.data[5] == 6.0f); + REQUIRE(ndArray.data[6] == 7.0f); + REQUIRE(ndArray.data[7] == 8.0f); + REQUIRE(ndArray.data[8] == 9.0f); + REQUIRE(ndArray.data[9] == 10.0f); + REQUIRE(ndArray.data[10] == 11.0f); + REQUIRE(ndArray.data[11] == 12.0f); + REQUIRE(*ndArray[0] == 1.0f); + REQUIRE(*ndArray[1] == 5.0f); + REQUIRE(*ndArray[2] == 9.0f); +} + +TEST_CASE("Test vectorsToNDArray throws error if vectors are not of the same " + "size") { + std::vector> vectors1 = {{1.0f, 2.0f, 3.0f, 4.0f}, + {5.0f, 6.0f, 7.0f}, + {9.0f, 10.0f, 11.0f, 12.0f}}; + REQUIRE_THROWS_AS(vectorsToNDArray(vectors1), std::invalid_argument); + + std::vector> vectors2 = { + {1.0f}, {5.0f, 6.0f, 7.0f}, {9.0f, 10.0f, 11.0f}}; + REQUIRE_THROWS_AS(vectorsToNDArray(vectors2), std::invalid_argument); +}