diff --git a/.gitignore b/.gitignore index f5287bc..f46320c 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ java/classpath.txt java/linux-build/include/* python/voyager-headers .asv/ +*.dSYM # Cmake CMakeLists.txt.user diff --git a/cpp/src/Index.h b/cpp/src/Index.h index f6fe581..a8b82ee 100644 --- a/cpp/src/Index.h +++ b/cpp/src/Index.h @@ -71,6 +71,11 @@ class Index { virtual hnswlib::labeltype addItem(std::vector vector, std::optional id) = 0; + + virtual std::vector + addItems(std::vector> input, + std::vector ids = {}, int numThreads = -1) = 0; + virtual std::vector addItems(NDArray input, std::vector ids = {}, int numThreads = -1) = 0; @@ -86,6 +91,10 @@ class Index { virtual std::tuple, std::vector> query(std::vector queryVector, int k = 1, long queryEf = -1) = 0; + virtual std::tuple, NDArray> + query(std::vector> queryVectors, int k = 1, + int numThreads = -1, long queryEf = -1) = 0; + virtual std::tuple, NDArray> query(NDArray queryVectors, int k = 1, int numThreads = -1, long queryEf = -1) = 0; diff --git a/cpp/src/TypedIndex.h b/cpp/src/TypedIndex.h index 63066a4..c30ae97 100644 --- a/cpp/src/TypedIndex.h +++ b/cpp/src/TypedIndex.h @@ -290,6 +290,12 @@ class TypedIndex : public Index { return addItems(NDArray(vector, {1, (int)vector.size()}), ids)[0]; } + std::vector + addItems(const std::vector> vectors, + std::vector ids = {}, int numThreads = -1) { + return addItems(vectorsToNDArray(vectors), ids, numThreads); + } + std::vector addItems(NDArray floatInput, std::vector ids = {}, int numThreads = -1) { @@ -502,6 +508,12 @@ class TypedIndex : public Index { return algorithmImpl->label_lookup_; } + std::tuple, NDArray> + query(std::vector> floatQueryVectors, int k = 1, + int numThreads = -1, long queryEf = -1) { + return query(vectorsToNDArray(floatQueryVectors), k, numThreads, queryEf); + } + std::tuple, NDArray> query(NDArray floatQueryVectors, int k = 1, int numThreads = -1, long queryEf = -1) { diff --git a/cpp/src/array_utils.h b/cpp/src/array_utils.h index 7c2a755..42717ef 100644 --- a/cpp/src/array_utils.h +++ b/cpp/src/array_utils.h @@ -309,3 +309,30 @@ std::string toFloatVectorString(std::vector vec) { return toFloatVectorString(vec.data(), vec.size()); } + +/** Convert a 2D vector of float to NDArray */ +NDArray vectorsToNDArray(std::vector> vectors) { + int numVectors = vectors.size(); + int dimensions = numVectors > 0 ? vectors[0].size() : 0; + std::array shape = {numVectors, dimensions}; + + // Flatten the 2d array into the NDArray's underlying 1D vector + std::vector flatArray(numVectors * dimensions); + // Pointer to the beginning of the flat array + float *flatArrayPtr = flatArray.data(); + for (const auto &vector : vectors) { + // check that all provided vectors are same size, using the 1st vector as + // the reference + if (vector.size() != dimensions) { + throw std::invalid_argument("All vectors must be of the same size, but " + "received vectors of size: " + + std::to_string(dimensions) + " and " + + std::to_string(vector.size()) + "."); + } + // Use std::memcpy to copy the elements directly into the flat array + std::memcpy(flatArrayPtr, vector.data(), vector.size() * sizeof(float)); + flatArrayPtr += vector.size(); // Increment the pointer + } + + return NDArray(flatArray, shape); +} diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index a46805c..1606f1a 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -4,6 +4,9 @@ set(TEST_FILES test_main.cpp doctest_setup.cpp) # Add any test files here # Create an executable for the tests add_executable(VoyagerTests ${TEST_FILES}) +# Add compiler flags +target_compile_options(VoyagerTests PRIVATE -g) + # Link the test executable with the main project and Doctest # target_link_libraries(MyProjectTests PRIVATE MyProject doctest::doctest) target_link_libraries(VoyagerTests diff --git a/cpp/test/test_main.cpp b/cpp/test/test_main.cpp index dbfa5dc..e35f81a 100644 --- a/cpp/test/test_main.cpp +++ b/cpp/test/test_main.cpp @@ -1,49 +1,175 @@ #include "doctest.h" #include "TypedIndex.h" +#include "test_utils.cpp" #include #include template > -void testCombination(TypedIndex &index, - SpaceType spaceType, int numDimensions, - StorageDataType storageType) { - CHECK(toString(index.getSpace()) == toString(spaceType)); - CHECK(index.getNumDimensions() == numDimensions); - CHECK(toString(index.getStorageDataType()) == toString(storageType)); +void testIndexProperties(TypedIndex &index, + SpaceType spaceType, int numDimensions, + StorageDataType storageType) { + REQUIRE(toString(index.getSpace()) == toString(spaceType)); + REQUIRE(index.getNumDimensions() == numDimensions); + REQUIRE(toString(index.getStorageDataType()) == toString(storageType)); } -TEST_CASE("Test combinations of different instantiations and sizes") { - std::vector spaceTypesSet = {SpaceType::Euclidean, - SpaceType::InnerProduct}; - std::vector numDimensionsSet = {4, 16, 128, 1024}; - std::vector numElementsSet = {100, 1000, 100000}; +/** + * Test the query method of the index. The index is populated with random + * vectors, and then queried with the same vectors. The expected result is that + * each vector's nearest neighbor is itself and that the distance is zero + * (allowing for some precision error based on the storage type). + */ +template > +void testQuery(TypedIndex &index, int numVectors, + int numDimensions, SpaceType spaceType, + StorageDataType storageType, bool testSingleVectorMethod, + float precisionTolerance) { + /** + * Create test data and ids. If we are using Float8 or E4M3 storage, quantize + * the vector values, if we are using Float32 storage, keep the float values + * as-is. We want to match the storage type use case with the input data. + */ + std::vector> inputData; + if (storageType == StorageDataType::Float8 || + storageType == StorageDataType::E4M3) { + inputData = randomQuantizedVectors(numVectors, numDimensions); + } else if (storageType == StorageDataType::Float32) { + inputData = randomVectors(numVectors, numDimensions); + } + std::vector ids(numVectors); + for (int i = 0; i < numVectors; i++) { + ids[i] = i; + } + + // add items to index + if (testSingleVectorMethod == true) { + for (auto id : ids) { + index.addItem(inputData[id], id); + } + } else { + index.addItems(inputData, ids, -1); + } + + int k = 1; + float lowerBound = 0.0f - precisionTolerance; + float upperBound = 0.0f + precisionTolerance; + + // Use the single-query interface (query with a single target vector) + for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) { + for (int i = 0; i < numVectors; i++) { + + /** + * Use the raw inputData as target vectors for querying. We don't use the + * index data because once data has been added to the index, the model can + * change the "ground truth" by changing the data format. + */ + auto targetVector = inputData[i]; + auto nearestNeighbor = index.query(targetVector, k, queryEf); + + auto labels = std::get<0>(nearestNeighbor); + auto distances = std::get<1>(nearestNeighbor); + REQUIRE(labels.size() == k); + REQUIRE(distances.size() == k); + + /** + * E4M3 is too low precision for us to confidently assume that querying + * with the unquantized (fp32) vector will return the quantized vector as + * its NN. InnerProduct will have negative distance to the closest item, + * not zero + */ + if (storageType != StorageDataType::E4M3 && + spaceType != SpaceType::InnerProduct) { + REQUIRE(i == labels[0]); + REQUIRE(distances[0] >= lowerBound); + REQUIRE(distances[0] <= upperBound); + } + } + } + + // Use the bulk-query interface (query with multiple target vectors at once) + for (long queryEf = 100; queryEf <= numVectors; queryEf *= 10) { + auto nearestNeighbors = index.query( + inputData, /* k= */ k, /* numThreads= */ -1, /* queryEf= */ queryEf); + NDArray labels = std::get<0>(nearestNeighbors); + NDArray distances = std::get<1>(nearestNeighbors); + REQUIRE(labels.shape[0] == numVectors); + REQUIRE(labels.shape[1] == k); + REQUIRE(distances.shape[0] == numVectors); + REQUIRE(distances.shape[1] == k); + + for (int i = 0; i < numVectors; i++) { + auto label = labels.data[i]; + auto distance = distances.data[i]; + + /** + * E4M3 is too low precision for us to confidently assume that querying + * with the unquantized (fp32) vector will return the quantized vector + * as its NN. InnerProduct will have negative distance to the closest + * item, not zero + */ + if (storageType != StorageDataType::E4M3 && + spaceType != SpaceType::InnerProduct) { + REQUIRE(i == label); + REQUIRE(distance >= lowerBound); + REQUIRE(distance <= upperBound); + } + } + } +} + +TEST_CASE("Test combinations of different instantiations. Test that each " + "vector's NN is itself and distance is approximately zero.") { + std::unordered_map PRECISION_TOLERANCE_PER_DATA_TYPE = + {{StorageDataType::Float32, 0.00001f}, + {StorageDataType::Float8, 0.10f}, + {StorageDataType::E4M3, 0.20f}}; + std::vector spaceTypesSet = { + SpaceType::Euclidean, SpaceType::InnerProduct, SpaceType::Cosine}; + std::vector numDimensionsSet = {32}; + std::vector numVectorsSet = {2000}; std::vector storageTypesSet = { StorageDataType::Float8, StorageDataType::Float32, StorageDataType::E4M3}; - - auto count = 0; + std::vector testSingleVectorMethods = {true, false}; for (auto spaceType : spaceTypesSet) { - for (auto numDimensions : numDimensionsSet) { - for (auto numElements : numElementsSet) { - for (auto storageType : storageTypesSet) { - SUBCASE("Test instantiation ") { - CAPTURE(spaceType); - CAPTURE(numDimensions); - CAPTURE(numElements); - CAPTURE(storageType); - - if (storageType == StorageDataType::Float8) { - auto index = TypedIndex>( - spaceType, numDimensions); - testCombination(index, spaceType, numDimensions, storageType); - } else if (storageType == StorageDataType::Float32) { - auto index = TypedIndex(spaceType, numDimensions); - testCombination(index, spaceType, numDimensions, storageType); - } else if (storageType == StorageDataType::E4M3) { - auto index = TypedIndex(spaceType, numDimensions); - testCombination(index, spaceType, numDimensions, storageType); + for (auto storageType : storageTypesSet) { + for (auto numDimensions : numDimensionsSet) { + for (auto numVectors : numVectorsSet) { + for (auto testSingleVectorMethod : testSingleVectorMethods) { + + SUBCASE("Test instantiation ") { + CAPTURE(spaceType); + CAPTURE(numDimensions); + CAPTURE(numVectors); + CAPTURE(storageType); + CAPTURE(testSingleVectorMethod); + + if (storageType == StorageDataType::Float8) { + auto index = TypedIndex>( + spaceType, numDimensions); + testIndexProperties(index, spaceType, numDimensions, + storageType); + testQuery(index, numVectors, numDimensions, spaceType, + storageType, testSingleVectorMethod, + PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]); + } else if (storageType == StorageDataType::Float32) { + auto index = TypedIndex(spaceType, numDimensions); + testIndexProperties(index, spaceType, numDimensions, + storageType); + testQuery(index, numVectors, numDimensions, spaceType, + storageType, testSingleVectorMethod, + PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]); + } else if (storageType == StorageDataType::E4M3) { + auto index = TypedIndex(spaceType, numDimensions); + testIndexProperties(index, spaceType, numDimensions, + storageType); + testQuery(index, numVectors, numDimensions, spaceType, + storageType, testSingleVectorMethod, + PRECISION_TOLERANCE_PER_DATA_TYPE[storageType]); + } } } } @@ -51,3 +177,42 @@ TEST_CASE("Test combinations of different instantiations and sizes") { } } } + +TEST_CASE("Test vectorsToNDArray converts 2D vector of float to NDArray") { + std::vector> vectors = {{1.0f, 2.0f, 3.0f, 4.0f}, + {5.0f, 6.0f, 7.0f, 8.0f}, + {9.0f, 10.0f, 11.0f, 12.0f}}; + NDArray ndArray = vectorsToNDArray(vectors); + REQUIRE(ndArray.shape.size() == 2); + REQUIRE(ndArray.shape[0] == 3); + REQUIRE(ndArray.shape[1] == 4); + REQUIRE(ndArray.data.size() == 12); + REQUIRE(ndArray.data[0] == 1.0f); + REQUIRE(ndArray.data[1] == 2.0f); + REQUIRE(ndArray.data[2] == 3.0f); + REQUIRE(ndArray.data[3] == 4.0f); + REQUIRE(ndArray.data[4] == 5.0f); + REQUIRE(ndArray.data[5] == 6.0f); + REQUIRE(ndArray.data[6] == 7.0f); + REQUIRE(ndArray.data[7] == 8.0f); + REQUIRE(ndArray.data[8] == 9.0f); + REQUIRE(ndArray.data[9] == 10.0f); + REQUIRE(ndArray.data[10] == 11.0f); + REQUIRE(ndArray.data[11] == 12.0f); + REQUIRE(*ndArray[0] == 1.0f); + REQUIRE(*ndArray[1] == 5.0f); + REQUIRE(*ndArray[2] == 9.0f); +} + +TEST_CASE("Test vectorsToNDArray throws error if vectors are not of the same " + "size") { + std::vector> vectors1 = {{1.0f, 2.0f, 3.0f, 4.0f}, + {5.0f, 6.0f, 7.0f}, + {9.0f, 10.0f, 11.0f, 12.0f}}; + REQUIRE_THROWS_AS(vectorsToNDArray(vectors1), std::invalid_argument); + + std::vector> vectors2 = { + {1.0f}, {5.0f, 6.0f, 7.0f}, {9.0f, 10.0f, 11.0f}}; + REQUIRE_THROWS_AS(vectorsToNDArray(vectors2), std::invalid_argument); +} diff --git a/cpp/test/test_utils.cpp b/cpp/test/test_utils.cpp new file mode 100644 index 0000000..91fdbb3 --- /dev/null +++ b/cpp/test/test_utils.cpp @@ -0,0 +1,41 @@ +#include +#include + +#include "array_utils.h" + +// create test data intended for Float8 storage or E4M3 storage +std::vector> randomQuantizedVectors(int numVectors, + int dimensions) { + std::vector> vectors(numVectors, + std::vector(dimensions)); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0, 1.0); + + for (int i = 0; i < numVectors; ++i) { + for (int j = 0; j < dimensions; ++j) { + vectors[i][j] = static_cast(((dis(gen) * 2 - 1) * 10.0f)) / 10.0f; + } + } + + return vectors; +} + +// create test data intended for Float32 storage +std::vector> randomVectors(int numVectors, int dimensions) { + std::vector> vectors(numVectors, + std::vector(dimensions)); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(0, 1.0); + + for (int i = 0; i < numVectors; ++i) { + for (int j = 0; j < dimensions; ++j) { + vectors[i][j] = static_cast(dis(gen)) * 2 - 1; + } + } + + return vectors; +}