diff --git a/C/Cpp_include/c4Database.hh b/C/Cpp_include/c4Database.hh index 0e06b9833..4534e1eee 100644 --- a/C/Cpp_include/c4Database.hh +++ b/C/Cpp_include/c4Database.hh @@ -50,10 +50,12 @@ struct C4Database using Config = C4DatabaseConfig2; - /** Registers a directory path to load extension libraries from, such as Vector Search. - Must be called before opening a database that will use an extension. */ + // Deprecated in favor of enableExtension! static void setExtensionPath(slice path); + /** Attempts to discover and verify the named extension in the provided path */ + static void enableExtension(slice name, slice path); + static bool exists(slice name, slice inDirectory); static void copyNamed(slice sourcePath, slice destinationName, const Config&); static bool deleteNamed(slice name, slice inDirectory); diff --git a/C/Cpp_include/c4Index.hh b/C/Cpp_include/c4Index.hh index c3f25f838..f59e8c0de 100644 --- a/C/Cpp_include/c4Index.hh +++ b/C/Cpp_include/c4Index.hh @@ -33,20 +33,31 @@ struct C4Index slice getName() const noexcept { return _name; } + C4IndexType getType() const noexcept; + C4QueryLanguage getQueryLanguage() const noexcept; + slice getExpression() const noexcept; + + /// Writes the index options to `opts` and returns true. If there are none, returns false. + [[nodiscard]] bool getOptions(C4IndexOptions& opts) const noexcept; + #ifdef COUCHBASE_ENTERPRISE + bool isTrained() const; + /// Finds new or updated documents for which vectors need to be recomputed by the application. /// If there are none, returns NULL. /// @param limit The maximum number of documents/vectors to return. If this is less than /// the total number, the rest will be returned on the next call to `beginUpdate`. /// @warning Do not call `beginUpdate` again until you're done with the returned updater; /// it's not valid to have more than one update in progress at a time. - Retained beginUpdate(size_t limit); + Retained beginUpdate(size_t limit); #endif protected: friend class litecore::CollectionImpl; static Retained getIndex(C4Collection*, slice name); + C4Index(C4Collection* coll, std::string name) : _collection(coll), _name(std::move(name)) {} + Retained _collection; std::string _name; }; diff --git a/C/c4.exp b/C/c4.exp index a60ba7ae7..23cd233b4 100644 --- a/C/c4.exp +++ b/C/c4.exp @@ -402,6 +402,12 @@ _c4_dumpInstances _gC4ExpectExceptions _c4_setExtensionPath +_c4_enableExtension + +_c4index_getType +_c4index_getQueryLanguage +_c4index_getExpression +_c4index_getOptions _FLDoc_FromJSON _FLDoc_Retain diff --git a/C/c4CAPI.cc b/C/c4CAPI.cc index adebbc5f2..782d10164 100644 --- a/C/c4CAPI.cc +++ b/C/c4CAPI.cc @@ -374,6 +374,10 @@ C4SliceResult c4coll_getIndexesInfo(C4Collection* coll, C4Error* C4NULLABLE outE void c4_setExtensionPath(C4String path) noexcept { C4Database::setExtensionPath(path); } +bool c4_enableExtension(C4String name, C4String extensionPath, C4Error* outError) noexcept { + return tryCatch(outError, [=] { C4Database::enableExtension(name, extensionPath); }); +} + bool c4db_exists(C4String name, C4String inDirectory) noexcept { return C4Database::exists(name, inDirectory); } bool c4key_setPassword(C4EncryptionKey* outKey, C4String password, C4EncryptionAlgorithm alg) noexcept { @@ -553,7 +557,7 @@ bool c4db_createIndex2(C4Database* database, C4Slice name, C4Slice indexSpec, C4 } bool c4coll_isIndexTrained(C4Collection* collection, C4Slice name, C4Error* outError) noexcept { - memset(outError, 0, sizeof(C4Error)); + if ( outError ) *outError = kC4NoError; return tryCatch(outError, [=] { return collection->isIndexTrained(name); }); } @@ -901,6 +905,22 @@ C4Document* c4enum_getDocument(C4DocEnumerator* e, C4Error* outError) noexcept { }); } +#pragma mark - INDEXES: + +C4IndexType c4index_getType(C4Index* index) C4API { return index->getType(); } + +C4QueryLanguage c4index_getQueryLanguage(C4Index* index) C4API { return index->getQueryLanguage(); } + +C4String c4index_getExpression(C4Index* index) C4API { return index->getExpression(); } + +bool c4index_getOptions(C4Index* index, C4IndexOptions* outOpts) C4API { return index->getOptions(*outOpts); } + +#ifdef COUCHBASE_ENTERPRISE +bool c4index_isTrained(C4Index* index, C4Error* C4NULLABLE outError) C4API { + return c4coll_isIndexTrained(index->getCollection(), index->getName(), outError); +} +#endif + #pragma mark - OBSERVERS: C4DatabaseObserver* c4dbobs_createOnCollection(C4Collection* coll, C4CollectionObserverCallback callback, diff --git a/C/c4Database.cc b/C/c4Database.cc index 61c9c76f4..f07294e34 100644 --- a/C/c4Database.cc +++ b/C/c4Database.cc @@ -62,6 +62,10 @@ C4EncryptionKey C4EncryptionKeyFromPasswordSHA1(slice password, C4EncryptionAlgo void C4Database::setExtensionPath(slice path) { SQLiteDataFile::setExtensionPath(string(path)); } +void C4Database::enableExtension(slice name, slice path) { + SQLiteDataFile::enableExtension(string(name), string(path)); +} + #pragma mark - STATIC LIFECYCLE METHODS: static FilePath dbPath(slice name, slice parentDir) { diff --git a/C/c4Index.cc b/C/c4Index.cc index cb55921f3..0356f73da 100644 --- a/C/c4Index.cc +++ b/C/c4Index.cc @@ -28,9 +28,70 @@ using namespace fleece; using namespace litecore; struct C4IndexImpl final : public C4Index { - C4IndexImpl(C4Collection* c, slice name) : _spec(asInternal(c)->keyStore().getIndex(name)) { - _collection = c; - _name = name; + C4IndexImpl(C4Collection* c, IndexSpec spec) : C4Index(c, spec.name), _spec(std::move(spec)) {} + + C4IndexType getType() const noexcept { return C4IndexType(_spec.type); } + + C4QueryLanguage getQueryLanguage() const noexcept { return C4QueryLanguage(_spec.queryLanguage); } + + slice getExpression() const noexcept { return _spec.expression; } + + bool getOptions(C4IndexOptions& opts) const noexcept { + opts = {}; + if ( auto ftsOpts = _spec.ftsOptions() ) { + opts.language = ftsOpts->language; + opts.ignoreDiacritics = ftsOpts->ignoreDiacritics; + opts.disableStemming = ftsOpts->disableStemming; + opts.stopWords = ftsOpts->stopWords; + return true; + +#ifdef COUCHBASE_ENTERPRISE + } else if ( auto vecOpts = _spec.vectorOptions() ) { + opts.vector.dimensions = vecOpts->dimensions; + opts.vector.metric = C4VectorMetricType(int(vecOpts->metric) + 1); + opts.vector.clustering.type = C4VectorClusteringType(vecOpts->clusteringType()); + switch ( vecOpts->clusteringType() ) { + case vectorsearch::ClusteringType::Flat: + { + auto flat = std::get(vecOpts->clustering); + opts.vector.clustering.flat_centroids = flat.numCentroids; + break; + } + case vectorsearch::ClusteringType::MultiIndex: + { + auto multi = std::get(vecOpts->clustering); + opts.vector.clustering.multi_bits = multi.bitsPerSub; + opts.vector.clustering.multi_subquantizers = multi.subquantizers; + break; + } + } + opts.vector.encoding.type = C4VectorEncodingType(vecOpts->encodingType()); + switch ( vecOpts->encodingType() ) { + case vectorsearch::EncodingType::None: + break; + case vectorsearch::EncodingType::PQ: + { + auto pq = std::get(vecOpts->encoding); + opts.vector.encoding.pq_subquantizers = pq.subquantizers; + opts.vector.encoding.bits = pq.bitsPerSub; + break; + } + case vectorsearch::EncodingType::SQ: + { + auto sq = std::get(vecOpts->encoding); + opts.vector.encoding.bits = sq.bitsPerDimension; + break; + } + } + if ( vecOpts->probeCount ) opts.vector.numProbes = *vecOpts->probeCount; + if ( vecOpts->minTrainingCount ) opts.vector.minTrainingSize = unsigned(*vecOpts->minTrainingCount); + if ( vecOpts->maxTrainingCount ) opts.vector.maxTrainingSize = unsigned(*vecOpts->maxTrainingCount); + opts.vector.lazy = vecOpts->lazyEmbedding; + return true; +#endif + } else { + return false; + } } #ifdef COUCHBASE_ENTERPRISE @@ -43,21 +104,35 @@ struct C4IndexImpl final : public C4Index { } #endif - optional _spec; + IndexSpec _spec; Retained _lazy; }; -inline C4IndexImpl* asInternal(C4Index* index) { return static_cast(index); } +inline C4IndexImpl* asInternal(C4Index* i) { return static_cast(i); } -Retained C4Index::getIndex(C4Collection* c, slice name) { - Retained index = new C4IndexImpl(c, name); - if ( !index->_spec ) index = nullptr; - return index; +inline C4IndexImpl const* asInternal(C4Index const* i) { return static_cast(i); } + +/*static*/ Retained C4Index::getIndex(C4Collection* c, slice name) { + if ( optional spec = asInternal(c)->keyStore().getIndex(name) ) { + return new C4IndexImpl(c, *std::move(spec)); + } else { + return nullptr; + } } +C4IndexType C4Index::getType() const noexcept { return asInternal(this)->getType(); } + +C4QueryLanguage C4Index::getQueryLanguage() const noexcept { return asInternal(this)->getQueryLanguage(); } + +slice C4Index::getExpression() const noexcept { return asInternal(this)->getExpression(); } + +bool C4Index::getOptions(C4IndexOptions& opts) const noexcept { return asInternal(this)->getOptions(opts); } + #ifdef COUCHBASE_ENTERPRISE +bool C4Index::isTrained() const { return _collection->isIndexTrained(_name); } + Retained C4Index::beginUpdate(size_t limit) { return asInternal(this)->beginUpdate(limit); } C4IndexUpdater::C4IndexUpdater(Retained u, C4Collection* c) diff --git a/C/c4_ee.exp b/C/c4_ee.exp index 87b76ca67..10cf320e9 100644 --- a/C/c4_ee.exp +++ b/C/c4_ee.exp @@ -443,6 +443,12 @@ _c4_dumpInstances _gC4ExpectExceptions _c4_setExtensionPath +_c4_enableExtension + +_c4index_getType +_c4index_getQueryLanguage +_c4index_getExpression +_c4index_getOptions _FLDoc_FromJSON _FLDoc_Retain @@ -480,7 +486,9 @@ _c4keypair_privateKeyData _c4keypair_publicKeyData _c4keypair_publicKeyDigest +_c4index_isTrained _c4index_beginUpdate + _c4indexupdater_count _c4indexupdater_valueAt _c4indexupdater_setVectorAt diff --git a/C/include/c4Collection.h b/C/include/c4Collection.h index c10f432b2..1f29364da 100644 --- a/C/include/c4Collection.h +++ b/C/include/c4Collection.h @@ -13,7 +13,7 @@ #pragma once #include "c4DatabaseTypes.h" #include "c4DocumentTypes.h" -#include "c4IndexTypes.h" +#include "fleece/Fleece.h" C4_ASSUME_NONNULL_BEGIN C4API_BEGIN_DECLS @@ -26,6 +26,11 @@ C4API_BEGIN_DECLS Observer-related functions are in c4Observer.h: - c4dbobs_createOnCollection - c4docobs_createWithCollection + Index-related functions are in c4Index.h: + - c4coll_createIndex + - c4coll_deleteIndex + - c4coll_getIndex + - c4coll_getIndexesInfo */ @@ -263,53 +268,6 @@ CBL_CORE_API C4Timestamp c4coll_nextDocExpiration(C4Collection*) C4API; NODISCARD CBL_CORE_API int64_t c4coll_purgeExpiredDocs(C4Collection*, C4Error* C4NULLABLE) C4API; -/** @} */ -/** \name Indexes - @{ */ - - -/** Creates a collection index, of the values of specific expressions across all documents. - The name is used to identify the index for later updating or deletion; if an index with the - same name already exists, it will be replaced unless it has the exact same expressions. - - The `indexSpec` argument is an expression, relative to a document, that describes what to index. - It can be in either the JSON query schema, or in N1QL syntax. It usually names a property, - but may also be a computed value based on properties. - - @param collection The collection to index. - @param name The name of the index. Any existing index with the same name will be replaced, - unless it has the identical expressions (in which case this is a no-op.) - @param indexSpec The definition of the index in JSON or N1QL form. (See above.) - @param queryLanguage The language of `indexSpec`, either JSON or N1QL. - @param indexType The type of index (value full-text, etc.) - @param indexOptions Options for the index. If NULL, each option will get a default value. - @param outError On failure, will be set to the error status. - @return True on success, false on failure. */ -NODISCARD CBL_CORE_API bool c4coll_createIndex(C4Collection* collection, C4String name, C4String indexSpec, - C4QueryLanguage queryLanguage, C4IndexType indexType, - const C4IndexOptions* C4NULLABLE indexOptions, - C4Error* C4NULLABLE outError) C4API; - -/** Returns an object representing an existing index. */ -CBL_CORE_API C4Index* C4NULLABLE c4coll_getIndex(C4Collection* collection, C4String name, - C4Error* C4NULLABLE outError) C4API; - -/** Deletes an index that was created by `c4coll_createIndex`. - @param collection The collection to index. - @param name The name of the index to delete - @param outError On failure, will be set to the error status. - @return True on success, false on failure. */ -NODISCARD CBL_CORE_API bool c4coll_deleteIndex(C4Collection* collection, C4String name, - C4Error* C4NULLABLE outError) C4API; - -/** Returns information about all indexes in the collection. - The result is a Fleece-encoded array of dictionaries, one per index. - Each dictionary has keys `"name"`, `"type"` (a `C4IndexType`), and `"expr"` (the source expression). - @param collection The collection to check - @param outError On failure, will be set to the error status. - @return A Fleece-encoded array of dictionaries, or NULL on failure. */ -CBL_CORE_API C4SliceResult c4coll_getIndexesInfo(C4Collection* collection, C4Error* C4NULLABLE outError) C4API; - /** @} */ /** @} */ // end Collections group diff --git a/C/include/c4Database.h b/C/include/c4Database.h index c9f4718c3..48fa8aa8b 100644 --- a/C/include/c4Database.h +++ b/C/include/c4Database.h @@ -45,10 +45,21 @@ NODISCARD CBL_CORE_API bool c4key_setPassword(C4EncryptionKey* encryptionKey, C4 NODISCARD CBL_CORE_API bool c4key_setPasswordSHA1(C4EncryptionKey* encryptionKey, C4String password, C4EncryptionAlgorithm alg) C4API; -/** Registers a directory path to load extension libraries from, such as Vector Search. - Must be called before opening a database that will use an extension. */ +// Deprecated in favor of c4_enableExtension CBL_CORE_API void c4_setExtensionPath(C4String path) C4API; +/** Asks LiteCore to look for and validate the presence of an extension given the name + * of the extension and the path in which it is supposed to reside. It makes an attempt + * to only check things that have the possibility of being corrected by the user (i.e. + * if there is a bug in the extension and it cannot load functionally that won't be caught) + * @param name The name of the extension (corresponds to the lower case of the filename + * without the extension) + * @param extensionPath The path in which the extension should be found + * @param outError On failure, will store the error. + * @return True on success, false on failure + */ +CBL_CORE_API bool c4_enableExtension(C4String name, C4String extensionPath, C4Error* outError) C4API; + /** @} */ //////// DATABASE API: diff --git a/C/include/c4Error.h b/C/include/c4Error.h index 27aaf1a6d..daf843d9c 100644 --- a/C/include/c4Error.h +++ b/C/include/c4Error.h @@ -186,6 +186,12 @@ typedef struct C4Error { #endif } C4Error; +#ifdef __cplusplus +static constexpr C4Error kC4NoError = {}; +#else +# define kC4NoError ((C4Error){}) +#endif + // C4Error C API: diff --git a/C/include/c4Index.h b/C/include/c4Index.h index f92014902..07e54920d 100644 --- a/C/include/c4Index.h +++ b/C/include/c4Index.h @@ -16,111 +16,120 @@ C4_ASSUME_NONNULL_BEGIN C4API_BEGIN_DECLS -/** \defgroup Indexing Database Indexes +//======== C4Collection Methods: + +/** \defgroup Indexing Indexes @{ */ +/** Creates a collection index, of the values of specific expressions across all documents. + The name is used to identify the index for later updating or deletion; if an index with the + same name already exists, it will be replaced unless it has the exact same expressions. + + Currently five types of indexes are supported: + + * Value indexes speed up queries by making it possible to look up property (or expression) + values without scanning every document. They're just like regular indexes in SQL or N1QL. + Multiple expressions are supported; the first is the primary key, second is secondary. + Expressions must evaluate to scalar types (boolean, number, string). + * Full-Text Search (FTS) indexes enable fast search of natural-language words or phrases + by using the `MATCH` operator in a query. A FTS index is **required** for full-text + search: a query with a `MATCH` operator will fail to compile unless there is already a + FTS index for the property/expression being matched. Only a single expression is + currently allowed, and it must evaluate to a string. + * Array indexes optimize UNNEST queries, by materializing an unnested array property + (across all documents) as a table in the SQLite database, and creating a SQL index on it. + * Predictive indexes optimize queries that use the PREDICTION() function, by materializing + the function's results as a table and creating a SQL index on a result property. + * Vector indexes store high-dimensional vectors/embeddings and support efficient Approximate + Nearest Neighbor (ANN) queries for finding the nearest vectors to a query vector. + + Note: If some documents are missing the values to be indexed, + those documents will just be omitted from the index. It's not an error. + + In an array index, the first expression must evaluate to an array to be unnested; it's + usually a property path but could be some other expression type. If the array items are + nonscalar (dictionaries or arrays), you should add a second expression defining the sub- + property (or computed value) to index, relative to the array item. + + In a predictive index, the expression is a PREDICTION() call in JSON query syntax, + including the optional 3rd parameter that gives the result property to extract (and index.) + + The `indexSpec` argument is an expression, relative to a document, that describes what to index. + It can be in either the JSON query schema, or in N1QL syntax. It usually names a property, + but may also be a computed value based on properties. + + @param collection The collection to index. + @param name The name of the index. Any existing index with the same name will be replaced, + unless it has the identical expressions (in which case this is a no-op.) + @param indexSpec The definition of the index in JSON or N1QL form. (See above.) + @param queryLanguage The language of `indexSpec`, either JSON or N1QL. + @param indexType The type of index (value full-text, etc.) + @param indexOptions Options for the index. If NULL, each option will get a default value. + @param outError On failure, will be set to the error status. + @return True on success, false on failure. */ +NODISCARD CBL_CORE_API bool c4coll_createIndex(C4Collection* collection, C4String name, C4String indexSpec, + C4QueryLanguage queryLanguage, C4IndexType indexType, + const C4IndexOptions* C4NULLABLE indexOptions, + C4Error* C4NULLABLE outError) C4API; + +/** Returns an object representing an existing index. */ +CBL_CORE_API C4Index* C4NULLABLE c4coll_getIndex(C4Collection* collection, C4String name, + C4Error* C4NULLABLE outError) C4API; + +/** Deletes an index that was created by `c4coll_createIndex`. + @param collection The collection to index. + @param name The name of the index to delete + @param outError On failure, will be set to the error status. + @return True on success, false on failure. */ +NODISCARD CBL_CORE_API bool c4coll_deleteIndex(C4Collection* collection, C4String name, + C4Error* C4NULLABLE outError) C4API; -/** Creates a database index, of the values of specific expressions across all documents. - The name is used to identify the index for later updating or deletion; if an index with the - same name already exists, it will be replaced unless it has the exact same expressions. - - Currently four types of indexes are supported: - - * Value indexes speed up queries by making it possible to look up property (or expression) - values without scanning every document. They're just like regular indexes in SQL or N1QL. - Multiple expressions are supported; the first is the primary key, second is secondary. - Expressions must evaluate to scalar types (boolean, number, string). - * Full-Text Search (FTS) indexes enable fast search of natural-language words or phrases - by using the `MATCH` operator in a query. A FTS index is **required** for full-text - search: a query with a `MATCH` operator will fail to compile unless there is already a - FTS index for the property/expression being matched. Only a single expression is - currently allowed, and it must evaluate to a string. - * Array indexes optimize UNNEST queries, by materializing an unnested array property - (across all documents) as a table in the SQLite database, and creating a SQL index on it. - * Predictive indexes optimize queries that use the PREDICTION() function, by materializing - the function's results as a table and creating a SQL index on a result property. - - Note: If some documents are missing the values to be indexed, - those documents will just be omitted from the index. It's not an error. - - In an array index, the first expression must evaluate to an array to be unnested; it's - usually a property path but could be some other expression type. If the array items are - nonscalar (dictionaries or arrays), you should add a second expression defining the sub- - property (or computed value) to index, relative to the array item. - - In a predictive index, the expression is a PREDICTION() call in JSON query syntax, - including the optional 3rd parameter that gives the result property to extract (and index.) - - `indexSpecJSON` specifies the index as a JSON object, with properties: - * `WHAT`: An array of expressions in the JSON query syntax. (Note that each - expression is already an array, so there are two levels of nesting.) - * `WHERE`: An optional expression. Including this creates a _partial index_: documents - for which this expression returns `false` or `null` will be skipped. - - For backwards compatibility, `indexSpecJSON` may be an array; this is treated as if it were - a dictionary with a `WHAT` key mapping to that array. - - Expressions are defined in JSON, as in a query, and wrapped in a JSON array. For example, - `[[".name.first"]]` will index on the first-name property. Note the two levels of brackets, - since an expression is already an array. - - @param database The database to index. - @param name The name of the index. Any existing index with the same name will be replaced, - unless it has the identical expressions (in which case this is a no-op.) - @param indexSpecJSON The definition of the index in JSON form. (See above.) - @param indexType The type of index (value or full-text.) - @param indexOptions Options for the index. If NULL, each option will get a default value. - @param outError On failure, will be set to the error status. - @return True on success, false on failure. */ -NODISCARD CBL_CORE_API bool c4db_createIndex(C4Database* database, C4String name, C4String indexSpecJSON, - C4IndexType indexType, const C4IndexOptions* C4NULLABLE indexOptions, - C4Error* C4NULLABLE outError) C4API; +/** Returns information about all indexes in the collection. + The result is a Fleece-encoded array of dictionaries, one per index. + Each dictionary has keys `"name"`, `"type"` (a `C4IndexType`), and `"expr"` (the source expression). + @param collection The collection to check + @param outError On failure, will be set to the error status. + @return A Fleece-encoded array of dictionaries, or NULL on failure. */ +CBL_CORE_API C4SliceResult c4coll_getIndexesInfo(C4Collection* collection, C4Error* C4NULLABLE outError) C4API; -/** @param database The database to index. - @param name The name of the index. Any existing index with the same name will be replaced, - unless it has the identical expressions (in which case this is a no-op.) - @param indexSpec The definition of the index in JSON or N1QL form. (See above.) - @param queryLanguage The query language (JSON or N1QL) of `indexSpec` is expressed. - @param indexType The type of index (value or full-text.) - @param indexOptions Options for the index. If NULL, each option will get a default value. - @param outError On failure, will be set to the error status. - @return True on success, false on failure. */ -NODISCARD CBL_CORE_API bool c4db_createIndex2(C4Database* database, C4String name, C4String indexSpec, - C4QueryLanguage queryLanguage, C4IndexType indexType, - const C4IndexOptions* C4NULLABLE indexOptions, - C4Error* C4NULLABLE outError) C4API; +/** Returns information about all indexes in the database. + The result is a Fleece-encoded array of dictionaries, one per index. + Each dictionary has keys `"name"`, `"type"` (a `C4IndexType`), and `"expr"` (the source expression). + @param database The database to check + @param outError On failure, will be set to the error status. + @return A Fleece-encoded array of dictionaries, or NULL on failure. */ +CBL_CORE_API C4SliceResult c4db_getIndexesInfo(C4Database* database, C4Error* C4NULLABLE outError) C4API; +//======== C4Index Methods: -/** Deletes an index that was created by `c4db_createIndex`. - @param database The database to index. - @param name The name of the index to delete - @param outError On failure, will be set to the error status. - @return True on success, false on failure. */ -NODISCARD CBL_CORE_API bool c4db_deleteIndex(C4Database* database, C4String name, C4Error* C4NULLABLE outError) C4API; +/** Returns the index's type. */ +CBL_CORE_API C4IndexType c4index_getType(C4Index*) C4API; +/** Returns the index's query language (JSON or N1QL). */ +CBL_CORE_API C4QueryLanguage c4index_getQueryLanguage(C4Index*) C4API; -/** Returns information about all indexes in the database. - The result is a Fleece-encoded array of dictionaries, one per index. - Each dictionary has keys `"name"`, `"type"` (a `C4IndexType`), and `"expr"` (the source expression). - @param database The database to check - @param outError On failure, will be set to the error status. - @return A Fleece-encoded array of dictionaries, or NULL on failure. */ -CBL_CORE_API C4SliceResult c4db_getIndexesInfo(C4Database* database, C4Error* C4NULLABLE outError) C4API; +/** Returns the indexed expression. */ +CBL_CORE_API C4String c4index_getExpression(C4Index*) C4API; -/** Returns whether or not a given vector index is trained - * If the index doesn't exist, or is not a vector index, then this method will - * return false with an appropriate error set. Otherwise, in the absence of errors, - * this method will zero the error and set the return value. - * @param collection The collection to look up the index in - * @param name The name of the index to check - * @param outError On failure, will be set to the error status - * @return true if the index is trained, false if the index was not valid or is not yet trained - */ -CBL_CORE_API bool c4coll_isIndexTrained(C4Collection* collection, C4String name, C4Error* C4NULLABLE outError) C4API; +/** Gets the index's FTS/vector options, if any. + @param index The index. + @param outOpts The options will be written here, if they exist. + @returns True if there are options, false if not. */ +CBL_CORE_API bool c4index_getOptions(C4Index* index, C4IndexOptions* outOpts) C4API; #ifdef COUCHBASE_ENTERPRISE +/** Returns whether a vector index has been trained yet or not. + If the index doesn't exist, or is not a vector index, then this method will + return false with an appropriate error set. Otherwise, in the absence of errors, + this method will zero the error and set the return value. */ +CBL_CORE_API bool c4index_isTrained(C4Index*, C4Error* C4NULLABLE outError) C4API; + + +//======== UPDATING LAZY INDEXES: + + /** Finds new or updated documents for which vectors need to be recomputed by the application. If there are none, returns NULL. If it returns a non-NULL `C4IndexUpdater` object pointer, you should: @@ -130,11 +139,18 @@ CBL_CORE_API bool c4coll_isIndexTrained(C4Collection* collection, C4String name, 2. Call `finish` to apply the updates to the index. 3. Release the `C4IndexUpdater`, of course. + @note The updater is not guaranteed to find all of the unindexed documents at once! It may + return less than the limit, even if more exist. It _is_ guaranteed to make progress, + by returning _some_ unindexed documents if there are any. The intention is that the app + will continue updating the index periodically until this call returns NULL, signaling + that the index is now up-to-date. + @param index The index to update; must be a vector index with the lazy attribute. @param limit The maximum number of out-of-date documents to include. @param outError On failure, will be set to the error status. @return A new `C4IndexUpdater` reference, or NULL if there's nothing to update. */ -CBL_CORE_API C4IndexUpdater* C4NULLABLE c4index_beginUpdate(C4Index* index, size_t limit, C4Error* outError) C4API; +NODISCARD CBL_CORE_API C4IndexUpdater* C4NULLABLE c4index_beginUpdate(C4Index* index, size_t limit, + C4Error* outError) C4API; /** * Return the name of this index. @@ -156,7 +172,7 @@ CBL_CORE_API size_t c4indexupdater_count(C4IndexUpdater* updater) C4API; @returns A Fleece value: the value of the index's query expression evaluated on the i'th document. Internally this value is part of a query result. It remains valid until the index updater is released. If you want to keep it longer, retain it with `FLRetain`. */ -CBL_CORE_API FLValue c4indexupdater_valueAt(C4IndexUpdater* updater, size_t i) C4API; +NODISCARD CBL_CORE_API FLValue c4indexupdater_valueAt(C4IndexUpdater* updater, size_t i) C4API; /** Sets the vector for the i'th value. If you don't call this, it's assumed there is no vector, and any existing vector will be removed upon `finish`. @@ -166,8 +182,9 @@ CBL_CORE_API FLValue c4indexupdater_valueAt(C4IndexUpdater* updater, size_t i) C @param dimension The dimension of `vector`; must be equal to the index's declared dimension. @param outError On failure, will be set to the error status. @return True on success, false on failure. */ -CBL_CORE_API bool c4indexupdater_setVectorAt(C4IndexUpdater* updater, size_t i, const float vector[C4NULLABLE], - size_t dimension, C4Error* outError) C4API; +NODISCARD CBL_CORE_API bool c4indexupdater_setVectorAt(C4IndexUpdater* updater, size_t i, + const float vector[C4NULLABLE], size_t dimension, + C4Error* outError) C4API; /** Explicitly skips updating the i'th vector. No index entry will be created or deleted. The vector still needs to be recomputed, and will be included in the next update request. @@ -193,6 +210,21 @@ CBL_CORE_API bool c4indexupdater_finish(C4IndexUpdater* updater, C4Error* outErr #endif + +#ifndef C4_STRICT_COLLECTION_API +//======== SEMI-DEPRECATED DATABASE METHODS: +NODISCARD CBL_CORE_API bool c4db_createIndex(C4Database* database, C4String name, C4String indexSpecJSON, + C4IndexType indexType, const C4IndexOptions* C4NULLABLE indexOptions, + C4Error* C4NULLABLE outError) C4API; +NODISCARD CBL_CORE_API bool c4db_createIndex2(C4Database* database, C4String name, C4String indexSpec, + C4QueryLanguage queryLanguage, C4IndexType indexType, + const C4IndexOptions* C4NULLABLE indexOptions, + C4Error* C4NULLABLE outError) C4API; +NODISCARD CBL_CORE_API bool c4db_deleteIndex(C4Database* database, C4String name, C4Error* C4NULLABLE outError) C4API; +#endif + +CBL_CORE_API bool c4coll_isIndexTrained(C4Collection* collection, C4String name, C4Error* C4NULLABLE outError) C4API; + /** @} */ C4API_END_DECLS diff --git a/C/include/c4IndexTypes.h b/C/include/c4IndexTypes.h index 43f1fb604..e971f7a7b 100644 --- a/C/include/c4IndexTypes.h +++ b/C/include/c4IndexTypes.h @@ -28,7 +28,7 @@ typedef C4_ENUM(uint32_t, C4IndexType){ kC4ArrayIndex, ///< Index of array values, for use with UNNEST kC4PredictiveIndex, ///< Index of prediction() results (Enterprise Edition only) kC4VectorIndex, ///< Index of ML vector similarity (Enterprise Edition only) -}; +}; // Values must match litecore::IndexSpec::Type! #ifdef COUCHBASE_ENTERPRISE @@ -37,7 +37,7 @@ typedef C4_ENUM(uint32_t, C4VectorMetricType){ kC4VectorMetricDefault, ///< Use default metric, Euclidean kC4VectorMetricEuclidean, ///< Euclidean distance (squared) kC4VectorMetricCosine, ///< Cosine distance (1.0 - cosine similarity) -}; // Values must match IndexSpec::VectorOptions::MetricType +}; // Values DO NOT match IndexSpec::VectorOptions::MetricType! /** Types of clustering in vector indexes. There is no default type because you must fill in the C4VectorClustering struct with a number of centroids or subquantizers+bits. */ diff --git a/C/scripts/c4.txt b/C/scripts/c4.txt index 95ae9dadb..1e7587c21 100644 --- a/C/scripts/c4.txt +++ b/C/scripts/c4.txt @@ -411,6 +411,11 @@ gC4ExpectExceptions c4_setExtensionPath +c4index_getType +c4index_getQueryLanguage +c4index_getExpression +c4index_getOptions + FLDoc_FromJSON FLDoc_Retain FLDoc_GetAllocedData diff --git a/C/scripts/c4_ee.txt b/C/scripts/c4_ee.txt index 4f468eb96..7540894d1 100644 --- a/C/scripts/c4_ee.txt +++ b/C/scripts/c4_ee.txt @@ -57,7 +57,9 @@ c4keypair_privateKeyData c4keypair_publicKeyData c4keypair_publicKeyDigest +c4index_isTrained c4index_beginUpdate + c4indexupdater_count c4indexupdater_valueAt c4indexupdater_setVectorAt diff --git a/C/tests/c4DatabaseTest.cc b/C/tests/c4DatabaseTest.cc index 350e7b04f..d2248bd3a 100644 --- a/C/tests/c4DatabaseTest.cc +++ b/C/tests/c4DatabaseTest.cc @@ -16,7 +16,7 @@ #include "c4Test.hh" // IWYU pragma: keep #include "c4DocEnumerator.h" #include "c4BlobStore.h" -#include "c4IndexTypes.h" +#include "c4Index.h" #include "c4Query.h" #include "c4Collection.h" #include "Error.hh" diff --git a/CMakeLists.txt b/CMakeLists.txt index b17ca1c3f..9b03a01be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,6 +250,7 @@ set( vendor/mbedtls/include vendor/mbedtls/crypto/include vendor/sockpp/include + vendor/vector_search ) target_include_directories( diff --git a/LiteCore/Database/CollectionImpl.hh b/LiteCore/Database/CollectionImpl.hh index b6cd7e51d..f97e7a704 100644 --- a/LiteCore/Database/CollectionImpl.hh +++ b/LiteCore/Database/CollectionImpl.hh @@ -416,24 +416,46 @@ namespace litecore { break; case kC4VectorIndex: if ( indexOptions ) { - auto& c4Opt = indexOptions->vector; - auto& vecOpt = options.emplace(c4Opt.dimensions); - vecOpt.metric = IndexSpec::VectorOptions::MetricType(c4Opt.metric); - - vecOpt.clustering.type = IndexSpec::VectorOptions::ClusteringType(c4Opt.clustering.type); - vecOpt.clustering.flat_centroids = c4Opt.clustering.flat_centroids; - vecOpt.clustering.multi_subquantizers = c4Opt.clustering.multi_subquantizers; - vecOpt.clustering.multi_bits = c4Opt.clustering.multi_bits; - - vecOpt.encoding.type = IndexSpec::VectorOptions::EncodingType(c4Opt.encoding.type); - vecOpt.encoding.pq_subquantizers = c4Opt.encoding.pq_subquantizers; - vecOpt.encoding.bits = c4Opt.encoding.bits; - - vecOpt.minTrainingSize = c4Opt.minTrainingSize; - vecOpt.maxTrainingSize = c4Opt.maxTrainingSize; - vecOpt.numProbes = c4Opt.numProbes; - - vecOpt.lazy = c4Opt.lazy; + auto& c4Opt = indexOptions->vector; + auto& vecOpt = options.emplace(); + vecOpt.dimensions = c4Opt.dimensions; + switch ( c4Opt.metric ) { + case kC4VectorMetricEuclidean: + vecOpt.metric = vectorsearch::Metric::Euclidean2; + case kC4VectorMetricCosine: + vecOpt.metric = vectorsearch::Metric::Cosine; + break; + case kC4VectorMetricDefault: + break; + } + switch ( c4Opt.clustering.type ) { + case kC4VectorClusteringFlat: + vecOpt.clustering = vectorsearch::FlatClustering{c4Opt.clustering.flat_centroids}; + break; + case kC4VectorClusteringMulti: + vecOpt.clustering = vectorsearch::MultiIndexClustering{ + c4Opt.clustering.multi_subquantizers, c4Opt.clustering.multi_bits}; + break; + } + switch ( c4Opt.encoding.type ) { + case kC4VectorEncodingNone: + vecOpt.encoding = vectorsearch::NoEncoding{}; + break; + case kC4VectorEncodingPQ: + vecOpt.encoding = + vectorsearch::PQEncoding{c4Opt.encoding.pq_subquantizers, c4Opt.encoding.bits}; + break; + case kC4VectorEncodingSQ: + vecOpt.encoding = vectorsearch::SQEncoding{c4Opt.encoding.bits}; + break; + case kC4VectorEncodingDefault: + break; + } + vecOpt.minTrainingCount = c4Opt.minTrainingSize; + vecOpt.maxTrainingCount = c4Opt.maxTrainingSize; + vecOpt.probeCount = c4Opt.numProbes; + vecOpt.lazyEmbedding = c4Opt.lazy; + vecOpt.validate(); } else { error::_throw(error::InvalidParameter, "Vector index requires options"); } @@ -456,7 +478,7 @@ namespace litecore { FLEncoder_BeginArray(enc, 2); for ( const auto& spec : keyStore().getIndexes() ) { if ( fullInfo ) { - FLEncoder_BeginDict(enc, 3); + FLEncoder_BeginDict(enc, 5); FLEncoder_WriteKey(enc, slice("name")); FLEncoder_WriteString(enc, slice(spec.name)); FLEncoder_WriteKey(enc, slice("type")); @@ -472,6 +494,10 @@ namespace litecore { FLEncoder_WriteString(enc, slice("n1ql")); break; } + if ( auto vecOpts = spec.vectorOptions() ) { + FLEncoder_WriteKey(enc, "vector_options"_sl); + FLEncoder_WriteString(enc, slice(vecOpts->createArgs())); + } FLEncoder_EndDict(enc); } else { FLEncoder_WriteString(enc, slice(spec.name)); diff --git a/LiteCore/Query/IndexSpec.cc b/LiteCore/Query/IndexSpec.cc index 416e811c1..8b5a63487 100644 --- a/LiteCore/Query/IndexSpec.cc +++ b/LiteCore/Query/IndexSpec.cc @@ -31,10 +31,9 @@ namespace litecore { , expression(std::move(expression_)) , queryLanguage(queryLanguage_) , options(std::move(opt)) { - if ( auto whichOpts = options.index() ) { - if ( (type == kFullText && whichOpts != 1) || (type == kVector && whichOpts != 2) ) - error::_throw(error::LiteCoreError::InvalidParameter, "Invalid options type for index"); - } + auto whichOpts = options.index(); + if ( (type == kFullText && whichOpts != 1 && whichOpts != 0) || (type == kVector && whichOpts != 2) ) + error::_throw(error::LiteCoreError::InvalidParameter, "Invalid options type for index"); } IndexSpec::IndexSpec(IndexSpec&&) = default; diff --git a/LiteCore/Query/IndexSpec.hh b/LiteCore/Query/IndexSpec.hh index c05bf7eb8..c536ac776 100644 --- a/LiteCore/Query/IndexSpec.hh +++ b/LiteCore/Query/IndexSpec.hh @@ -12,6 +12,7 @@ #pragma once #include "Base.hh" +#include "VectorIndexSpec.hh" #include #include #include @@ -29,7 +30,7 @@ namespace litecore { }; struct IndexSpec { - /// The types of indexes. + /// The types of indexes. (Values MUST match C4IndexType) enum Type { kValue, ///< Regular index of property value kFullText, ///< Full-text index, for MATCH queries. Uses IndexSpec::FTSOptions. @@ -47,52 +48,9 @@ namespace litecore { }; /// Options for a vector index. - struct VectorOptions { - enum MetricType { - DefaultMetric, ///< Use default metric, Euclidean - Euclidean, ///< Euclidean distance (squared) - Cosine, ///< Cosine distance (1.0 - cosine similarity) - }; // Note: values must match C4VectorMetricType in c4IndexTypes.h - - enum ClusteringType { - Flat, - Multi, - }; // Note: values must match C4VectorClusteringType in c4IndexTypes.h - - enum EncodingType { - DefaultEncoding, ///< Use default encoding, which is currently SQ8Bit - NoEncoding, ///< No encoding; 4 bytes per dimension, no data loss - PQ, ///< Product Quantizer - SQ, ///< Scalar Quantizer - }; // Note: values must match C4VectorEncodingType in c4IndexTypes.h - - struct Clustering { - ClusteringType type; - unsigned flat_centroids; - unsigned multi_subquantizers; ///< Number of pieces to split vectors into (for multi) - unsigned multi_bits; ///< log2 of # of centroids per subquantizer (for multi) - }; - - struct Encoding { - EncodingType type; ///< Encoding type: none, PQ, SQ - unsigned pq_subquantizers; ///< Number of subquantizers (for PQ) - unsigned bits; ///< Number of bits (for PQ and SQ) - }; - - unsigned dimensions; ///< Number of dimensions - MetricType metric{DefaultMetric}; ///< Distance metric - Clustering clustering{Flat}; ///< Clustering type & parameters - Encoding encoding{DefaultEncoding}; ///< Vector compression type & parameters - - unsigned minTrainingSize{0}; ///< Minimum # of vectors to train index (>= 25*numCentroids) - unsigned maxTrainingSize{0}; ///< Maximum # of vectors to train index on (<= 256*numCentroids) - unsigned numProbes{0}; ///< Default # of probes when querying - - bool lazy{false}; - - /// Constructor. Number of dimensions is a required parameter. - explicit VectorOptions(unsigned d) : dimensions(d) {} - }; + using VectorOptions = vectorsearch::IndexSpec; + + static constexpr vectorsearch::SQEncoding DefaultEncoding{8}; /// Index options. If not empty (the first state), must match the index type. using Options = std::variant; diff --git a/LiteCore/Query/SQLiteDataFile+Indexes.cc b/LiteCore/Query/SQLiteDataFile+Indexes.cc index d6b81e725..6bc2ec9d2 100644 --- a/LiteCore/Query/SQLiteDataFile+Indexes.cc +++ b/LiteCore/Query/SQLiteDataFile+Indexes.cc @@ -372,32 +372,4 @@ namespace litecore { } } - bool SQLiteKeyStore::isIndexTrained(fleece::slice name) const { - auto specs = getIndexes(); - for ( const auto& spec : specs ) { - if ( name == spec.name ) { - if ( spec.type != IndexSpec::kVector ) { - error::_throw(error::InvalidParameter, "Index '%.*s' is not a vector index", SPLAT(name)); - } - - // IMPORTANT: These are implementation details that will break this functionality if changed - // in the mobile-vector-search repo! - static const char* vectorTableNameSuffix = "_vectorsearchImpl"; - static const char* vectorDataTableName = "vectorSearchIndexData"; - // END - - string sql; - if ( !db().getSchema(vectorDataTableName, "table", vectorDataTableName, sql) ) { return false; } - auto vectorTableName = db().auxiliaryTableName(tableName(), KeyStore::kVectorSeparator, (string)name) - + vectorTableNameSuffix; - auto rawResult = db().rawQuery(format("SELECT tableName FROM %s WHERE tableName = '%s'", - vectorDataTableName, vectorTableName.c_str())); - auto result = Value::fromTrustedData(rawResult)->asArray(); - return result->count() == 1; - } - } - - error::_throw(error::NoSuchIndex); - } - } // namespace litecore diff --git a/LiteCore/Query/SQLiteKeyStore+Indexes.cc b/LiteCore/Query/SQLiteKeyStore+Indexes.cc index 66f888968..d9b706c2f 100644 --- a/LiteCore/Query/SQLiteKeyStore+Indexes.cc +++ b/LiteCore/Query/SQLiteKeyStore+Indexes.cc @@ -132,7 +132,7 @@ namespace litecore { _createFlagsIndex("blobs", DocumentFlags::kHasAttachments, _createdBlobsIndex); } - optional SQLiteKeyStore::getIndex(slice indexName) { + optional SQLiteKeyStore::getIndex(slice indexName) const { optional spec = db().getIndex(indexName); if ( spec && spec->keyStoreName != name() ) spec = nullopt; return spec; diff --git a/LiteCore/Query/SQLiteKeyStore+VectorIndex.cc b/LiteCore/Query/SQLiteKeyStore+VectorIndex.cc index 6b9fba45a..264b77473 100644 --- a/LiteCore/Query/SQLiteKeyStore+VectorIndex.cc +++ b/LiteCore/Query/SQLiteKeyStore+VectorIndex.cc @@ -10,20 +10,19 @@ // the file licenses/APL2.txt. // -#ifdef COUCHBASE_ENTERPRISE - -# include - -# include "SQLiteKeyStore.hh" -# include "SQLiteDataFile.hh" -# include "QueryParser.hh" -# include "SQLUtil.hh" -# include "SQLite_Internal.hh" -# include "StringUtil.hh" -# include "Array.hh" -# include "Error.hh" -# include "SQLiteCpp/Exception.h" -# include +#include + +#include "SQLiteKeyStore.hh" +#include "SQLiteDataFile.hh" +#include "QueryParser.hh" +#include "SQLUtil.hh" +#include "SQLite_Internal.hh" +#include "StringUtil.hh" +#include "Array.hh" +#include "Error.hh" +#include "SQLiteCpp/Statement.h" +#include "SQLiteCpp/Exception.h" +#include using namespace std; using namespace fleece; @@ -31,73 +30,31 @@ using namespace fleece::impl; namespace litecore { +#ifdef COUCHBASE_ENTERPRISE + // Vector search index for ML / predictive query, using the vectorsearch extension. // https://github.com/couchbaselabs/mobile-vector-search/blob/main/README_Extension.md - static constexpr const char* kMetricNames[] = {nullptr, "euclidean2", "cosine"}; - - /// Returns the SQL expression to create a vectorsearch virtual table. - static string createVectorSearchTableSQL(string_view vectorTableName, const IndexSpec& spec) { - stringstream stmt; - stmt << "CREATE VIRTUAL TABLE " << sqlIdentifier(vectorTableName) << " USING vectorsearch("; - Assert(spec.vectorOptions() != nullptr); - IndexSpec::VectorOptions const& options = *spec.vectorOptions(); - stmt << "dimensions=" << options.dimensions << ','; - if ( options.metric != IndexSpec::VectorOptions::DefaultMetric ) { - stmt << "metric=" << kMetricNames[options.metric] << ','; - } - switch ( options.clustering.type ) { - case IndexSpec::VectorOptions::Flat: - stmt << "clustering=flat" << options.clustering.flat_centroids << ','; - break; - case IndexSpec::VectorOptions::Multi: - stmt << "clustering=multi" << options.clustering.multi_subquantizers << 'x' - << options.clustering.multi_bits << ','; - break; - default: - error::_throw(error::InvalidParameter, "invalid vector clustering type"); - } - switch ( options.encoding.type ) { - case IndexSpec::VectorOptions::DefaultEncoding: - break; - case IndexSpec::VectorOptions::NoEncoding: - stmt << "encoding=none,"; - break; - case IndexSpec::VectorOptions::PQ: - stmt << "encoding=PQ" << options.encoding.pq_subquantizers << 'x' << options.encoding.bits << ','; - break; - case IndexSpec::VectorOptions::SQ: - stmt << "encoding=SQ" << options.encoding.bits << ','; - break; - default: - error::_throw(error::InvalidParameter, "invalid vector encoding type"); - } - if ( options.numProbes > 0 ) stmt << "probes=" << options.numProbes << ','; - if ( options.maxTrainingSize > 0 ) stmt << "maxToTrain=" << options.maxTrainingSize << ','; - stmt << "minToTrain=" << options.minTrainingSize; - if ( QueryLog.effectiveLevel() <= LogLevel::Verbose ) - stmt << ",verbose"; // Enable vectorsearch verbose logging (via printf, for now) - stmt << ")"; - return stmt.str(); - } - // Creates a vector-similarity index. bool SQLiteKeyStore::createVectorIndex(const IndexSpec& spec) { auto vectorTableName = db().auxiliaryTableName(tableName(), KeyStore::kVectorSeparator, spec.name); + auto vectorOptions = spec.vectorOptions(); + Assert(vectorOptions); // Generate a SQL expression to get the vector: QueryParser qp(db(), collectionName(), tableName()); qp.setBodyColumnName("new.body"); string vectorExpr; if ( auto what = spec.what(); what && what->count() == 1 ) - vectorExpr = qp.vectorToIndexExpressionSQL(what->get(0), spec.vectorOptions()->dimensions); + vectorExpr = qp.vectorToIndexExpressionSQL(what->get(0), vectorOptions->dimensions); else error::_throw(error::Unimplemented, "Vector index doesn't support multiple properties"); // Create the virtual table: try { - if ( !db().createIndex(spec, this, vectorTableName, createVectorSearchTableSQL(vectorTableName, spec)) ) - return false; + string sql = CONCAT("CREATE VIRTUAL TABLE " << sqlIdentifier(vectorTableName) << " USING vectorsearch(" + << *vectorOptions << ")"); + if ( !db().createIndex(spec, this, vectorTableName, sql) ) return false; } catch ( SQLite::Exception const& x ) { string_view what(x.what()); if ( hasPrefix(what, "no such module") ) { @@ -123,7 +80,7 @@ namespace litecore { createTrigger(vectorTableName, "preupdate", "BEFORE UPDATE OF body", whereOldSQL, deleteOldSQL); createTrigger(vectorTableName, "del", "AFTER DELETE", whereOldSQL, deleteOldSQL); - bool lazy = spec.vectorOptions()->lazy; + bool lazy = vectorOptions->lazyEmbedding; if ( lazy ) { // Lazy index: Mark as lazy by initializing lastSeq. Vectors will not be computed // automatically; app updates them via the LazyIndex class. @@ -165,61 +122,35 @@ namespace litecore { return ""; // no index found } - static inline unsigned asUInt(string_view sv) { - string str(sv); - return unsigned(strtoul(str.c_str(), nullptr, 10)); - } - // The opposite of createVectorSearchTableSQL optional SQLiteKeyStore::parseVectorSearchTableSQL(string_view sql) { - optional opts; // Find the virtual-table arguments in the CREATE TABLE statement: auto start = sql.find("vectorsearch("); - if ( start == string::npos ) return opts; + if ( start == string::npos ) return nullopt; start += strlen("vectorsearch("); auto end = sql.find(')', start); - if ( end == string::npos ) return opts; + if ( end == string::npos ) return nullopt; // Parse each comma-delimited key-value pair: - string_view args(&sql[start], end - start); - opts.emplace(0); - split(args, ",", [&](string_view key) { - string_view value; - if ( auto eq = key.find('='); eq != string::npos ) { - value = key.substr(eq + 1); - key = key.substr(0, eq); - if ( value.empty() || key.empty() ) return; - } - if ( key == "dimensions" ) { - opts->dimensions = asUInt(value); - } else if ( key == "metric" ) { - if ( value == "euclidean2" ) opts->metric = IndexSpec::VectorOptions::Euclidean; - else if ( value == "cosine" ) - opts->metric = IndexSpec::VectorOptions::Cosine; - } else if ( key == "minToTrain" ) { - opts->minTrainingSize = asUInt(value); - } else if ( key == "maxToTrain" ) { - opts->maxTrainingSize = asUInt(value); - } else if ( key == "probes" ) { - opts->numProbes = asUInt(value); - } else if ( key == "lazyindex" ) { - opts->lazy = (value != "false" && value != "0"); - } else if ( key == "clustering" ) { - if ( hasPrefix(value, "multi") ) opts->clustering = {IndexSpec::VectorOptions::Multi}; - //TODO: Parse centroid count & other params; see vectorsearch::IndexSpec::setParam() - } else if ( key == "encoding" ) { - if ( value == "none" ) opts->encoding = {IndexSpec::VectorOptions::NoEncoding}; - else if ( hasPrefix(value, "PQ") ) { - opts->encoding = {IndexSpec::VectorOptions::PQ}; - } else if ( hasPrefix(value, "SQ") ) { - opts->encoding = {IndexSpec::VectorOptions::SQ}; - } - //TODO: Parse encoding params; see vectorsearch::IndexSpec::setParam() - } - }); + string_view args(&sql[start], end - start); + IndexSpec::VectorOptions opts; + split(args, ",", [&](string_view arg) { (void)opts.readArg(arg); }); return opts; } -} // namespace litecore +#endif // COUCHBASE_ENTERPRISE + + bool SQLiteKeyStore::isIndexTrained(fleece::slice name) const { + if ( auto spec = db().getIndex(name); spec && spec->keyStoreName == this->name() ) { + if ( spec->type != IndexSpec::kVector ) { + error::_throw(error::InvalidParameter, "Index '%.*s' is not a vector index", SPLAT(name)); + } + auto q = db().compile( + ("SELECT 1 FROM \""s + spec->indexTableName + "\" WHERE bucket != -1 LIMIT 1").c_str()); + return q->executeStep(); + } -#endif + error::_throw(error::NoSuchIndex); + } + +} // namespace litecore diff --git a/LiteCore/Storage/BothKeyStore.hh b/LiteCore/Storage/BothKeyStore.hh index fcccefb0e..481d44789 100644 --- a/LiteCore/Storage/BothKeyStore.hh +++ b/LiteCore/Storage/BothKeyStore.hh @@ -98,7 +98,7 @@ namespace litecore { void deleteIndex(slice name) override { _liveStore->deleteIndex(name); } - std::optional getIndex(slice name) override { return _liveStore->getIndex(name); } + std::optional getIndex(slice name) const override { return _liveStore->getIndex(name); } [[nodiscard]] std::vector getIndexes() const override { return _liveStore->getIndexes(); } diff --git a/LiteCore/Storage/KeyStore.hh b/LiteCore/Storage/KeyStore.hh index 7117ac865..c5c32b9d5 100644 --- a/LiteCore/Storage/KeyStore.hh +++ b/LiteCore/Storage/KeyStore.hh @@ -201,7 +201,7 @@ namespace litecore { virtual void deleteIndex(slice name) = 0; [[nodiscard]] virtual std::vector getIndexes() const = 0; - [[nodiscard]] virtual std::optional getIndex(slice name) = 0; + [[nodiscard]] virtual std::optional getIndex(slice name) const = 0; [[nodiscard]] virtual bool isIndexTrained(slice name) const = 0; // public for complicated reasons; clients should never call it diff --git a/LiteCore/Storage/SQLiteDataFile.cc b/LiteCore/Storage/SQLiteDataFile.cc index fe528f53a..98401f1bb 100644 --- a/LiteCore/Storage/SQLiteDataFile.cc +++ b/LiteCore/Storage/SQLiteDataFile.cc @@ -80,9 +80,11 @@ namespace litecore { // Maximum size WAL journal will be left at after a commit static const int64_t kJournalSize = 5 * MB; + static map kValidExtensionVersions = { #ifdef COUCHBASE_ENTERPRISE - static constexpr int kVectorSearchCompatibleVersion = 1; + {"CouchbaseLiteVectorSearch", 1} #endif + }; // Amount of file to memory-map #if TARGET_OS_OSX || TARGET_OS_SIMULATOR @@ -185,6 +187,34 @@ namespace litecore { void SQLiteDataFile::setExtensionPath(string path) { sExtensionPath = std::move(path); } + void SQLiteDataFile::enableExtension(const string& name, string path) { + auto extensionEntry = kValidExtensionVersions.find(name); + if ( extensionEntry == kValidExtensionVersions.end() ) { + error::_throw(error::LiteCoreError::InvalidParameter, "'%s' is not a known extension", name.c_str()); + } + + // NOTE: This logic will need to be changed later if we have more than one extension + // and they reside in different directories + if ( !sExtensionPath.empty() && sExtensionPath != path ) { + WarnError("Extension path previously set to '%s' but being reset to '%s'. This is not advisable!", + sExtensionPath.c_str(), path.c_str()); + } + + sExtensionPath = std::move(path); + +#if defined(__ANDROID__) + string pluginPath = sExtensionPath + FilePath::kSeparator + "lib" + name; +#else + string pluginPath = sExtensionPath + FilePath::kSeparator + name; +#endif + + if ( !extension::check_extension_version(pluginPath, extensionEntry->second) ) { + error::_throw(error::UnsupportedOperation, + "Extension '%s' is not found or not compatible with this version of Couchbase Lite", + name.c_str()); + } + } + SQLiteDataFile::SQLiteDataFile(const FilePath& path, DataFile::Delegate* delegate, const Options* options) : DataFile(path, delegate, options) { reopen(); @@ -217,12 +247,6 @@ namespace litecore { }; string pluginPath = sExtensionPath + FilePath::kSeparator + extensionName; - if ( !litecore::extension::check_extension_version(pluginPath, kVectorSearchCompatibleVersion) ) { - // This function logs the reason for the version match failure, no need to log here. - error::_throw(error::UnsupportedOperation, - "Extension '%s' is not found or not compatible with this version of Couchbase Lite", - extensionName); - } # if defined(_WIN32) && defined(_M_X64) // Flimsy hack to get around the fact that we need to load this dep from a non-standard diff --git a/LiteCore/Storage/SQLiteDataFile.hh b/LiteCore/Storage/SQLiteDataFile.hh index 211938da6..af160e020 100644 --- a/LiteCore/Storage/SQLiteDataFile.hh +++ b/LiteCore/Storage/SQLiteDataFile.hh @@ -102,9 +102,11 @@ namespace litecore { Retained compileQuery(slice expression, QueryLanguage, KeyStore*) override; - /// Sets the directory where SQLite extensions can be found (i.e. VectorSearch) + // Deprecated in favor of enableExtension! static void setExtensionPath(string); + static void enableExtension(const string& name, string path); + // QueryParser::delegate: bool tableExists(const std::string& tableName) const override; string collectionTableName(const string& collection, DeletionStatus) const override; diff --git a/LiteCore/Storage/SQLiteKeyStore.hh b/LiteCore/Storage/SQLiteKeyStore.hh index 7d1d54422..698987c44 100644 --- a/LiteCore/Storage/SQLiteKeyStore.hh +++ b/LiteCore/Storage/SQLiteKeyStore.hh @@ -81,7 +81,7 @@ namespace litecore { void deleteIndex(slice name) override; std::vector getIndexes() const override; - std::optional getIndex(slice name) override; + std::optional getIndex(slice name) const override; bool isIndexTrained(slice name) const override; std::vector withDocBodies(const std::vector& docIDs, WithDocBodyCallback callback) override; diff --git a/LiteCore/Support/Extension.cc b/LiteCore/Support/Extension.cc index 247bfe38e..6c76b9bd5 100644 --- a/LiteCore/Support/Extension.cc +++ b/LiteCore/Support/Extension.cc @@ -125,7 +125,7 @@ bool litecore::extension::check_extension_version(const string& extensionPath, i int majorVersion = version_number_f() / 1000000; const char* versionStr = version_f(); if ( majorVersion == expectedVersion ) { - LogToAt(DBLog, Info, "Loaded extension '%s' version %s", extensionName.c_str(), versionStr); + LogToAt(DBLog, Info, "Found extension '%s' version %s", extensionName.c_str(), versionStr); return true; } diff --git a/LiteCore/tests/CMakeLists.txt b/LiteCore/tests/CMakeLists.txt index b22e0fe02..c37d58c00 100644 --- a/LiteCore/tests/CMakeLists.txt +++ b/LiteCore/tests/CMakeLists.txt @@ -151,6 +151,7 @@ target_include_directories( ${TOP}REST/tests ${TOP}vendor/sockpp/include ${TOP}vendor/fleece/vendor/date/include + ${TOP}vendor/vector_search ) target_link_libraries( diff --git a/LiteCore/tests/LazyVectorAPITest.cc b/LiteCore/tests/LazyVectorAPITest.cc index e161c5aff..eea1e579f 100644 --- a/LiteCore/tests/LazyVectorAPITest.cc +++ b/LiteCore/tests/LazyVectorAPITest.cc @@ -2,6 +2,7 @@ // Created by Callum Birks on 28/05/2024. // +#include "VectorIndexSpec.hh" #include "c4Base.hh" #include "DatabaseImpl.hh" #include "LazyIndex.hh" @@ -9,6 +10,7 @@ #include "c4Collection.hh" #include "c4Index.h" #include "c4Index.hh" +#include "c4IndexTypes.h" #include "c4Query.h" #include "c4Test.hh" // IWYU pragma: keep #include "LiteCoreTest.hh" @@ -104,7 +106,7 @@ class LazyVectorAPITest : public C4Test { std::call_once(sOnce, [] { if ( const char* path = getenv("LiteCoreExtensionPath") ) { sExtensionPath = path; - litecore::SQLiteDataFile::setExtensionPath(sExtensionPath); + litecore::SQLiteDataFile::enableExtension("CouchbaseLiteVectorSearch", sExtensionPath); Log("Registered LiteCore extension path %s", path); } }); @@ -181,7 +183,7 @@ class LazyVectorAPITest : public C4Test { bool createVectorIndex(bool lazy, slice expression = R"(['.word'])"_sl, slice name = "words_index"_sl, IndexSpec::VectorOptions options = vectorOptions(300, 8), C4Error* err = ERROR_INFO()) const { - options.lazy = lazy; + options.lazyEmbedding = lazy; return createIndex(name, json5(expression), kC4VectorIndex, indexOptions(options), err); } @@ -267,60 +269,71 @@ class LazyVectorAPITest : public C4Test { static C4VectorIndexOptions c4VectorOptions(const IndexSpec::VectorOptions& options) { C4VectorMetricType metric{}; switch ( options.metric ) { - case IndexSpec::VectorOptions::DefaultMetric: - metric = kC4VectorMetricDefault; - break; - case IndexSpec::VectorOptions::Euclidean: + case vectorsearch::Metric::Euclidean2: metric = kC4VectorMetricEuclidean; break; - case IndexSpec::VectorOptions::Cosine: + case vectorsearch::Metric::Cosine: metric = kC4VectorMetricCosine; break; } - C4VectorClusteringType clusteringType{}; - switch ( options.clustering.type ) { - case IndexSpec::VectorOptions::Flat: - clusteringType = kC4VectorClusteringFlat; - break; - case IndexSpec::VectorOptions::Multi: - clusteringType = kC4VectorClusteringMulti; - break; + C4VectorClustering clustering{}; + switch ( options.clustering.index() ) { + case 0: + { + clustering.type = kC4VectorClusteringFlat; + auto _clustering = std::get(options.clustering); + clustering.flat_centroids = _clustering.numCentroids; + break; + } + case 1: + { + clustering.type = kC4VectorClusteringMulti; + auto _clustering = std::get(options.clustering); + clustering.multi_bits = _clustering.bitsPerSub; + clustering.multi_subquantizers = _clustering.subquantizers; + break; + } } - C4VectorEncodingType encodingType{}; - switch ( options.encoding.type ) { - case IndexSpec::VectorOptions::DefaultEncoding: - encodingType = kC4VectorEncodingDefault; - break; - case IndexSpec::VectorOptions::NoEncoding: - encodingType = kC4VectorEncodingNone; - break; - case IndexSpec::VectorOptions::PQ: - encodingType = kC4VectorEncodingPQ; - break; - case IndexSpec::VectorOptions::SQ: - encodingType = kC4VectorEncodingSQ; - break; + C4VectorEncoding encoding{}; + switch ( options.encoding.index() ) { + case 0: + { + encoding.type = kC4VectorEncodingNone; + break; + } + case 1: + { + encoding.type = kC4VectorEncodingPQ; + auto _encoding = std::get(options.encoding); + encoding.bits = _encoding.bitsPerSub; + encoding.pq_subquantizers = _encoding.subquantizers; + break; + } + case 2: + { + encoding.type = kC4VectorEncodingSQ; + auto _encoding = std::get(options.encoding); + encoding.bits = _encoding.bitsPerDimension; + break; + } } return C4VectorIndexOptions{ options.dimensions, metric, - C4VectorClustering{clusteringType, options.clustering.flat_centroids, - options.clustering.multi_subquantizers, options.clustering.multi_bits}, - C4VectorEncoding{encodingType, options.encoding.pq_subquantizers, options.encoding.bits}, - options.minTrainingSize, - options.maxTrainingSize, - options.numProbes, - options.lazy, + clustering, + encoding, + static_cast(options.minTrainingCount.value_or(0)), + static_cast(options.maxTrainingCount.value_or(0)), + options.probeCount.value_or(0), + options.lazyEmbedding, }; } static IndexSpec::VectorOptions vectorOptions(unsigned dimensions, unsigned centroids) { - IndexSpec::VectorOptions options(dimensions); - options.clustering.type = IndexSpec::VectorOptions::Flat; - options.clustering.flat_centroids = centroids; + IndexSpec::VectorOptions options(dimensions, vectorsearch::FlatClustering{centroids}); return options; } @@ -337,7 +350,7 @@ class LazyVectorAPITest : public C4Test { // 1, 2 TEST_CASE_METHOD(LazyVectorAPITest, "Lazy Vector isLazy Default False", "[API][.VectorSearch]") { auto vectorOpt = vectorOptions(300, 20); - CHECK(vectorOpt.lazy == false); + CHECK(vectorOpt.lazyEmbedding == false); } // 3 @@ -393,7 +406,7 @@ TEST_CASE_METHOD(LazyVectorAPITest, "BeginUpdate on Non-Vector", "[API][.VectorS auto index = REQUIRED(getIndex("value_index"_sl, ERROR_INFO())); C4Error err{}; - c4index_beginUpdate(index, 10, &err); + auto _ = c4index_beginUpdate(index, 10, &err); CHECK(err.code == kC4ErrorUnsupported); c4index_release(index); @@ -406,7 +419,7 @@ TEST_CASE_METHOD(LazyVectorAPITest, "BeginUpdate on Non-Lazy Vector", "[API][.Ve auto index = REQUIRED(getIndex("nonlazyindex"_sl)); C4Error err{}; - c4index_beginUpdate(index, 10, &err); + auto _ = c4index_beginUpdate(index, 10, &err); CHECK(err.code == kC4ErrorUnsupported); c4index_release(index); diff --git a/LiteCore/tests/LazyVectorQueryTest.cc b/LiteCore/tests/LazyVectorQueryTest.cc index f7c01981d..7bb373e0f 100644 --- a/LiteCore/tests/LazyVectorQueryTest.cc +++ b/LiteCore/tests/LazyVectorQueryTest.cc @@ -80,10 +80,8 @@ class LazyVectorQueryTest : public VectorQueryTest { } void createVectorIndex() { - IndexSpec::VectorOptions options(kDimension); - options.clustering.type = IndexSpec::VectorOptions::Flat; - options.clustering.flat_centroids = 16; - options.lazy = true; + IndexSpec::VectorOptions options(kDimension, vectorsearch::FlatClustering{16}, IndexSpec::DefaultEncoding); + options.lazyEmbedding = true; VectorQueryTest::createVectorIndex("factorsindex", "[ ['.num'] ]", options); _lazyIndex = make_retained(*store, "factorsindex"); diff --git a/LiteCore/tests/PredictiveVectorQueryTest.cc b/LiteCore/tests/PredictiveVectorQueryTest.cc index a094fbfd4..75c30c318 100644 --- a/LiteCore/tests/PredictiveVectorQueryTest.cc +++ b/LiteCore/tests/PredictiveVectorQueryTest.cc @@ -82,9 +82,7 @@ class PredictiveVectorQueryTest : public VectorQueryTest { } void createVectorIndex(QueryLanguage lang) { - IndexSpec::VectorOptions options(5); - options.clustering.type = IndexSpec::VectorOptions::Flat; - options.clustering.flat_centroids = 16; + IndexSpec::VectorOptions options(5, vectorsearch::FlatClustering{16}, IndexSpec::DefaultEncoding); if ( lang == QueryLanguage::kJSON ) { VectorQueryTest::createVectorIndex( "factorsindex", "[ ['PREDICTION()', 'factors', {number: ['.num']}, '.vec'] ]", options, lang); diff --git a/LiteCore/tests/VectorQueryTest.cc b/LiteCore/tests/VectorQueryTest.cc index 101d27db0..e9400a3eb 100644 --- a/LiteCore/tests/VectorQueryTest.cc +++ b/LiteCore/tests/VectorQueryTest.cc @@ -20,6 +20,7 @@ #include "Base64.hh" #include "c4Database.hh" #include "c4Collection.hh" +#include "c4Database.h" #ifdef COUCHBASE_ENTERPRISE @@ -30,10 +31,7 @@ class SIFTVectorQueryTest : public VectorQueryTest { SIFTVectorQueryTest() : VectorQueryTest(0) {} IndexSpec::VectorOptions vectorIndexOptions() const { - IndexSpec::VectorOptions options(128); - options.clustering.type = IndexSpec::VectorOptions::Flat; - options.clustering.flat_centroids = 256; - return options; + return IndexSpec::VectorOptions(128, vectorsearch::FlatClustering{256}, IndexSpec::DefaultEncoding); } void createVectorIndex() { @@ -146,8 +144,8 @@ N_WAY_TEST_CASE_METHOD(SIFTVectorQueryTest, "Create/Delete Vector Index", "[Quer REQUIRE(vecOptions); auto trueOptions = vectorIndexOptions(); CHECK(vecOptions->dimensions == trueOptions.dimensions); - CHECK(vecOptions->clustering.type == trueOptions.clustering.type); - CHECK(vecOptions->encoding.type == trueOptions.encoding.type); + CHECK(vecOptions->clusteringType() == trueOptions.clusteringType()); + CHECK(vecOptions->encodingType() == trueOptions.encodingType()); CHECK(db->allKeyStoreNames() == allKeyStores); // CBL-3824, CBL-5369 // Delete a doc too: @@ -670,6 +668,14 @@ TEST_CASE_METHOD(SIFTVectorQueryTest, "Index isTrained API", "[Query][.VectorSea CHECK(isTrained == expectedTrained); } +TEST_CASE_METHOD(SIFTVectorQueryTest, "enableExtension API", "[.VectorSearch]") { + ExpectingExceptions e; + C4Error err; + auto result = c4_enableExtension("BadName"_sl, FLStr(sExtensionPath.c_str()), &err); + CHECK(!result); + CHECK(err.code == kC4ErrorInvalidParameter); +} + N_WAY_TEST_CASE_METHOD(SIFTVectorQueryTest, "Inspect Vector Index", "[Query][.VectorSearch]") { auto allKeyStores = db->allKeyStoreNames(); readVectorDocs(100); diff --git a/LiteCore/tests/VectorQueryTest.hh b/LiteCore/tests/VectorQueryTest.hh index 12b8b7dbe..2f23b1cd5 100644 --- a/LiteCore/tests/VectorQueryTest.hh +++ b/LiteCore/tests/VectorQueryTest.hh @@ -20,7 +20,7 @@ class VectorQueryTest : public QueryTest { std::call_once(sOnce, [] { if ( const char* path = getenv("LiteCoreExtensionPath") ) { sExtensionPath = path; - litecore::SQLiteDataFile::setExtensionPath(sExtensionPath); + litecore::SQLiteDataFile::enableExtension("CouchbaseLiteVectorSearch", sExtensionPath); Log("Registered LiteCore extension path %s", path); } }); diff --git a/Replicator/tests/ReplicatorSGTest.cc b/Replicator/tests/ReplicatorSGTest.cc index 375a253a5..dac25ff6a 100644 --- a/Replicator/tests/ReplicatorSGTest.cc +++ b/Replicator/tests/ReplicatorSGTest.cc @@ -16,6 +16,7 @@ #include "c4Collection.h" #include "c4Document+Fleece.h" #include "c4DocEnumerator.h" +#include "c4Index.h" #include "Stopwatch.hh" #include "StringUtil.hh" #include "SecureRandomize.hh" diff --git a/Xcode/LiteCore.xcodeproj/project.pbxproj b/Xcode/LiteCore.xcodeproj/project.pbxproj index eb2afdc60..b25b0d261 100644 --- a/Xcode/LiteCore.xcodeproj/project.pbxproj +++ b/Xcode/LiteCore.xcodeproj/project.pbxproj @@ -66,6 +66,7 @@ 270C6B8C1EBA2CD600E73415 /* LogEncoder.cc in Sources */ = {isa = PBXBuildFile; fileRef = 270C6B891EBA2CD600E73415 /* LogEncoder.cc */; }; 270C6B981EBA3AD200E73415 /* LogEncoderTest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 270C6B901EBA2D5600E73415 /* LogEncoderTest.cc */; }; 270C7D522022916D00FF86D3 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 270515581D907F6200D62D05 /* CoreFoundation.framework */; }; + 270D5B8A2C110ED800AA91E7 /* VectorIndexSpec.cc in Sources */ = {isa = PBXBuildFile; fileRef = 270D5B892C110ED800AA91E7 /* VectorIndexSpec.cc */; }; 270F2BD52301E8AE00D8DB21 /* TCPSocket.hh in Headers */ = {isa = PBXBuildFile; fileRef = 270F2BD32301E8AE00D8DB21 /* TCPSocket.hh */; }; 2712F5AF25D5A9AB0082D526 /* c4Error.cc in Sources */ = {isa = PBXBuildFile; fileRef = 2712F5AE25D5A9AB0082D526 /* c4Error.cc */; }; 27139B3118F8E9750021A9A3 /* XCTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 275072AB18E4A68E00A80C5A /* XCTest.framework */; }; @@ -901,6 +902,9 @@ 270C6B891EBA2CD600E73415 /* LogEncoder.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogEncoder.cc; sourceTree = ""; }; 270C6B8A1EBA2CD600E73415 /* LogEncoder.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = LogEncoder.hh; sourceTree = ""; }; 270C6B901EBA2D5600E73415 /* LogEncoderTest.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogEncoderTest.cc; sourceTree = ""; }; + 270D5B852C110ED800AA91E7 /* VectorIndexSpec.hh */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = VectorIndexSpec.hh; sourceTree = ""; }; + 270D5B892C110ED800AA91E7 /* VectorIndexSpec.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = VectorIndexSpec.cc; sourceTree = ""; }; + 270D5B8C2C122B9500AA91E7 /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = ""; }; 270F2BD32301E8AE00D8DB21 /* TCPSocket.hh */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = TCPSocket.hh; sourceTree = ""; }; 270F2BD42301E8AE00D8DB21 /* TCPSocket.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = TCPSocket.cc; sourceTree = ""; }; 2712F5AE25D5A9AB0082D526 /* c4Error.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = c4Error.cc; sourceTree = ""; }; @@ -1887,6 +1891,16 @@ path = EE; sourceTree = ""; }; + 270D5B8B2C122B5200AA91E7 /* vector_search */ = { + isa = PBXGroup; + children = ( + 270D5B892C110ED800AA91E7 /* VectorIndexSpec.cc */, + 270D5B852C110ED800AA91E7 /* VectorIndexSpec.hh */, + 270D5B8C2C122B9500AA91E7 /* README.md */, + ); + path = vector_search; + sourceTree = ""; + }; 271057C21D3997230018247B /* C++ Tests */ = { isa = PBXGroup; children = ( @@ -2971,6 +2985,7 @@ 27AFF38F23036A7100B4D6C4 /* socketpp */, 27EF7FFA1914296D00A327B9 /* sqlite3-unicodesn */, 27D74A731D4D3F0700D806E0 /* SQLiteCpp */, + 270D5B8B2C122B5200AA91E7 /* vector_search */, 2744B3602418566F005A194D /* zlib */, ); name = vendor; @@ -4077,7 +4092,6 @@ 27431BC7258A8AB0009E3EC5 /* QuietReporter.hh in Sources */, 270C6B981EBA3AD200E73415 /* LogEncoderTest.cc in Sources */, 274D18ED2617DFE40018D39C /* c4DocumentTest_Internal.cc in Sources */, - 27D62A3F2B72D92B004C0787 /* LazyVectorQueryTest.cc in Sources */, 274D17C22615445B0018D39C /* DBAccessTestWrapper.cc in Sources */, 27FA09A01D6FA380005888AA /* DataFileTest.cc in Sources */, 274D165D261250220018D39C /* c4CollectionTest.cc in Sources */, @@ -4093,6 +4107,7 @@ 27098AAA216C2ED6002751DA /* PredictiveQueryTest.cc in Sources */, 27BEEE792A783A17005AD4BF /* VectorQueryTest.cc in Sources */, 27F602FE2A968503006FA1D0 /* PredictiveVectorQueryTest.cc in Sources */, + 27D62A3F2B72D92B004C0787 /* LazyVectorQueryTest.cc in Sources */, 27A83D54269E3E69002B7EBA /* PropertyEncryptionTests.cc in Sources */, 272850B51E9BE361009CA22F /* UpgraderTest.cc in Sources */, 2761F3F71EEA00C3006D4BB8 /* CookieStoreTest.cc in Sources */, @@ -4396,6 +4411,7 @@ 272850AB1E9AF53B009CA22F /* Upgrader.cc in Sources */, 27469D08233D719800A1EE1A /* PublicKey+Apple.mm in Sources */, 274B36D225B271F7001FC28D /* Version.cc in Sources */, + 270D5B8A2C110ED800AA91E7 /* VectorIndexSpec.cc in Sources */, 2744B351241854F2005A194D /* WebSocketImpl.cc in Sources */, 2769438C1DCD502A00DB2555 /* c4Observer.cc in Sources */, 2744B354241854F2005A194D /* Actor.cc in Sources */, diff --git a/cmake/platform_base.cmake b/cmake/platform_base.cmake index 86eae019e..00835617c 100644 --- a/cmake/platform_base.cmake +++ b/cmake/platform_base.cmake @@ -92,6 +92,7 @@ function(set_litecore_source_base) vendor/SQLiteCpp/src/Transaction.cpp vendor/SQLiteCpp/sqlite3/ext/carray.cc vendor/SQLiteCpp/sqlite3/ext/carray_bind.cc + vendor/vector_search/VectorIndexSpec.cc Replicator/c4Replicator.cc Replicator/c4Replicator_CAPI.cc Replicator/c4Socket.cc diff --git a/vendor/vector_search/README.md b/vendor/vector_search/README.md new file mode 100644 index 000000000..120352ff5 --- /dev/null +++ b/vendor/vector_search/README.md @@ -0,0 +1,9 @@ +# `vector_search` files + +These source files are copied from the private [vectorsearch][VECTORSEARCH] repo. + +Using the same source code ensures that vector index parameters stay compatible, are validated consistently, and are communicated correctly from CBL to the `vectorsearch` virtual table. + +Any changes made in either repo should be copied to the other! + +[VECTORSEARCH]: https://github.com/couchbaselabs/mobile-vector-search diff --git a/vendor/vector_search/VectorIndexSpec.cc b/vendor/vector_search/VectorIndexSpec.cc new file mode 100644 index 000000000..a8f90053e --- /dev/null +++ b/vendor/vector_search/VectorIndexSpec.cc @@ -0,0 +1,332 @@ +// +// IndexSpec.cc +// +// +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// NOTE: This file appears in both the vectorsearch and couchbase-lite-core repos. +// Any changes made in one should be copied to the other! + +#include "VectorIndexSpec.hh" +#include +#include +#include + +#ifdef SQLITECPP_BUILDING_EXTENSION +#include +SQLITE_EXTENSION_INIT3 +#else +#include // for sqlite3_log +#endif + +namespace vectorsearch { + using namespace std; + + +#pragma mark - VALIDATION: + + + static void check(bool condition, const char* what, const char* problem) { + if (!condition) { + string message = "invalid vector index spec: "; + message += what; + message += problem; + throw std::invalid_argument(message); + } + } + + template + static void check(T value, T minVal, T maxVal, const char* what) { + check(value >= minVal, what, " parameter is too small"); + check(value <= maxVal, what, " parameter is too large"); + } + + + void IndexSpec::validate() const { + check(dimensions, kMinDimensions, kMaxDimensions, "dimension"); + switch (clusteringType()) { + case ClusteringType::Flat: { + auto &c = std::get(clustering); + check(c.numCentroids, + kMinFlatClustering.numCentroids, + kMaxFlatClustering.numCentroids, + "centroids"); + break; + } + case ClusteringType::MultiIndex: { + auto& c = std::get(clustering); + check(c.subquantizers, + kMinMultiIndexClustering.subquantizers, + kMaxMultiIndexClustering.subquantizers, + "clustering subquantizers"); + check(c.bitsPerSub, + kMinMultiIndexClustering.bitsPerSub, + kMaxMultiIndexClustering.bitsPerSub, + "clustering bits"); + check(dimensions % c.subquantizers == 0, + "clustering subquantizers", + " must evenly divide the number of dimensions"); + break; + } + } + + if (probeCount) + check(*probeCount, 1u, numCentroids(), "probe count"); + + switch (encodingType()) { + case EncodingType::None: + break; + case EncodingType::PQ: { + auto& e = std::get(encoding); + check(e.subquantizers, + kMinPQEncoding.subquantizers, + kMaxPQEncoding.subquantizers, + "encoding subquantizers"); + check(e.bitsPerSub, + kMinPQEncoding.bitsPerSub, + kMaxPQEncoding.bitsPerSub, + "encoding bits"); + check(dimensions % e.subquantizers == 0, + "encoding subquantizers", + " must evenly divide the number of dimensions"); + break; + } + case EncodingType::SQ: { + auto& e = std::get(encoding); + check(e.bitsPerDimension == 4 || e.bitsPerDimension == 6 || e.bitsPerDimension == 8, + "encoding bits", " must be 4, 6 or 8"); + break; + } + } + } + + + void IndexSpec::resolveTrainingCounts() { + // If maxTrainingCount wasn't given or is zero, set it to a reasonable value: + unsigned nCent = numCentroidsToTrain(); + if (!maxTrainingCount || *maxTrainingCount == 0) + maxTrainingCount = kRecommendedMaxTrainingVectorsPerCentroid * nCent; + + if (!minTrainingCount || *minTrainingCount == 0) { + // If minTrainingCount wasn't given, set a default value. + // (kRecommendedMinTrainingVectorsPerCentroid would be better, + // but would break compatibility.) + minTrainingCount = kMinTrainingVectorsPerCentroid * nCent; + } else if (int64_t m = kMinTrainingVectorsPerCentroid * nCent; *minTrainingCount < m) { + sqlite3_log(SQLITE_WARNING, + "vectorsearch: minTrainingSize of %" PRIi64 " is too small;" + " raising it to %" PRIi64 ", based on %u centroids.", + *minTrainingCount, m, nCent); + minTrainingCount = m; + } + } + + +#pragma mark - PARSING: + + + static bool popPrefix(string_view &str, string_view prefix) { + auto prefixLen = prefix.size(); + if (prefixLen > str.size() || prefix != str.substr(0, prefixLen)) + return false; + str = str.substr(prefixLen); + return true; + } + + static unsigned asUInt(string_view str, string_view forKey) { + try { + return unsigned(std::stoul(string(str))); + } catch (...) { + throw invalid_argument("invalid numeric value '"s + string(str) + "' for " + string(forKey)); + } + } + + static bool asBool(string_view str) { + return str != "false" && str != "0"; + } + + + static pair readPQ(string_view value, string_view forKey) { + if (auto x = value.find('x'); x != string::npos) + return { asUInt(value.substr(0, x), forKey), asUInt(value.substr(x + 1), forKey) }; + else + throw invalid_argument("value of '"s + string(forKey) + + " must be of form x , e.g. 32x8"); + } + + + bool IndexSpec::readArg(std::string_view key, std::string_view value) { + if (key == "dimensions") { + dimensions = asUInt(value, "dimensions"); + } else if (key == "metric") { + if (value == "euclidean2") + metric = Metric::Euclidean2; + else if (value == "cosine") + metric = Metric::Cosine; + else + throw std::invalid_argument("unknown metric"); + } else if (key == "clustering") { + if (popPrefix(value, "flat")) { + clustering = FlatClustering{asUInt(value, key)}; + } else if (popPrefix(value, "multi")) { + auto [sub, bits] = readPQ(value, key); + clustering = MultiIndexClustering{sub, bits}; + } else { + throw std::invalid_argument("unknown clustering"); + } + } else if (key == "centroids") { + clustering = FlatClustering{asUInt(value, "centroid count")}; + } else if (key == "encoding") { + if (value == "none") + encoding = NoEncoding{}; + else if (popPrefix(value, "PQ")) { + auto [sub, bits] = readPQ(value, "PQ encoding"); + encoding = PQEncoding(sub, bits); + } else if (popPrefix(value, "SQ")) { + unsigned v = 8; + if (!value.empty()) + v = asUInt(value, "SQ encoding"); + if (v == 4 || v == 6 || v == 8) + encoding = SQEncoding{v}; + else + throw std::invalid_argument("invalid bits for SQ encoding"); + } else { + throw std::invalid_argument("unknown encoding"); + } + } else if (key == "minToTrain") { + if (value == "never") + minTrainingCount = kNeverTrain; + else + minTrainingCount = asUInt(value, "min training size"); + } else if (key == "maxToTrain") { + maxTrainingCount = asUInt(value, "max training size"); + } else if (key == "probes") { + probeCount = asUInt(value, "probe count"); + } else if (key == "lazyindex") { + lazyEncoding = asBool(value); + } else if (key == "lazyembedding") { + lazyEmbedding = asBool(value); + } else { + return false; // unknown key + } + return true; // fall through = success + } + + + bool IndexSpec::readArg(string_view arg) { + if (arg.empty()) + return true; // no-op + string_view value; + if (auto eq = arg.find('='); eq != string::npos) { + if (eq == 0 || eq == arg.size()) + throw std::invalid_argument("invalid virtual-table argument " + string(arg)); + value = arg.substr(eq + 1); + arg = arg.substr(0, eq); + } + return readArg(arg, value); + } + + +#pragma mark - GENERATING TEXT: + + + static constexpr const char* kMetricNames[] = {"euclidean2", "cosine"}; + + std::ostream& IndexSpec::writeArgs(std::ostream& out) const { + out << "dimensions=" << dimensions; + if (metric != Metric::Default) + out << ",metric=" << kMetricNames[int(metric)]; + switch (clusteringType()) { + case ClusteringType::Flat: { + auto& c = std::get(clustering); + out << ",clustering=flat" << c.numCentroids; + break; + } + case ClusteringType::MultiIndex: { + auto& c = std::get(clustering); + out << ",clustering=multi" << c.subquantizers << 'x' << c.bitsPerSub; + break; + } + } + switch (encodingType()) { + case EncodingType::None: + out << ",encoding=none"; + break; + case EncodingType::PQ: { + auto& e = std::get(encoding); + out << ",encoding=PQ" << e.subquantizers << 'x' << e.bitsPerSub; + break; + } + case EncodingType::SQ: { + auto& e = std::get(encoding); + out << ",encoding=SQ" << e.bitsPerDimension; + break; + } + } + if (minTrainingCount) + out << ",minToTrain=" << *minTrainingCount; + if ( maxTrainingCount ) + out << ",maxToTrain=" << *maxTrainingCount; + if ( probeCount ) + out << ",probes=" << *probeCount; + if (lazyEncoding) + out << ",lazyindex=true"; + if (lazyEmbedding) + out << ",lazyembedding=true"; + return out; + } + + + string IndexSpec::createArgs() const { + stringstream stmt; + writeArgs(stmt); + return stmt.str(); + } + + + std::string IndexSpec::description() const { + stringstream out; + switch (clusteringType()) { + case ClusteringType::Flat: + out << get(clustering).numCentroids << " centroids, "; + break; + case ClusteringType::MultiIndex: { + auto& miq = get(clustering); + out << "multi-index quantizer (" << miq.subquantizers << " subquantizers × " + << miq.bitsPerSub << " bits), "; + break; + } + } + switch(encodingType()) { + case EncodingType::None: + out << " no encoding"; + break; + case EncodingType::PQ: { + auto& pq = get(encoding); + out << "PQ encoding (" << pq.subquantizers << " subquantizers × " + << pq.bitsPerSub << " bits)"; + break; + } + case EncodingType::SQ: { + auto& sq = get(encoding); + out << "SQ encoding (" << sq.bitsPerDimension << " bits)"; + break; + } + } + return out.str(); + } + + +} diff --git a/vendor/vector_search/VectorIndexSpec.hh b/vendor/vector_search/VectorIndexSpec.hh new file mode 100644 index 000000000..ea5ec77fc --- /dev/null +++ b/vendor/vector_search/VectorIndexSpec.hh @@ -0,0 +1,190 @@ +// +// VectorIndexSpec.hh +// +// Copyright 2024-Present Couchbase, Inc. +// +// Use of this software is governed by the Business Source License included +// in the file licenses/BSL-Couchbase.txt. As of the Change Date specified +// in that file, in accordance with the Business Source License, use of this +// software will be governed by the Apache License, Version 2.0, included in +// the file licenses/APL2.txt. +// + +// NOTE: This file appears in both the vectorsearch and couchbase-lite-core repos. +// Any changes made in one should be copied to the other! + +#pragma once +#include +#include +#include +#include +#include + +namespace vectorsearch { + + /// Distance metric; defines the distance between vectors. + enum class Metric { + Euclidean2, ///< Euclidean distance, squared + Cosine, ///< Cosine similarity subtracted from 1, so smaller is closer + Default = Euclidean2 + }; + + struct FlatClustering { + unsigned numCentroids; ///< Number of buckets to assign the vectors to + }; + + struct MultiIndexClustering { + unsigned subquantizers = 2; ///< Number of pieces each vector is split into + unsigned bitsPerSub = 8; ///< Number of bits of centroid count per piece + }; + + enum class ClusteringType {Flat, MultiIndex}; ///< Just identifies type of clustering + + using Clustering = std::variant; + + + struct NoEncoding { }; + + struct PQEncoding { + unsigned subquantizers; ///< Number of pieces each vector is split into + unsigned bitsPerSub = 8; ///< Bits for encoding each piece + + explicit constexpr PQEncoding(unsigned sub, unsigned bits =8) + :subquantizers(sub), bitsPerSub(bits) { } + }; + + struct SQEncoding { + unsigned bitsPerDimension = 8; ///< Bits/dimension; must be 4, 6 or 8 + }; + + enum class EncodingType {None, PQ, SQ}; ///< Just identifies type of encoding + + using Encoding = std::variant; + + + /** The parameters of a VectorDB. */ + struct IndexSpec { + + //---- PROPERTIES: + + unsigned dimensions = 0; ///< Vector dimensions + Metric metric = Metric::Default; ///< Distance metric + Clustering clustering = MultiIndexClustering{};///< Clustering type + Encoding encoding = SQEncoding{}; ///< Encoding type + + std::optional minTrainingCount; ///< Min vectors needed to train + std::optional maxTrainingCount; ///< Max vectors to train with + std::optional probeCount; ///< Number of buckets to probe + + /// If true, inserted vectors are not encoded or mapped to centroids until the next query. + /// @warning This is not the same meaning of "lazy" as in CBL! See \ref lazyEmbedding. + bool lazyEncoding = false; + + /// If true, app will use the CBL IndexUpdater API to compute/request vectors for docs. + /// @note This flag is ignored by vectorsearch! It's for the use of LiteCore. + bool lazyEmbedding = false; + + /// Set `minTrainingCount` to this value (or greater) to disable automatic training. + static constexpr int64_t kNeverTrain = 999'999'999; + + //---- CONSTRUCTION: + + IndexSpec() = default; + + IndexSpec(unsigned dim, Clustering q, Encoding e = NoEncoding{}) + :dimensions(dim), clustering(q), encoding(e) { } + + /// Sets an attribute of an IndexSpec from a key/value pair; useful for CLI. + /// See Extension.md for documentation of the supported keys and values. + /// @returns true if it applied the param, false if it didn't recognize the key. + /// @throws std::invalid_argument if the value is invalid. + [[nodiscard]] bool readArg(std::string_view key, std::string_view value); + + /// Same as the other `readArg` but takes a single string of the form `key=value` or `key`. + [[nodiscard]] bool readArg(std::string_view arg); + + //---- VALIDATION: + + /// Throws a std::invalid_argument exception if the parameters are invalid. + /// Also sets reasonable values for training & probe counts, if omitted. + void validate() const; + + /// Ensures `minTrainingCount` and `maxTrainingCount` are set to reasonable values: + /// - If either is `nullopt` or 0, it's set to its default (based on the # of centroids.) + /// - If min is too small, it's raised to the default, and a warning is logged. + void resolveTrainingCounts(); + + //---- ACCESSORS: + + ClusteringType clusteringType() const {return ClusteringType(clustering.index());} + EncodingType encodingType() const {return EncodingType(encoding.index());} + + /// The number of centroid points that need to be identified during training. + /// This depends on both the clustering type and the encoding, because both PQ and SQ + /// encoders have their own internal sets of centroids. + /// @warning FAISS is likely to throw an exception if training is performed with fewer + /// vectors than this number. + unsigned numCentroidsToTrain() const { + unsigned nCent; + if (auto q = std::get_if(&clustering)) + nCent = 1 << q->bitsPerSub; + else + nCent = std::get(clustering).numCentroids; + if (auto pq = std::get_if(&encoding)) { + // PQ encoding has its own centroids that need to be trained: + nCent = std::max(nCent, 1u << pq->bitsPerSub); + } + return nCent; + } + + /// The number of buckets to which vectors will be assigned when indexed. + /// @note This is not the same as `numCentroidsToTrain`, because + /// (a) with multi-index clustering the 'centroids' used as buckets are actually tuples, + /// with one centroid per subquantizer; + /// (b) it only refers to the main IVF index, not centroids used by encoders. + unsigned numCentroids() const { + if (auto q = std::get_if(&clustering)) + return 1 << (q->bitsPerSub * q->subquantizers); + else + return std::get(clustering).numCentroids; + } + + //---- ENCODING: + + /// Writes a series of comma-separated "key=value" pairs describing this spec. + std::ostream& writeArgs(std::ostream&) const; + + /// Returns a string of comma-separated key=value pairs describing this spec. + std::string createArgs() const; + + friend std::ostream& operator<<(std::ostream& out, IndexSpec const& spec) { + return spec.writeArgs(out); + } + + /// Returns a human-readable string describing this spec. + std::string description() const; + + //---- LIMITS: + + static constexpr unsigned kMinDimensions = 2; + static constexpr unsigned kMaxDimensions = 4096; + static constexpr FlatClustering kMinFlatClustering {1}; + static constexpr FlatClustering kMaxFlatClustering {64'000}; + static constexpr MultiIndexClustering kMinMultiIndexClustering { 2, 4}; + static constexpr MultiIndexClustering kMaxMultiIndexClustering {1024, 12}; + static constexpr PQEncoding kMinPQEncoding { 2, 4}; + static constexpr PQEncoding kMaxPQEncoding {1024, 12}; + static constexpr SQEncoding kMinSQEncoding {4}; + static constexpr SQEncoding kMaxSQEncoding {8}; + + /// Absolute minimum number of training vectors needed per centroid. + /// The `train` method will return false instead of training if given fewer. + static constexpr int64_t kMinTrainingVectorsPerCentroid = 25; + + /// Minimum recommended (by FAISS) number of training vectors per centroid for good results. + static constexpr int64_t kRecommendedMinTrainingVectorsPerCentroid = 39; + static constexpr int64_t kRecommendedMaxTrainingVectorsPerCentroid = 100; + + }; + +}