Skip to content

Commit

Permalink
LazyIndex: reject wrong-dimensional vectors [CBL-5814] (#2053)
Browse files Browse the repository at this point in the history
* getIndex() now recovers vector-index options

Added code to parse the vectorsearch options from the CREATE TABLE
statement in the db schema.
It's not complete -- it doesn't fill in the parameters of the encoding
-- but it gets the most important stuff including the vector
dimensions, which LazyIndex wants...

* LazyIndex: reject wrong-dimensional vectors
  • Loading branch information
snej authored Jun 6, 2024
1 parent b7eb2e4 commit 2c1f5f4
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 34 deletions.
23 changes: 16 additions & 7 deletions LiteCore/Query/LazyIndex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# include "QueryParser.hh"
# include "SequenceSet.hh"
# include "SQLiteDataFile.hh"
# include "SQLite_Internal.hh"
# include "SQLiteKeyStore.hh"
# include "SQLUtil.hh"
# include "SQLite_Internal.hh"
Expand Down Expand Up @@ -77,15 +78,20 @@ namespace litecore {
AssertArg(limit > 0);
Retained<LazyIndexUpdate> update;
do {
unsigned dimension = 0;
SequenceSet indexedSequences;
sequence_t curSeq;
{
// Open a RO transaction so the code sees a consistent snapshot of the database:
ReadOnlyTransaction txn(_db);

alloc_slice json = getSpec().indexedSequences;
if ( !indexedSequences.read_json(json) )
LogError(QueryLog, "Couldn't parse index's indexedSequences: %.*s", FMTSLICE(json));
{
SQLiteIndexSpec spec = getSpec();
if ( auto vecOpts = spec.vectorOptions() ) dimension = vecOpts->dimensions;
if ( !indexedSequences.read_json(spec.indexedSequences) )
LogError(QueryLog, "Couldn't parse index's indexedSequences: %.*s",
FMTSLICE(spec.indexedSequences));
}
curSeq = _sqlKeyStore.lastSequence();
LogTo(QueryLog, "LazyIndex: Indexed sequences of %s are %s ; latest seq is %llu", _indexName.c_str(),
indexedSequences.to_string().c_str(), (long long)curSeq);
Expand All @@ -104,7 +110,7 @@ namespace litecore {
Query::Options options(enc.finish());
Retained<QueryEnumerator> e = _query->createEnumerator(&options);
if ( e->getRowCount() > 0 )
update = new LazyIndexUpdate(this, startSeq, curSeq, indexedSequences, e, limit);
update = new LazyIndexUpdate(this, dimension, startSeq, curSeq, indexedSequences, e, limit);
}

if ( !update ) {
Expand All @@ -130,6 +136,7 @@ namespace litecore {
<< sqlIdentifier(_vectorTableName)
<< " (docid, vector) VALUES (?1, ?2)"));
}
UsingStatement u(_ins);
_ins->bind(1, (long long)rowid);
_ins->bindNoCopy(2, (const void*)vec, int(dimension * sizeof(float)));
_ins->exec();
Expand All @@ -141,6 +148,7 @@ namespace litecore {
_del = make_unique<SQLite::Statement>(
_db, CONCAT("DELETE FROM " << sqlIdentifier(_vectorTableName) << " WHERE docid=?1"));
}
UsingStatement u(_del);
_del->bind(1, (long long)rowid);
_del->exec();
_del->reset();
Expand All @@ -153,13 +161,14 @@ namespace litecore {

# pragma mark - LAZY INDEX UPDATE:

LazyIndexUpdate::LazyIndexUpdate(LazyIndex* manager, sequence_t firstSeq, sequence_t atSeq, SequenceSet indexedSeqs,
Retained<QueryEnumerator> e, size_t limit)
LazyIndexUpdate::LazyIndexUpdate(LazyIndex* manager, unsigned dimension, sequence_t firstSeq, sequence_t atSeq,
SequenceSet indexedSeqs, Retained<QueryEnumerator> e, size_t limit)
: _manager(manager)
, _firstSeq(firstSeq)
, _atSeq(atSeq)
, _indexedSequences(std::move(indexedSeqs))
, _enum(std::move(e)) {
, _enum(std::move(e))
, _dimension(dimension) {
// Find the rows which are not yet indexed:
int64_t row = 0;
for ( ; _enum->next(); ++row ) {
Expand Down
7 changes: 5 additions & 2 deletions LiteCore/Query/LazyIndex.hh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ namespace litecore {
/// The number of vectors to compute.
size_t count() const { return _count; }

/// The dimensions of the vectors.
size_t dimensions() const { return _dimension; }

/// Returns the i'th value to compute a vector from.
/// This is the value of the expression in the index spec.
FLValue valueAt(size_t i) const;
Expand All @@ -84,7 +87,7 @@ namespace litecore {

private:
friend class LazyIndex;
LazyIndexUpdate(LazyIndex*, sequence_t firstSeq, sequence_t curSeq, SequenceSet indexedSeqs,
LazyIndexUpdate(LazyIndex*, unsigned dimension, sequence_t firstSeq, sequence_t curSeq, SequenceSet indexedSeqs,
Retained<QueryEnumerator>, size_t limit);

using VectorPtr = std::unique_ptr<float[]>;
Expand All @@ -103,7 +106,7 @@ namespace litecore {
Retained<QueryEnumerator> _enum; // Results of Query for updated docs
size_t _count = 0; // Number of vectors to update
std::vector<Item> _items; // Vectors to update exposed in the public API
size_t _dimension = 0; // Dimensions of the vectors in _vectors
size_t _dimension; // Dimensions of the vectors in _vectors
bool _incomplete; // True if query did not get all update docs
};

Expand Down
33 changes: 23 additions & 10 deletions LiteCore/Query/SQLiteDataFile+Indexes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ namespace litecore {
string indexName = getIndex.getColumn(0);
string keyStoreName = getIndex.getColumn(1).getString().substr(3);
if ( !store || keyStoreName == store->name() )
indexes.emplace_back(indexName, IndexSpec::kValue, alloc_slice(), QueryLanguage::kJSON, keyStoreName,
"");
indexes.emplace_back(indexName, IndexSpec::kValue, alloc_slice(), QueryLanguage::kJSON,
IndexSpec::Options{}, keyStoreName, "");
}

// FTS indexes:
Expand All @@ -194,8 +194,8 @@ namespace litecore {
string keyStoreName = tableName.substr(delim);
string indexName = tableName.substr(delim + 2);
if ( !store || keyStoreName == store->name() )
indexes.emplace_back(indexName, IndexSpec::kValue, alloc_slice(), QueryLanguage::kJSON, keyStoreName,
tableName);
indexes.emplace_back(indexName, IndexSpec::kValue, alloc_slice(), QueryLanguage::kJSON,
IndexSpec::Options{}, keyStoreName, tableName);
}
return indexes;
}
Expand All @@ -219,19 +219,32 @@ namespace litecore {
stmt.exec();
}

// Recover an IndexSpec from a row of the `indexes` table
SQLiteIndexSpec SQLiteDataFile::specFromStatement(SQLite::Statement& stmt) {
string name = stmt.getColumn(0).getString();
auto type = IndexSpec::Type(stmt.getColumn(1).getInt());
IndexSpec::Options options;
string keyStoreName = stmt.getColumn(3).getString();
string indexTableName = stmt.getColumn(4).getString();

QueryLanguage queryLanguage = QueryLanguage::kJSON;
alloc_slice expression;
if ( string col = stmt.getColumn(2).getString(); !col.empty() ) {
expression = col;
if ( col[0] != '[' && col[0] != '{' ) queryLanguage = QueryLanguage::kN1QL;
}
SQLiteIndexSpec spec{stmt.getColumn(0).getString(),
(IndexSpec::Type)stmt.getColumn(1).getInt(),
expression,
queryLanguage,
stmt.getColumn(3).getString(),
stmt.getColumn(4).getString()};

#ifdef COUCHBASE_ENTERPRISE
if ( type == IndexSpec::kVector ) {
// Recover the vector options from the index schema itself:
string sql;
if ( getSchema(indexTableName, "table", indexTableName, sql) ) {
if ( auto opts = SQLiteKeyStore::parseVectorSearchTableSQL(sql) ) options = std::move(*opts);
}
}
#endif

SQLiteIndexSpec spec{name, type, expression, queryLanguage, options, keyStoreName, indexTableName};
if ( auto col5 = stmt.getColumn(5); col5.isText() ) spec.indexedSequences = col5.getText();
return spec;
}
Expand Down
55 changes: 55 additions & 0 deletions LiteCore/Query/SQLiteKeyStore+VectorIndex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,61 @@ namespace litecore {
return ""; // no index found
}

static inline unsigned asUInt(string_view sv) {
string str(sv);
return unsigned(strtoul(str.c_str(), nullptr, 10));
}

// The opposite of createVectorSearchTableSQL
optional<IndexSpec::VectorOptions> SQLiteKeyStore::parseVectorSearchTableSQL(string_view sql) {
optional<IndexSpec::VectorOptions> opts;
// Find the virtual-table arguments in the CREATE TABLE statement:
auto start = sql.find("vectorsearch(");
if ( start == string::npos ) return opts;
start += strlen("vectorsearch(");
auto end = sql.find(')', start);
if ( end == string::npos ) return opts;

// Parse each comma-delimited key-value pair:
string_view args(&sql[start], end - start);
opts.emplace(0);
split(args, ",", [&](string_view key) {
string_view value;
if ( auto eq = key.find('='); eq != string::npos ) {
value = key.substr(eq + 1);
key = key.substr(0, eq);
if ( value.empty() || key.empty() ) return;
}
if ( key == "dimensions" ) {
opts->dimensions = asUInt(value);
} else if ( key == "metric" ) {
if ( value == "euclidean2" ) opts->metric = IndexSpec::VectorOptions::Euclidean;
else if ( value == "cosine" )
opts->metric = IndexSpec::VectorOptions::Cosine;
} else if ( key == "minToTrain" ) {
opts->minTrainingSize = asUInt(value);
} else if ( key == "maxToTrain" ) {
opts->maxTrainingSize = asUInt(value);
} else if ( key == "probes" ) {
opts->numProbes = asUInt(value);
} else if ( key == "lazyindex" ) {
opts->lazy = (value != "false" && value != "0");
} else if ( key == "clustering" ) {
if ( hasPrefix(value, "multi") ) opts->clustering = {IndexSpec::VectorOptions::Multi};
//TODO: Parse centroid count & other params; see vectorsearch::IndexSpec::setParam()
} else if ( key == "encoding" ) {
if ( value == "none" ) opts->encoding = {IndexSpec::VectorOptions::NoEncoding};
else if ( hasPrefix(value, "PQ") ) {
opts->encoding = {IndexSpec::VectorOptions::PQ};
} else if ( hasPrefix(value, "SQ") ) {
opts->encoding = {IndexSpec::VectorOptions::SQ};
}
//TODO: Parse encoding params; see vectorsearch::IndexSpec::setParam()
}
});
return opts;
}

} // namespace litecore

#endif
3 changes: 2 additions & 1 deletion LiteCore/Storage/DataFile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,8 @@ namespace litecore {

ExclusiveTransaction::~ExclusiveTransaction() {
if ( _active ) {
_db._logInfo("Transaction exiting scope without explicit commit; aborting");
if ( !std::uncaught_exception() )
_db._logInfo("Transaction exiting scope without explicit commit; aborting");
abort();
}
_db.endTransactionScope(this);
Expand Down
14 changes: 7 additions & 7 deletions LiteCore/Storage/SQLiteDataFile.hh
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ namespace litecore {
static bool tableNameIsCollection(slice tableName);
static bool keyStoreNameIsCollection(slice ksName);

bool getSchema(const std::string& name, const std::string& type, const std::string& tableName,
std::string& outSQL) const;
bool schemaExistsWithSQL(const std::string& name, const std::string& type, const std::string& tableName,
const std::string& sql) const;
[[nodiscard]] bool getSchema(const std::string& name, const std::string& type, const std::string& tableName,
std::string& outSQL) const;
[[nodiscard]] bool schemaExistsWithSQL(const std::string& name, const std::string& type,
const std::string& tableName, const std::string& sql) const;

fleece::alloc_slice rawQuery(const std::string& query) override;

Expand Down Expand Up @@ -179,7 +179,7 @@ namespace litecore {
const std::string& indexTableName);
void unregisterIndex(slice indexName);
void garbageCollectIndexTable(const std::string& tableName);
static SQLiteIndexSpec specFromStatement(SQLite::Statement& stmt);
SQLiteIndexSpec specFromStatement(SQLite::Statement& stmt);
std::vector<SQLiteIndexSpec> getIndexesOldStyle(const KeyStore* store = nullptr);


Expand All @@ -193,8 +193,8 @@ namespace litecore {

struct SQLiteIndexSpec : public IndexSpec {
SQLiteIndexSpec(const std::string& name, IndexSpec::Type type, alloc_slice expressionJSON,
QueryLanguage language, std::string ksName, std::string itName)
: IndexSpec(name, type, std::move(expressionJSON), language)
QueryLanguage language, Options options, std::string ksName, std::string itName)
: IndexSpec(name, type, std::move(expressionJSON), language, std::move(options))
, keyStoreName(std::move(ksName))
, indexTableName(std::move(itName)) {}

Expand Down
4 changes: 2 additions & 2 deletions LiteCore/Storage/SQLiteKeyStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -509,8 +509,8 @@ namespace litecore {
if ( !_hasExpirationColumn ) {
string sql;
string tableName = this->tableName();
db().getSchema(tableName, "table", tableName, sql);
if ( sql.find("expiration") != string::npos ) _hasExpirationColumn = true;
if ( db().getSchema(tableName, "table", tableName, sql) && sql.find("expiration") != string::npos )
_hasExpirationColumn = true;
}
return _hasExpirationColumn;
}
Expand Down
3 changes: 2 additions & 1 deletion LiteCore/Storage/SQLiteKeyStore.hh
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ namespace litecore {
bool createArrayIndex(const IndexSpec&);
bool createVectorIndex(const IndexSpec&);
string findVectorIndexNameFor(const string& property);
std::string createUnnestedTable(const fleece::impl::Value* arrayPath);
static std::optional<IndexSpec::VectorOptions> parseVectorSearchTableSQL(string_view sql);
std::string createUnnestedTable(const fleece::impl::Value* arrayPath);

#ifdef COUCHBASE_ENTERPRISE
bool createPredictiveIndex(const IndexSpec&);
Expand Down
19 changes: 17 additions & 2 deletions LiteCore/tests/LazyVectorQueryTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class LazyVectorQueryTest : public VectorQueryTest {
return 0;
}
Log("---- Updating %zu vectors...", update->count());
CHECK(update->dimensions() == kDimension);

size_t count = update->count();
CHECK(count > 0);
Expand Down Expand Up @@ -186,6 +187,20 @@ TEST_CASE_METHOD(LazyVectorQueryTest, "Lazy Vector Index Skipping", "[Query][.Ve
checkQueryReturns({"rec-291", "rec-171", "rec-039", "rec-081", "rec-249"});
}

#endif
TEST_CASE_METHOD(LazyVectorQueryTest, "Lazy Vector Update Wrong Dimensions", "[.VectorSearch]") {
Retained<LazyIndexUpdate> update = _lazyIndex->beginUpdate(1);
REQUIRE(update);
CHECK(update->count() == 1);
CHECK(update->dimensions() == kDimension);

fleece::Value val(update->valueAt(0));
REQUIRE(val.type() == kFLNumber);
float vec[kDimension];
computeVector(0, vec);

// Guard against multiple updater objects, where 2nd one finishes first!!
ExpectingExceptions x;
Log("---- Calling setVectorAt with wrong dimension...");
CHECK_THROWS_AS(update->setVectorAt(0, vec, kDimension - 1), error);
}

#endif
21 changes: 19 additions & 2 deletions LiteCore/tests/VectorQueryTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@ class SIFTVectorQueryTest : public VectorQueryTest {

SIFTVectorQueryTest() : VectorQueryTest(0) {}

void createVectorIndex() {
IndexSpec::VectorOptions vectorIndexOptions() const {
IndexSpec::VectorOptions options(128);
options.clustering.type = IndexSpec::VectorOptions::Flat;
options.clustering.flat_centroids = 256;
VectorQueryTest::createVectorIndex("vecIndex", "[ ['.vector'] ]", options);
return options;
}

void createVectorIndex() {
VectorQueryTest::createVectorIndex("vecIndex", "[ ['.vector'] ]", vectorIndexOptions());
}

void readVectorDocs(size_t maxLines = 1000000) {
Expand Down Expand Up @@ -80,6 +84,19 @@ N_WAY_TEST_CASE_METHOD(SIFTVectorQueryTest, "Create/Delete Vector Index", "[Quer
auto allKeyStores = db->allKeyStoreNames();
readVectorDocs(1);
createVectorIndex();

// Recover the IndexSpec:
std::optional<IndexSpec> spec = store->getIndex("vecIndex");
REQUIRE(spec);
CHECK(spec->name == "vecIndex");
CHECK(spec->type == IndexSpec::kVector);
auto vecOptions = spec->vectorOptions();
REQUIRE(vecOptions);
auto trueOptions = vectorIndexOptions();
CHECK(vecOptions->dimensions == trueOptions.dimensions);
CHECK(vecOptions->clustering.type == trueOptions.clustering.type);
CHECK(vecOptions->encoding.type == trueOptions.encoding.type);

CHECK(db->allKeyStoreNames() == allKeyStores); // CBL-3824, CBL-5369
// Delete a doc too:
{
Expand Down

0 comments on commit 2c1f5f4

Please sign in to comment.