diff --git a/Tool.cc b/Tool.cc index fca5439..7b8d2ba 100644 --- a/Tool.cc +++ b/Tool.cc @@ -20,6 +20,7 @@ #include "Logging.hh" #include "linenoise.h" #include "utf8.h" +#include #include #include #include @@ -266,3 +267,26 @@ alloc_slice Tool::readFile(const string &path) { in.read((char*)data.buf, size); return data; } + +int Tool::nextIntArg(const char *what, int minVal, int maxVal) { + return parseInt(nextArg(what), minVal, maxVal); +} + + +int Tool::parseInt(string_view str, int minVal, int maxVal) { + int value; + const char* end = str.data() + str.size(); + auto [ptr, ec] = std::from_chars(str.data(), end, value); + const char* err = nullptr; + if (ec == errc::result_out_of_range) + err = " is out of range"; + else if (ec != errc{} || ptr != end) + err = " is not a valid integer"; + else if (value < minVal) + err = " is too small"; + else if (value > maxVal) + err = " is too large"; + if (err) + fail(string(str) + err); + return value; +} diff --git a/Tool.hh b/Tool.hh index d13e5c2..93f0e94 100644 --- a/Tool.hh +++ b/Tool.hh @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CMAKE #include "config.h" @@ -176,6 +177,8 @@ public: std::string it(const char *str) {return ansiItalic() + str + ansiReset();} std::string spaces(int n) {return std::string(std::max(n, 1), ' ');} + + int parseInt(std::string_view, int minVal = INT_MIN, int maxVal = INT_MAX); protected: @@ -206,6 +209,8 @@ protected: return arg; } + int nextIntArg(const char *what, int minVal = INT_MIN, int maxVal = INT_MAX); + /** If the next arg matches the given string, consumes it and returns true. */ bool matchArg(const char *matchArg) { if (_argTokenizer.argument() != matchArg) @@ -255,7 +260,15 @@ protected: if (flag == "--") return; // marks end of flags - if (!processFlag(flag, specs)) { + + bool handled; + try { + handled = processFlag(flag, specs); + } catch (std::exception const& x) { + fail("in flag " + flag + ": " + x.what()); + } + + if (!handled) { // Flags all subcommands accept: if (flag == "--help") { usage(); diff --git a/Xcode/Tools.xcodeproj/project.pbxproj b/Xcode/Tools.xcodeproj/project.pbxproj index 7ec3da9..205f52e 100644 --- a/Xcode/Tools.xcodeproj/project.pbxproj +++ b/Xcode/Tools.xcodeproj/project.pbxproj @@ -7,6 +7,7 @@ objects = { /* Begin PBXBuildFile section */ + 1A02703F2C2645A60025F2B5 /* EnrichCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1A02703D2C2645A60025F2B5 /* EnrichCommand.cc */; }; 2716F94D2491822700BE21D9 /* CheckCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 2716F94C2491822700BE21D9 /* CheckCommand.cc */; }; 2716F95B2491857E00BE21D9 /* ReindexCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 2716F95A2491857E00BE21D9 /* ReindexCommand.cc */; }; 27175C0F261CE5F40045F3AC /* MkCollCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 27175C0E261CE5F40045F3AC /* MkCollCommand.cc */; }; @@ -30,7 +31,6 @@ 2751C83225D3650A00A9B39B /* linenoise.c in Sources */ = {isa = PBXBuildFile; fileRef = 2751C81C25D3650A00A9B39B /* linenoise.c */; }; 2751C83325D3650A00A9B39B /* utf8.h in Headers */ = {isa = PBXBuildFile; fileRef = 2751C81F25D3650A00A9B39B /* utf8.h */; }; 276CE5C9225FAA1600B681AC /* TokenizerTest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5C8225FAA1600B681AC /* TokenizerTest.cc */; }; - 276CE5CC225FAC8500B681AC /* LibC++Debug.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5CB225FAC8500B681AC /* LibC++Debug.cc */; }; 276CE5CF225FACDD00B681AC /* ArgumentTokenizer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 27FC8DF722137C580083B033 /* ArgumentTokenizer.cc */; }; 276CE5D1225FADB200B681AC /* tests_main.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5D0225FADB200B681AC /* tests_main.cc */; }; 276D4AC527864A5500F61A89 /* MkIndexCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276D4AC427864A5500F61A89 /* MkIndexCommand.cc */; }; @@ -58,7 +58,6 @@ 27FC8E64221381C60083B033 /* liblinenoise.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 27FC8E5A221381890083B033 /* liblinenoise.a */; }; 27FC8E67221383880083B033 /* libz.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 27FC8E66221383880083B033 /* libz.tbd */; }; 27FC8E6A221383AE0083B033 /* libLiteCoreREST-static.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 27FC8E1A22137CB60083B033 /* libLiteCoreREST-static.a */; }; - 42030A6E24AC14F900283CE8 /* LibC++Debug.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5CB225FAC8500B681AC /* LibC++Debug.cc */; }; 42030A7024AC152000283CE8 /* StringUtil.cc in Sources */ = {isa = PBXBuildFile; fileRef = 42030A6F24AC152000283CE8 /* StringUtil.cc */; }; /* End PBXBuildFile section */ @@ -208,6 +207,7 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 1A02703D2C2645A60025F2B5 /* EnrichCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = EnrichCommand.cc; sourceTree = ""; }; 2716F94C2491822700BE21D9 /* CheckCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CheckCommand.cc; sourceTree = ""; }; 2716F95A2491857E00BE21D9 /* ReindexCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ReindexCommand.cc; sourceTree = ""; }; 2716F9942493E59B00BE21D9 /* BUILDING.md */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = net.daringfireball.markdown; name = BUILDING.md; path = ../BUILDING.md; sourceTree = ""; }; @@ -237,7 +237,6 @@ 276CE5C0225FA9F400B681AC /* Tests */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Tests; sourceTree = BUILT_PRODUCTS_DIR; }; 276CE5C8225FAA1600B681AC /* TokenizerTest.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = TokenizerTest.cc; sourceTree = ""; }; 276CE5CA225FAA4C00B681AC /* tests.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = tests.xcconfig; sourceTree = ""; }; - 276CE5CB225FAC8500B681AC /* LibC++Debug.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = "LibC++Debug.cc"; sourceTree = ""; }; 276CE5D0225FADB200B681AC /* tests_main.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tests_main.cc; sourceTree = ""; }; 276D4AC427864A5500F61A89 /* MkIndexCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = MkIndexCommand.cc; sourceTree = ""; }; 276D4AD22786502600F61A89 /* RmIndexCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = RmIndexCommand.cc; sourceTree = ""; }; @@ -361,7 +360,6 @@ isa = PBXGroup; children = ( 276CE5C8225FAA1600B681AC /* TokenizerTest.cc */, - 276CE5CB225FAC8500B681AC /* LibC++Debug.cc */, 276CE5D0225FADB200B681AC /* tests_main.cc */, ); name = tests; @@ -422,6 +420,7 @@ 27FC8DD322137C330083B033 /* CpCommand.cc */, 273B20D3264B2D6900A14EC4 /* EditCommand.cc */, 271BA4AD227CC54300D49D13 /* EncryptCommand.cc */, + 1A02703D2C2645A60025F2B5 /* EnrichCommand.cc */, 27FC8DD622137C330083B033 /* InfoCommand.cc */, 27FC8DD222137C330083B033 /* ListCommand.cc */, 27E95BAC24083D7C0013711C /* ListCommand.hh */, @@ -760,7 +759,6 @@ files = ( 272F00CF226F7FE500E62F72 /* cbl-log.cc in Sources */, 272F00D7226F956C00E62F72 /* LogDecoder_stub.cpp in Sources */, - 42030A6E24AC14F900283CE8 /* LibC++Debug.cc in Sources */, 272F00D0226F7FFC00E62F72 /* Tool.cc in Sources */, 42030A7024AC152000283CE8 /* StringUtil.cc in Sources */, 272F00D1226F800000E62F72 /* ArgumentTokenizer.cc in Sources */, @@ -772,7 +770,6 @@ buildActionMask = 2147483647; files = ( 276CE5D1225FADB200B681AC /* tests_main.cc in Sources */, - 276CE5CC225FAC8500B681AC /* LibC++Debug.cc in Sources */, 276CE5CF225FACDD00B681AC /* ArgumentTokenizer.cc in Sources */, 276CE5C9225FAA1600B681AC /* TokenizerTest.cc in Sources */, ); @@ -800,6 +797,7 @@ 27FC8DDD22137C330083B033 /* ListCommand.cc in Sources */, 27FC8DF222137C490083B033 /* Endpoint.cc in Sources */, 27FC8DE522137C330083B033 /* SQLCommand.cc in Sources */, + 1A02703F2C2645A60025F2B5 /* EnrichCommand.cc in Sources */, 27175C27261D097A0045F3AC /* MvCommand.cc in Sources */, 27E95BAB2408376B0013711C /* CBLiteCommand.cc in Sources */, 27175C21261D00200045F3AC /* CdCommand.cc in Sources */, @@ -855,6 +853,19 @@ isa = XCBuildConfiguration; baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */; buildSettings = { + HEADER_SEARCH_PATHS = ( + "$(inherited)", + "$(FLEECE)/API", + "$(FLEECE)/Fleece/Support", + "$(LITECORE)/C/include", + "$(LITECORE)/C/Cpp_include", + "$(LITECORE)/C", + "$(LITECORE)/Networking", + "$(LITECORE)/Replicator", + "$(LITECORE)/LiteCore/Support", + "$(LITECORE)/REST", + "$(LITECORE)/Networking/HTTP", + ); }; name = Debug_EE; }; @@ -947,6 +958,19 @@ isa = XCBuildConfiguration; baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */; buildSettings = { + HEADER_SEARCH_PATHS = ( + "$(inherited)", + "$(FLEECE)/API", + "$(FLEECE)/Fleece/Support", + "$(LITECORE)/C/include", + "$(LITECORE)/C/Cpp_include", + "$(LITECORE)/C", + "$(LITECORE)/Networking", + "$(LITECORE)/Replicator", + "$(LITECORE)/LiteCore/Support", + "$(LITECORE)/REST", + "$(LITECORE)/Networking/HTTP", + ); }; name = Release_EE; }; @@ -1174,6 +1198,19 @@ isa = XCBuildConfiguration; baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */; buildSettings = { + HEADER_SEARCH_PATHS = ( + "$(inherited)", + "$(FLEECE)/API", + "$(FLEECE)/Fleece/Support", + "$(LITECORE)/C/include", + "$(LITECORE)/C/Cpp_include", + "$(LITECORE)/C", + "$(LITECORE)/Networking", + "$(LITECORE)/Replicator", + "$(LITECORE)/LiteCore/Support", + "$(LITECORE)/REST", + "$(LITECORE)/Networking/HTTP", + ); }; name = Debug; }; @@ -1181,6 +1218,19 @@ isa = XCBuildConfiguration; baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */; buildSettings = { + HEADER_SEARCH_PATHS = ( + "$(inherited)", + "$(FLEECE)/API", + "$(FLEECE)/Fleece/Support", + "$(LITECORE)/C/include", + "$(LITECORE)/C/Cpp_include", + "$(LITECORE)/C", + "$(LITECORE)/Networking", + "$(LITECORE)/Replicator", + "$(LITECORE)/LiteCore/Support", + "$(LITECORE)/REST", + "$(LITECORE)/Networking/HTTP", + ); }; name = Release; }; diff --git a/cblite/CBLiteCommand.hh b/cblite/CBLiteCommand.hh index 11f5a22..ece512e 100644 --- a/cblite/CBLiteCommand.hh +++ b/cblite/CBLiteCommand.hh @@ -146,6 +146,7 @@ CBLiteCommand* newCheckCommand(CBLiteTool&); CBLiteCommand* newCompactCommand(CBLiteTool&); CBLiteCommand* newCpCommand(CBLiteTool&); CBLiteCommand* newEditCommand(CBLiteTool&); +CBLiteCommand* newEnrichCommand(CBLiteTool&); CBLiteCommand* newExportCommand(CBLiteTool&); CBLiteCommand* newImportCommand(CBLiteTool&); CBLiteCommand* newInfoCommand(CBLiteTool&); diff --git a/cblite/CBLiteTool.cc b/cblite/CBLiteTool.cc index 12801ee..4a45820 100644 --- a/cblite/CBLiteTool.cc +++ b/cblite/CBLiteTool.cc @@ -208,6 +208,10 @@ void CBLiteTool::openDatabase(string pathStr, bool interactive) { C4Error err; const C4Error kEncryptedDBError = {LiteCoreDomain, kC4ErrorNotADatabaseFile}; + if (const char* extPath = getenv("CBLITE_EXTENSION_PATH")) { + c4_setExtensionPath(slice(extPath)); + } + if (!_dbNeedsPassword) { _db = c4db_openNamed(slice(dbName), &config, &err); } else { @@ -287,6 +291,7 @@ static constexpr struct {const char* name; ToolFactory factory;} kSubcommands[] {"compact", newCompactCommand}, {"cp", newCpCommand}, {"edit", newEditCommand}, + {"enrich", newEnrichCommand}, {"export", newExportCommand}, {"file", newInfoCommand}, {"get", newCatCommand}, diff --git a/cblite/CMakeLists.txt b/cblite/CMakeLists.txt index 3641d7f..86a0910 100644 --- a/cblite/CMakeLists.txt +++ b/cblite/CMakeLists.txt @@ -52,6 +52,7 @@ target_include_directories( ${LITECORE}Networking/ ${LITECORE}Networking/HTTP ${LITECORE}Replicator + ${LITECORE}REST/ ${LITECORE}vendor/fleece/API/ ${LITECORE}vendor/fleece/Fleece/Support/ # PlatformCompat.hh ${CMAKE_BINARY_DIR}/generated_headers/ diff --git a/cblite/CompactCommand.cc b/cblite/CompactCommand.cc index 8ff4ebe..126e486 100644 --- a/cblite/CompactCommand.cc +++ b/cblite/CompactCommand.cc @@ -44,7 +44,7 @@ class CompactCommand : public CBLiteCommand { void runSubcommand() override { // Read params: processFlags({ - {"--prune", [&]{_prune = stoi(nextArg("depth for --prune"));}}, + {"--prune", [&]{_prune = nextIntArg("depth for --prune", 1);}}, {"--purgeDeleted", [&]{_purgeDeleted = true;}}, }); @@ -177,14 +177,14 @@ class CompactCommand : public CBLiteCommand { if (c4doc_selectCommonAncestorRevision(doc, doc->selectedRev.revID, currentRevID)) branchPoint = doc->selectedRev.revID; // First count the number of revs on the branch: - c4doc_selectRevision(doc, closedBranch, false, nullptr); + [[maybe_unused]] bool _ = c4doc_selectRevision(doc, closedBranch, false, nullptr); do { ++nPrunedRevs; if (doc->selectedRev.flags & kRevKeepBody) ++nRemovedBodies; } while (c4doc_selectParentRevision(doc) && doc->selectedRev.revID != branchPoint); // Then prune the entire branch: - c4doc_purgeRevision(doc, closedBranch, nullptr); + _ = c4doc_purgeRevision(doc, closedBranch, nullptr); } else { // Walk its ancestor chain, counting how many revs are deeper than maxDepth: unsigned branchDepth = 1, keepBodyDepth = 0; diff --git a/cblite/DocBranchIterator.hh b/cblite/DocBranchIterator.hh index 670f852..6bf81ee 100644 --- a/cblite/DocBranchIterator.hh +++ b/cblite/DocBranchIterator.hh @@ -24,7 +24,7 @@ public: } DocBranchIterator& operator++() { - c4doc_selectRevision(_doc, _branchID, false, nullptr); + [[maybe_unused]] bool _ = c4doc_selectRevision(_doc, _branchID, false, nullptr); _branchID = fleece::nullslice; while (c4doc_selectNextRevision(_doc)) { if (_doc->selectedRev.flags & kRevLeaf) { diff --git a/cblite/EnrichCommand.cc b/cblite/EnrichCommand.cc new file mode 100644 index 0000000..e4a23df --- /dev/null +++ b/cblite/EnrichCommand.cc @@ -0,0 +1,151 @@ +// +// EnrichCommand.cc +// +// Copyright (c) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include +#include +#include +#include "EnrichCommand.hh" +#include "fleece/Fleece.hh" +#include "StringUtil.hh" +#include "fleece/Mutable.hh" +#include "Response.hh" + +using namespace std; +using namespace fleece; +using namespace litecore; + +void EnrichCommand::usage() { + writeUsageCommand("enrich", false, "PROP"); + cerr << + "Enriches given JSON with embeddings of selected field\n" + " --offset N : Skip first N docs\n" + " --limit N : Stop after N docs\n" + " " << it("PROPERTY") << " : property for matching docs\n" + " " << it("DESTINATION") << " : destination property\n" + ; +} + +void EnrichCommand::runSubcommand() { + // Read params: + processFlags({ + {"--offset", [&]{offsetFlag();}}, + {"--limit", [&]{limitFlag();}}, + }); + openWriteableDatabaseFromNextArg(); + string srcProp, dstProp; + srcProp = nextArg("source property"); + if (hasArgs()) + dstProp = nextArg("destination property"); + else + dstProp = srcProp + "_vector"; + endOfArgs(); + + enrichDocs(srcProp, dstProp); +} + +void EnrichCommand::enrichDocs(const string& srcProp, const string& dstProp) { + EnumerateDocsOptions options{}; + options.flags |= kC4IncludeBodies; + options.bySequence = true; + options.offset = _offset; + options.limit = _limit; + + cout << "\n"; + if (_offset > 0) + cout << "(Skipping first " << _offset << " docs)\n"; + + // Start transaction + C4Error error; + c4::Transaction t(_db); + if (!t.begin(&error)) + fail("Couldn't open database transaction"); + + // Loop through docs and get properties + int64_t nDocs = enumerateDocs(options, [&](const C4DocumentInfo &info, C4Document *doc) { + Dict body = c4doc_getProperties(doc); + if (!body) + fail("Unexpectedly couldn't parse document body!"); + + Value rawSrcPropValue = body.get(srcProp); + if (rawSrcPropValue.type() != kFLString) { + cout << "Property type must be a string" << endl; + return; + } + + string restBody = format("{\"input\":\"%.*s\", \"model\":\"text-embedding-3-small\"}", SPLAT(rawSrcPropValue.asString())); + + // LiteCore Request and Response + Encoder enc; + enc.beginDict(); + enc["Content-Type"_sl] = "application/json"; + enc["Content-Length"_sl] = restBody.length(); + if (getenv("API_KEY") == NULL) + fail("API Key not provided", error); + + enc["Authorization"] = format("Bearer %s", getenv("API_KEY")); + enc.endDict(); + auto headers = enc.finishDoc(); + auto r = std::make_unique("https", "POST", "api.openai.com", 443, "v1/embeddings"); + r->setHeaders(headers).setBody(restBody); + alloc_slice response; + + if (r->run()) { + response = r->body(); + } else { + if ( r->error() == C4Error{NetworkDomain, kC4NetErrTimeout} ) { + C4Warn("REST request timed out. Current timeout is %f seconds", r->getTimeout()); + } + else + { + C4Warn("REST request failed. %d/%d", r->error().domain, r->error().code); + } + return; + } + + // Parse response + Doc newDoc = Doc::fromJSON(response); + Value embedding = newDoc.asDict()["data"].asArray()[0].asDict()["embedding"]; + auto mutableBody = body.mutableCopy(kFLDefaultCopy); + mutableBody.set(dstProp, embedding); + auto json = mutableBody.toJSON(); + auto newBody = alloc_slice(c4db_encodeJSON(_db, json, &error)); + if (!newBody) + fail("Couldn't encode body", error); + + // Update doc + doc = c4doc_update(doc, newBody, 0, &error); + if (!doc) + fail("Couldn't save document", error); + }); + + // End transaction (commit) + if (!t.commit(&error)) + fail("Couldn't commit database transaction", error); + + // Output status to user + if (nDocs == 0) { + cout << "(No documents with property matching \"" << srcProp << "\"" << ")"; + } else if (nDocs > _limit && _limit > 0) { + cout << "\n(Stopping after " << _limit << " docs)"; + } + cout << "\n"; +} + +CBLiteCommand* newEnrichCommand(CBLiteTool &parent) { + return new EnrichCommand(parent); +} diff --git a/cblite/EnrichCommand.hh b/cblite/EnrichCommand.hh new file mode 100644 index 0000000..1892477 --- /dev/null +++ b/cblite/EnrichCommand.hh @@ -0,0 +1,32 @@ +// +// EnrichCommand.hh +// +// Copyright (c) 2024 Couchbase, Inc All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "CBLiteCommand.hh" + +class EnrichCommand : public CBLiteCommand { +public: + EnrichCommand(CBLiteTool &parent) + :CBLiteCommand(parent) + { } + void usage() override; + void runSubcommand() override; +protected: + void enrichDocs(const std::string&, const std::string&); +}; diff --git a/cblite/InfoCommand.cc b/cblite/InfoCommand.cc index ea2d1e3..25bf30d 100644 --- a/cblite/InfoCommand.cc +++ b/cblite/InfoCommand.cc @@ -325,7 +325,7 @@ class InfoCommand : public CBLiteCommand { c4::ref e = c4query_run(q, nullslice, &error); if (!e) fail("querying database", error); - c4queryenum_next(e, &error); + [[maybe_unused]] bool _ = c4queryenum_next(e, &error); return FLValue_AsUnsigned(FLArrayIterator_GetValueAt(&e->columns, 0)); } diff --git a/cblite/MkIndexCommand.cc b/cblite/MkIndexCommand.cc index 8c15e07..eecaf7a 100644 --- a/cblite/MkIndexCommand.cc +++ b/cblite/MkIndexCommand.cc @@ -22,6 +22,16 @@ using namespace std; using namespace litecore; +#ifdef COUCHBASE_ENTERPRISE +static pair split(string_view str, string_view sep) { + if (auto pos = str.find(sep); pos != string::npos) + return {str.substr(0, pos), str.substr(pos + 1) }; + else + throw invalid_argument("Missing separator '"s + string(sep) + "'"); +} +#endif + + class MkIndexCommand : public CBLiteCommand { public: @@ -33,51 +43,135 @@ class MkIndexCommand : public CBLiteCommand { void usage() override { writeUsageCommand("mkindex", true, "NAME EXPRESSION"); cerr << - " Creates an index.\n" - " --json : Use JSON syntax for " << it("EXPRESSION") << ", instead of N1QL\n" - " --fts : Create a Full-Text Search index\n" - " --language LANG : (Human) language for FTS index, by name or ISO-369 code:\n" - " " << ansiDim() << "da/danish, nl/dutch, en/english, fi/finnish, fr/french, de/german,\n" - " hu/hungarian, it/italian, no/norwegian, pt/portuguese,\n" - " ro/romanian, ru/russian, es/spanish, sv/swedish, tr/turkish" << ansiReset() << "\n" - " --ignoreDiacritics : FTS index should ignore diacritical (accent) marks\n" - " --noStemming : FTS index should not try to recognize word variations like plurals.\n" - ; + " Creates an index. The EXPRESSION does not need to be quoted.\n" + " --json : Use JSON syntax for " << it("EXPRESSION") << ", instead of N1QL\n" + " --fts : Create a Full-Text Search (FTS) index\n" +#ifdef COUCHBASE_ENTERPRISE + " --vector : Create a vector index\n\n" +#endif + << bold("FTS index flags:\n") << + " --language LANG : (Human) language, by name or ISO-369 code:\n" + " " << ansiDim() << "da/danish, nl/dutch, en/english, fi/finnish, fr/french, de/german,\n" + " hu/hungarian, it/italian, no/norwegian, pt/portuguese,\n" + " ro/romanian, ru/russian, es/spanish, sv/swedish, tr/turkish" << ansiReset() << "\n" + " --ignoreDiacritics : Ignore diacritical (accent) marks\n" + " --noStemming : Don't try to recognize word variations like plurals.\n\n"; +#ifdef COUCHBASE_ENTERPRISE + cerr << bold("Vector index flags:\n") << + " --dim N : Number of dimensions (required)\n" + " --metric M : Distance metric (M = 'euclidean' or 'cosine')\n" + " --centroids N : Flat clustering with N centroids\n" + " --multi NxB : Multi-index clustering with N subquantizers, B bits\n" + " --encoding ENC : Encoding type (ENC = 'none', 'SQ8', 'PQ32x8', etc.)\n"; + if (! getenv("CBLITE_EXTENSION_PATH")) { + cerr << + "NOTE: Vector indexes require the CouchbaseLiteVectorSearch extension.\n" + " The environment variable CBLITE_EXTENSION_PATH must be set to its parent directory.\n"; + } +#endif } void runSubcommand() override { C4QueryLanguage language = kC4N1QLQuery; - C4IndexType indexType = kC4ValueIndex; - C4IndexOptions ftsOptions = {}; + bool ftsFlags = false; + bool vectorFlags = false; + C4IndexOptions options = {}; string ftsLanguage; processFlags({ {"--json", [&]{language = kC4JSONQuery;}}, - {"--fts", [&]{indexType = kC4FullTextIndex;}}, + + {"--fts", [&]{ftsFlags = true;}}, {"--language", [&]{ - indexType = kC4FullTextIndex; + ftsFlags = true; ftsLanguage = nextArg("FTS language"); - ftsOptions.language = ftsLanguage.c_str(); + options.language = ftsLanguage.c_str(); }}, {"--ignoreDiacritics", [&]{ - indexType = kC4FullTextIndex; - ftsOptions.ignoreDiacritics = true; + ftsFlags = true; + options.ignoreDiacritics = true; }}, {"--noStemming", [&]{ - indexType = kC4FullTextIndex; - ftsOptions.disableStemming = true; + ftsFlags = true; + options.disableStemming = true; + }}, + +#ifdef COUCHBASE_ENTERPRISE + {"--vector", [&]{vectorFlags = true;}}, + {"--dim", [&]{ + vectorFlags = true; + options.vector.dimensions = nextIntArg("dimensions", 2, 2048); }}, + {"--metric", [&]{ + vectorFlags = true; + if (string m = lowercase(nextArg("metric name")); m == "euclidean") + options.vector.metric = kC4VectorMetricEuclidean; + else if (m == "cosine") + options.vector.metric = kC4VectorMetricCosine; + else + throw invalid_argument("--metric value must be euclidean or cosine"); + }}, + {"--centroids", [&]{ + vectorFlags = true; + options.vector.clustering.type = kC4VectorClusteringFlat; + options.vector.clustering.flat_centroids = nextIntArg("number of centroids", + 2, 65536); + }}, + {"--multi", [&]{ + vectorFlags = true; + options.vector.clustering.type = kC4VectorClusteringMulti; + string arg = nextArg("multi-index parameters"); + auto [sub, bits] = split(arg, "x"); + options.vector.clustering.multi_subquantizers = parseInt(sub, 2); + options.vector.clustering.multi_bits = parseInt(bits, 4); + }}, + {"--encoding", [&]{ + vectorFlags = true; + string arg = lowercase(nextArg("encoding type")); + if (hasPrefix(arg, "pq")) { + auto [sub, bits] = split(arg, "x"); + options.vector.encoding.pq_subquantizers = parseInt(sub, 2, 2048); + options.vector.encoding.bits = parseInt(bits, 4); + } else if (hasPrefix(arg, "sq")) { + options.vector.encoding.type = kC4VectorEncodingSQ; + options.vector.encoding.bits = parseInt(arg.substr(2), 4, 8); + } else { + throw invalid_argument("encoding type must start with PQ or SQ"); + } + }}, +#endif }); + +#ifdef COUCHBASE_ENTERPRISE + if (ftsFlags && vectorFlags) + throw invalid_argument("Can't combine FTS and vector options"); + if (vectorFlags && options.vector.dimensions == 0) + throw invalid_argument("Number of dimensions (--dim) is required in a vector index"); +#endif + openWriteableDatabaseFromNextArg(); string name = nextArg("index name"); string expression = restOfInput("expression"); - cout << "Creating index '" << name << "' ..."; + C4IndexType indexType; + const char* message; + if (ftsFlags) { + indexType = kC4FullTextIndex; + message = "Creating FTS index '"; + } else if (vectorFlags) { + indexType = kC4VectorIndex; + message = "Creating vector index '"; + } else { + indexType = kC4ValueIndex; + message = "Creating index '"; + } + cout << message << name << "' ..."; cout.flush(); + C4Error error; bool ok; ok = c4coll_createIndex(collection(), slice(name), slice(expression), language, indexType, - &ftsOptions, &error); + &options, &error); if (!ok) { cout << endl; fail("Couldn't create index", error); diff --git a/cblite/OpenCommand.cc b/cblite/OpenCommand.cc index a10bab5..55bf10c 100644 --- a/cblite/OpenCommand.cc +++ b/cblite/OpenCommand.cc @@ -97,6 +97,7 @@ class OpenCommand : public CBLiteCommand { #endif " edit " << it("[FLAGS] DOCID") << "\n" " export " << it("[FLAGS] JSONFILE") << "\n" + " enrich " << it("[FLAGS] PROPERTY [DESTINATION]") << "\n" " get " << it("[FLAGS] DOCID [DOCID...]") << "\n" " help " << it("[SUBCOMMAND]") << "\n" " import " << it("[FLAGS] JSONFILE") << "\n"