Skip to content

Commit

Permalink
Enrich Command (#75)
Browse files Browse the repository at this point in the history
A command that is able to produce LLM embeddings in a couchbase lite database.

Co-authored-by: Alexander Baker <[email protected]>
  • Loading branch information
Alexander-Baker-24 and Alexander Baker authored Jul 16, 2024
1 parent d33c57a commit fa63e1d
Show file tree
Hide file tree
Showing 13 changed files with 405 additions and 33 deletions.
24 changes: 24 additions & 0 deletions Tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "Logging.hh"
#include "linenoise.h"
#include "utf8.h"
#include <charconv>
#include <cstdio>
#include <fstream>
#include <regex>
Expand Down Expand Up @@ -266,3 +267,26 @@ alloc_slice Tool::readFile(const string &path) {
in.read((char*)data.buf, size);
return data;
}

int Tool::nextIntArg(const char *what, int minVal, int maxVal) {
return parseInt(nextArg(what), minVal, maxVal);
}


int Tool::parseInt(string_view str, int minVal, int maxVal) {
int value;
const char* end = str.data() + str.size();
auto [ptr, ec] = std::from_chars(str.data(), end, value);
const char* err = nullptr;
if (ec == errc::result_out_of_range)
err = " is out of range";
else if (ec != errc{} || ptr != end)
err = " is not a valid integer";
else if (value < minVal)
err = " is too small";
else if (value > maxVal)
err = " is too large";
if (err)
fail(string(str) + err);
return value;
}
15 changes: 14 additions & 1 deletion Tool.hh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <string>
#include <deque>
#include <algorithm>
#include <climits>

#ifdef CMAKE
#include "config.h"
Expand Down Expand Up @@ -176,6 +177,8 @@ public:
std::string it(const char *str) {return ansiItalic() + str + ansiReset();}

std::string spaces(int n) {return std::string(std::max(n, 1), ' ');}

int parseInt(std::string_view, int minVal = INT_MIN, int maxVal = INT_MAX);

protected:

Expand Down Expand Up @@ -206,6 +209,8 @@ protected:
return arg;
}

int nextIntArg(const char *what, int minVal = INT_MIN, int maxVal = INT_MAX);

/** If the next arg matches the given string, consumes it and returns true. */
bool matchArg(const char *matchArg) {
if (_argTokenizer.argument() != matchArg)
Expand Down Expand Up @@ -255,7 +260,15 @@ protected:

if (flag == "--")
return; // marks end of flags
if (!processFlag(flag, specs)) {

bool handled;
try {
handled = processFlag(flag, specs);
} catch (std::exception const& x) {
fail("in flag " + flag + ": " + x.what());
}

if (!handled) {
// Flags all subcommands accept:
if (flag == "--help") {
usage();
Expand Down
62 changes: 56 additions & 6 deletions Xcode/Tools.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
objects = {

/* Begin PBXBuildFile section */
1A02703F2C2645A60025F2B5 /* EnrichCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 1A02703D2C2645A60025F2B5 /* EnrichCommand.cc */; };
2716F94D2491822700BE21D9 /* CheckCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 2716F94C2491822700BE21D9 /* CheckCommand.cc */; };
2716F95B2491857E00BE21D9 /* ReindexCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 2716F95A2491857E00BE21D9 /* ReindexCommand.cc */; };
27175C0F261CE5F40045F3AC /* MkCollCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 27175C0E261CE5F40045F3AC /* MkCollCommand.cc */; };
Expand All @@ -30,7 +31,6 @@
2751C83225D3650A00A9B39B /* linenoise.c in Sources */ = {isa = PBXBuildFile; fileRef = 2751C81C25D3650A00A9B39B /* linenoise.c */; };
2751C83325D3650A00A9B39B /* utf8.h in Headers */ = {isa = PBXBuildFile; fileRef = 2751C81F25D3650A00A9B39B /* utf8.h */; };
276CE5C9225FAA1600B681AC /* TokenizerTest.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5C8225FAA1600B681AC /* TokenizerTest.cc */; };
276CE5CC225FAC8500B681AC /* LibC++Debug.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5CB225FAC8500B681AC /* LibC++Debug.cc */; };
276CE5CF225FACDD00B681AC /* ArgumentTokenizer.cc in Sources */ = {isa = PBXBuildFile; fileRef = 27FC8DF722137C580083B033 /* ArgumentTokenizer.cc */; };
276CE5D1225FADB200B681AC /* tests_main.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5D0225FADB200B681AC /* tests_main.cc */; };
276D4AC527864A5500F61A89 /* MkIndexCommand.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276D4AC427864A5500F61A89 /* MkIndexCommand.cc */; };
Expand Down Expand Up @@ -58,7 +58,6 @@
27FC8E64221381C60083B033 /* liblinenoise.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 27FC8E5A221381890083B033 /* liblinenoise.a */; };
27FC8E67221383880083B033 /* libz.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 27FC8E66221383880083B033 /* libz.tbd */; };
27FC8E6A221383AE0083B033 /* libLiteCoreREST-static.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 27FC8E1A22137CB60083B033 /* libLiteCoreREST-static.a */; };
42030A6E24AC14F900283CE8 /* LibC++Debug.cc in Sources */ = {isa = PBXBuildFile; fileRef = 276CE5CB225FAC8500B681AC /* LibC++Debug.cc */; };
42030A7024AC152000283CE8 /* StringUtil.cc in Sources */ = {isa = PBXBuildFile; fileRef = 42030A6F24AC152000283CE8 /* StringUtil.cc */; };
/* End PBXBuildFile section */

Expand Down Expand Up @@ -208,6 +207,7 @@
/* End PBXCopyFilesBuildPhase section */

/* Begin PBXFileReference section */
1A02703D2C2645A60025F2B5 /* EnrichCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = EnrichCommand.cc; sourceTree = "<group>"; };
2716F94C2491822700BE21D9 /* CheckCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CheckCommand.cc; sourceTree = "<group>"; };
2716F95A2491857E00BE21D9 /* ReindexCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = ReindexCommand.cc; sourceTree = "<group>"; };
2716F9942493E59B00BE21D9 /* BUILDING.md */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = net.daringfireball.markdown; name = BUILDING.md; path = ../BUILDING.md; sourceTree = "<group>"; };
Expand Down Expand Up @@ -237,7 +237,6 @@
276CE5C0225FA9F400B681AC /* Tests */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Tests; sourceTree = BUILT_PRODUCTS_DIR; };
276CE5C8225FAA1600B681AC /* TokenizerTest.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = TokenizerTest.cc; sourceTree = "<group>"; };
276CE5CA225FAA4C00B681AC /* tests.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = tests.xcconfig; sourceTree = "<group>"; };
276CE5CB225FAC8500B681AC /* LibC++Debug.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = "LibC++Debug.cc"; sourceTree = "<group>"; };
276CE5D0225FADB200B681AC /* tests_main.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tests_main.cc; sourceTree = "<group>"; };
276D4AC427864A5500F61A89 /* MkIndexCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = MkIndexCommand.cc; sourceTree = "<group>"; };
276D4AD22786502600F61A89 /* RmIndexCommand.cc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = RmIndexCommand.cc; sourceTree = "<group>"; };
Expand Down Expand Up @@ -361,7 +360,6 @@
isa = PBXGroup;
children = (
276CE5C8225FAA1600B681AC /* TokenizerTest.cc */,
276CE5CB225FAC8500B681AC /* LibC++Debug.cc */,
276CE5D0225FADB200B681AC /* tests_main.cc */,
);
name = tests;
Expand Down Expand Up @@ -422,6 +420,7 @@
27FC8DD322137C330083B033 /* CpCommand.cc */,
273B20D3264B2D6900A14EC4 /* EditCommand.cc */,
271BA4AD227CC54300D49D13 /* EncryptCommand.cc */,
1A02703D2C2645A60025F2B5 /* EnrichCommand.cc */,
27FC8DD622137C330083B033 /* InfoCommand.cc */,
27FC8DD222137C330083B033 /* ListCommand.cc */,
27E95BAC24083D7C0013711C /* ListCommand.hh */,
Expand Down Expand Up @@ -760,7 +759,6 @@
files = (
272F00CF226F7FE500E62F72 /* cbl-log.cc in Sources */,
272F00D7226F956C00E62F72 /* LogDecoder_stub.cpp in Sources */,
42030A6E24AC14F900283CE8 /* LibC++Debug.cc in Sources */,
272F00D0226F7FFC00E62F72 /* Tool.cc in Sources */,
42030A7024AC152000283CE8 /* StringUtil.cc in Sources */,
272F00D1226F800000E62F72 /* ArgumentTokenizer.cc in Sources */,
Expand All @@ -772,7 +770,6 @@
buildActionMask = 2147483647;
files = (
276CE5D1225FADB200B681AC /* tests_main.cc in Sources */,
276CE5CC225FAC8500B681AC /* LibC++Debug.cc in Sources */,
276CE5CF225FACDD00B681AC /* ArgumentTokenizer.cc in Sources */,
276CE5C9225FAA1600B681AC /* TokenizerTest.cc in Sources */,
);
Expand Down Expand Up @@ -800,6 +797,7 @@
27FC8DDD22137C330083B033 /* ListCommand.cc in Sources */,
27FC8DF222137C490083B033 /* Endpoint.cc in Sources */,
27FC8DE522137C330083B033 /* SQLCommand.cc in Sources */,
1A02703F2C2645A60025F2B5 /* EnrichCommand.cc in Sources */,
27175C27261D097A0045F3AC /* MvCommand.cc in Sources */,
27E95BAB2408376B0013711C /* CBLiteCommand.cc in Sources */,
27175C21261D00200045F3AC /* CdCommand.cc in Sources */,
Expand Down Expand Up @@ -855,6 +853,19 @@
isa = XCBuildConfiguration;
baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */;
buildSettings = {
HEADER_SEARCH_PATHS = (
"$(inherited)",
"$(FLEECE)/API",
"$(FLEECE)/Fleece/Support",
"$(LITECORE)/C/include",
"$(LITECORE)/C/Cpp_include",
"$(LITECORE)/C",
"$(LITECORE)/Networking",
"$(LITECORE)/Replicator",
"$(LITECORE)/LiteCore/Support",
"$(LITECORE)/REST",
"$(LITECORE)/Networking/HTTP",
);
};
name = Debug_EE;
};
Expand Down Expand Up @@ -947,6 +958,19 @@
isa = XCBuildConfiguration;
baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */;
buildSettings = {
HEADER_SEARCH_PATHS = (
"$(inherited)",
"$(FLEECE)/API",
"$(FLEECE)/Fleece/Support",
"$(LITECORE)/C/include",
"$(LITECORE)/C/Cpp_include",
"$(LITECORE)/C",
"$(LITECORE)/Networking",
"$(LITECORE)/Replicator",
"$(LITECORE)/LiteCore/Support",
"$(LITECORE)/REST",
"$(LITECORE)/Networking/HTTP",
);
};
name = Release_EE;
};
Expand Down Expand Up @@ -1174,13 +1198,39 @@
isa = XCBuildConfiguration;
baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */;
buildSettings = {
HEADER_SEARCH_PATHS = (
"$(inherited)",
"$(FLEECE)/API",
"$(FLEECE)/Fleece/Support",
"$(LITECORE)/C/include",
"$(LITECORE)/C/Cpp_include",
"$(LITECORE)/C",
"$(LITECORE)/Networking",
"$(LITECORE)/Replicator",
"$(LITECORE)/LiteCore/Support",
"$(LITECORE)/REST",
"$(LITECORE)/Networking/HTTP",
);
};
name = Debug;
};
27FC8DD022137BF60083B033 /* Release */ = {
isa = XCBuildConfiguration;
baseConfigurationReference = 27FC8E4122137DAF0083B033 /* cblite.xcconfig */;
buildSettings = {
HEADER_SEARCH_PATHS = (
"$(inherited)",
"$(FLEECE)/API",
"$(FLEECE)/Fleece/Support",
"$(LITECORE)/C/include",
"$(LITECORE)/C/Cpp_include",
"$(LITECORE)/C",
"$(LITECORE)/Networking",
"$(LITECORE)/Replicator",
"$(LITECORE)/LiteCore/Support",
"$(LITECORE)/REST",
"$(LITECORE)/Networking/HTTP",
);
};
name = Release;
};
Expand Down
1 change: 1 addition & 0 deletions cblite/CBLiteCommand.hh
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ CBLiteCommand* newCheckCommand(CBLiteTool&);
CBLiteCommand* newCompactCommand(CBLiteTool&);
CBLiteCommand* newCpCommand(CBLiteTool&);
CBLiteCommand* newEditCommand(CBLiteTool&);
CBLiteCommand* newEnrichCommand(CBLiteTool&);
CBLiteCommand* newExportCommand(CBLiteTool&);
CBLiteCommand* newImportCommand(CBLiteTool&);
CBLiteCommand* newInfoCommand(CBLiteTool&);
Expand Down
5 changes: 5 additions & 0 deletions cblite/CBLiteTool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ void CBLiteTool::openDatabase(string pathStr, bool interactive) {
C4Error err;
const C4Error kEncryptedDBError = {LiteCoreDomain, kC4ErrorNotADatabaseFile};

if (const char* extPath = getenv("CBLITE_EXTENSION_PATH")) {
c4_setExtensionPath(slice(extPath));
}

if (!_dbNeedsPassword) {
_db = c4db_openNamed(slice(dbName), &config, &err);
} else {
Expand Down Expand Up @@ -287,6 +291,7 @@ static constexpr struct {const char* name; ToolFactory factory;} kSubcommands[]
{"compact", newCompactCommand},
{"cp", newCpCommand},
{"edit", newEditCommand},
{"enrich", newEnrichCommand},
{"export", newExportCommand},
{"file", newInfoCommand},
{"get", newCatCommand},
Expand Down
1 change: 1 addition & 0 deletions cblite/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ target_include_directories(
${LITECORE}Networking/
${LITECORE}Networking/HTTP
${LITECORE}Replicator
${LITECORE}REST/
${LITECORE}vendor/fleece/API/
${LITECORE}vendor/fleece/Fleece/Support/ # PlatformCompat.hh
${CMAKE_BINARY_DIR}/generated_headers/
Expand Down
6 changes: 3 additions & 3 deletions cblite/CompactCommand.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class CompactCommand : public CBLiteCommand {
void runSubcommand() override {
// Read params:
processFlags({
{"--prune", [&]{_prune = stoi(nextArg("depth for --prune"));}},
{"--prune", [&]{_prune = nextIntArg("depth for --prune", 1);}},
{"--purgeDeleted", [&]{_purgeDeleted = true;}},
});

Expand Down Expand Up @@ -177,14 +177,14 @@ class CompactCommand : public CBLiteCommand {
if (c4doc_selectCommonAncestorRevision(doc, doc->selectedRev.revID, currentRevID))
branchPoint = doc->selectedRev.revID;
// First count the number of revs on the branch:
c4doc_selectRevision(doc, closedBranch, false, nullptr);
[[maybe_unused]] bool _ = c4doc_selectRevision(doc, closedBranch, false, nullptr);
do {
++nPrunedRevs;
if (doc->selectedRev.flags & kRevKeepBody)
++nRemovedBodies;
} while (c4doc_selectParentRevision(doc) && doc->selectedRev.revID != branchPoint);
// Then prune the entire branch:
c4doc_purgeRevision(doc, closedBranch, nullptr);
_ = c4doc_purgeRevision(doc, closedBranch, nullptr);
} else {
// Walk its ancestor chain, counting how many revs are deeper than maxDepth:
unsigned branchDepth = 1, keepBodyDepth = 0;
Expand Down
2 changes: 1 addition & 1 deletion cblite/DocBranchIterator.hh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public:
}

DocBranchIterator& operator++() {
c4doc_selectRevision(_doc, _branchID, false, nullptr);
[[maybe_unused]] bool _ = c4doc_selectRevision(_doc, _branchID, false, nullptr);
_branchID = fleece::nullslice;
while (c4doc_selectNextRevision(_doc)) {
if (_doc->selectedRev.flags & kRevLeaf) {
Expand Down
Loading

0 comments on commit fa63e1d

Please sign in to comment.