From a090167b192e2a1a790bb1aec99217a9df45b25a Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 4 Nov 2024 14:56:17 +0100 Subject: [PATCH 01/12] Lazy `TransitivePath` operation (#1595) This PR enables the `TransitivePath` operation to yield its result lazily and to consume its left/right child lazily. Note that the graph which is transitively traversed needs to be fully materialized due to the underlying algorithm. E.G when computing the (large) result of `wdt:P31/wdt:P279*`, the large result and the `wdt:P31` can be dealt with lazily, but the full `wdt:P279` predicate needs to be materialized. --- src/engine/TransitivePathBase.cpp | 124 +++++----- src/engine/TransitivePathBase.h | 65 +++-- src/engine/TransitivePathImpl.h | 375 +++++++++++++++-------------- src/engine/idTable/IdTable.h | 9 +- test/TransitivePathTest.cpp | 381 +++++++++++++++++++++--------- test/util/IdTableHelpers.cpp | 12 + test/util/IdTableHelpers.h | 5 + 7 files changed, 591 insertions(+), 380 deletions(-) diff --git a/src/engine/TransitivePathBase.cpp b/src/engine/TransitivePathBase.cpp index 63899fdb28..a833bfdfbd 100644 --- a/src/engine/TransitivePathBase.cpp +++ b/src/engine/TransitivePathBase.cpp @@ -63,76 +63,80 @@ TransitivePathBase::decideDirection() { } // _____________________________________________________________________________ -void TransitivePathBase::fillTableWithHull(IdTable& table, const Map& hull, - std::vector& nodes, - size_t startSideCol, - size_t targetSideCol, - const IdTable& startSideTable, - size_t skipCol) const { - CALL_FIXED_SIZE((std::array{table.numColumns(), startSideTable.numColumns()}), - &TransitivePathBase::fillTableWithHullImpl, this, table, hull, - nodes, startSideCol, targetSideCol, startSideTable, skipCol); +Result::Generator TransitivePathBase::fillTableWithHull( + NodeGenerator hull, size_t startSideCol, size_t targetSideCol, + size_t skipCol, bool yieldOnce, size_t inputWidth) const { + return ad_utility::callFixedSize( + std::array{inputWidth, getResultWidth()}, + [&]() { + return fillTableWithHullImpl( + std::move(hull), startSideCol, targetSideCol, yieldOnce, skipCol); + }); } // _____________________________________________________________________________ -template -void TransitivePathBase::fillTableWithHullImpl( - IdTable& tableDyn, const Map& hull, std::vector& nodes, - size_t startSideCol, size_t targetSideCol, const IdTable& startSideTable, - size_t skipCol) const { - IdTableStatic table = std::move(tableDyn).toStatic(); - IdTableView startView = - startSideTable.asStaticView(); - - size_t rowIndex = 0; - for (size_t i = 0; i < nodes.size(); i++) { - Id node = nodes[i]; - auto it = hull.find(node); - if (it == hull.end()) { - continue; - } - - for (Id otherNode : it->second) { - table.emplace_back(); - table(rowIndex, startSideCol) = node; - table(rowIndex, targetSideCol) = otherNode; - - copyColumns(startView, table, i, rowIndex, skipCol); - - rowIndex++; - } - } - - tableDyn = std::move(table).toDynamic(); -} - -// _____________________________________________________________________________ -void TransitivePathBase::fillTableWithHull(IdTable& table, const Map& hull, - size_t startSideCol, - size_t targetSideCol) const { - CALL_FIXED_SIZE((std::array{table.numColumns()}), - &TransitivePathBase::fillTableWithHullImpl, this, table, hull, - startSideCol, targetSideCol); +Result::Generator TransitivePathBase::fillTableWithHull(NodeGenerator hull, + size_t startSideCol, + size_t targetSideCol, + bool yieldOnce) const { + return ad_utility::callFixedSize(getResultWidth(), [&]() { + return fillTableWithHullImpl<0, WIDTH>(std::move(hull), startSideCol, + targetSideCol, yieldOnce); + }); } // _____________________________________________________________________________ -template -void TransitivePathBase::fillTableWithHullImpl(IdTable& tableDyn, - const Map& hull, - size_t startSideCol, - size_t targetSideCol) const { - IdTableStatic table = std::move(tableDyn).toStatic(); - size_t rowIndex = 0; - for (auto const& [node, linkedNodes] : hull) { +template +Result::Generator TransitivePathBase::fillTableWithHullImpl( + NodeGenerator hull, size_t startSideCol, size_t targetSideCol, + bool yieldOnce, size_t skipCol) const { + ad_utility::Timer timer{ad_utility::Timer::Stopped}; + size_t outputRow = 0; + IdTableStatic table{getResultWidth(), allocator()}; + std::vector storedLocalVocabs; + for (auto& [node, linkedNodes, localVocab, idTable, inputRow] : hull) { + timer.cont(); + // As an optimization nodes without any linked nodes should not get yielded + // in the first place. + AD_CONTRACT_CHECK(!linkedNodes.empty()); + if (!yieldOnce) { + table.reserve(linkedNodes.size()); + } + std::optional> inputView = std::nullopt; + if (idTable != nullptr) { + inputView = idTable->template asStaticView(); + } for (Id linkedNode : linkedNodes) { table.emplace_back(); - table(rowIndex, startSideCol) = node; - table(rowIndex, targetSideCol) = linkedNode; + table(outputRow, startSideCol) = node; + table(outputRow, targetSideCol) = linkedNode; - rowIndex++; + if (inputView.has_value()) { + copyColumns(inputView.value(), table, + inputRow, outputRow, skipCol); + } + + outputRow++; } + + if (yieldOnce) { + storedLocalVocabs.emplace_back(std::move(localVocab)); + } else { + timer.stop(); + runtimeInfo().addDetail("IdTable fill time", timer.msecs()); + co_yield {std::move(table).toDynamic(), std::move(localVocab)}; + table = IdTableStatic{getResultWidth(), allocator()}; + outputRow = 0; + } + timer.stop(); + } + if (yieldOnce) { + timer.start(); + LocalVocab mergedVocab{}; + mergedVocab.mergeWith(storedLocalVocabs); + runtimeInfo().addDetail("IdTable fill time", timer.msecs()); + co_yield {std::move(table).toDynamic(), std::move(mergedVocab)}; } - tableDyn = std::move(table).toDynamic(); } // _____________________________________________________________________________ @@ -405,7 +409,7 @@ void TransitivePathBase::copyColumns(const IdTableView& inputTable, continue; } - outputTable(outputRow, outCol) = inputTable(inputRow, inCol); + outputTable.at(outputRow, outCol) = inputTable.at(inputRow, inCol); inCol++; outCol++; } diff --git a/src/engine/TransitivePathBase.h b/src/engine/TransitivePathBase.h index ce7c32ac3e..a223e06d95 100644 --- a/src/engine/TransitivePathBase.h +++ b/src/engine/TransitivePathBase.h @@ -69,6 +69,31 @@ using Map = std::unordered_map< Id, Set, HashId, std::equal_to, ad_utility::AllocatorWithLimit>>; +// Helper struct, that allows a generator to yield a a node and all its +// connected nodes (the `targets`), along with a local vocabulary and the row +// index of the node in the input table. The `IdTable` pointer might be null if +// the `Id` is not associated with a table. In this case the `row` value does +// not represent anything meaningful and should not be used. +struct NodeWithTargets { + Id node_; + Set targets_; + LocalVocab localVocab_; + const IdTable* idTable_; + size_t row_; + + // Explicit to prevent issues with co_yield and lifetime. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103909 for more info. + NodeWithTargets(Id node, Set targets, LocalVocab localVocab, + const IdTable* idTable, size_t row) + : node_{node}, + targets_{std::move(targets)}, + localVocab_{std::move(localVocab)}, + idTable_{idTable}, + row_{row} {} +}; + +using NodeGenerator = cppcoro::generator; + /** * @class TransitivePathBase * @brief A common base class for different implementations of the Transitive @@ -147,37 +172,36 @@ class TransitivePathBase : public Operation { * startSideTable to fill in the rest of the columns. * This function is called if the start side is bound and a variable. * - * @param table The result table which will be filled. - * @param hull The transitive hull. - * @param nodes The start nodes of the transitive hull. These need to be in - * the same order and amount as the starting side nodes in the startTable. + * @param hull The transitive hull, represented by a generator that yields + * sets of connected nodes with some metadata. * @param startSideCol The column of the result table for the startSide of the * hull * @param targetSideCol The column of the result table for the targetSide of * the hull - * @param startSideTable An IdTable that holds other results. The other - * results will be transferred to the new result table. * @param skipCol This column contains the Ids of the start side in the * startSideTable and will be skipped. + * @param yieldOnce If true, the generator will yield only a single time. + * @param inputWidth The width of the input table that is referenced by the + * elements of `hull`. */ - void fillTableWithHull(IdTable& table, const Map& hull, - std::vector& nodes, size_t startSideCol, - size_t targetSideCol, const IdTable& startSideTable, - size_t skipCol) const; + Result::Generator fillTableWithHull(NodeGenerator hull, size_t startSideCol, + size_t targetSideCol, size_t skipCol, + bool yieldOnce, size_t inputWidth) const; /** * @brief Fill the given table with the transitive hull. * This function is called if the sides are unbound or ids. * - * @param table The result table which will be filled. * @param hull The transitive hull. * @param startSideCol The column of the result table for the startSide of the * hull * @param targetSideCol The column of the result table for the targetSide of * the hull + * @param yieldOnce If true, the generator will yield only a single time. */ - void fillTableWithHull(IdTable& table, const Map& hull, size_t startSideCol, - size_t targetSideCol) const; + Result::Generator fillTableWithHull(NodeGenerator hull, size_t startSideCol, + size_t targetSideCol, + bool yieldOnce) const; // Copy the columns from the input table to the output table template @@ -204,16 +228,11 @@ class TransitivePathBase : public Operation { private: uint64_t getSizeEstimateBeforeLimit() override; - template - void fillTableWithHullImpl(IdTable& table, const Map& hull, - std::vector& nodes, size_t startSideCol, - size_t targetSideCol, - const IdTable& startSideTable, - size_t skipCol) const; - - template - void fillTableWithHullImpl(IdTable& table, const Map& hull, - size_t startSideCol, size_t targetSideCol) const; + template + Result::Generator fillTableWithHullImpl(NodeGenerator hull, + size_t startSideCol, + size_t targetSideCol, bool yieldOnce, + size_t skipCol = 0) const; public: size_t getCostEstimate() override; diff --git a/src/engine/TransitivePathImpl.h b/src/engine/TransitivePathImpl.h index 55ce45ba4d..407b63a298 100644 --- a/src/engine/TransitivePathImpl.h +++ b/src/engine/TransitivePathImpl.h @@ -11,6 +11,25 @@ #include "util/Exception.h" #include "util/Timer.h" +namespace detail { + +// Helper struct that allows to group a read-only view of a column of a table +// with a reference to the table itself and a local vocabulary (used to ensure +// the correct lifetime). +template +struct TableColumnWithVocab { + const IdTable* table_; + ColumnType column_; + LocalVocab vocab_; + + // Explicit to prevent issues with co_yield and lifetime. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103909 for more info. + TableColumnWithVocab(const IdTable* table, ColumnType column, + LocalVocab vocab) + : table_{table}, column_{std::move(column)}, vocab_{std::move(vocab)} {}; +}; +}; // namespace detail + /** * @class TransitivePathImpl * @brief This class implements common functions for the concrete TransitivePath @@ -22,6 +41,9 @@ */ template class TransitivePathImpl : public TransitivePathBase { + using TableColumnWithVocab = + detail::TableColumnWithVocab>; + public: TransitivePathImpl(QueryExecutionContext* qec, std::shared_ptr child, @@ -36,100 +58,88 @@ class TransitivePathImpl : public TransitivePathBase { * it is a variable. The other IdTable contains the result * of the start side and will be used to get the start nodes. * - * @tparam RES_WIDTH Number of columns of the result table - * @tparam SUB_WIDTH Number of columns of the sub table - * @tparam SIDE_WIDTH Number of columns of the - * @param res The result table which will be filled in-place - * @param sub The IdTable for the sub result + * @param sub A shared pointer to the sub result. Needs to be kept alive for + * the lifetime of this generator. * @param startSide The start side for the transitive hull * @param targetSide The target side for the transitive hull - * @param startSideTable The IdTable of the startSide + * @param startSideResult The Result of the startSide + * @param yieldOnce If true, the generator will yield only a single time. */ - template - void computeTransitivePathBound(IdTable* dynRes, const IdTable& dynSub, - const TransitivePathSide& startSide, - const TransitivePathSide& targetSide, - const IdTable& startSideTable) const { - auto timer = ad_utility::Timer(ad_utility::Timer::Stopped); - timer.start(); - - auto [edges, nodes] = setupMapAndNodes( - dynSub, startSide, targetSide, startSideTable); - - timer.stop(); - auto initTime = timer.msecs(); - timer.start(); - - Map hull(allocator()); - if (!targetSide.isVariable()) { - hull = transitiveHull(edges, nodes, std::get(targetSide.value_)); - } else { - hull = transitiveHull(edges, nodes, std::nullopt); + Result::Generator computeTransitivePathBound( + std::shared_ptr sub, const TransitivePathSide& startSide, + const TransitivePathSide& targetSide, + std::shared_ptr startSideResult, bool yieldOnce) const { + ad_utility::Timer timer{ad_utility::Timer::Started}; + + auto edges = setupEdgesMap(sub->idTable(), startSide, targetSide); + auto nodes = setupNodes(startSide, std::move(startSideResult)); + // Setup nodes returns a generator, so this time measurement won't include + // the time for each iteration, but every iteration step should have + // constant overhead, which should be safe to ignore. + runtimeInfo().addDetail("Initialization time", timer.msecs().count()); + + NodeGenerator hull = + transitiveHull(edges, sub->getCopyOfLocalVocab(), std::move(nodes), + targetSide.isVariable() + ? std::nullopt + : std::optional{std::get(targetSide.value_)}); + + auto result = fillTableWithHull( + std::move(hull), startSide.outputCol_, targetSide.outputCol_, + startSide.treeAndCol_.value().second, yieldOnce, + startSide.treeAndCol_.value().first->getResultWidth()); + + // Iterate over generator to prevent lifetime issues + for (auto& pair : result) { + co_yield pair; } - - timer.stop(); - auto hullTime = timer.msecs(); - timer.start(); - - fillTableWithHull(*dynRes, hull, nodes, startSide.outputCol_, - targetSide.outputCol_, startSideTable, - startSide.treeAndCol_.value().second); - - timer.stop(); - auto fillTime = timer.msecs(); - - auto& info = runtimeInfo(); - info.addDetail("Initialization time", initTime.count()); - info.addDetail("Hull time", hullTime.count()); - info.addDetail("IdTable fill time", fillTime.count()); }; /** * @brief Compute the transitive hull. * This function is called when no side is bound (or an id). * - * @tparam RES_WIDTH Number of columns of the result table - * @tparam SUB_WIDTH Number of columns of the sub table - * @param res The result table which will be filled in-place - * @param sub The IdTable for the sub result + * @param sub A shared pointer to the sub result. Needs to be kept alive for + * the lifetime of this generator. * @param startSide The start side for the transitive hull * @param targetSide The target side for the transitive hull + * @param yieldOnce If true, the generator will yield only a single time. */ - template - void computeTransitivePath(IdTable* dynRes, const IdTable& dynSub, - const TransitivePathSide& startSide, - const TransitivePathSide& targetSide) const { - auto timer = ad_utility::Timer(ad_utility::Timer::Stopped); - timer.start(); + Result::Generator computeTransitivePath(std::shared_ptr sub, + const TransitivePathSide& startSide, + const TransitivePathSide& targetSide, + bool yieldOnce) const { + ad_utility::Timer timer{ad_utility::Timer::Started}; + + auto edges = setupEdgesMap(sub->idTable(), startSide, targetSide); + auto nodesWithDuplicates = + setupNodes(sub->idTable(), startSide, targetSide); + Set nodesWithoutDuplicates{allocator()}; + for (const auto& span : nodesWithDuplicates) { + nodesWithoutDuplicates.insert(span.begin(), span.end()); + } - auto [edges, nodes] = - setupMapAndNodes(dynSub, startSide, targetSide); + runtimeInfo().addDetail("Initialization time", timer.msecs()); - timer.stop(); - auto initTime = timer.msecs(); - timer.start(); + // Technically we should pass the localVocab of `sub` here, but this will + // just lead to a merge with itself later on in the pipeline. + detail::TableColumnWithVocab tableInfo{ + nullptr, nodesWithoutDuplicates, LocalVocab{}}; - Map hull{allocator()}; - if (!targetSide.isVariable()) { - hull = transitiveHull(edges, nodes, std::get(targetSide.value_)); - } else { - hull = transitiveHull(edges, nodes, std::nullopt); - } - - timer.stop(); - auto hullTime = timer.msecs(); - timer.start(); + NodeGenerator hull = transitiveHull( + edges, sub->getCopyOfLocalVocab(), std::span{&tableInfo, 1}, + targetSide.isVariable() + ? std::nullopt + : std::optional{std::get(targetSide.value_)}); - fillTableWithHull(*dynRes, hull, startSide.outputCol_, - targetSide.outputCol_); - timer.stop(); - auto fillTime = timer.msecs(); + auto result = fillTableWithHull(std::move(hull), startSide.outputCol_, + targetSide.outputCol_, yieldOnce); - auto& info = runtimeInfo(); - info.addDetail("Initialization time", initTime.count()); - info.addDetail("Hull time", hullTime.count()); - info.addDetail("IdTable fill time", fillTime.count()); + // Iterate over generator to prevent lifetime issues + for (auto& pair : result) { + co_yield pair; + } }; protected: @@ -142,7 +152,7 @@ class TransitivePathImpl : public TransitivePathBase { * * @return Result The result of the TransitivePath operation */ - ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override { + ProtoResult computeResult(bool requestLaziness) override { if (minDist_ == 0 && !isBoundOrId() && lhs_.isVariable() && rhs_.isVariable()) { AD_THROW( @@ -151,161 +161,170 @@ class TransitivePathImpl : public TransitivePathBase { "not supported"); } auto [startSide, targetSide] = decideDirection(); - std::shared_ptr subRes = subtree_->getResult(); - - IdTable idTable{allocator()}; - - idTable.setNumColumns(getResultWidth()); - - size_t subWidth = subRes->idTable().numColumns(); + // In order to traverse the graph represented by this result, we need random + // access across the whole table, so it doesn't make sense to lazily compute + // the result. + std::shared_ptr subRes = subtree_->getResult(false); if (startSide.isBoundVariable()) { std::shared_ptr sideRes = - startSide.treeAndCol_.value().first->getResult(); - size_t sideWidth = sideRes->idTable().numColumns(); + startSide.treeAndCol_.value().first->getResult(true); - CALL_FIXED_SIZE((std::array{resultWidth_, subWidth, sideWidth}), - &TransitivePathImpl::computeTransitivePathBound, this, - &idTable, subRes->idTable(), startSide, targetSide, - sideRes->idTable()); + auto gen = + computeTransitivePathBound(std::move(subRes), startSide, targetSide, + std::move(sideRes), !requestLaziness); - return {std::move(idTable), resultSortedOn(), - Result::getMergedLocalVocab(*sideRes, *subRes)}; + return requestLaziness + ? ProtoResult{std::move(gen), resultSortedOn()} + : ProtoResult{cppcoro::getSingleElement(std::move(gen)), + resultSortedOn()}; } - CALL_FIXED_SIZE((std::array{resultWidth_, subWidth}), - &TransitivePathImpl::computeTransitivePath, this, - &idTable, subRes->idTable(), startSide, targetSide); - - // NOTE: The only place, where the input to a transitive path operation is - // not an index scan (which has an empty local vocabulary by default) is the - // `LocalVocabTest`. But it doesn't harm to propagate the local vocab here - // either. - return {std::move(idTable), resultSortedOn(), - subRes->getSharedLocalVocab()}; - }; + auto gen = computeTransitivePath(std::move(subRes), startSide, targetSide, + !requestLaziness); + return requestLaziness + ? ProtoResult{std::move(gen), resultSortedOn()} + : ProtoResult{cppcoro::getSingleElement(std::move(gen)), + resultSortedOn()}; + } /** - * @brief Compute the transitive hull starting at the given nodes, - * using the given Map. - * - * @param edges Adjacency lists, mapping Ids (nodes) to their connected + * @brief Depth-first search to find connected nodes in the graph. + * @param edges The adjacency lists, mapping Ids (nodes) to their connected * Ids. - * @param nodes A list of Ids. These Ids are used as starting points for the - * transitive hull. Thus, this parameter guides the performance of this - * algorithm. - * @param target Optional target Id. If supplied, only paths which end - * in this Id are added to the hull. - * @return Map Maps each Id to its connected Ids in the transitive hull + * @param startNode The node to start the search from. + * @param target Optional target Id. If supplied, only paths which end in this + * Id are added to the result. + * @return A set of connected nodes in the graph. */ - Map transitiveHull(const T& edges, const std::vector& startNodes, - std::optional target) const { - // For every node do a dfs on the graph - Map hull{allocator()}; - + Set findConnectedNodes(const T& edges, Id startNode, + const std::optional& target) const { std::vector> stack; ad_utility::HashSetWithMemoryLimit marks{ getExecutionContext()->getAllocator()}; - for (auto startNode : startNodes) { - if (hull.contains(startNode)) { - // We have already computed the hull for this node - continue; - } + Set connectedNodes{getExecutionContext()->getAllocator()}; + stack.emplace_back(startNode, 0); - marks.clear(); - stack.clear(); - stack.push_back({startNode, 0}); + if (minDist_ == 0 && (!target.has_value() || startNode == target.value())) { + connectedNodes.insert(startNode); + } - if (minDist_ == 0 && - (!target.has_value() || startNode == target.value())) { - insertIntoMap(hull, startNode, startNode); - } + while (!stack.empty()) { + checkCancellation(); + auto [node, steps] = stack.back(); + stack.pop_back(); - while (!stack.empty()) { - checkCancellation(); - auto [node, steps] = stack.back(); - stack.pop_back(); - - if (steps <= maxDist_ && marks.count(node) == 0) { - if (steps >= minDist_) { - marks.insert(node); - if (!target.has_value() || node == target.value()) { - insertIntoMap(hull, startNode, node); - } + if (steps <= maxDist_ && marks.count(node) == 0) { + if (steps >= minDist_) { + marks.insert(node); + if (!target.has_value() || node == target.value()) { + connectedNodes.insert(node); } + } - const auto& successors = edges.successors(node); - for (auto successor : successors) { - stack.push_back({successor, steps + 1}); - } + const auto& successors = edges.successors(node); + for (auto successor : successors) { + stack.emplace_back(successor, steps + 1); + } + } + } + return connectedNodes; + } + + /** + * @brief Compute the transitive hull starting at the given nodes, + * using the given Map. + * + * @param edges Adjacency lists, mapping Ids (nodes) to their connected + * Ids. + * @param startNodes A range that yields an instantiation of + * `TableColumnWithVocab` that can be consumed to create a transitive hull. + * @param target Optional target Id. If supplied, only paths which end + * in this Id are added to the hull. + * @return Map Maps each Id to its connected Ids in the transitive hull + */ + NodeGenerator transitiveHull(const T& edges, LocalVocab edgesVocab, + std::ranges::range auto startNodes, + std::optional target) const { + ad_utility::Timer timer{ad_utility::Timer::Stopped}; + for (auto&& tableColumn : startNodes) { + timer.cont(); + LocalVocab mergedVocab = std::move(tableColumn.vocab_); + mergedVocab.mergeWith(std::span{&edgesVocab, 1}); + size_t currentRow = 0; + for (Id startNode : tableColumn.column_) { + Set connectedNodes = findConnectedNodes(edges, startNode, target); + if (!connectedNodes.empty()) { + runtimeInfo().addDetail("Hull time", timer.msecs()); + timer.stop(); + co_yield NodeWithTargets{startNode, std::move(connectedNodes), + mergedVocab.clone(), tableColumn.table_, + currentRow}; + timer.cont(); } + currentRow++; } + timer.stop(); } - return hull; } /** * @brief Prepare a Map and a nodes vector for the transitive hull * computation. * - * @tparam SUB_WIDTH Number of columns of the sub table * @param sub The sub table result * @param startSide The TransitivePathSide where the edges start * @param targetSide The TransitivePathSide where the edges end - * @return std::pair> A Map and Id vector (nodes) for the - * transitive hull computation + * @return std::vector> An vector of spans of (nodes) for + * the transitive hull computation */ - template - std::pair> setupMapAndNodes( + std::vector> setupNodes( const IdTable& sub, const TransitivePathSide& startSide, const TransitivePathSide& targetSide) const { - std::vector nodes; - auto edges = setupEdgesMap(sub, startSide, targetSide); + std::vector> result; // id -> var|id if (!startSide.isVariable()) { - nodes.push_back(std::get(startSide.value_)); + result.emplace_back(&std::get(startSide.value_), 1); // var -> var } else { std::span startNodes = sub.getColumn(startSide.subCol_); - // TODO Use ranges::to. - nodes.insert(nodes.end(), startNodes.begin(), startNodes.end()); + result.emplace_back(startNodes); if (minDist_ == 0) { std::span targetNodes = sub.getColumn(targetSide.subCol_); - nodes.insert(nodes.end(), targetNodes.begin(), targetNodes.end()); + result.emplace_back(targetNodes); } } - return {std::move(edges), std::move(nodes)}; + return result; }; /** * @brief Prepare a Map and a nodes vector for the transitive hull * computation. * - * @tparam SUB_WIDTH Number of columns of the sub table - * @tparam SIDE_WIDTH Number of columns of the startSideTable - * @param sub The sub table result * @param startSide The TransitivePathSide where the edges start - * @param targetSide The TransitivePathSide where the edges end * @param startSideTable An IdTable containing the Ids for the startSide - * @return std::pair> A Map and Id vector (nodes) for the - * transitive hull computation + * @return cppcoro::generator An generator for + * the transitive hull computation */ - template - std::pair> setupMapAndNodes( - const IdTable& sub, const TransitivePathSide& startSide, - const TransitivePathSide& targetSide, - const IdTable& startSideTable) const { - std::vector nodes; - auto edges = setupEdgesMap(sub, startSide, targetSide); - - // Bound -> var|id - std::span startNodes = - startSideTable.getColumn(startSide.treeAndCol_.value().second); - // TODO Use ranges::to. - nodes.insert(nodes.end(), startNodes.begin(), startNodes.end()); - - return {std::move(edges), std::move(nodes)}; + cppcoro::generator setupNodes( + const TransitivePathSide& startSide, + std::shared_ptr startSideResult) const { + if (startSideResult->isFullyMaterialized()) { + // Bound -> var|id + std::span startNodes = startSideResult->idTable().getColumn( + startSide.treeAndCol_.value().second); + co_yield TableColumnWithVocab{&startSideResult->idTable(), startNodes, + startSideResult->getCopyOfLocalVocab()}; + } else { + for (auto& [idTable, localVocab] : startSideResult->idTables()) { + // Bound -> var|id + std::span startNodes = + idTable.getColumn(startSide.treeAndCol_.value().second); + co_yield TableColumnWithVocab{&idTable, startNodes, + std::move(localVocab)}; + } + } }; virtual T setupEdgesMap(const IdTable& dynSub, diff --git a/src/engine/idTable/IdTable.h b/src/engine/idTable/IdTable.h index c615e8350c..c76ee1b9d6 100644 --- a/src/engine/idTable/IdTable.h +++ b/src/engine/idTable/IdTable.h @@ -330,9 +330,16 @@ class IdTable { T& at(size_t row, size_t column) requires(!isView) { return data().at(column).at(row); } - const T& at(size_t row, size_t column) const { + // TODO Remove overload for `isView` and drop requires clause. + const T& at(size_t row, size_t column) const requires(!isView) { return data().at(column).at(row); } + // `std::span::at` is a C++26 feature, so we have to implement it ourselves. + const T& at(size_t row, size_t column) const requires(isView) { + const auto& col = data().at(column); + AD_CONTRACT_CHECK(row < col.size()); + return col[row]; + } // Get a reference to the `i`-th row. The returned proxy objects can be // implicitly and trivially converted to `row_reference`. For the design diff --git a/test/TransitivePathTest.cpp b/test/TransitivePathTest.cpp index e616ad2e2b..2e6da1855b 100644 --- a/test/TransitivePathTest.cpp +++ b/test/TransitivePathTest.cpp @@ -4,7 +4,6 @@ // Johannes Herrmann (johannes.r.herrmann(at)gmail.com) #include -#include #include #include @@ -14,7 +13,6 @@ #include "engine/QueryExecutionTree.h" #include "engine/TransitivePathBase.h" #include "engine/ValuesForTesting.h" -#include "gtest/gtest.h" #include "util/GTestHelpers.h" #include "util/IdTableHelpers.h" #include "util/IndexTestHelpers.h" @@ -26,13 +24,17 @@ using Vars = std::vector>; } // namespace -class TransitivePathTest : public testing::TestWithParam { +// The first bool indicates if binary search should be used (true) or hash map +// based search (false). The second bool indicates if the result should be +// requested lazily. +class TransitivePathTest + : public testing::TestWithParam> { public: [[nodiscard]] static std::pair, QueryExecutionContext*> makePath(IdTable input, Vars vars, TransitivePathSide left, TransitivePathSide right, size_t minDist, size_t maxDist) { - bool useBinSearch = GetParam(); + bool useBinSearch = std::get<0>(GetParam()); auto qec = getQec(); auto subtree = ad_utility::makeExecutionTree( qec, std::move(input), vars); @@ -42,6 +44,7 @@ class TransitivePathTest : public testing::TestWithParam { qec}; } + // ___________________________________________________________________________ [[nodiscard]] static std::shared_ptr makePathUnbound( IdTable input, Vars vars, TransitivePathSide left, TransitivePathSide right, size_t minDist, size_t maxDist) { @@ -50,29 +53,75 @@ class TransitivePathTest : public testing::TestWithParam { return T; } - [[nodiscard]] static std::shared_ptr makePathLeftBound( - IdTable input, Vars vars, IdTable sideTable, size_t sideTableCol, - Vars sideVars, TransitivePathSide left, TransitivePathSide right, - size_t minDist, size_t maxDist) { + // Create bound transitive path with a side table that is either a single + // table or multiple ones. + [[nodiscard]] static std::shared_ptr makePathBound( + bool isLeft, IdTable input, Vars vars, + std::variant> sideTable, + size_t sideTableCol, Vars sideVars, TransitivePathSide left, + TransitivePathSide right, size_t minDist, size_t maxDist, + bool forceFullyMaterialized = false) { auto [T, qec] = makePath(std::move(input), vars, std::move(left), std::move(right), minDist, maxDist); - auto leftOp = ad_utility::makeExecutionTree( - qec, std::move(sideTable), sideVars); - return T->bindLeftSide(leftOp, sideTableCol); + auto operation = + std::holds_alternative(sideTable) + ? ad_utility::makeExecutionTree( + qec, std::move(std::get(sideTable)), sideVars, false, + std::vector{sideTableCol}, LocalVocab{}, + std::nullopt, forceFullyMaterialized) + : ad_utility::makeExecutionTree( + qec, std::move(std::get>(sideTable)), + sideVars, false, std::vector{sideTableCol}); + return isLeft ? T->bindLeftSide(operation, sideTableCol) + : T->bindRightSide(operation, sideTableCol); } - [[nodiscard]] static std::shared_ptr makePathRightBound( - IdTable input, Vars vars, IdTable sideTable, size_t sideTableCol, - Vars sideVars, TransitivePathSide left, TransitivePathSide right, - size_t minDist, size_t maxDist) { - auto [T, qec] = makePath(std::move(input), vars, std::move(left), - std::move(right), minDist, maxDist); - auto rightOp = ad_utility::makeExecutionTree( - qec, std::move(sideTable), sideVars); - return T->bindRightSide(rightOp, sideTableCol); + // ___________________________________________________________________________ + static std::vector split(const IdTable& idTable) { + std::vector result; + for (const auto& row : idTable) { + result.emplace_back(idTable.numColumns(), idTable.getAllocator()); + result.back().push_back(row); + } + return result; + } + + // ___________________________________________________________________________ + static bool requestLaziness() { return std::get<1>(GetParam()); } + + // ___________________________________________________________________________ + void assertResultMatchesIdTable(const Result& result, const IdTable& expected, + ad_utility::source_location loc = + ad_utility::source_location::current()) { + auto t = generateLocationTrace(loc); + using ::testing::UnorderedElementsAreArray; + ASSERT_NE(result.isFullyMaterialized(), requestLaziness()); + if (requestLaziness()) { + const auto& [idTable, localVocab] = + aggregateTables(std::move(result.idTables()), expected.numColumns()); + EXPECT_THAT(idTable, UnorderedElementsAreArray(expected)); + } else { + EXPECT_THAT(result.idTable(), UnorderedElementsAreArray(expected)); + } + } + + // Call testCase three times with differing arguments. This is used to test + // scenarios where the same input table is delivered in different splits + // either wrapped within a generator or as a single table. + static void runTestWithForcedSideTableScenarios( + const std::invocable>, + bool> auto& testCase, + IdTable idTable, + ad_utility::source_location loc = + ad_utility::source_location::current()) { + auto trace = generateLocationTrace(loc); + testCase(idTable.clone(), false); + testCase(split(idTable), false); + testCase(idTable.clone(), true); } }; +// _____________________________________________________________________________ TEST_P(TransitivePathTest, idToId) { auto sub = makeIdTableFromVector({{0, 1}, {1, 2}, {1, 3}, {2, 3}}); @@ -84,11 +133,11 @@ TEST_P(TransitivePathTest, idToId) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, std::numeric_limits::max()); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, idToVar) { auto sub = makeIdTableFromVector({{0, 1}, {1, 2}, {1, 3}, {2, 3}}); @@ -100,11 +149,11 @@ TEST_P(TransitivePathTest, idToVar) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, std::numeric_limits::max()); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, varToId) { auto sub = makeIdTableFromVector({{0, 1}, {1, 2}, {1, 3}, {2, 3}}); @@ -120,11 +169,11 @@ TEST_P(TransitivePathTest, varToId) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, std::numeric_limits::max()); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, idToVarMinLengthZero) { auto sub = makeIdTableFromVector({{0, 1}, {1, 2}, {1, 3}, {2, 3}}); @@ -136,11 +185,11 @@ TEST_P(TransitivePathTest, idToVarMinLengthZero) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 0, std::numeric_limits::max()); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, varToIdMinLengthZero) { auto sub = makeIdTableFromVector({{0, 1}, {1, 2}, {1, 3}, {2, 3}}); @@ -157,11 +206,11 @@ TEST_P(TransitivePathTest, varToIdMinLengthZero) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 0, std::numeric_limits::max()); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, varTovar) { auto sub = makeIdTableFromVector({ {0, 1}, @@ -185,11 +234,11 @@ TEST_P(TransitivePathTest, varTovar) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, std::numeric_limits::max()); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, unlimitedMaxLength) { auto sub = makeIdTableFromVector({{0, 2}, {2, 4}, @@ -225,11 +274,11 @@ TEST_P(TransitivePathTest, unlimitedMaxLength) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, std::numeric_limits::max()); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, idToLeftBound) { auto sub = makeIdTableFromVector({{0, 1}, {1, 2}, {1, 3}, {2, 3}, {3, 4}}); @@ -247,29 +296,33 @@ TEST_P(TransitivePathTest, idToLeftBound) { TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); TransitivePathSide right(std::nullopt, 1, V(4), 1); - { - auto T = makePathLeftBound( - sub.clone(), {Variable{"?start"}, Variable{"?target"}}, - leftOpTable.clone(), 1, {Variable{"?x"}, Variable{"?start"}}, left, - right, 0, std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); - } - { - auto T = makePathLeftBound( - std::move(sub), {Variable{"?start"}, Variable{"?target"}}, - std::move(leftOpTable), 1, {std::nullopt, Variable{"?start"}}, - std::move(left), std::move(right), 0, - std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); - } + runTestWithForcedSideTableScenarios( + [&](auto tableVariant, bool forceFullyMaterialized) { + auto T = makePathBound( + true, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + std::move(tableVariant), 1, {Variable{"?x"}, Variable{"?start"}}, + left, right, 0, std::numeric_limits::max(), + forceFullyMaterialized); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); + }, + leftOpTable.clone()); + runTestWithForcedSideTableScenarios( + [&](auto tableVariant, bool forceFullyMaterialized) { + auto T = makePathBound( + true, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + std::move(tableVariant), 1, {std::nullopt, Variable{"?start"}}, + left, right, 0, std::numeric_limits::max(), + forceFullyMaterialized); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); + }, + std::move(leftOpTable)); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, idToRightBound) { auto sub = makeIdTableFromVector({ {0, 1}, @@ -293,29 +346,33 @@ TEST_P(TransitivePathTest, idToRightBound) { TransitivePathSide left(std::nullopt, 0, V(0), 0); TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); - { - auto T = makePathRightBound( - sub.clone(), {Variable{"?start"}, Variable{"?target"}}, - rightOpTable.clone(), 0, {Variable{"?target"}, Variable{"?x"}}, left, - right, 0, std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); - } - { - auto T = makePathRightBound( - std::move(sub), {Variable{"?start"}, Variable{"?target"}}, - std::move(rightOpTable), 0, {Variable{"?target"}, std::nullopt}, - std::move(left), std::move(right), 0, - std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); - } + runTestWithForcedSideTableScenarios( + [&](auto tableVariant, bool forceFullyMaterialized) { + auto T = makePathBound( + false, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + std::move(tableVariant), 0, {Variable{"?target"}, Variable{"?x"}}, + left, right, 0, std::numeric_limits::max(), + forceFullyMaterialized); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); + }, + rightOpTable.clone()); + runTestWithForcedSideTableScenarios( + [&](auto tableVariant, bool forceFullyMaterialized) { + auto T = makePathBound( + false, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + std::move(tableVariant), 0, {Variable{"?target"}, std::nullopt}, + left, right, 0, std::numeric_limits::max(), + forceFullyMaterialized); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); + }, + std::move(rightOpTable)); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, leftBoundToVar) { auto sub = makeIdTableFromVector({ {1, 2}, @@ -344,19 +401,21 @@ TEST_P(TransitivePathTest, leftBoundToVar) { TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); - { - auto T = makePathLeftBound( - std::move(sub), {Variable{"?start"}, Variable{"?target"}}, - std::move(leftOpTable), 1, {Variable{"?x"}, Variable{"?start"}}, - std::move(left), std::move(right), 0, - std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); - } + runTestWithForcedSideTableScenarios( + [&](auto tableVariant, bool forceFullyMaterialized) { + auto T = makePathBound( + true, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + std::move(tableVariant), 1, {Variable{"?x"}, Variable{"?start"}}, + left, right, 0, std::numeric_limits::max(), + forceFullyMaterialized); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); + }, + std::move(leftOpTable)); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, rightBoundToVar) { auto sub = makeIdTableFromVector({ {1, 2}, @@ -385,16 +444,98 @@ TEST_P(TransitivePathTest, rightBoundToVar) { TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); - auto T = makePathRightBound( - std::move(sub), {Variable{"?start"}, Variable{"?target"}}, - std::move(rightOpTable), 0, {Variable{"?target"}, Variable{"?x"}}, - std::move(left), std::move(right), 0, std::numeric_limits::max()); - - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + runTestWithForcedSideTableScenarios( + [&](auto tableVariant, bool forceFullyMaterialized) { + auto T = makePathBound( + false, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + std::move(tableVariant), 0, {Variable{"?target"}, Variable{"?x"}}, + left, right, 0, std::numeric_limits::max(), + forceFullyMaterialized); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); + }, + std::move(rightOpTable)); +} + +// _____________________________________________________________________________ +TEST_P(TransitivePathTest, startNodesWithNoMatchesRightBound) { + auto sub = makeIdTableFromVector({ + {1, 2}, + {3, 4}, + }); + + auto rightOpTable = makeIdTableFromVector({ + {2, 5}, + {3, 6}, + {4, 7}, + }); + + auto expected = makeIdTableFromVector({ + {1, 2, 5}, + {3, 4, 7}, + }); + + TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); + TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); + auto T = makePathBound( + false, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + split(rightOpTable), 0, {Variable{"?target"}, Variable{"?x"}}, left, + right, 1, std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); +} + +// _____________________________________________________________________________ +TEST_P(TransitivePathTest, emptySideTable) { + auto sub = makeIdTableFromVector({ + {1, 2}, + {3, 4}, + }); + + auto expected = makeIdTableFromVector({}); + + TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); + TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); + auto T = makePathBound(true, sub.clone(), + {Variable{"?start"}, Variable{"?target"}}, + std::vector{}, 0, {Variable{"?start"}}, left, + right, 0, std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); +} + +// _____________________________________________________________________________ +TEST_P(TransitivePathTest, startNodesWithNoMatchesLeftBound) { + auto sub = makeIdTableFromVector({ + {1, 2}, + {3, 4}, + }); + + auto leftOpTable = makeIdTableFromVector({ + {2, 5}, + {3, 6}, + {4, 7}, + }); + + auto expected = makeIdTableFromVector({ + {3, 4, 6}, + }); + + TransitivePathSide left(std::nullopt, 0, Variable{"?start"}, 0); + TransitivePathSide right(std::nullopt, 1, Variable{"?target"}, 1); + auto T = makePathBound( + true, sub.clone(), {Variable{"?start"}, Variable{"?target"}}, + split(leftOpTable), 0, {Variable{"?start"}, Variable{"?x"}}, left, right, + 1, std::numeric_limits::max()); + + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, maxLength2FromVariable) { auto sub = makeIdTableFromVector({ {0, 2}, @@ -426,11 +567,11 @@ TEST_P(TransitivePathTest, maxLength2FromVariable) { auto T = makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, 2); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, maxLength2FromId) { auto sub = makeIdTableFromVector({ {0, 2}, @@ -454,11 +595,11 @@ TEST_P(TransitivePathTest, maxLength2FromId) { auto T = makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, 2); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, maxLength2ToId) { auto sub = makeIdTableFromVector({ {0, 2}, @@ -481,11 +622,11 @@ TEST_P(TransitivePathTest, maxLength2ToId) { auto T = makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 1, 2); - auto resultTable = T->computeResultOnlyForTesting(); - ASSERT_THAT(resultTable.idTable(), - ::testing::UnorderedElementsAreArray(expected)); + auto resultTable = T->computeResultOnlyForTesting(requestLaziness()); + assertResultMatchesIdTable(resultTable, expected); } +// _____________________________________________________________________________ TEST_P(TransitivePathTest, zeroLengthException) { auto sub = makeIdTableFromVector({ {0, 2}, @@ -504,15 +645,19 @@ TEST_P(TransitivePathTest, zeroLengthException) { makePathUnbound(std::move(sub), {Variable{"?start"}, Variable{"?target"}}, left, right, 0, std::numeric_limits::max()); AD_EXPECT_THROW_WITH_MESSAGE( - T->computeResultOnlyForTesting(), + T->computeResultOnlyForTesting(requestLaziness()), ::testing::ContainsRegex("This query might have to evaluate the empty " "path, which is currently " "not supported")); } -INSTANTIATE_TEST_SUITE_P(TransitivePathTestSuite, TransitivePathTest, - testing::Bool(), - [](const testing::TestParamInfo& info) { - return info.param ? "TransitivePathBinSearch" - : "TransitivePathHashMap"; - }); +// _____________________________________________________________________________ +INSTANTIATE_TEST_SUITE_P( + TransitivePathTestSuite, TransitivePathTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool()), + [](const testing::TestParamInfo>& info) { + std::string result = std::get<0>(info.param) ? "TransitivePathBinSearch" + : "TransitivePathHashMap"; + result += std::get<1>(info.param) ? "Lazy" : "FullyMaterialized"; + return result; + }); diff --git a/test/util/IdTableHelpers.cpp b/test/util/IdTableHelpers.cpp index d643476256..0b3b0a6a2e 100644 --- a/test/util/IdTableHelpers.cpp +++ b/test/util/IdTableHelpers.cpp @@ -248,3 +248,15 @@ std::shared_ptr idTableToExecutionTree( return ad_utility::makeExecutionTree(qec, input.clone(), std::move(vars)); } + +// _____________________________________________________________________________ +std::pair> aggregateTables( + Result::Generator generator, size_t numColumns) { + IdTable aggregateTable{numColumns, ad_utility::makeUnlimitedAllocator()}; + std::vector localVocabs; + for (auto& [idTable, localVocab] : generator) { + localVocabs.emplace_back(std::move(localVocab)); + aggregateTable.insertAtEnd(idTable); + } + return {std::move(aggregateTable), std::move(localVocabs)}; +} diff --git a/test/util/IdTableHelpers.h b/test/util/IdTableHelpers.h index 474c0dfd03..bc7035cd2f 100644 --- a/test/util/IdTableHelpers.h +++ b/test/util/IdTableHelpers.h @@ -256,3 +256,8 @@ IdTable createRandomlyFilledIdTable( /// and filling it with dummy variables. std::shared_ptr idTableToExecutionTree( QueryExecutionContext*, const IdTable&); + +// Fully consume a given generator and store it in an `IdTable` and store the +// local vocabs in a vector. +std::pair> aggregateTables( + Result::Generator generator, size_t numColumns); From e5284804e27e5f6583a7dc0990947497404b8720 Mon Sep 17 00:00:00 2001 From: Joe Date: Mon, 4 Nov 2024 14:58:51 +0100 Subject: [PATCH 02/12] Add a numPathsPerTarget parameter to PathSearch (#1596) When this parameter is set, the `PathSearch` service limits the number of paths per `[source, target]` pair. This makes it possible to use the path search for cases where enumerating all paths would exhaust the available time and memory constraints. --- docs/path_search.md | 39 ++++++++- src/engine/PathSearch.cpp | 43 ++++++---- src/engine/PathSearch.h | 7 +- src/parser/GraphPatternOperation.cpp | 9 +- src/parser/GraphPatternOperation.h | 1 + test/PathSearchTest.cpp | 33 ++++++++ test/QueryPlannerTest.cpp | 120 +++++++++++++++++++++++++++ 7 files changed, 233 insertions(+), 19 deletions(-) diff --git a/docs/path_search.md b/docs/path_search.md index 10ae4e0f51..6c9d161377 100644 --- a/docs/path_search.md +++ b/docs/path_search.md @@ -48,6 +48,10 @@ SELECT ?start ?end ?path ?edge WHERE { **one target**. Sources and targets are paired based on their index (i.e. the paths from the first source to the first target are searched, then the second source and target, and so on). +- **pathSearch:numPathsPerTarget** (optional): The path search will only search and store paths, + if the number of found paths is lower or equal to the value of the parameter. Expects an integer. + Example: if the value is 5, then the search will enumerate all paths until 5 paths have been found. + Other paths will be ignored. ### Example 1: Single Source and Target @@ -170,7 +174,7 @@ SELECT ?start ?end ?path ?edge WHERE { } ``` -This is esecially useful for [N-ary relations](https://www.w3.org/TR/swbp-n-aryRelations/). +This is especially useful for [N-ary relations](https://www.w3.org/TR/swbp-n-aryRelations/). Considering the example above, it is possible to query additional relations of `?middle`: ```sparql @@ -255,6 +259,39 @@ SELECT ?start ?end ?path ?edge WHERE { } ``` +### Example 5: Limit Number of Paths per Target + +It is possible to limit how many paths per target are returned. This is especially useful if +the query uses a lot of memory. In that case, it is possible to query a limited number of +paths to debug where the problem is. + +The following query for example will only return one path per source and target pair. +I.e. one path for `(, )`, one path for `(, )` and so on. + +```sparql +PREFIX pathSearch: + +SELECT ?start ?end ?path ?edge WHERE { + SERVICE pathSearch: { + _:path pathSearch:algorithm pathSearch:allPaths ; + pathSearch:source ; + pathSearch:source ; + pathSearch:target ; + pathSearch:target ; + pathSearch:pathColumn ?path ; + pathSearch:edgeColumn ?edge ; + pathSearch:start ?start ; + pathSearch:end ?end ; + pathSearch:numPathsPerTarget 1; + { + SELECT * WHERE { + ?start ?end. + } + } + } +} +``` + ## Error Handling The Path Search feature will throw errors in the following scenarios: diff --git a/src/engine/PathSearch.cpp b/src/engine/PathSearch.cpp index 50f10210a6..9291fba19a 100644 --- a/src/engine/PathSearch.cpp +++ b/src/engine/PathSearch.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -262,7 +263,8 @@ Result PathSearch::computeResult([[maybe_unused]] bool requestLaziness) { allSources = binSearch.getSources(); sources = allSources; } - paths = allPaths(sources, targets, binSearch, config_.cartesian_); + paths = allPaths(sources, targets, binSearch, config_.cartesian_, + config_.numPathsPerTarget_); timer.stop(); auto searchTime = timer.msecs(); @@ -326,15 +328,16 @@ PathSearch::handleSearchSides() const { } // _____________________________________________________________________________ -PathsLimited PathSearch::findPaths(const Id& source, - const std::unordered_set& targets, - const BinSearchWrapper& binSearch) const { +PathsLimited PathSearch::findPaths( + const Id& source, const std::unordered_set& targets, + const BinSearchWrapper& binSearch, + std::optional numPathsPerTarget) const { std::vector edgeStack; Path currentPath{EdgesLimited(allocator())}; std::unordered_map< - uint64_t, PathsLimited, std::hash, std::equal_to, - ad_utility::AllocatorWithLimit>> - pathCache{allocator()}; + uint64_t, uint64_t, std::hash, std::equal_to, + ad_utility::AllocatorWithLimit>> + numPathsPerNode{allocator()}; PathsLimited result{allocator()}; std::unordered_set, std::equal_to, ad_utility::AllocatorWithLimit> @@ -357,9 +360,18 @@ PathsLimited PathSearch::findPaths(const Id& source, currentPath.pop_back(); } + auto edgeEnd = edge.end_.getBits(); + if (numPathsPerTarget) { + auto numPaths = ++numPathsPerNode[edgeEnd]; + + if (numPaths > numPathsPerTarget) { + continue; + } + } + currentPath.push_back(edge); - if (targets.empty() || targets.contains(edge.end_.getBits())) { + if (targets.empty() || targets.contains(edgeEnd)) { result.push_back(currentPath); } @@ -374,10 +386,10 @@ PathsLimited PathSearch::findPaths(const Id& source, } // _____________________________________________________________________________ -PathsLimited PathSearch::allPaths(std::span sources, - std::span targets, - const BinSearchWrapper& binSearch, - bool cartesian) const { +PathsLimited PathSearch::allPaths( + std::span sources, std::span targets, + const BinSearchWrapper& binSearch, bool cartesian, + std::optional numPathsPerTarget) const { PathsLimited paths{allocator()}; Path path{EdgesLimited(allocator())}; @@ -387,14 +399,15 @@ PathsLimited PathSearch::allPaths(std::span sources, targetSet.insert(target.getBits()); } for (auto source : sources) { - for (const auto& path : findPaths(source, targetSet, binSearch)) { + for (const auto& path : + findPaths(source, targetSet, binSearch, numPathsPerTarget)) { paths.push_back(path); } } } else { for (size_t i = 0; i < sources.size(); i++) { - for (const auto& path : - findPaths(sources[i], {targets[i].getBits()}, binSearch)) { + for (const auto& path : findPaths(sources[i], {targets[i].getBits()}, + binSearch, numPathsPerTarget)) { paths.push_back(path); } } diff --git a/src/engine/PathSearch.h b/src/engine/PathSearch.h index 9e330d1d4e..b42f277eb7 100644 --- a/src/engine/PathSearch.h +++ b/src/engine/PathSearch.h @@ -98,6 +98,7 @@ struct PathSearchConfiguration { Variable edgeColumn_; std::vector edgeProperties_; bool cartesian_ = true; + std::optional numPathsPerTarget_ = std::nullopt; bool sourceIsVariable() const { return std::holds_alternative(sources_); @@ -260,7 +261,8 @@ class PathSearch : public Operation { */ pathSearch::PathsLimited findPaths( const Id& source, const std::unordered_set& targets, - const pathSearch::BinSearchWrapper& binSearch) const; + const pathSearch::BinSearchWrapper& binSearch, + std::optional numPathsPerTarget) const; /** * @brief Finds all paths in the graph. @@ -268,7 +270,8 @@ class PathSearch : public Operation { */ pathSearch::PathsLimited allPaths( std::span sources, std::span targets, - const pathSearch::BinSearchWrapper& binSearch, bool cartesian) const; + const pathSearch::BinSearchWrapper& binSearch, bool cartesian, + std::optional numPathsPerTarget) const; /** * @brief Converts paths to a result table with a specified width. diff --git a/src/parser/GraphPatternOperation.cpp b/src/parser/GraphPatternOperation.cpp index efceda159c..90356bb6fa 100644 --- a/src/parser/GraphPatternOperation.cpp +++ b/src/parser/GraphPatternOperation.cpp @@ -128,6 +128,12 @@ void PathQuery::addParameter(const SparqlTriple& triple) { throw PathSearchException("The parameter 'cartesian' expects a boolean"); } cartesian_ = object.getBool(); + } else if (predString.ends_with("numPathsPerTarget>")) { + if (!object.isInt()) { + throw PathSearchException( + "The parameter 'numPathsPerTarget' expects an integer"); + } + numPathsPerTarget_ = object.getInt(); } else if (predString.ends_with("algorithm>")) { if (!object.isIri()) { throw PathSearchException("The 'algorithm' value has to be an Iri"); @@ -209,7 +215,8 @@ PathSearchConfiguration PathQuery::toPathSearchConfiguration( return PathSearchConfiguration{ algorithm_, sources, targets, start_.value(), end_.value(), pathColumn_.value(), - edgeColumn_.value(), edgeProperties_, cartesian_}; + edgeColumn_.value(), edgeProperties_, cartesian_, + numPathsPerTarget_}; } // ____________________________________________________________________________ diff --git a/src/parser/GraphPatternOperation.h b/src/parser/GraphPatternOperation.h index 6367d4e510..060e548045 100644 --- a/src/parser/GraphPatternOperation.h +++ b/src/parser/GraphPatternOperation.h @@ -174,6 +174,7 @@ struct PathQuery { GraphPattern childGraphPattern_; bool cartesian_ = true; + std::optional numPathsPerTarget_ = std::nullopt; /** * @brief Add a parameter to the PathQuery from the given triple. diff --git a/test/PathSearchTest.cpp b/test/PathSearchTest.cpp index da8bd31c94..30ca2b42cf 100644 --- a/test/PathSearchTest.cpp +++ b/test/PathSearchTest.cpp @@ -543,6 +543,39 @@ TEST(PathSearchTest, elongatedDiamond) { ::testing::UnorderedElementsAreArray(expected)); } +// _____________________________________________________________________________ +TEST(PathSearchTest, numPathsPerTarget) { + auto sub = + makeIdTableFromVector({{0, 1}, {1, 2}, {1, 3}, {2, 4}, {3, 4}, {4, 5}}); + auto expected = makeIdTableFromVector({ + {V(0), V(1), I(0), I(0)}, + {V(1), V(3), I(0), I(1)}, + {V(3), V(4), I(0), I(2)}, + {V(0), V(1), I(1), I(0)}, + {V(1), V(3), I(1), I(1)}, + {V(3), V(4), I(1), I(2)}, + {V(4), V(5), I(1), I(3)}, + }); + + std::vector sources{V(0)}; + std::vector targets{V(4), V(5)}; + Vars vars = {Variable{"?start"}, Variable{"?end"}}; + PathSearchConfiguration config{PathSearchAlgorithm::ALL_PATHS, + sources, + targets, + Var{"?start"}, + Var{"?end"}, + Var{"?edgeIndex"}, + Var{"?pathIndex"}, + {}, + true, + 1}; + + auto resultTable = performPathSearch(config, std::move(sub), vars); + ASSERT_THAT(resultTable.idTable(), + ::testing::UnorderedElementsAreArray(expected)); +} + /** * Graph: * 0 4 diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index adb099ea52..d462a39d61 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -920,6 +920,7 @@ TEST(QueryPlanner, PathSearchMultipleSourcesAndTargetsCartesian) { "}}}}", h::PathSearch(config, true, true, scan("?start", "

", "?end")), qec); } + TEST(QueryPlanner, PathSearchMultipleSourcesAndTargetsNonCartesian) { auto scan = h::IndexScanFromStrings; auto qec = @@ -957,6 +958,45 @@ TEST(QueryPlanner, PathSearchMultipleSourcesAndTargetsNonCartesian) { h::PathSearch(config, true, true, scan("?start", "

", "?end")), qec); } +// _____________________________________________________________________________ +TEST(QueryPlanner, numPathsPerTarget) { + auto scan = h::IndexScanFromStrings; + auto qec = + ad_utility::testing::getQec("

.

.

"); + auto getId = ad_utility::testing::makeGetId(qec->getIndex()); + + std::vector sources{getId(""), getId("")}; + std::vector targets{getId(""), getId("")}; + PathSearchConfiguration config{PathSearchAlgorithm::ALL_PATHS, + sources, + targets, + Variable("?start"), + Variable("?end"), + Variable("?path"), + Variable("?edge"), + {}, + true, + 1}; + h::expect( + "PREFIX pathSearch: " + "SELECT ?start ?end ?path ?edge WHERE {" + "SERVICE pathSearch: {" + "_:path pathSearch:algorithm pathSearch:allPaths ;" + "pathSearch:source ;" + "pathSearch:source ;" + "pathSearch:target ;" + "pathSearch:target ;" + "pathSearch:pathColumn ?path ;" + "pathSearch:edgeColumn ?edge ;" + "pathSearch:start ?start;" + "pathSearch:end ?end;" + "pathSearch:numPathsPerTarget 1;" + "{SELECT * WHERE {" + "?start

?end." + "}}}}", + h::PathSearch(config, true, true, scan("?start", "

", "?end")), qec); +} + TEST(QueryPlanner, PathSearchWithEdgeProperties) { auto scan = h::IndexScanFromStrings; auto join = h::Join; @@ -1483,6 +1523,86 @@ TEST(QueryPlanner, PathSearchUnsupportedAlgorithm) { parsedQuery::PathSearchException); } +// __________________________________________________________________________ +TEST(QueryPlanner, PathSearchWrongArgumentCartesian) { + auto qec = ad_utility::testing::getQec("

.

"); + auto getId = ad_utility::testing::makeGetId(qec->getIndex()); + + auto query = + "PREFIX pathSearch: " + "SELECT ?start ?end ?path ?edge WHERE {" + "SERVICE pathSearch: {" + "_:path pathSearch:algorithm pathSearch:allPaths ;" + "pathSearch:source ?source1 ;" + "pathSearch:source ?source2 ;" + "pathSearch:target ;" + "pathSearch:pathColumn ?path ;" + "pathSearch:edgeColumn ?edge ;" + "pathSearch:start ?start;" + "pathSearch:end ?end;" + "pathSearch:cartesian ;" + "{SELECT * WHERE {" + "?start

?end." + "}}}}"; + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + h::parseAndPlan(std::move(query), qec), + HasSubstr("The parameter 'cartesian' expects a boolean"), + parsedQuery::PathSearchException); +} + +// __________________________________________________________________________ +TEST(QueryPlanner, PathSearchWrongArgumentNumPathsPerTarget) { + auto qec = ad_utility::testing::getQec("

.

"); + auto getId = ad_utility::testing::makeGetId(qec->getIndex()); + + auto query = + "PREFIX pathSearch: " + "SELECT ?start ?end ?path ?edge WHERE {" + "SERVICE pathSearch: {" + "_:path pathSearch:algorithm pathSearch:allPaths ;" + "pathSearch:source ?source1 ;" + "pathSearch:source ?source2 ;" + "pathSearch:target ;" + "pathSearch:pathColumn ?path ;" + "pathSearch:edgeColumn ?edge ;" + "pathSearch:start ?start;" + "pathSearch:end ?end;" + "pathSearch:numPathsPerTarget ;" + "{SELECT * WHERE {" + "?start

?end." + "}}}}"; + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + h::parseAndPlan(std::move(query), qec), + HasSubstr("The parameter 'numPathsPerTarget' expects an integer"), + parsedQuery::PathSearchException); +} + +// __________________________________________________________________________ +TEST(QueryPlanner, PathSearchWrongArgumentAlgorithm) { + auto qec = ad_utility::testing::getQec("

.

"); + auto getId = ad_utility::testing::makeGetId(qec->getIndex()); + + auto query = + "PREFIX pathSearch: " + "SELECT ?start ?end ?path ?edge WHERE {" + "SERVICE pathSearch: {" + "_:path pathSearch:algorithm 1 ;" + "pathSearch:source ?source1 ;" + "pathSearch:source ?source2 ;" + "pathSearch:target ;" + "pathSearch:pathColumn ?path ;" + "pathSearch:edgeColumn ?edge ;" + "pathSearch:start ?start;" + "pathSearch:end ?end;" + "{SELECT * WHERE {" + "?start

?end." + "}}}}"; + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + h::parseAndPlan(std::move(query), qec), + HasSubstr("The 'algorithm' value has to be an Iri"), + parsedQuery::PathSearchException); +} + TEST(QueryPlanner, SpatialJoinViaMaxDistPredicate) { auto scan = h::IndexScanFromStrings; h::expect( From 3d321c25c67c6d443a062fe1f3f3e060b1d62ef3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 5 Nov 2024 09:22:14 +0100 Subject: [PATCH 03/12] Allow `REGEX` for arbitrary expressions (not just a variable) (#1576) So far, the `REGEX` function was only implemented for the (frequent) special case, where the first argument is either a variable (like `?x`) or `STR` of a variable (like `STR(?x)`). Now `REGEX` works for arbitrary expressions. Use the occasion to clean up the code a little bit and improve the documentation. --- .../sparqlExpressions/RegexExpression.cpp | 181 +++++++++++------- .../sparqlExpressions/RegexExpression.h | 43 +++-- test/RegexExpressionTest.cpp | 66 +++++-- 3 files changed, 186 insertions(+), 104 deletions(-) diff --git a/src/engine/sparqlExpressions/RegexExpression.cpp b/src/engine/sparqlExpressions/RegexExpression.cpp index 9ee49dedab..2cf5757e64 100644 --- a/src/engine/sparqlExpressions/RegexExpression.cpp +++ b/src/engine/sparqlExpressions/RegexExpression.cpp @@ -72,19 +72,21 @@ std::optional getPrefixRegex(std::string regex) { } // namespace sparqlExpression::detail namespace sparqlExpression { + // ___________________________________________________________________________ RegexExpression::RegexExpression( SparqlExpression::Ptr child, SparqlExpression::Ptr regex, std::optional optionalFlags) : child_{std::move(child)} { + // If we have a `STR()` expression, remove the `STR()` and remember that it + // was there. if (child_->isStrExpression()) { child_ = std::move(std::move(*child_).moveChildrenOut().at(0)); childIsStrExpression_ = true; } - if (!dynamic_cast(child_.get())) { - throw std::runtime_error( - "REGEX expressions are currently supported only on variables."); - } + + // Get the regex string, which must be a string literal without a datatype or + // language tag. std::string regexString; if (auto regexPtr = dynamic_cast(regex.get())) { @@ -100,6 +102,9 @@ RegexExpression::RegexExpression( "The second argument to the REGEX function must be a " "string literal (which contains the regular expression)"); } + + // Parse the flags. The optional argument for that must, again, be a + // string literal without a datatype or language tag. if (optionalFlags.has_value()) { if (auto flagsPtr = dynamic_cast( optionalFlags.value().get())) { @@ -131,19 +136,18 @@ RegexExpression::RegexExpression( } } + // Create RE2 object from the regex string. If it is a simple prefix regex, + // store the prefix in `prefixRegex_` (otherwise that becomes `std::nullopt`). regexAsString_ = regexString; - if (auto opt = detail::getPrefixRegex(regexString)) { - regex_ = std::move(opt.value()); - } else { - regex_.emplace(regexString, RE2::Quiet); - const auto& r = std::get(regex_); - if (r.error_code() != RE2::NoError) { - throw std::runtime_error{absl::StrCat( - "The regex \"", regexString, - "\" is not supported by QLever (which uses Google's RE2 library). " - "Error from RE2 is: ", - r.error())}; - } + prefixRegex_ = detail::getPrefixRegex(regexString); + regex_.emplace(regexString, RE2::Quiet); + const auto& r = regex_.value(); + if (r.error_code() != RE2::NoError) { + throw std::runtime_error{absl::StrCat( + "The regex \"", regexString, + "\" is not supported by QLever (which uses Google's RE2 library); " + "the error from RE2 is: ", + r.error())}; } } @@ -163,17 +167,27 @@ std::span RegexExpression::childrenImpl() { ExpressionResult RegexExpression::evaluatePrefixRegex( const Variable& variable, sparqlExpression::EvaluationContext* context) const { - std::string prefixRegex = std::get(regex_); + // This function must only be called if we have a simple prefix regex. + AD_CORRECTNESS_CHECK(prefixRegex_.has_value()); + std::string prefixRegex = prefixRegex_.value(); + + // If the expression is enclosed in `STR()`, we have two ranges: for the + // prefix with and without leading "<". + // + // TODO prefix filters currently have false negatives when the prefix + // is not in the vocabulary, and there exist local vocab entries in the input + // that are between the prefix and the next local vocab entry. This is + // non-trivial to fix as it involves fiddling with Unicode prefix encodings. + // + // TODO prefix filters currently never find numbers or other + // datatypes that are encoded directly inside the IDs. std::vector actualPrefixes; actualPrefixes.push_back("\"" + prefixRegex); - // If the STR function was applied, we also look for prefix matches for IRIs. - // TODO prefix filters currently never find numbers or local vocab - // entries, numbers, or other datatypes that are encoded directly inside the - // IDs. if (childIsStrExpression_) { actualPrefixes.push_back("<" + prefixRegex); } - std::vector resultSetOfIntervals; + + // Compute the (one or two) ranges. std::vector> lowerAndUpperIds; lowerAndUpperIds.reserve(actualPrefixes.size()); for (const auto& prefix : actualPrefixes) { @@ -184,12 +198,21 @@ ExpressionResult RegexExpression::evaluatePrefixRegex( } } checkCancellation(context); + + // Begin and end of the input (for each row of which we want to + // evaluate the regex). auto beg = context->_inputTable.begin() + context->_beginIndex; auto end = context->_inputTable.begin() + context->_endIndex; AD_CONTRACT_CHECK(end <= context->_inputTable.end()); + + // In this function, the expression is a simple variable. If the input is + // sorted by that variable, the result can be computed by a constant number + // of binary searches and the result is a set of intervals. + std::vector resultSetOfIntervals; if (context->isResultSortedBy(variable)) { auto column = context->getColumnIndexForVariable(variable); for (auto [lowerId, upperId] : lowerAndUpperIds) { + // Two binary searches to find the lower and upper bounds of the range. auto lower = std::lower_bound( beg, end, nullptr, [column, lowerId = lowerId](const auto& l, const auto&) { @@ -200,7 +223,6 @@ ExpressionResult RegexExpression::evaluatePrefixRegex( [column, upperId = upperId](const auto& l, const auto&) { return l[column] < upperId; }); - // Return the empty result as an empty `SetOfIntervals` instead of as an // empty range. if (lower != upper) { @@ -212,47 +234,58 @@ ExpressionResult RegexExpression::evaluatePrefixRegex( return std::reduce(resultSetOfIntervals.begin(), resultSetOfIntervals.end(), ad_utility::SetOfIntervals{}, ad_utility::SetOfIntervals::Union{}); - } else { - auto resultSize = context->size(); - VectorWithMemoryLimit result{context->_allocator}; - result.reserve(resultSize); - for (auto id : detail::makeGenerator(variable, resultSize, context)) { - result.push_back(Id::makeFromBool( - std::ranges::any_of(lowerAndUpperIds, [&](const auto& lowerUpper) { - return !valueIdComparators::compareByBits(id, lowerUpper.first) && - valueIdComparators::compareByBits(id, lowerUpper.second); - }))); - checkCancellation(context); - } - return result; } + + // If the input is not sorted by the variable, we have to check each row + // individually (by checking inclusion in the ranges). + auto resultSize = context->size(); + VectorWithMemoryLimit result{context->_allocator}; + result.reserve(resultSize); + for (auto id : detail::makeGenerator(variable, resultSize, context)) { + result.push_back(Id::makeFromBool( + std::ranges::any_of(lowerAndUpperIds, [&](const auto& lowerUpper) { + return !valueIdComparators::compareByBits(id, lowerUpper.first) && + valueIdComparators::compareByBits(id, lowerUpper.second); + }))); + checkCancellation(context); + } + return result; } // ___________________________________________________________________________ -ExpressionResult RegexExpression::evaluateNonPrefixRegex( - const Variable& variable, - sparqlExpression::EvaluationContext* context) const { - AD_CONTRACT_CHECK(std::holds_alternative(regex_)); +template +ExpressionResult RegexExpression::evaluateGeneralCase( + T&& input, sparqlExpression::EvaluationContext* context) const { + // We have one result for each row of the input. auto resultSize = context->size(); VectorWithMemoryLimit result{context->_allocator}; result.reserve(resultSize); + AD_CORRECTNESS_CHECK(regex_.has_value()); - auto impl = [&](const ValueGetter& getter) { - for (auto id : detail::makeGenerator(variable, resultSize, context)) { - auto str = getter(id, context); - if (!str.has_value()) { - result.push_back(Id::makeUndefined()); - } else { - result.push_back(Id::makeFromBool( - RE2::PartialMatch(str.value(), std::get(regex_)))); - } - checkCancellation(context); - } + // Compute the result using the given value getter. If the getter returns + // `std::nullopt` for a row, the result is `UNDEF`. Otherwise, we have a + // string and evaluate the regex on it. + auto computeResult = [&](const ValueGetter& getter) { + std::ranges::for_each( + detail::makeGenerator(AD_FWD(input), resultSize, context), + [&getter, &context, &result, this](const auto& id) { + auto str = getter(id, context); + if (!str.has_value()) { + result.push_back(Id::makeUndefined()); + } else { + result.push_back(Id::makeFromBool( + RE2::PartialMatch(str.value(), regex_.value()))); + } + checkCancellation(context); + }); }; + + // Compute the result with the correct value getter (depending on whether the + // expression is enclosed in `STR()` or not), and return it. if (childIsStrExpression_) { - impl(detail::StringValueGetter{}); + computeResult(detail::StringValueGetter{}); } else { - impl(detail::LiteralFromIdGetter{}); + computeResult(detail::LiteralFromIdGetter{}); } return result; } @@ -262,51 +295,57 @@ ExpressionResult RegexExpression::evaluate( sparqlExpression::EvaluationContext* context) const { auto resultAsVariant = child_->evaluate(context); auto variablePtr = std::get_if(&resultAsVariant); - AD_CONTRACT_CHECK(variablePtr); - if (std::holds_alternative(regex_)) { + if (prefixRegex_.has_value() && variablePtr != nullptr) { return evaluatePrefixRegex(*variablePtr, context); } else { - return evaluateNonPrefixRegex(*variablePtr, context); + return std::visit( + [this, context](auto&& input) { + return evaluateGeneralCase(AD_FWD(input), context); + }, + std::move(resultAsVariant)); } } // ____________________________________________________________________________ bool RegexExpression::isPrefixExpression() const { - return std::holds_alternative(regex_); + return prefixRegex_.has_value(); } // ____________________________________________________________________________ auto RegexExpression::getEstimatesForFilterExpression( uint64_t inputSize, const std::optional& firstSortedVariable) const -> Estimates { + // If we have a simple prefix regex, assume that only 10^-k entries remain, + // where k is the length of the prefix. if (isPrefixExpression()) { - // Assume that only 10^-k entries remain, where k is the length of the - // prefix. The reason for the -2 is that at this point, _rhs always - // starts with ^" double reductionFactor = std::pow( - 10, std::max( - 0, static_cast(std::get(regex_).size()) - 2)); + 10, std::max(0, static_cast(prefixRegex_.value().size()))); // Cap to reasonable minimal and maximal values to prevent numerical // stability problems. reductionFactor = std::min(100000000.0, reductionFactor); reductionFactor = std::max(1.0, reductionFactor); size_t sizeEstimate = inputSize / static_cast(reductionFactor); auto varPtr = dynamic_cast(child_.get()); - AD_CONTRACT_CHECK(varPtr); - size_t costEstimate = firstSortedVariable == varPtr->value() + size_t costEstimate = (varPtr && firstSortedVariable == varPtr->value()) ? sizeEstimate : sizeEstimate + inputSize; - return {sizeEstimate, costEstimate}; - } else { // Not a prefix filter. - size_t sizeEstimate = inputSize / 2; - // We assume that checking a REGEX for an element is 10 times more - // expensive than an "ordinary" filter check. - size_t costEstimate = sizeEstimate + 10 * inputSize; - return {sizeEstimate, costEstimate}; } + + // For the general case, we make two assumptions. + // + // 1. Half of the entries remain after the filter. This is a very simple + // and arbitrary heuristic. + // + // 2. Checking a REGEX for an element is 10 times more expensive than a + // "simple" filter check. This is reasonable because regex evaluations are + // expensive, but the fixed factor disregard that it depends on the + // complexity of the regex how expensive it is. + size_t sizeEstimate = inputSize / 2; + size_t costEstimate = sizeEstimate + 10 * inputSize; + return {sizeEstimate, costEstimate}; } // ____________________________________________________________________________ diff --git a/src/engine/sparqlExpressions/RegexExpression.h b/src/engine/sparqlExpressions/RegexExpression.h index 783acc22db..cbf95c7f38 100644 --- a/src/engine/sparqlExpressions/RegexExpression.h +++ b/src/engine/sparqlExpressions/RegexExpression.h @@ -1,6 +1,6 @@ -// Copyright 2022, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2022 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #pragma once @@ -11,22 +11,27 @@ #include "re2/re2.h" namespace sparqlExpression { +// Class implementing the REGEX function, which takes two mandatory arguments +// (an expression and a regex) and one optional argument (a string of flags). class RegexExpression : public SparqlExpression { private: SparqlExpression::Ptr child_; - // If this variant holds a string, we consider this string as the prefix of a - // prefix regex. - std::variant regex_; + // The reguar expression. It needs to be a `std::optional` because `RE2` + // objects do not have a default constructor. + std::optional regex_; + // If this `std::optional` holds a string, we have a simple prefix regex + // (which translates to a range search) and this string holds the prefix. + std::optional prefixRegex_; // The regex as a string, used for the cache key. std::string regexAsString_; - // True if the STR() function is to be applied on the child before evaluating - // the regex. + // True iff the expression is enclosed in `STR()`. bool childIsStrExpression_ = false; public: - // `child` must be a `VariableExpression` and `regex` must be a - // `LiteralExpression` that stores a string, else an exception will be thrown. + // The `child` must be a `VariableExpression` and `regex` must be a + // `LiteralExpression` that stores a string, otherwise an exception will be + // thrown. RegexExpression(SparqlExpression::Ptr child, SparqlExpression::Ptr regex, std::optional optionalFlags); @@ -46,17 +51,21 @@ class RegexExpression : public SparqlExpression { private: std::span childrenImpl() override; - // Internal implementations that are called by `evaluate`. + + // Evaluate for the special case, where the expression is a variable and we + // have a simple prefix regex (in which case the regex match translates to a + // simple range check). ExpressionResult evaluatePrefixRegex( const Variable& variable, sparqlExpression::EvaluationContext* context) const; - ExpressionResult evaluateNonPrefixRegex( - const Variable& variable, - sparqlExpression::EvaluationContext* context) const; - /// Helper function to check if the `CancellationHandle` of the passed - /// `EvaluationContext` has been cancelled and throw an exception if this is - /// the case. + // Evaluate for the general case. + template + ExpressionResult evaluateGeneralCase( + T&& input, sparqlExpression::EvaluationContext* context) const; + + // Check if the `CancellationHandle` of `context` has been cancelled and throw + // an exception if this is the case. static void checkCancellation( const sparqlExpression::EvaluationContext* context, ad_utility::source_location location = diff --git a/test/RegexExpressionTest.cpp b/test/RegexExpressionTest.cpp index 01643c1de8..0e94d3603d 100644 --- a/test/RegexExpressionTest.cpp +++ b/test/RegexExpressionTest.cpp @@ -1,6 +1,6 @@ -// Copyright 2022, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2022 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #include #include @@ -23,8 +23,11 @@ constexpr auto T = Id::makeFromBool(true); constexpr auto F = Id::makeFromBool(false); constexpr Id U = Id::makeUndefined(); +// Make `RegexExpression` from given the `child` (the expression on which to +// apply the regex), `regex`, and optional `flags`. The argument `childAsStr` is +// true iff the expression is enclosed in a `STR()` function. RegexExpression makeRegexExpression( - std::string variable, std::string regex, + SparqlExpression::Ptr child, std::string regex, std::optional flags = std::nullopt, bool childAsStr = false) { // The regex and the flags both have to be enquoted. This is normally ensured // by the SPARQL parser. For easier readability of the tests we add those @@ -33,10 +36,8 @@ RegexExpression makeRegexExpression( if (flags.has_value()) { flags.value() = absl::StrCat("\"", flags.value(), "\""); } - SparqlExpression::Ptr variableExpression = - std::make_unique(Variable{std::move(variable)}); if (childAsStr) { - variableExpression = makeStrExpression(std::move(variableExpression)); + child = makeStrExpression(std::move(child)); } auto regexExpression = std::make_unique(lit(regex)); std::optional flagsExpression = std::nullopt; @@ -45,18 +46,33 @@ RegexExpression makeRegexExpression( std::make_unique(lit(flags.value()))}; } - return {std::move(variableExpression), std::move(regexExpression), + return {std::move(child), std::move(regexExpression), std::move(flagsExpression)}; } + +// Special case of the `makeRegexExpression` above, where the `child` +// expression is a variable. +RegexExpression makeRegexExpression( + std::string variable, std::string regex, + std::optional flags = std::nullopt, bool childAsStr = false) { + SparqlExpression::Ptr variableExpression = + std::make_unique(Variable{std::move(variable)}); + return makeRegexExpression(std::move(variableExpression), std::move(regex), + std::move(flags), childAsStr); +} } // namespace // Test that the expression `leftValue Comparator rightValue`, when evaluated on // the `TestContext` (see above), yields the `expected` result. void testWithExplicitResult(const SparqlExpression& expression, std::vector expected, + std::optional numInputs = std::nullopt, source_location l = source_location::current()) { - static TestContext ctx; + TestContext ctx; auto trace = generateLocationTrace(l, "testWithExplicitResult"); + if (numInputs.has_value()) { + ctx.context._endIndex = numInputs.value(); + } auto resultAsVariant = expression.evaluate(&ctx.context); const auto& result = std::get>(resultAsVariant); @@ -74,6 +90,8 @@ auto testNonPrefixRegex = [](std::string variable, std::string regex, testWithExplicitResult(expr, expectedResult); }; +// Tests where the expression is a variable and the regex is not a simple prefix +// regex (that translates to a simple range search). TEST(RegexExpression, nonPrefixRegex) { // ?vocab column is `"Beta", "alpha", "älpha" // ?mixed column is `1, -0.1, ` @@ -83,10 +101,11 @@ TEST(RegexExpression, nonPrefixRegex) { test("?vocab", "l[^a]{2}a", {F, T, T}); test("?vocab", "[el][^a]*a", {T, T, T}); test("?vocab", "B", {T, F, F}); - // case-sensitive by default. + + // The match is case-sensitive by default. test("?vocab", "b", {F, F, F}); - // Not a prefix expression because of the "special" regex characters + // A prefix regex, but not a fixed string. test("?vocab", "^a.*", {F, T, F}); test("?mixed", "x", {U, U, U}); @@ -96,10 +115,27 @@ TEST(RegexExpression, nonPrefixRegex) { // ?localVocab column is "notInVocabA", "notInVocabB", <"notInVocabD"> test("?localVocab", "InV", {T, T, U}); + // The IRI is only considered when testing with a STR expression test("?localVocab", "Vocab[AD]", {T, F, T}, true); } +// Test where the expression is not simply a variable. +TEST(RegexExpression, inputNotVariable) { + // Our expression is a fixed string literal: "hallo". + VectorWithMemoryLimit input{ + ad_utility::testing::getQec()->getAllocator()}; + input.push_back(ad_utility::triple_component::LiteralOrIri(lit("\"hallo\""))); + auto child = std::make_unique( + input.clone()); + + // "hallo" matches the regex "ha". + auto expr = makeRegexExpression(std::move(child), "ha", ""); + std::vector expected; + expected.push_back(Id::makeFromBool(true)); + testWithExplicitResult(expr, expected, input.size()); +} + auto testNonPrefixRegexWithFlags = [](std::string variable, std::string regex, std::string flags, const std::vector& expectedResult, @@ -111,6 +147,7 @@ auto testNonPrefixRegexWithFlags = testWithExplicitResult(expr, expectedResult); }; +// Fun with flags. TEST(RegexExpression, nonPrefixRegexWithFlags) { // ?vocab column is `"Beta", "alpha", "älpha" // ?mixed column is `1, -0.1, A` @@ -141,6 +178,8 @@ TEST(RegexExpression, nonPrefixRegexWithFlags) { // TODO Add tests for other flags (maybe the non-greedy one?) } +// Test the `getPrefixRegex` function (which returns `std::nullopt` if the regex +// is not a simple prefix regex). TEST(RegexExpression, getPrefixRegex) { using namespace sparqlExpression::detail; ASSERT_EQ(std::nullopt, getPrefixRegex("alpha")); @@ -265,11 +304,6 @@ TEST(RegexExpression, invalidConstruction) { return std::make_unique(Variable{std::move(literal)}); }; - // The first argument must be a variable. - EXPECT_THROW( - RegexExpression(literal("\"a\""), literal("\"b\""), std::nullopt), - std::runtime_error); - // The second argument must be a string literal. EXPECT_THROW(RegexExpression(variable("?a"), variable("?b"), std::nullopt), std::runtime_error); From f1490771f7a70d49bc1d3d08a95e31488aaa505e Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:03:13 +0100 Subject: [PATCH 04/12] Simplify the `CartesianProductJoin` class (#1598) Refactor some rather large functions into smaller ones. This will make the lazy implementation of this class much simpler to implement and to review. --- src/engine/CartesianProductJoin.cpp | 174 ++++++++++++++-------------- src/engine/CartesianProductJoin.h | 14 ++- 2 files changed, 96 insertions(+), 92 deletions(-) diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp index c73361e001..b9bb514ba7 100644 --- a/src/engine/CartesianProductJoin.cpp +++ b/src/engine/CartesianProductJoin.cpp @@ -53,22 +53,21 @@ string CartesianProductJoin::getCacheKeyImpl() const { // ____________________________________________________________________________ size_t CartesianProductJoin::getResultWidth() const { auto view = childView() | std::views::transform(&Operation::getResultWidth); - return std::accumulate(view.begin(), view.end(), 0UL, std::plus{}); + return std::reduce(view.begin(), view.end(), 0UL, std::plus{}); } // ____________________________________________________________________________ size_t CartesianProductJoin::getCostEstimate() { auto childSizes = childView() | std::views::transform(&Operation::getCostEstimate); - return getSizeEstimate() + std::accumulate(childSizes.begin(), - childSizes.end(), 0UL, - std::plus{}); + return getSizeEstimate() + + std::reduce(childSizes.begin(), childSizes.end(), 0UL, std::plus{}); } // ____________________________________________________________________________ uint64_t CartesianProductJoin::getSizeEstimateBeforeLimit() { auto view = childView() | std::views::transform(&Operation::getSizeEstimate); - return std::accumulate(view.begin(), view.end(), 1UL, std::multiplies{}); + return std::reduce(view.begin(), view.end(), 1UL, std::multiplies{}); } // ____________________________________________________________________________ @@ -85,13 +84,10 @@ bool CartesianProductJoin::knownEmptyResult() { } // ____________________________________________________________________________ -template void CartesianProductJoin::writeResultColumn(std::span targetColumn, std::span inputColumn, - size_t groupSize, size_t offset) { - if (StaticGroupSize != 0) { - AD_CORRECTNESS_CHECK(StaticGroupSize == groupSize); - } + size_t groupSize, + size_t offset) const { // Copy each element from the `inputColumn` `groupSize` times to // the `targetColumn`, repeat until the `targetColumn` is completely filled. size_t numRowsWritten = 0; @@ -104,20 +100,13 @@ void CartesianProductJoin::writeResultColumn(std::span targetColumn, size_t groupStartIdx = offset % groupSize; while (true) { for (size_t i = firstInputElementIdx; i < inputSize; ++i) { - auto writeGroup = [&](size_t actualGroupSize) { - for (size_t u = groupStartIdx; u < actualGroupSize; ++u) { - if (numRowsWritten == targetSize) { - return; - } - targetColumn[numRowsWritten] = inputColumn[i]; - ++numRowsWritten; - checkCancellation(); + for (size_t u = groupStartIdx; u < groupSize; ++u) { + if (numRowsWritten == targetSize) { + return; } - }; - if constexpr (StaticGroupSize == 0) { - writeGroup(groupSize); - } else { - writeGroup(StaticGroupSize); + targetColumn[numRowsWritten] = inputColumn[i]; + ++numRowsWritten; + checkCancellation(); } if (numRowsWritten == targetSize) { return; @@ -131,61 +120,52 @@ void CartesianProductJoin::writeResultColumn(std::span targetColumn, firstInputElementIdx = 0; } } + // ____________________________________________________________________________ ProtoResult CartesianProductJoin::computeResult( [[maybe_unused]] bool requestLaziness) { - IdTable result{getExecutionContext()->getAllocator()}; - result.setNumColumns(getResultWidth()); - std::vector> subResults; + std::vector> subResults = calculateSubResults(); - // We don't need to fully materialize the child results if we have a LIMIT - // specified and an OFFSET of 0. - // TODO We could in theory also apply this optimization if a - // non-zero OFFSET is specified, but this would make the algorithm more - // complicated. - std::optional limitIfPresent = getLimit(); - if (!getLimit()._limit.has_value() || getLimit()._offset != 0) { - limitIfPresent = std::nullopt; - } - - // Get all child results (possibly with limit, see above). - for (auto& child : childView()) { - if (limitIfPresent.has_value() && child.supportsLimit()) { - child.setLimit(limitIfPresent.value()); - } - subResults.push_back(child.getResult()); + IdTable result = writeAllColumns(subResults); - const auto& table = subResults.back()->idTable(); - // Early stopping: If one of the results is empty, we can stop early. - if (table.empty()) { - break; - } + // Dereference all the subresult pointers because `getSharedLocalVocabFrom...` + // requires a range of references, not pointers. + auto subResultsDeref = std::views::transform( + subResults, [](auto& x) -> decltype(auto) { return *x; }); + return {std::move(result), resultSortedOn(), + Result::getMergedLocalVocab(subResultsDeref)}; +} - // If one of the children is the neutral element (because of a triple with - // zero variables), we can simply ignore it here. - if (table.numRows() == 1 && table.numColumns() == 0) { - subResults.pop_back(); - continue; - } - // Example for the following calculation: If we have a LIMIT of 1000 and - // the first child already has a result of size 100, then the second child - // needs to evaluate only its first 10 results. The +1 is because integer - // divisions are rounded down by default. - if (limitIfPresent.has_value()) { - limitIfPresent.value()._limit = limitIfPresent.value()._limit.value() / - subResults.back()->idTable().size() + - 1; +// ____________________________________________________________________________ +VariableToColumnMap CartesianProductJoin::computeVariableToColumnMap() const { + VariableToColumnMap result; + // It is crucial that we also count the columns in the inputs to which no + // variable was assigned. This is managed by the `offset` variable. + size_t offset = 0; + for (const auto& child : childView()) { + for (auto varCol : child.getExternallyVisibleVariableColumns()) { + varCol.second.columnIndex_ += offset; + result.insert(std::move(varCol)); } + // `getResultWidth` contains all the columns, not only the ones to which a + // variable is assigned. + offset += child.getResultWidth(); } + return result; +} +// _____________________________________________________________________________ +IdTable CartesianProductJoin::writeAllColumns( + const std::vector>& subResults) const { + IdTable result{getResultWidth(), getExecutionContext()->getAllocator()}; // TODO Find a solution to cheaply handle the case, that only a // single result is left. This can probably be done by using the // `ProtoResult`. auto sizesView = std::views::transform( subResults, [](const auto& child) { return child->idTable().size(); }); - auto totalResultSize = std::accumulate(sizesView.begin(), sizesView.end(), - 1UL, std::multiplies{}); + auto totalResultSize = + std::reduce(sizesView.begin(), sizesView.end(), 1UL, std::multiplies{}); size_t totalSizeIncludingLimit = getLimit().actualSize(totalResultSize); size_t offset = getLimit().actualOffset(totalResultSize); @@ -211,37 +191,57 @@ ProtoResult CartesianProductJoin::computeResult( const auto& input = subResultPtr->idTable(); for (const auto& inputCol : input.getColumns()) { decltype(auto) resultCol = result.getColumn(resultColIdx); - ad_utility::callFixedSize(groupSize, [&]() { - writeResultColumn(resultCol, inputCol, groupSize, offset); - }); + writeResultColumn(resultCol, inputCol, groupSize, offset); ++resultColIdx; } groupSize *= input.numRows(); } } - - // Dereference all the subresult pointers because `getSharedLocalVocabFrom...` - // requires a range of references, not pointers. - auto subResultsDeref = std::views::transform( - subResults, [](auto& x) -> decltype(auto) { return *x; }); - return {std::move(result), resultSortedOn(), - Result::getMergedLocalVocab(subResultsDeref)}; + return result; } -// ____________________________________________________________________________ -VariableToColumnMap CartesianProductJoin::computeVariableToColumnMap() const { - VariableToColumnMap result; - // It is crucial that we also count the columns in the inputs to which no - // variable was assigned. This is managed by the `offset` variable. - size_t offset = 0; - for (const auto& child : childView()) { - for (auto varCol : child.getExternallyVisibleVariableColumns()) { - varCol.second.columnIndex_ += offset; - result.insert(std::move(varCol)); +// _____________________________________________________________________________ +std::vector> +CartesianProductJoin::calculateSubResults() { + std::vector> subResults; + // We don't need to fully materialize the child results if we have a LIMIT + // specified and an OFFSET of 0. + // TODO We could in theory also apply this optimization if a + // non-zero OFFSET is specified, but this would make the algorithm more + // complicated. + std::optional limitIfPresent = getLimit(); + if (!getLimit()._limit.has_value() || getLimit()._offset != 0) { + limitIfPresent = std::nullopt; + } + + // Get all child results (possibly with limit, see above). + for (auto& child : childView()) { + if (limitIfPresent.has_value() && child.supportsLimit()) { + child.setLimit(limitIfPresent.value()); + } + subResults.push_back(child.getResult()); + + const auto& table = subResults.back()->idTable(); + // Early stopping: If one of the results is empty, we can stop early. + if (table.empty()) { + break; + } + + // If one of the children is the neutral element (because of a triple with + // zero variables), we can simply ignore it here. + if (table.numRows() == 1 && table.numColumns() == 0) { + subResults.pop_back(); + continue; + } + // Example for the following calculation: If we have a LIMIT of 1000 and + // the first child already has a result of size 100, then the second child + // needs to evaluate only its first 10 results. The +1 is because integer + // divisions are rounded down by default. + if (limitIfPresent.has_value()) { + limitIfPresent.value()._limit = limitIfPresent.value()._limit.value() / + subResults.back()->idTable().size() + + 1; } - // `getResultWidth` contains all the columns, not only the ones to which a - // variable is assigned. - offset += child.getResultWidth(); } - return result; + return subResults; } diff --git a/src/engine/CartesianProductJoin.h b/src/engine/CartesianProductJoin.h index 779adf1dba..de130a739e 100644 --- a/src/engine/CartesianProductJoin.h +++ b/src/engine/CartesianProductJoin.h @@ -82,11 +82,15 @@ class CartesianProductJoin : public Operation { // Copy each element from the `inputColumn` `groupSize` times to the // `targetColumn`. Repeat until the `targetColumn` is completely filled. Skip // the first `offset` write operations to the `targetColumn`. Call - // `checkCancellation` after each write. If `StaticGroupSize != 0`, then the - // group size is known at compile time which allows for more efficient loop - // processing for very small group sizes. - template + // `checkCancellation` after each write. void writeResultColumn(std::span targetColumn, std::span inputColumn, size_t groupSize, - size_t offset); + size_t offset) const; + + // Write all columns of the subresults into an `IdTable` and return it. + IdTable writeAllColumns( + const std::vector>& subResults) const; + + // Calculate the subresults of the children and store them into a vector. + std::vector> calculateSubResults(); }; From 80938b74f8a7e4919abd479a357bbe759896363f Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 7 Nov 2024 11:48:34 +0100 Subject: [PATCH 05/12] Docker build no tests for ARM (#1599) The cross-compilatiion currently takes more than 6 hours and is then cancelled by GitHub actions. We thus disable the building and execution of unit tests for the ARM64 build. --- Dockerfile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index bd514e1e12..15d7754191 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,16 +8,21 @@ RUN apt-get update && apt-get install -y software-properties-common wget && add- RUN wget https://apt.kitware.com/kitware-archive.sh && chmod +x kitware-archive.sh &&./kitware-archive.sh FROM base as builder +ARG TARGETPLATFORM RUN apt-get update && apt-get install -y build-essential cmake libicu-dev tzdata pkg-config uuid-runtime uuid-dev git libjemalloc-dev ninja-build libzstd-dev libssl-dev libboost1.81-dev libboost-program-options1.81-dev libboost-iostreams1.81-dev libboost-url1.81-dev - COPY . /app/ WORKDIR /app/ ENV DEBIAN_FRONTEND=noninteractive WORKDIR /app/build/ -RUN cmake -DCMAKE_BUILD_TYPE=Release -DLOGLEVEL=INFO -DUSE_PARALLEL=true -D_NO_TIMING_TESTS=ON -GNinja .. && ninja -RUN ctest --rerun-failed --output-on-failure +RUN cmake -DCMAKE_BUILD_TYPE=Release -DLOGLEVEL=INFO -DUSE_PARALLEL=true -D_NO_TIMING_TESTS=ON -GNinja .. +# When cross-compiling the container for ARM64, then compiling and running all tests runs into a timeout on GitHub actions, +# so we disable tests for this platform. +# TODO(joka921) re-enable these tests as soon as we can use a native ARM64 platform to compile the docker container. +RUN if [ $TARGETPLATFORM = "linux/arm64" ] ; then echo "target is ARM64, don't build tests to avoid timeout"; fi +RUN if [ $TARGETPLATFORM = "linux/arm64" ] ; then cmake --build . --target IndexBuilderMain ServerMain; else cmake --build . ; fi +RUN if [ $TARGETPLATFORM = "linux/arm64" ] ; then echo "Skipping tests for ARM64" ; else ctest --rerun-failed --output-on-failure ; fi FROM base as runtime WORKDIR /app From 0421dab7d1a1e72b13f1a128565924631b3b37ae Mon Sep 17 00:00:00 2001 From: Julian <14220769+Qup42@users.noreply.github.com> Date: Thu, 7 Nov 2024 12:01:21 +0100 Subject: [PATCH 06/12] Improve `SparqlQleverVisitor.cpp` code coverage (#1591) Fill some gaps in the test coverage for the code that parses UPDATE requests. --- .../sparqlParser/SparqlQleverVisitor.cpp | 16 +++++++++--- test/SparqlAntlrParserTest.cpp | 25 +++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 7e8ae8facf..db5002eea6 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -516,13 +516,23 @@ ParsedQuery Visitor::visit(Parser::ModifyContext* ctx) { return true; } }; + auto isVisibleIfVariableGraph = + [this](const SparqlTripleSimpleWithGraph::Graph& graph) { + if (std::holds_alternative(graph)) { + return ad_utility::contains(parsedQuery_.getVisibleVariables(), + std::get(graph)); + } else { + return true; + } + }; auto checkTriples = - [&isVisibleIfVariable, - &ctx](const std::vector& triples) { + [&isVisibleIfVariable, &ctx, &isVisibleIfVariableGraph]( + const std::vector& triples) { for (auto& triple : triples) { if (!(isVisibleIfVariable(triple.s_) && isVisibleIfVariable(triple.p_) && - isVisibleIfVariable(triple.o_))) { + isVisibleIfVariable(triple.o_) && + isVisibleIfVariableGraph(triple.g_))) { reportError(ctx, absl::StrCat("A triple contains a variable that was " "not bound in the query body.")); diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 06e8ae3a79..8983a27a3f 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -2047,7 +2047,14 @@ TEST(SparqlParser, UpdateQuery) { m::GraphUpdate({{Var("?a"), Iri(""), Iri(""), noGraph}}, {}, std::nullopt), m::GraphPattern(m::Triples({{Iri(""), "", Var{"?a"}}})))); + // Use variables that are not visible in the query body. Do this for all parts + // of the quad for coverage reasons. expectUpdateFails("DELETE { ?a } WHERE { ?b ?c }"); + expectUpdateFails("DELETE { . ?a } WHERE { ?b ?c }"); + expectUpdateFails( + "DELETE { GRAPH { . ?a } } WHERE { ?b ?c " + "}"); + expectUpdateFails("DELETE { GRAPH ?a { } } WHERE { ?b ?c }"); expectUpdate( "DELETE { ?a } INSERT { ?a } WHERE { ?a }", m::UpdateClause( @@ -2167,6 +2174,24 @@ TEST(SparqlParser, GraphRef) { expectGraphRefAll("GRAPH ", m::GraphRefIri("")); } +TEST(SparqlParser, QuadsNotTriples) { + auto expectQuadsNotTriples = + ExpectCompleteParse<&Parser::quadsNotTriples>{defaultPrefixMap}; + auto expectQuadsNotTriplesFails = + ExpectParseFails<&Parser::quadsNotTriples>{}; + const auto Iri = TripleComponent::Iri::fromIriref; + + expectQuadsNotTriples( + "GRAPH { }", + testing::ElementsAre( + m::Quad(Iri(""), Iri(""), Iri(""), ::Iri("")))); + expectQuadsNotTriples( + "GRAPH ?f { }", + ElementsAre(m::Quad(Iri(""), Iri(""), Iri(""), Var{"?f"}))); + expectQuadsNotTriplesFails("GRAPH \"foo\" { }"); + expectQuadsNotTriplesFails("GRAPH _:blankNode { }"); +} + TEST(SparqlParser, SourceSelector) { // This will be implemented soon, but for now we test the failure for the // coverage tool. From 3fa3ff08dd6b84fd7eed6859bc23c032d7489770 Mon Sep 17 00:00:00 2001 From: Julian <14220769+Qup42@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:22:33 +0100 Subject: [PATCH 07/12] Prepare the actual execution of UPDATE requests (#1592) Add some helper functions that can and will be used to convert A `QueryExecutionTree` and the parsed representation of an update clause into the format, that the `DeltaTriples` expect. --- src/engine/CMakeLists.txt | 2 +- src/engine/ExecuteUpdate.cpp | 101 +++++++++++ src/engine/ExecuteUpdate.h | 44 +++++ src/engine/ExportQueryExecutionTrees.h | 2 + src/engine/Server.cpp | 3 + src/parser/data/Iri.h | 1 + test/CMakeLists.txt | 2 + test/CachingMemoryResourceTest.cpp | 4 + test/DeltaTriplesTest.cpp | 37 +---- test/DeltaTriplesTestHelpers.h | 51 ++++++ test/ExecuteUpdateTest.cpp | 221 +++++++++++++++++++++++++ test/QueryPlannerTestHelpers.h | 3 +- 12 files changed, 434 insertions(+), 37 deletions(-) create mode 100644 src/engine/ExecuteUpdate.cpp create mode 100644 src/engine/ExecuteUpdate.h create mode 100644 test/DeltaTriplesTestHelpers.h create mode 100644 test/ExecuteUpdateTest.cpp diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 41a9a33a68..cbfb3344c3 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -13,5 +13,5 @@ add_library(engine VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp - CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp) + CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/ExecuteUpdate.cpp b/src/engine/ExecuteUpdate.cpp new file mode 100644 index 0000000000..ef27c6a8d4 --- /dev/null +++ b/src/engine/ExecuteUpdate.cpp @@ -0,0 +1,101 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Julian Mundhahs + +#include "engine/ExecuteUpdate.h" + +#include "engine/ExportQueryExecutionTrees.h" + +// _____________________________________________________________________________ +std::pair, LocalVocab> +ExecuteUpdate::transformTriplesTemplate( + const Index::Vocab& vocab, const VariableToColumnMap& variableColumns, + std::vector&& triples) { + // This LocalVocab only contains IDs that are related to the + // template. Most of the IDs will be added to the DeltaTriples' LocalVocab. An + // ID will only not be added if it belongs to a Quad with a variable that has + // no solutions. + LocalVocab localVocab{}; + + auto transformSparqlTripleComponent = + [&vocab, &localVocab, + &variableColumns](TripleComponent component) -> IdOrVariableIndex { + if (component.isVariable()) { + AD_CORRECTNESS_CHECK(variableColumns.contains(component.getVariable())); + return variableColumns.at(component.getVariable()).columnIndex_; + } else { + return std::move(component).toValueId(vocab, localVocab); + } + }; + Id defaultGraphIri = [&transformSparqlTripleComponent] { + IdOrVariableIndex defaultGraph = transformSparqlTripleComponent( + ad_utility::triple_component::Iri::fromIriref(DEFAULT_GRAPH_IRI)); + AD_CORRECTNESS_CHECK(std::holds_alternative(defaultGraph)); + return std::get(defaultGraph); + }(); + auto transformGraph = + [&vocab, &localVocab, &defaultGraphIri, + &variableColumns](SparqlTripleSimpleWithGraph::Graph graph) { + return std::visit( + ad_utility::OverloadCallOperator{ + [&defaultGraphIri](const std::monostate&) -> IdOrVariableIndex { + return defaultGraphIri; + }, + [&vocab, &localVocab](const Iri& iri) -> IdOrVariableIndex { + ad_utility::triple_component::Iri i = + ad_utility::triple_component::Iri::fromIriref(iri.iri()); + return TripleComponent(i).toValueId(vocab, localVocab); + }, + [&variableColumns](const Variable& var) -> IdOrVariableIndex { + AD_CORRECTNESS_CHECK(variableColumns.contains(var)); + return variableColumns.at(var).columnIndex_; + }}, + graph); + }; + auto transformSparqlTripleSimple = + [&transformSparqlTripleComponent, + &transformGraph](SparqlTripleSimpleWithGraph triple) { + return std::array{transformSparqlTripleComponent(std::move(triple.s_)), + transformSparqlTripleComponent(std::move(triple.p_)), + transformSparqlTripleComponent(std::move(triple.o_)), + transformGraph(std::move(triple.g_))}; + }; + return { + ad_utility::transform(std::move(triples), transformSparqlTripleSimple), + std::move(localVocab)}; +} + +// _____________________________________________________________________________ +std::optional ExecuteUpdate::resolveVariable(const IdTable& idTable, + const uint64_t& rowIdx, + IdOrVariableIndex idOrVar) { + auto visitId = [](const Id& id) { + return id.isUndefined() ? std::optional{} : id; + }; + return std::visit( + ad_utility::OverloadCallOperator{ + [&idTable, &rowIdx, &visitId](const ColumnIndex& columnInfo) { + return visitId(idTable(rowIdx, columnInfo)); + }, + visitId}, + idOrVar); +} + +// _____________________________________________________________________________ +void ExecuteUpdate::computeAndAddQuadsForResultRow( + const std::vector& templates, + std::vector>& result, const IdTable& idTable, + const uint64_t rowIdx) { + for (const auto& [s, p, o, g] : templates) { + auto subject = resolveVariable(idTable, rowIdx, s); + auto predicate = resolveVariable(idTable, rowIdx, p); + auto object = resolveVariable(idTable, rowIdx, o); + auto graph = resolveVariable(idTable, rowIdx, g); + + if (!subject.has_value() || !predicate.has_value() || !object.has_value() || + !graph.has_value()) { + continue; + } + result.emplace_back(std::array{*subject, *predicate, *object, *graph}); + } +} diff --git a/src/engine/ExecuteUpdate.h b/src/engine/ExecuteUpdate.h new file mode 100644 index 0000000000..729e65d51c --- /dev/null +++ b/src/engine/ExecuteUpdate.h @@ -0,0 +1,44 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Julian Mundhahs + +#pragma once + +#include + +#include "index/Index.h" +#include "parser/ParsedQuery.h" +#include "util/CancellationHandle.h" + +class ExecuteUpdate { + public: + using CancellationHandle = ad_utility::SharedCancellationHandle; + using IdOrVariableIndex = std::variant; + using TransformedTriple = std::array; + + private: + // Resolve all `TripleComponent`s and `Graph`s in a vector of + // `SparqlTripleSimpleWithGraph` into `Variable`s or `Id`s. + static std::pair, LocalVocab> + transformTriplesTemplate(const Index::Vocab& vocab, + const VariableToColumnMap& variableColumns, + std::vector&& triples); + FRIEND_TEST(ExecuteUpdate, transformTriplesTemplate); + + // Resolve a single `IdOrVariable` to an `Id` by looking up the value in the + // result row. The `Id`s will never be undefined. If (and only if) the input + // `Id` or the `Id` looked up in the `IdTable` is undefined then + // `std::nullopt` is returned. + static std::optional resolveVariable(const IdTable& idTable, + const uint64_t& rowIdx, + IdOrVariableIndex idOrVar); + FRIEND_TEST(ExecuteUpdate, resolveVariable); + + // Calculate and add the set of quads for the update that results from + // interpolating one result row into the template. The resulting `IdTriple`s + // consist of only `Id`s. + static void computeAndAddQuadsForResultRow( + const std::vector& templates, + std::vector>& result, const IdTable& idTable, uint64_t rowIdx); + FRIEND_TEST(ExecuteUpdate, computeAndAddQuadsForResultRow); +}; diff --git a/src/engine/ExportQueryExecutionTrees.h b/src/engine/ExportQueryExecutionTrees.h index d8a42b4d48..91a37b6d40 100644 --- a/src/engine/ExportQueryExecutionTrees.h +++ b/src/engine/ExportQueryExecutionTrees.h @@ -192,9 +192,11 @@ class ExportQueryExecutionTrees { // Return a range that contains the indices of the rows that have to be // exported from the `idTable` given the `LimitOffsetClause`. It takes into // account the LIMIT, the OFFSET, and the actual size of the `idTable` + public: static cppcoro::generator getRowIndices( LimitOffsetClause limitOffset, const Result& result); + private: FRIEND_TEST(ExportQueryExecutionTrees, getIdTablesReturnsSingletonIterator); FRIEND_TEST(ExportQueryExecutionTrees, getIdTablesMirrorsGenerator); FRIEND_TEST(ExportQueryExecutionTrees, ensureCorrectSlicingOfSingleIdTable); diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index cb9bf96a34..5a5085a23c 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -797,6 +797,9 @@ Awaitable Server::processQuery( auto qet = plannedQuery.queryExecutionTree_; if (plannedQuery.parsedQuery_.hasUpdateClause()) { + // This may be caused by a bug (the code is not yet tested well) or by an + // attack which tries to circumvent (not yet existing) access controls for + // Update. throw std::runtime_error("Expected Query but received Update."); } diff --git a/src/parser/data/Iri.h b/src/parser/data/Iri.h index 7b4cfea167..10685f4bb4 100644 --- a/src/parser/data/Iri.h +++ b/src/parser/data/Iri.h @@ -8,6 +8,7 @@ #include "parser/data/ConstructQueryExportContext.h" +// TODO: replace usages of this class with `ad_utility::triple_component::Iri` class Iri { std::string _string; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cd64d5c0b6..e9b2cf8347 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -416,3 +416,5 @@ addLinkAndDiscoverTest(SparqlExpressionGeneratorsTest engine) addLinkAndDiscoverTest(UrlParserTest) addLinkAndDiscoverTest(ServerTest engine) + +addLinkAndDiscoverTest(ExecuteUpdateTest engine) diff --git a/test/CachingMemoryResourceTest.cpp b/test/CachingMemoryResourceTest.cpp index 5d3a977a5a..d376e35516 100644 --- a/test/CachingMemoryResourceTest.cpp +++ b/test/CachingMemoryResourceTest.cpp @@ -35,6 +35,10 @@ TEST(CachingMemoryResource, allocateAndDeallocate) { ptr->deallocate(p12a, 1, 2); ptr->deallocate(p12b, 1, 2); + + // Reset the default resource to the default resource, such that subsequent + // unit test running in the same binary won't run into trouble. + std::pmr::set_default_resource(nullptr); } TEST(CachingMemoryResource, equality) { diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index 8871a43c0e..e9e858d727 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -6,6 +6,7 @@ #include +#include "./DeltaTriplesTestHelpers.h" #include "./util/GTestHelpers.h" #include "./util/IndexTestHelpers.h" #include "absl/strings/str_split.h" @@ -15,41 +16,7 @@ #include "index/Permutation.h" #include "parser/RdfParser.h" -namespace { -// A matcher that applies `InnerMatcher` to all `LocatedTriplesPerBlock` of a -// `DeltaTriples`. -auto InAllPermutations = - [](testing::Matcher InnerMatcher) - -> testing::Matcher { - return testing::AllOfArray(ad_utility::transform( - Permutation::ALL, [&InnerMatcher](const Permutation::Enum& perm) { - return testing::ResultOf( - absl::StrCat(".getLocatedTriplesPerBlock(", - Permutation::toString(perm), ")"), - [perm](const DeltaTriples& deltaTriples) { - return deltaTriples.getLocatedTriplesPerBlock(perm); - }, - InnerMatcher); - })); -}; -// A matcher that checks `numTriples()` for all `LocatedTriplesPerBlock` of a -// `DeltaTriples`. -auto NumTriplesInAllPermutations = - [](size_t expectedNumTriples) -> testing::Matcher { - return InAllPermutations(AD_PROPERTY(LocatedTriplesPerBlock, numTriples, - testing::Eq(expectedNumTriples))); -}; -// A matcher that checks `numInserted()` and `numDeleted()` of a `DeltaTriples` -// and `numTriples()` for all `LocatedTriplesPerBlock` of the `DeltaTriples`. -auto NumTriples = - [](size_t inserted, size_t deleted, - size_t inAllPermutations) -> testing::Matcher { - return testing::AllOf( - AD_PROPERTY(DeltaTriples, numInserted, testing::Eq(inserted)), - AD_PROPERTY(DeltaTriples, numDeleted, testing::Eq(deleted)), - NumTriplesInAllPermutations(inAllPermutations)); -}; -} // namespace +using namespace deltaTriplesTestHelpers; // Fixture that sets up a test index. class DeltaTriplesTest : public ::testing::Test { diff --git a/test/DeltaTriplesTestHelpers.h b/test/DeltaTriplesTestHelpers.h new file mode 100644 index 0000000000..586a54196a --- /dev/null +++ b/test/DeltaTriplesTestHelpers.h @@ -0,0 +1,51 @@ +// Copyright 2024, University of Freiburg +// Chair of Algorithms and Data Structures. +// Authors: +// 2024 Julian Mundhahs + +#include +#include + +#include "index/DeltaTriples.h" +#include "index/LocatedTriples.h" +#include "util/GTestHelpers.h" + +#pragma once + +namespace deltaTriplesTestHelpers { + +// A matcher that applies `InnerMatcher` to all `LocatedTriplesPerBlock` of a +// `DeltaTriples`. +inline auto InAllPermutations = + [](testing::Matcher InnerMatcher) + -> testing::Matcher { + return testing::AllOfArray(ad_utility::transform( + Permutation::ALL, [&InnerMatcher](const Permutation::Enum& perm) { + return testing::ResultOf( + absl::StrCat(".getLocatedTriplesPerBlock(", + Permutation::toString(perm), ")"), + [perm](const DeltaTriples& deltaTriples) { + return deltaTriples.getLocatedTriplesPerBlock(perm); + }, + InnerMatcher); + })); +}; +// A matcher that checks `numTriples()` for all `LocatedTriplesPerBlock` of a +// `DeltaTriples`. +inline auto NumTriplesInAllPermutations = + [](size_t expectedNumTriples) -> testing::Matcher { + return InAllPermutations(AD_PROPERTY(LocatedTriplesPerBlock, numTriples, + testing::Eq(expectedNumTriples))); +}; +// A matcher that checks `numInserted()` and `numDeleted()` of a `DeltaTriples` +// and `numTriples()` for all `LocatedTriplesPerBlock` of the `DeltaTriples`. +inline auto NumTriples = + [](size_t inserted, size_t deleted, + size_t inAllPermutations) -> testing::Matcher { + return testing::AllOf( + AD_PROPERTY(DeltaTriples, numInserted, testing::Eq(inserted)), + AD_PROPERTY(DeltaTriples, numDeleted, testing::Eq(deleted)), + NumTriplesInAllPermutations(inAllPermutations)); +}; + +} // namespace deltaTriplesTestHelpers diff --git a/test/ExecuteUpdateTest.cpp b/test/ExecuteUpdateTest.cpp new file mode 100644 index 0000000000..08c4ec284e --- /dev/null +++ b/test/ExecuteUpdateTest.cpp @@ -0,0 +1,221 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Julian Mundhahs (mundhahj@tf.uni-freiburg.de) + +#include +#include + +#include "DeltaTriplesTestHelpers.h" +#include "QueryPlannerTestHelpers.h" +#include "engine/ExecuteUpdate.h" +#include "index/IndexImpl.h" +#include "parser/sparqlParser/SparqlQleverVisitor.h" +#include "util/GTestHelpers.h" +#include "util/IdTableHelpers.h" +#include "util/IndexTestHelpers.h" + +using namespace deltaTriplesTestHelpers; + +auto V = [](const uint64_t index) { + return Id::makeFromVocabIndex(VocabIndex::make(index)); +}; + +// `ExecuteUpdate::IdOrVariableIndex` extended by `LiteralOrIri` which denotes +// an entry from the local vocab. +using TripleComponentT = + std::variant; + +// A matcher that never matches and outputs the given message. +MATCHER_P(AlwaysFalse, msg, "") { + (void)arg; // avoid compiler warning for unused value. + *result_listener << msg; + return false; +} + +// _____________________________________________________________________________ +TEST(ExecuteUpdate, transformTriplesTemplate) { + // Create an index for testing. + const auto qec = ad_utility::testing::getQec(" \"foo\""); + const Index& index = qec->getIndex(); + // We need a non-const vocab for the test. + auto& vocab = const_cast(index.getVocab()); + + // Helpers + const auto Id = ad_utility::testing::makeGetId(index); + using Graph = SparqlTripleSimpleWithGraph::Graph; + using LocalVocab = ad_utility::triple_component::LiteralOrIri; + auto defaultGraphId = Id(std::string{DEFAULT_GRAPH_IRI}); + auto Iri = [](const std::string& iri) { + return ad_utility::triple_component::Iri::fromIriref(iri); + }; + auto Literal = [](const std::string& literal) { + return ad_utility::triple_component::Literal::fromStringRepresentation( + literal); + }; + // Matchers + using MatcherType = testing::Matcher; + auto TripleComponentMatcher = [](const ::LocalVocab& localVocab, + TripleComponentT component) -> MatcherType { + return std::visit( + ad_utility::OverloadCallOperator{ + [](const ::Id& id) -> MatcherType { + return testing::VariantWith<::Id>(testing::Eq(id)); + }, + [](const ColumnIndex& index) -> MatcherType { + return testing::VariantWith(testing::Eq(index)); + }, + [&localVocab]( + const ad_utility::triple_component::LiteralOrIri& literalOrIri) + -> MatcherType { + const auto lviOpt = localVocab.getIndexOrNullopt(literalOrIri); + if (!lviOpt) { + return AlwaysFalse( + absl::StrCat(literalOrIri.toStringRepresentation(), + " not in local vocab")); + } + const auto id = Id::makeFromLocalVocabIndex(lviOpt.value()); + return testing::VariantWith<::Id>( + AD_PROPERTY(Id, getBits, testing::Eq(id.getBits()))); + }}, + component); + }; + auto expectTransformTriplesTemplate = + [&vocab, &TripleComponentMatcher]( + const VariableToColumnMap& variableColumns, + std::vector&& triples, + const std::vector>& + expectedTransformedTriples) { + auto [transformedTriples, localVocab] = + ExecuteUpdate::transformTriplesTemplate(vocab, variableColumns, + std::move(triples)); + const auto transformedTriplesMatchers = ad_utility::transform( + expectedTransformedTriples, + [&localVocab, &TripleComponentMatcher](const auto& expectedTriple) { + return ElementsAre( + TripleComponentMatcher(localVocab, expectedTriple.at(0)), + TripleComponentMatcher(localVocab, expectedTriple.at(1)), + TripleComponentMatcher(localVocab, expectedTriple.at(2)), + TripleComponentMatcher(localVocab, expectedTriple.at(3))); + }); + EXPECT_THAT(transformedTriples, + testing::ElementsAreArray(transformedTriplesMatchers)); + }; + auto expectTransformTriplesTemplateFails = + [&vocab](const VariableToColumnMap& variableColumns, + std::vector&& triples, + const testing::Matcher& messageMatcher) { + AD_EXPECT_THROW_WITH_MESSAGE( + ExecuteUpdate::transformTriplesTemplate(vocab, variableColumns, + std::move(triples)), + messageMatcher); + }; + // Transforming an empty vector of template results in no `TransformedTriple`s + // and leaves the `LocalVocab` empty. + expectTransformTriplesTemplate({}, {}, {}); + // Resolve a `SparqlTripleSimpleWithGraph` without variables. + expectTransformTriplesTemplate( + {}, + {SparqlTripleSimpleWithGraph{Literal("\"foo\""), Iri(""), + Literal("\"foo\""), Graph{}}}, + {{Id("\"foo\""), Id(""), Id("\"foo\""), defaultGraphId}}); + // Literals in the template that are not in the index are added to the + // `LocalVocab`. + expectTransformTriplesTemplate( + {}, + {SparqlTripleSimpleWithGraph{Literal("\"foo\""), Iri(""), + Literal("\"foo\""), Graph{::Iri("")}}}, + {{Id("\"foo\""), Id(""), Id("\"foo\""), LocalVocab(Iri(""))}}); + // A variable in the template (`?f`) is not mapped in the + // `VariableToColumnMap`. + expectTransformTriplesTemplateFails( + {}, + {SparqlTripleSimpleWithGraph{Literal("\"foo\""), Iri(""), + Variable("?f"), Graph{}}}, + testing::HasSubstr( + "Assertion `variableColumns.contains(component.getVariable())` " + "failed.")); + expectTransformTriplesTemplateFails( + {}, + {SparqlTripleSimpleWithGraph{Literal("\"foo\""), Iri(""), + Literal("\"foo\""), Graph{Variable("?f")}}}, + testing::HasSubstr("Assertion `variableColumns.contains(var)` failed.")); + // Variables in the template are mapped to their column index. + expectTransformTriplesTemplate( + {{Variable("?f"), {0, ColumnIndexAndTypeInfo::PossiblyUndefined}}}, + {SparqlTripleSimpleWithGraph{Literal("\"foo\""), Iri(""), + Variable("?f"), Graph{}}}, + {{Id("\"foo\""), Id(""), 0UL, defaultGraphId}}); + expectTransformTriplesTemplate( + {{Variable("?f"), {0, ColumnIndexAndTypeInfo::PossiblyUndefined}}}, + {SparqlTripleSimpleWithGraph{Literal("\"foo\""), Iri(""), + Literal("\"foo\""), Graph{Variable("?f")}}}, + {{Id("\"foo\""), Id(""), Id("\"foo\""), 0UL}}); +} + +// _____________________________________________________________________________ +TEST(ExecuteUpdate, resolveVariable) { + const auto idTable = + makeIdTableFromVector({{V(0), V(1), V(2)}, + {V(3), V(4), V(5)}, + {V(6), Id::makeUndefined(), V(8)}}); + auto resolveVariable = + std::bind_front(&ExecuteUpdate::resolveVariable, std::cref(idTable)); + EXPECT_THAT(resolveVariable(0, V(10)), testing::Eq(V(10))); + EXPECT_THAT(resolveVariable(0, 1UL), testing::Eq(V(1))); + EXPECT_THAT(resolveVariable(1, 1UL), testing::Eq(V(4))); + EXPECT_THAT(resolveVariable(2, 1UL), testing::Eq(std::nullopt)); + EXPECT_THAT(resolveVariable(2, Id::makeUndefined()), + testing::Eq(std::nullopt)); +} + +// _____________________________________________________________________________ +TEST(ExecuteUpdate, computeAndAddQuadsForResultRow) { + const auto idTable = + makeIdTableFromVector({{V(0), V(1), V(2)}, + {V(3), V(4), V(5)}, + {V(6), Id::makeUndefined(), V(8)}}); + auto expectComputeQuads = + [](const std::vector& templates, + const IdTable& idTable, uint64_t rowIdx, + const testing::Matcher>&>& + expectedQuads) { + std::vector> result; + ExecuteUpdate::computeAndAddQuadsForResultRow(templates, result, + idTable, rowIdx); + EXPECT_THAT(result, expectedQuads); + }; + // Compute the quads for an empty template set yields no quads. + expectComputeQuads({}, idTable, 0, testing::IsEmpty()); + // Compute the quads for template without variables yields the templates + // unmodified. + expectComputeQuads( + {{V(0), V(1), V(2), V(3)}}, idTable, 0, + testing::ElementsAreArray({IdTriple{{V(0), V(1), V(2), V(3)}}})); + expectComputeQuads( + {{V(0), V(1), V(2), V(3)}}, idTable, 1, + testing::ElementsAreArray({IdTriple{{V(0), V(1), V(2), V(3)}}})); + // The variables in templates are resolved to the value of the variable in the + // specified row of the result. + expectComputeQuads( + {{0UL, V(1), 1UL, V(3)}}, idTable, 0, + testing::ElementsAreArray({IdTriple{{V(0), V(1), V(1), V(3)}}})); + expectComputeQuads( + {{0UL, V(1), 1UL, V(3)}}, idTable, 1, + testing::ElementsAreArray({IdTriple{{V(3), V(1), V(4), V(3)}}})); + // Quads with undefined IDs cannot be stored and are not returned. + expectComputeQuads({{0UL, V(1), 1UL, V(3)}}, idTable, 2, testing::IsEmpty()); + expectComputeQuads({{V(0), V(1), Id::makeUndefined(), V(3)}}, idTable, 0, + testing::IsEmpty()); + // Some extra cases to cover all branches. + expectComputeQuads({{Id::makeUndefined(), V(1), V(2), V(3)}}, idTable, 0, + testing::IsEmpty()); + expectComputeQuads({{V(0), Id::makeUndefined(), V(2), V(3)}}, idTable, 0, + testing::IsEmpty()); + expectComputeQuads({{V(0), V(1), V(2), Id::makeUndefined()}}, idTable, 0, + testing::IsEmpty()); + // All the templates are evaluated for the specified row of the result. + expectComputeQuads( + {{0UL, V(1), 1UL, V(3)}, {V(0), 1UL, 2UL, V(3)}}, idTable, 0, + testing::ElementsAreArray({IdTriple{{V(0), V(1), V(1), V(3)}}, + IdTriple{{V(0), V(1), V(2), V(3)}}})); +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index 48abc6d342..5791627d9e 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -384,7 +384,8 @@ constexpr auto Union = MatchTypeAndOrderedChildren<::Union>; /// Parse the given SPARQL `query`, pass it to a `QueryPlanner` with empty /// execution context, and return the resulting `QueryExecutionTree` -QueryExecutionTree parseAndPlan(std::string query, QueryExecutionContext* qec) { +inline QueryExecutionTree parseAndPlan(std::string query, + QueryExecutionContext* qec) { ParsedQuery pq = SparqlParser::parseQuery(std::move(query)); // TODO make it impossible to pass `nullptr` here, properly mock a // queryExecutionContext. From bb70c4a8bd9bb9505ba809ba845b0bd7df0dc9b6 Mon Sep 17 00:00:00 2001 From: Julian <14220769+Qup42@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:40:08 +0100 Subject: [PATCH 08/12] Enable parsing of Updates (#1604) The SPARQL parser now parses all UPDATE requests. In practice, this means that the `SPARQL UPDATE is not yet supported by QLever` message is now emitted by the Server, and not anymore by the Parser, which means that we are one step close to the support of SPARQL UPDATE. --- .../sparqlParser/SparqlQleverVisitor.cpp | 22 +++---- test/SparqlAntlrParserTest.cpp | 61 ++++++++++--------- 2 files changed, 43 insertions(+), 40 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index db5002eea6..c51d1cb1c9 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -211,21 +211,17 @@ ParsedQuery Visitor::visit(Parser::QueryContext* ctx) { // ____________________________________________________________________________________ ParsedQuery Visitor::visit(Parser::QueryOrUpdateContext* ctx) { - if (ctx->update()) { + if (ctx->update() && !ctx->update()->update1()) { // An empty query currently matches the `update()` rule. We handle this // case manually to get a better error message. If an update query doesn't // have an `update1()`, then it consists of a (possibly empty) prologue, but // has not actual content, see the grammar in `SparqlAutomatic.g4` for // details. - if (!ctx->update()->update1()) { - reportError(ctx->update(), - "Empty query (this includes queries that only consist " - "of comments or prefix declarations)."); - } - reportNotSupported(ctx->update(), "SPARQL 1.1 Update is"); - } else { - return visit(ctx->query()); + reportError(ctx->update(), + "Empty query (this includes queries that only consist " + "of comments or prefix declarations)."); } + return visitAlternative(ctx->query(), ctx->update()); } // ____________________________________________________________________________________ @@ -392,16 +388,20 @@ std::optional Visitor::visit(Parser::ValuesClauseContext* ctx) { // ____________________________________________________________________________________ ParsedQuery Visitor::visit(Parser::UpdateContext* ctx) { + // The prologue (BASE and PREFIX declarations) only affects the internal + // state of the visitor. visit(ctx->prologue()); - auto query = visit(ctx->update1()); + auto update = visit(ctx->update1()); if (ctx->update()) { parsedQuery_ = ParsedQuery{}; reportNotSupported(ctx->update(), "Multiple updates in one query are"); } - return query; + update._originalString = ctx->getStart()->getInputStream()->toString(); + + return update; } // ____________________________________________________________________________________ diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 8983a27a3f..4b93153522 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -1926,32 +1926,6 @@ TEST(SparqlParser, aggregateExpressions) { matchAggregate(true, V{"?x"}, separator(";"))); } -// Update queries are WIP. The individual parts to parse some update queries -// are in place the code to process them is still unfinished. Therefore we -// don't accept update queries. -TEST(SparqlParser, updateQueryUnsupported) { - auto expectUpdateFails = ExpectParseFails<&Parser::queryOrUpdate>{}; - auto contains = [](const std::string& s) { return ::testing::HasSubstr(s); }; - auto updateUnsupported = - contains("SPARQL 1.1 Update is currently not supported by QLever."); - - // Test all the cases because some functionality will be enabled shortly. - expectUpdateFails("INSERT DATA { }", updateUnsupported); - expectUpdateFails("DELETE DATA { }", updateUnsupported); - expectUpdateFails("DELETE { } WHERE { ?s ?p ?o }", - updateUnsupported); - expectUpdateFails("INSERT { } WHERE { ?s ?p ?o }", - updateUnsupported); - expectUpdateFails("DELETE WHERE { }", updateUnsupported); - expectUpdateFails("LOAD ", updateUnsupported); - expectUpdateFails("CLEAR GRAPH ", updateUnsupported); - expectUpdateFails("DROP GRAPH ", updateUnsupported); - expectUpdateFails("CREATE GRAPH ", updateUnsupported); - expectUpdateFails("ADD GRAPH TO DEFAULT", updateUnsupported); - expectUpdateFails("MOVE DEFAULT TO GRAPH ", updateUnsupported); - expectUpdateFails("COPY GRAPH TO GRAPH ", updateUnsupported); -} - TEST(SparqlParser, Quads) { auto expectQuads = ExpectCompleteParse<&Parser::quads>{defaultPrefixMap}; auto expectQuadsFails = ExpectParseFails<&Parser::quads>{}; @@ -2011,8 +1985,14 @@ TEST(SparqlParser, QuadData) { expectQuadDataFails("{ GRAPH ?foo { } }"); } -TEST(SparqlParser, UpdateQuery) { - auto expectUpdate = ExpectCompleteParse<&Parser::update>{defaultPrefixMap}; +TEST(SparqlParser, Update) { + auto expectUpdate_ = ExpectCompleteParse<&Parser::update>{defaultPrefixMap}; + // Automatically test all updates for their `_originalString`. + auto expectUpdate = [&expectUpdate_](const std::string& query, + auto&& expected) { + expectUpdate_(query, + testing::AllOf(expected, m::pq::OriginalString(query))); + }; auto expectUpdateFails = ExpectParseFails<&Parser::update>{}; auto Iri = [](std::string_view stringWithBrackets) { return TripleComponent::Iri::fromIriref(stringWithBrackets); @@ -2022,6 +2002,7 @@ TEST(SparqlParser, UpdateQuery) { }; auto noGraph = std::monostate{}; + // Test the parsing of the update clause in the ParsedQuery. expectUpdate( "INSERT DATA { }", m::UpdateClause( @@ -2144,13 +2125,35 @@ TEST(SparqlParser, UpdateQuery) { m::GraphPattern())); } -TEST(SparqlParser, EmptyQuery) { +TEST(SparqlParser, QueryOrUpdate) { + auto expectQuery = + ExpectCompleteParse<&Parser::queryOrUpdate>{defaultPrefixMap}; auto expectQueryFails = ExpectParseFails<&Parser::queryOrUpdate>{}; + auto Iri = [](std::string_view stringWithBrackets) { + return TripleComponent::Iri::fromIriref(stringWithBrackets); + }; + // Empty queries (queries without any query or update operation) are + // forbidden. auto emptyMatcher = ::testing::HasSubstr("Empty quer"); expectQueryFails("", emptyMatcher); expectQueryFails(" ", emptyMatcher); expectQueryFails("PREFIX ex: ", emptyMatcher); expectQueryFails("### Some comment \n \n #someMoreComments", emptyMatcher); + // Hit all paths for coverage. + expectQuery("SELECT ?a WHERE { ?a }", + AllOf(m::SelectQuery(m::Select({Var{"?a"}}), + m::GraphPattern(m::Triples( + {{Var{"?a"}, "", Iri("")}}))), + m::pq::OriginalString("SELECT ?a WHERE { ?a }"), + m::VisibleVariables({Var{"?a"}}))); + expectQuery( + "INSERT DATA { }", + AllOf(m::UpdateClause(m::GraphUpdate({}, + {{Iri(""), Iri(""), Iri(""), + std::monostate{}}}, + std::nullopt), + m::GraphPattern()), + m::pq::OriginalString("INSERT DATA { }"))); } TEST(SparqlParser, GraphOrDefault) { From 1ddf5e09cc1a7819385e35cb09276c808c464301 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 8 Nov 2024 14:21:21 +0100 Subject: [PATCH 09/12] Add `DeltaTriplesManager` (#1603) The already existing `DeltaTriples` class maintains a dynamically changing set of insertions and deletions relative to the original input data, together with a (single) local vocab. The class is not threadsafe and has to be used with care. In particular, concurrent update queries have to be serialized, and while a query makes use of the "delta triples", it has to be made sure that they are not changed over the course of the processing of that query. Both of these problems are solved by the new `DeltaTriplesManager` class. The index has a single object of this class. It maintains a single `DeltaTriples` object, write access to which is strictly serialized. Each new query gets a so-called *snapshot* of the current delta triples. This is a full copy (of the delta triples located in each of the permutations and of the local vocab). These snapshots are read-only and multiple queries can share the same snapshot. A snapshot lives as long as one query using it is still being processed. --- src/engine/CountAvailablePredicates.cpp | 7 +- src/engine/GroupBy.cpp | 10 +-- src/engine/HasPredicateScan.cpp | 13 +-- src/engine/IndexScan.cpp | 12 +-- src/engine/Operation.h | 4 +- src/engine/QueryExecutionContext.h | 25 +++--- src/engine/QueryExecutionTree.cpp | 9 +-- src/engine/QueryExecutionTree.h | 8 +- src/index/DeltaTriples.cpp | 62 +++++++++++--- src/index/DeltaTriples.h | 94 +++++++++++++++++----- src/index/Index.cpp | 49 +++++++----- src/index/Index.h | 36 +++++---- src/index/IndexImpl.cpp | 35 ++++---- src/index/IndexImpl.h | 33 +++++--- src/index/LocatedTriples.cpp | 16 ++++ src/index/LocatedTriples.h | 10 ++- src/index/Permutation.cpp | 58 ++++++++------ src/index/Permutation.h | 41 +++++----- test/DeltaTriplesTest.cpp | 102 ++++++++++++++++++++++++ test/DeltaTriplesTestHelpers.h | 2 +- test/IndexTest.cpp | 66 +++++++++------ test/util/IndexTestHelpers.cpp | 16 ++-- 22 files changed, 497 insertions(+), 211 deletions(-) diff --git a/src/engine/CountAvailablePredicates.cpp b/src/engine/CountAvailablePredicates.cpp index 095015d6a2..e78fcca694 100644 --- a/src/engine/CountAvailablePredicates.cpp +++ b/src/engine/CountAvailablePredicates.cpp @@ -165,9 +165,10 @@ void CountAvailablePredicates::computePatternTrickAllEntities( TripleComponent::Iri::fromIriref(HAS_PATTERN_PREDICATE), std::nullopt, std::nullopt} .toScanSpecification(index); - auto fullHasPattern = index.getPermutation(Permutation::Enum::PSO) - .lazyScan(scanSpec, std::nullopt, {}, - cancellationHandle_, deltaTriples()); + auto fullHasPattern = + index.getPermutation(Permutation::Enum::PSO) + .lazyScan(scanSpec, std::nullopt, {}, cancellationHandle_, + locatedTriplesSnapshot()); for (const auto& idTable : fullHasPattern) { for (const auto& patternId : idTable.getColumn(1)) { AD_CORRECTNESS_CHECK(patternId.getDatatype() == Datatype::Int); diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index cc00887845..e6ff853c48 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -665,7 +665,7 @@ std::optional GroupBy::computeGroupByObjectWithCount() const { getExecutionContext()->getIndex().getPimpl().getPermutation( indexScan->permutation()); auto result = permutation.getDistinctCol1IdsAndCounts( - col0Id.value(), cancellationHandle_, deltaTriples()); + col0Id.value(), cancellationHandle_, locatedTriplesSnapshot()); indexScan->updateRuntimeInformationWhenOptimizedOut( {}, RuntimeInformation::Status::optimizedOut); @@ -717,8 +717,8 @@ std::optional GroupBy::computeGroupByForFullIndexScan() const { const auto& permutation = getExecutionContext()->getIndex().getPimpl().getPermutation( permutationEnum.value()); - auto table = permutation.getDistinctCol0IdsAndCounts(cancellationHandle_, - deltaTriples()); + auto table = permutation.getDistinctCol0IdsAndCounts( + cancellationHandle_, locatedTriplesSnapshot()); if (numCounts == 0) { table.setColumnSubset({{0}}); } @@ -840,7 +840,7 @@ std::optional GroupBy::computeGroupByForJoinWithFullScan() const { Id currentId = subresult->idTable()(0, columnIndex); size_t currentCount = 0; size_t currentCardinality = - index.getCardinality(currentId, permutation, deltaTriples()); + index.getCardinality(currentId, permutation, locatedTriplesSnapshot()); auto pushRow = [&]() { // If the count is 0 this means that the element with the `currentId` @@ -863,7 +863,7 @@ std::optional GroupBy::computeGroupByForJoinWithFullScan() const { // without the internally added triples, but that is not easy to // retrieve right now. currentCardinality = - index.getCardinality(id, permutation, deltaTriples()); + index.getCardinality(id, permutation, locatedTriplesSnapshot()); } currentCount += currentCardinality; } diff --git a/src/engine/HasPredicateScan.cpp b/src/engine/HasPredicateScan.cpp index 5c494ab13c..b01ede635b 100644 --- a/src/engine/HasPredicateScan.cpp +++ b/src/engine/HasPredicateScan.cpp @@ -267,9 +267,10 @@ ProtoResult HasPredicateScan::computeResult( TripleComponent::Iri::fromIriref(HAS_PATTERN_PREDICATE), std::nullopt, std::nullopt} .toScanSpecification(index); - auto hasPattern = index.getPermutation(Permutation::Enum::PSO) - .lazyScan(scanSpec, std::nullopt, {}, - cancellationHandle_, deltaTriples()); + auto hasPattern = + index.getPermutation(Permutation::Enum::PSO) + .lazyScan(scanSpec, std::nullopt, {}, cancellationHandle_, + locatedTriplesSnapshot()); auto getId = [this](const TripleComponent tc) { std::optional id = tc.toValueId(getIndex().getVocab()); @@ -339,9 +340,9 @@ void HasPredicateScan::computeFreeO( TripleComponent::Iri::fromIriref(HAS_PATTERN_PREDICATE), subjectAsId, std::nullopt} .toScanSpecification(index); - auto hasPattern = - index.getPermutation(Permutation::Enum::PSO) - .scan(std::move(scanSpec), {}, cancellationHandle_, deltaTriples()); + auto hasPattern = index.getPermutation(Permutation::Enum::PSO) + .scan(std::move(scanSpec), {}, cancellationHandle_, + locatedTriplesSnapshot()); AD_CORRECTNESS_CHECK(hasPattern.numRows() <= 1); for (Id patternId : hasPattern.getColumn(0)) { const auto& pattern = patterns[patternId.getInt()]; diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 9217e91ac0..dc6781caef 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -162,7 +162,7 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) { const auto& index = _executionContext->getIndex(); idTable = index.scan(getScanSpecification(), permutation_, additionalColumns(), - cancellationHandle_, deltaTriples(), getLimit()); + cancellationHandle_, locatedTriplesSnapshot(), getLimit()); AD_CORRECTNESS_CHECK(idTable.numColumns() == getResultWidth()); LOG(DEBUG) << "IndexScan result computation done.\n"; checkCancellation(); @@ -174,7 +174,7 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) { size_t IndexScan::computeSizeEstimate() const { AD_CORRECTNESS_CHECK(_executionContext); return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_, - deltaTriples()); + locatedTriplesSnapshot()); } // _____________________________________________________________________________ @@ -195,7 +195,7 @@ void IndexScan::determineMultiplicities() { return {1.0f}; } else if (numVariables_ == 2) { return idx.getMultiplicities(*getPermutedTriple()[0], permutation_, - deltaTriples()); + locatedTriplesSnapshot()); } else { AD_CORRECTNESS_CHECK(numVariables_ == 3); return idx.getMultiplicities(permutation_); @@ -245,8 +245,8 @@ Permutation::IdTableGenerator IndexScan::getLazyScan( .getImpl() .getPermutation(permutation()) .lazyScan(getScanSpecification(), std::move(actualBlocks), - additionalColumns(), cancellationHandle_, deltaTriples(), - getLimit()); + additionalColumns(), cancellationHandle_, + locatedTriplesSnapshot(), getLimit()); }; // ________________________________________________________________ @@ -254,7 +254,7 @@ std::optional IndexScan::getMetadataForScan() const { const auto& index = getExecutionContext()->getIndex().getImpl(); return index.getPermutation(permutation()) - .getMetadataAndBlocks(getScanSpecification(), deltaTriples()); + .getMetadataAndBlocks(getScanSpecification(), locatedTriplesSnapshot()); }; // ________________________________________________________________ diff --git a/src/engine/Operation.h b/src/engine/Operation.h index 61702c7766..6f95633b33 100644 --- a/src/engine/Operation.h +++ b/src/engine/Operation.h @@ -69,8 +69,8 @@ class Operation { const Index& getIndex() const { return _executionContext->getIndex(); } - const DeltaTriples& deltaTriples() const { - return _executionContext->deltaTriples(); + const auto& locatedTriplesSnapshot() const { + return _executionContext->locatedTriplesSnapshot(); } // Get a unique, not ambiguous string representation for a subtree. diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h index 70657da59b..de7b5a4f6e 100644 --- a/src/engine/QueryExecutionContext.h +++ b/src/engine/QueryExecutionContext.h @@ -1,8 +1,7 @@ -// Copyright 2011, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: -// 2011-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) -// 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) +// Copyright 2011 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Björn Buchhold [2011 - 2017] +// Johannes Kalmbach [2017 - 2024] #pragma once @@ -92,7 +91,10 @@ class QueryExecutionContext { [[nodiscard]] const Index& getIndex() const { return _index; } - const DeltaTriples& deltaTriples() const { return *deltaTriples_; } + const LocatedTriplesSnapshot& locatedTriplesSnapshot() const { + AD_CORRECTNESS_CHECK(sharedLocatedTriplesSnapshot_ != nullptr); + return *sharedLocatedTriplesSnapshot_; + } void clearCacheUnpinnedOnly() { getQueryTreeCache().clearUnpinnedOnly(); } @@ -123,10 +125,13 @@ class QueryExecutionContext { private: const Index& _index; - // TODO This has to be stored externally once we properly support - // SPARQL UPDATE, currently it is just a stub to make the interface work. - std::shared_ptr deltaTriples_{ - std::make_shared(_index)}; + + // When the `QueryExecutionContext` is constructed, get a stable read-only + // snapshot of the current (located) delta triples. These can then be used + // by the respective query without interfering with further incoming + // update operations. + SharedLocatedTriplesSnapshot sharedLocatedTriplesSnapshot_{ + _index.deltaTriplesManager().getCurrentSnapshot()}; QueryResultCache* const _subtreeCache; // allocators are copied but hold shared state ad_utility::AllocatorWithLimit _allocator; diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index aed58d4fde..2b2c393928 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -1,8 +1,7 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: -// 2015-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) -// 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) +// Copyright 2015 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Björn Buchhold [2015 - 2017] +// Johannes Kalmbach [2017 - 2024] #include "./QueryExecutionTree.h" diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 0519082b78..6a4b63c712 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -1,6 +1,8 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) +// Copyright 2015 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Björn Buchhold +// Johannes Kalmbach + #pragma once #include diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index a07d342ec6..0d2ac3bac9 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -1,8 +1,8 @@ // Copyright 2023 - 2024, University of Freiburg -// Chair of Algorithms and Data Structures. -// Authors: -// 2023 Hannah Bast -// 2024 Julian Mundhahs +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast +// Julian Mundhahs +// Johannes Kalmbach #include "index/DeltaTriples.h" @@ -21,8 +21,7 @@ LocatedTriples::iterator& DeltaTriples::LocatedTripleHandles::forPermutation( void DeltaTriples::clear() { triplesInserted_.clear(); triplesDeleted_.clear(); - std::ranges::for_each(locatedTriplesPerBlock_, - &LocatedTriplesPerBlock::clear); + std::ranges::for_each(locatedTriples(), &LocatedTriplesPerBlock::clear); } // ____________________________________________________________________________ @@ -33,7 +32,7 @@ DeltaTriples::locateAndAddTriples(CancellationHandle cancellationHandle, std::array, Permutation::ALL.size()> intermediateHandles; for (auto permutation : Permutation::ALL) { - auto& perm = index_.getImpl().getPermutation(permutation); + auto& perm = index_.getPermutation(permutation); auto locatedTriples = LocatedTriple::locateTriplesInPermutation( // TODO: replace with `getAugmentedMetadata` once integration // is done @@ -41,7 +40,7 @@ DeltaTriples::locateAndAddTriples(CancellationHandle cancellationHandle, cancellationHandle); cancellationHandle->throwIfCancelled(); intermediateHandles[static_cast(permutation)] = - locatedTriplesPerBlock_[static_cast(permutation)].add( + this->locatedTriples()[static_cast(permutation)].add( locatedTriples); cancellationHandle->throwIfCancelled(); } @@ -60,8 +59,8 @@ void DeltaTriples::eraseTripleInAllPermutations(LocatedTripleHandles& handles) { // Erase for all permutations. for (auto permutation : Permutation::ALL) { auto ltIter = handles.forPermutation(permutation); - locatedTriplesPerBlock_[static_cast(permutation)].erase( - ltIter->blockIndex_, ltIter); + locatedTriples()[static_cast(permutation)].erase(ltIter->blockIndex_, + ltIter); } } @@ -172,7 +171,48 @@ void DeltaTriples::modifyTriplesImpl(CancellationHandle cancellationHandle, } // ____________________________________________________________________________ -const LocatedTriplesPerBlock& DeltaTriples::getLocatedTriplesPerBlock( +const LocatedTriplesPerBlock& +LocatedTriplesSnapshot::getLocatedTriplesForPermutation( Permutation::Enum permutation) const { return locatedTriplesPerBlock_[static_cast(permutation)]; } + +// ____________________________________________________________________________ +SharedLocatedTriplesSnapshot DeltaTriples::getSnapshot() const { + // NOTE: Both members of the `LocatedTriplesSnapshot` are copied, but the + // `localVocab_` has no copy constructor (in order to avoid accidental + // copies), hence the explicit `clone`. + return SharedLocatedTriplesSnapshot{std::make_shared( + locatedTriples(), localVocab_.clone())}; +} + +// ____________________________________________________________________________ +DeltaTriples::DeltaTriples(const Index& index) + : DeltaTriples(index.getImpl()) {} + +// ____________________________________________________________________________ +DeltaTriplesManager::DeltaTriplesManager(const IndexImpl& index) + : deltaTriples_{index}, + currentLocatedTriplesSnapshot_{deltaTriples_.rlock()->getSnapshot()} {} + +// _____________________________________________________________________________ +void DeltaTriplesManager::modify( + const std::function& function) { + // While holding the lock for the underlying `DeltaTriples`, perform the + // actual `function` (typically some combination of insert and delete + // operations) and (while still holding the lock) update the + // `currentLocatedTriplesSnapshot_`. + deltaTriples_.withWriteLock([this, &function](DeltaTriples& deltaTriples) { + function(deltaTriples); + auto newSnapshot = deltaTriples.getSnapshot(); + currentLocatedTriplesSnapshot_.withWriteLock( + [&newSnapshot](auto& currentSnapshot) { + currentSnapshot = std::move(newSnapshot); + }); + }); +} + +// _____________________________________________________________________________ +SharedLocatedTriplesSnapshot DeltaTriplesManager::getCurrentSnapshot() const { + return *currentLocatedTriplesSnapshot_.rlock(); +} diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 05342a845b..afe13c7c07 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -1,8 +1,8 @@ // Copyright 2023 - 2024, University of Freiburg -// Chair of Algorithms and Data Structures. -// Authors: -// 2023 Hannah Bast -// 2024 Julian Mundhahs +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast +// Julian Mundhahs +// Johannes Kalmbach #pragma once @@ -12,6 +12,29 @@ #include "index/IndexBuilderTypes.h" #include "index/LocatedTriples.h" #include "index/Permutation.h" +#include "util/Synchronized.h" + +// Typedef for one `LocatedTriplesPerBlock` object for each of the six +// permutations. +using LocatedTriplesPerBlockAllPermutations = + std::array; + +// The locations of a set of delta triples (triples that were inserted or +// deleted since the index was built) in each of the six permutations, and a +// local vocab. This is all the information that is required to perform a query +// that correctly respects these delta triples, hence the name. +struct LocatedTriplesSnapshot { + LocatedTriplesPerBlockAllPermutations locatedTriplesPerBlock_; + LocalVocab localVocab_; + // Get `TripleWithPosition` objects for given permutation. + const LocatedTriplesPerBlock& getLocatedTriplesForPermutation( + Permutation::Enum permutation) const; +}; + +// A shared pointer to a constant `LocatedTriplesSnapshot`, but as an explicit +// class, such that it can be forward-declared. +class SharedLocatedTriplesSnapshot + : public std::shared_ptr {}; // A class for maintaining triples that are inserted or deleted after index // building, we call these delta triples. How it works in principle: @@ -33,9 +56,16 @@ class DeltaTriples { FRIEND_TEST(DeltaTriplesTest, clear); FRIEND_TEST(DeltaTriplesTest, addTriplesToLocalVocab); + public: + using Triples = std::vector>; + using CancellationHandle = ad_utility::SharedCancellationHandle; + private: // The index to which these triples are added. - const Index& index_; + const IndexImpl& index_; + + // The located triples for all the 6 permutations. + LocatedTriplesPerBlockAllPermutations locatedTriples_; // The local vocabulary of the delta triples (they may have components, // which are not contained in the vocabulary of the original index). @@ -52,10 +82,6 @@ class DeltaTriples { static_assert(static_cast(Permutation::Enum::OSP) == 5); static_assert(Permutation::ALL.size() == 6); - // The positions of the delta triples in each of the six permutations. - std::array - locatedTriplesPerBlock_; - // Each delta triple needs to know where it is stored in each of the six // `LocatedTriplesPerBlock` above. struct LocatedTripleHandles { @@ -66,8 +92,6 @@ class DeltaTriples { }; using TriplesToHandlesMap = ad_utility::HashMap, LocatedTripleHandles>; - using Triples = std::vector>; - using CancellationHandle = ad_utility::SharedCancellationHandle; // The sets of triples added to and subtracted from the original index. Any // triple can be at most in one of the sets. The information whether a triple @@ -78,15 +102,26 @@ class DeltaTriples { public: // Construct for given index. - explicit DeltaTriples(const Index& index) : index_(index) {} + explicit DeltaTriples(const Index& index); + explicit DeltaTriples(const IndexImpl& index) : index_{index} {}; + + DeltaTriples(const DeltaTriples&) = delete; + DeltaTriples& operator=(const DeltaTriples&) = delete; // Get the common `LocalVocab` of the delta triples. private: LocalVocab& localVocab() { return localVocab_; } + auto& locatedTriples() { return locatedTriples_; } + const auto& locatedTriples() const { return locatedTriples_; } public: const LocalVocab& localVocab() const { return localVocab_; } + const LocatedTriplesPerBlock& getLocatedTriplesForPermutation( + Permutation::Enum permutation) const { + return locatedTriples_.at(static_cast(permutation)); + } + // Clear `triplesAdded_` and `triplesSubtracted_` and all associated data // structures. void clear(); @@ -101,9 +136,10 @@ class DeltaTriples { // Delete triples. void deleteTriples(CancellationHandle cancellationHandle, Triples triples); - // Get `TripleWithPosition` objects for given permutation. - const LocatedTriplesPerBlock& getLocatedTriplesPerBlock( - Permutation::Enum permutation) const; + // Return a deep copy of the `LocatedTriples` and the corresponding + // `LocalVocab` which form a snapshot of the current status of this + // `DeltaTriples` object. + SharedLocatedTriplesSnapshot getSnapshot() const; private: // Find the position of the given triple in the given permutation and add it @@ -144,7 +180,27 @@ class DeltaTriples { void eraseTripleInAllPermutations(LocatedTripleHandles& handles); }; -// DELTA TRIPLES AND THE CACHE -// -// Changes to the DeltaTriples invalidate all cache results that have an index -// scan in their subtree, which is almost all entries in practice. +// This class synchronizes the access to a `DeltaTriples` object, thus avoiding +// race conditions between concurrent updates and queries. +class DeltaTriplesManager { + ad_utility::Synchronized deltaTriples_; + ad_utility::Synchronized + currentLocatedTriplesSnapshot_; + + public: + using CancellationHandle = DeltaTriples::CancellationHandle; + using Triples = DeltaTriples::Triples; + + explicit DeltaTriplesManager(const IndexImpl& index); + FRIEND_TEST(DeltaTriplesTest, DeltaTriplesManager); + + // Modify the underlying `DeltaTriples` by applying `function` and then update + // the current snapshot. Concurrent calls to `modify` will be serialized, and + // each call to `getCurrentSnapshot` will either return the snapshot before or + // after a modification, but never one of an ongoing modification. + void modify(const std::function& function); + + // Return a shared pointer to a deep copy of the current snapshot. This can + // be safely used to execute a query without interfering with future updates. + SharedLocatedTriplesSnapshot getCurrentSnapshot() const; +}; diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 86af95a798..47fcad9c82 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -57,15 +57,17 @@ ad_utility::BlankNodeManager* Index::getBlankNodeManager() const { } // ____________________________________________________________________________ -size_t Index::getCardinality(const TripleComponent& comp, Permutation::Enum p, - const DeltaTriples& deltaTriples) const { - return pimpl_->getCardinality(comp, p, deltaTriples); +size_t Index::getCardinality( + const TripleComponent& comp, Permutation::Enum p, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { + return pimpl_->getCardinality(comp, p, locatedTriplesSnapshot); } // ____________________________________________________________________________ -size_t Index::getCardinality(Id id, Permutation::Enum p, - const DeltaTriples& deltaTriples) const { - return pimpl_->getCardinality(id, p, deltaTriples); +size_t Index::getCardinality( + Id id, Permutation::Enum p, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { + return pimpl_->getCardinality(id, p, locatedTriplesSnapshot); } // ____________________________________________________________________________ @@ -254,10 +256,10 @@ vector Index::getMultiplicities(Permutation::Enum p) const { } // ____________________________________________________________________________ -vector Index::getMultiplicities(const TripleComponent& key, - Permutation::Enum p, - const DeltaTriples& deltaTriples) const { - return pimpl_->getMultiplicities(key, p, deltaTriples); +vector Index::getMultiplicities( + const TripleComponent& key, Permutation::Enum p, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { + return pimpl_->getMultiplicities(key, p, locatedTriplesSnapshot); } // ____________________________________________________________________________ @@ -265,10 +267,10 @@ IdTable Index::scan( const ScanSpecificationAsTripleComponent& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset) const { return pimpl_->scan(scanSpecification, p, additionalColumns, - cancellationHandle, deltaTriples, limitOffset); + cancellationHandle, locatedTriplesSnapshot, limitOffset); } // ____________________________________________________________________________ @@ -276,21 +278,32 @@ IdTable Index::scan( const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset) const { return pimpl_->scan(scanSpecification, p, additionalColumns, - cancellationHandle, deltaTriples, limitOffset); + cancellationHandle, locatedTriplesSnapshot, limitOffset); } // ____________________________________________________________________________ -size_t Index::getResultSizeOfScan(const ScanSpecification& scanSpecification, - const Permutation::Enum& permutation, - const DeltaTriples& deltaTriples) const { +size_t Index::getResultSizeOfScan( + const ScanSpecification& scanSpecification, + const Permutation::Enum& permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { return pimpl_->getResultSizeOfScan(scanSpecification, permutation, - deltaTriples); + locatedTriplesSnapshot); } // ____________________________________________________________________________ void Index::createFromFiles(const std::vector& files) { return pimpl_->createFromFiles(files); } + +// ____________________________________________________________________________ +const DeltaTriplesManager& Index::deltaTriplesManager() const { + return pimpl_->deltaTriplesManager(); +} + +// ____________________________________________________________________________ +DeltaTriplesManager& Index::deltaTriplesManager() { + return pimpl_->deltaTriplesManager(); +} diff --git a/src/index/Index.h b/src/index/Index.h index d42c5ced42..ec408f15df 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -1,4 +1,3 @@ -// Copyright 2015, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: // 2014-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) @@ -23,7 +22,8 @@ class IdTable; class TextBlockMetaData; class IndexImpl; -class DeltaTriples; +struct LocatedTriplesSnapshot; +class DeltaTriplesManager; class Index { private: @@ -116,14 +116,19 @@ class Index { // Get a (non-owning) pointer to the BlankNodeManager of this Index. ad_utility::BlankNodeManager* getBlankNodeManager() const; + // Get a (non-owning) pointer to the BlankNodeManager of this Index. + DeltaTriplesManager& deltaTriplesManager(); + const DeltaTriplesManager& deltaTriplesManager() const; + // -------------------------------------------------------------------------- // RDF RETRIEVAL // -------------------------------------------------------------------------- - [[nodiscard]] size_t getCardinality(const TripleComponent& comp, - Permutation::Enum permutation, - const DeltaTriples& deltaTriples) const; - [[nodiscard]] size_t getCardinality(Id id, Permutation::Enum permutation, - const DeltaTriples& deltaTriples) const; + [[nodiscard]] size_t getCardinality( + const TripleComponent& comp, Permutation::Enum permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; + [[nodiscard]] size_t getCardinality( + Id id, Permutation::Enum permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // TODO Once we have an overview over the folding this logic should // probably not be in the index class. @@ -217,9 +222,9 @@ class Index { bool hasAllPermutations() const; // _____________________________________________________________________________ - vector getMultiplicities(const TripleComponent& key, - Permutation::Enum permutation, - const DeltaTriples& deltaTriples) const; + vector getMultiplicities( + const TripleComponent& key, Permutation::Enum permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // ___________________________________________________________________ vector getMultiplicities(Permutation::Enum p) const; @@ -243,21 +248,22 @@ class Index { Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset = {}) const; // Similar to the overload of `scan` above, but the keys are specified as IDs. IdTable scan(const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset = {}) const; // Similar to the previous overload of `scan`, but only get the exact size of // the scan result. - size_t getResultSizeOfScan(const ScanSpecification& scanSpecification, - const Permutation::Enum& permutation, - const DeltaTriples& deltaTriples) const; + size_t getResultSizeOfScan( + const ScanSpecification& scanSpecification, + const Permutation::Enum& permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // Get access to the implementation. This should be used rarely as it // requires including the rather expensive `IndexImpl.h` header diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 7f5e479f59..1e69d9676b 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -44,6 +44,7 @@ static constexpr size_t NUM_EXTERNAL_SORTERS_AT_SAME_TIME = 2u; IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit allocator) : allocator_{std::move(allocator)} { globalSingletonIndex_ = this; + deltaTriples_.emplace(*this); }; // _____________________________________________________________________________ @@ -1445,10 +1446,11 @@ Index::NumNormalAndInternal IndexImpl::numDistinctCol0( } // ___________________________________________________________________________ -size_t IndexImpl::getCardinality(Id id, Permutation::Enum permutation, - const DeltaTriples& deltaTriples) const { +size_t IndexImpl::getCardinality( + Id id, Permutation::Enum permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { if (const auto& meta = - getPermutation(permutation).getMetadata(id, deltaTriples); + getPermutation(permutation).getMetadata(id, locatedTriplesSnapshot); meta.has_value()) { return meta.value().numRows_; } @@ -1456,9 +1458,9 @@ size_t IndexImpl::getCardinality(Id id, Permutation::Enum permutation, } // ___________________________________________________________________________ -size_t IndexImpl::getCardinality(const TripleComponent& comp, - Permutation::Enum permutation, - const DeltaTriples& deltaTriples) const { +size_t IndexImpl::getCardinality( + const TripleComponent& comp, Permutation::Enum permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { // TODO This special case is only relevant for the `PSO` and `POS` // permutations, but this internal predicate should never appear in subjects // or objects anyway. @@ -1468,7 +1470,7 @@ size_t IndexImpl::getCardinality(const TripleComponent& comp, return TEXT_PREDICATE_CARDINALITY_ESTIMATE; } if (std::optional relId = comp.toValueId(getVocab()); relId.has_value()) { - return getCardinality(relId.value(), permutation, deltaTriples); + return getCardinality(relId.value(), permutation, locatedTriplesSnapshot); } return 0; } @@ -1491,10 +1493,10 @@ Index::Vocab::PrefixRanges IndexImpl::prefixRanges( // _____________________________________________________________________________ vector IndexImpl::getMultiplicities( const TripleComponent& key, Permutation::Enum permutation, - const DeltaTriples& deltaTriples) const { + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { if (auto keyId = key.toValueId(getVocab()); keyId.has_value()) { - auto meta = - getPermutation(permutation).getMetadata(keyId.value(), deltaTriples); + auto meta = getPermutation(permutation) + .getMetadata(keyId.value(), locatedTriplesSnapshot); if (meta.has_value()) { return {meta.value().getCol1Multiplicity(), meta.value().getCol2Multiplicity()}; @@ -1520,30 +1522,31 @@ IdTable IndexImpl::scan( const Permutation::Enum& permutation, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset) const { auto scanSpecification = scanSpecificationAsTc.toScanSpecification(*this); return scan(scanSpecification, permutation, additionalColumns, - cancellationHandle, deltaTriples, limitOffset); + cancellationHandle, locatedTriplesSnapshot, limitOffset); } // _____________________________________________________________________________ IdTable IndexImpl::scan( const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset) const { return getPermutation(p).scan(scanSpecification, additionalColumns, - cancellationHandle, deltaTriples, limitOffset); + cancellationHandle, locatedTriplesSnapshot, + limitOffset); } // _____________________________________________________________________________ size_t IndexImpl::getResultSizeOfScan( const ScanSpecification& scanSpecification, const Permutation::Enum& permutation, - const DeltaTriples& deltaTriples) const { + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { return getPermutation(permutation) - .getResultSizeOfScan(scanSpecification, deltaTriples); + .getResultSizeOfScan(scanSpecification, locatedTriplesSnapshot); } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index d62f4a7e13..0d5b396ccc 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -18,6 +18,7 @@ #include "global/SpecialIds.h" #include "index/CompressedRelation.h" #include "index/ConstantsIndexBuilding.h" +#include "index/DeltaTriples.h" #include "index/DocsDB.h" #include "index/Index.h" #include "index/IndexBuilderTypes.h" @@ -188,6 +189,8 @@ class IndexImpl { // BlankNodeManager, initialized during `readConfiguration` std::unique_ptr blankNodeManager_{nullptr}; + std::optional deltaTriples_; + public: explicit IndexImpl(ad_utility::AllocatorWithLimit allocator); @@ -261,6 +264,11 @@ class IndexImpl { ad_utility::BlankNodeManager* getBlankNodeManager() const; + DeltaTriplesManager& deltaTriplesManager() { return deltaTriples_.value(); } + const DeltaTriplesManager& deltaTriplesManager() const { + return deltaTriples_.value(); + } + // -------------------------------------------------------------------------- // -- RETRIEVAL --- // -------------------------------------------------------------------------- @@ -283,12 +291,12 @@ class IndexImpl { // ___________________________________________________________________________ size_t getCardinality(Id id, Permutation::Enum permutation, - const DeltaTriples&) const; + const LocatedTriplesSnapshot&) const; // ___________________________________________________________________________ - size_t getCardinality(const TripleComponent& comp, - Permutation::Enum permutation, - const DeltaTriples& deltaTriples) const; + size_t getCardinality( + const TripleComponent& comp, Permutation::Enum permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // ___________________________________________________________________________ std::string indexToString(VocabIndex id) const; @@ -420,9 +428,9 @@ class IndexImpl { bool hasAllPermutations() const { return SPO().isLoaded(); } // _____________________________________________________________________________ - vector getMultiplicities(const TripleComponent& key, - Permutation::Enum permutation, - const DeltaTriples&) const; + vector getMultiplicities( + const TripleComponent& key, Permutation::Enum permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // ___________________________________________________________________ vector getMultiplicities(Permutation::Enum permutation) const; @@ -432,20 +440,21 @@ class IndexImpl { const Permutation::Enum& permutation, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset = {}) const; // _____________________________________________________________________________ IdTable scan(const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset = {}) const; // _____________________________________________________________________________ - size_t getResultSizeOfScan(const ScanSpecification& scanSpecification, - const Permutation::Enum& permutation, - const DeltaTriples& deltaTriples) const; + size_t getResultSizeOfScan( + const ScanSpecification& scanSpecification, + const Permutation::Enum& permutation, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; private: // Private member functions diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 9bcad6838b..e898327a9a 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -283,3 +283,19 @@ std::ostream& operator<<(std::ostream& os, const std::vector>& v) { std::ranges::copy(v, std::ostream_iterator>(os, ", ")); return os; } + +// ____________________________________________________________________________ +bool LocatedTriplesPerBlock::containsTriple(const IdTriple<0>& triple, + bool shouldExist) const { + auto blockContains = [&triple, shouldExist](const LocatedTriples& lt, + size_t blockIndex) { + LocatedTriple locatedTriple{blockIndex, triple, shouldExist}; + locatedTriple.blockIndex_ = blockIndex; + return ad_utility::contains(lt, locatedTriple); + }; + + return std::ranges::any_of(map_, [&blockContains](auto& indexAndBlock) { + const auto& [index, block] = indexAndBlock; + return blockContains(block, index); + }); +} diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index c9d82d6745..c1b612a775 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -127,14 +127,14 @@ class LocatedTriplesPerBlock { IdTable mergeTriples(size_t blockIndex, const IdTable& block, size_t numIndexColumns, bool includeGraphColumn) const; - // Add `locatedTriples` to the `LocatedTriplesPerBlock`. + // Add `getLocatedTriplesForPermutation` to the `LocatedTriplesPerBlock`. // Return handles to where they were added (`LocatedTriples` is a sorted set, // see above). We need the handles so that we can easily remove the - // `locatedTriples` from the set again in case we need to. + // `getLocatedTriplesForPermutation` from the set again in case we need to. // // PRECONDITIONS: // - // 1. The `locatedTriples` must not already exist in + // 1. The `getLocatedTriplesForPermutation` must not already exist in // `LocatedTriplesPerBlock`. std::vector add( std::span locatedTriples); @@ -167,6 +167,10 @@ class LocatedTriplesPerBlock { augmentedMetadata_ = originalMetadata_; } + // Only used for testing. Return `true` iff a `LocatedTriple` with the given + // value for `shouldExist` is contained in any block. + bool containsTriple(const IdTriple<0>& triple, bool shouldExist) const; + // This operator is only for debugging and testing. It returns a // human-readable representation. friend std::ostream& operator<<(std::ostream& os, diff --git a/src/index/Permutation.cpp b/src/index/Permutation.cpp index 16f5113d68..cbe4b5dd1f 100644 --- a/src/index/Permutation.cpp +++ b/src/index/Permutation.cpp @@ -55,7 +55,7 @@ void Permutation::loadFromDisk(const std::string& onDiskBase, IdTable Permutation::scan(const ScanSpecification& scanSpec, ColumnIndicesRef additionalColumns, const CancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset) const { if (!isLoaded_) { throw std::runtime_error("This query requires the permutation " + @@ -64,35 +64,38 @@ IdTable Permutation::scan(const ScanSpecification& scanSpec, const auto& p = getActualPermutation(scanSpec); - return p.reader().scan(scanSpec, p.meta_.blockData(), additionalColumns, - cancellationHandle, locatedTriples(deltaTriples), - limitOffset); + return p.reader().scan( + scanSpec, p.meta_.blockData(), additionalColumns, cancellationHandle, + getLocatedTriplesForPermutation(locatedTriplesSnapshot), limitOffset); } // _____________________________________________________________________ size_t Permutation::getResultSizeOfScan( - const ScanSpecification& scanSpec, const DeltaTriples& deltaTriples) const { + const ScanSpecification& scanSpec, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { const auto& p = getActualPermutation(scanSpec); - return p.reader().getResultSizeOfScan(scanSpec, p.meta_.blockData(), - locatedTriples(deltaTriples)); + return p.reader().getResultSizeOfScan( + scanSpec, p.meta_.blockData(), + getLocatedTriplesForPermutation(locatedTriplesSnapshot)); } // ____________________________________________________________________________ IdTable Permutation::getDistinctCol1IdsAndCounts( Id col0Id, const CancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples) const { + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { const auto& p = getActualPermutation(col0Id); - return p.reader().getDistinctCol1IdsAndCounts(col0Id, p.meta_.blockData(), - cancellationHandle, - locatedTriples(deltaTriples)); + return p.reader().getDistinctCol1IdsAndCounts( + col0Id, p.meta_.blockData(), cancellationHandle, + getLocatedTriplesForPermutation(locatedTriplesSnapshot)); } // ____________________________________________________________________________ IdTable Permutation::getDistinctCol0IdsAndCounts( const CancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples) const { + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { return reader().getDistinctCol0IdsAndCounts( - meta_.blockData(), cancellationHandle, locatedTriples(deltaTriples)); + meta_.blockData(), cancellationHandle, + getLocatedTriplesForPermutation(locatedTriplesSnapshot)); } // _____________________________________________________________________ @@ -137,25 +140,27 @@ std::string_view Permutation::toString(Permutation::Enum permutation) { // _____________________________________________________________________ std::optional Permutation::getMetadata( - Id col0Id, const DeltaTriples& deltaTriples) const { + Id col0Id, const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { const auto& p = getActualPermutation(col0Id); if (p.meta_.col0IdExists(col0Id)) { return p.meta_.getMetaData(col0Id); } - return p.reader().getMetadataForSmallRelation(p.meta_.blockData(), col0Id, - locatedTriples(deltaTriples)); + return p.reader().getMetadataForSmallRelation( + p.meta_.blockData(), col0Id, + getLocatedTriplesForPermutation(locatedTriplesSnapshot)); } // _____________________________________________________________________ std::optional Permutation::getMetadataAndBlocks( - const ScanSpecification& scanSpec, const DeltaTriples& deltaTriples) const { + const ScanSpecification& scanSpec, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { const auto& p = getActualPermutation(scanSpec); CompressedRelationReader::ScanSpecAndBlocks mb{ scanSpec, CompressedRelationReader::getRelevantBlocks( scanSpec, p.meta_.blockData())}; - auto firstAndLastTriple = - p.reader().getFirstAndLastTriple(mb, locatedTriples(deltaTriples)); + auto firstAndLastTriple = p.reader().getFirstAndLastTriple( + mb, getLocatedTriplesForPermutation(locatedTriplesSnapshot)); if (!firstAndLastTriple.has_value()) { return std::nullopt; } @@ -169,7 +174,7 @@ Permutation::IdTableGenerator Permutation::lazyScan( std::optional> blocks, ColumnIndicesRef additionalColumns, ad_utility::SharedCancellationHandle cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset) const { const auto& p = getActualPermutation(scanSpec); if (!blocks.has_value()) { @@ -178,9 +183,10 @@ Permutation::IdTableGenerator Permutation::lazyScan( blocks = std::vector(blockSpan.begin(), blockSpan.end()); } ColumnIndices columns{additionalColumns.begin(), additionalColumns.end()}; - return p.reader().lazyScan(scanSpec, std::move(blocks.value()), - std::move(columns), std::move(cancellationHandle), - locatedTriples(deltaTriples), limitOffset); + return p.reader().lazyScan( + scanSpec, std::move(blocks.value()), std::move(columns), + std::move(cancellationHandle), + getLocatedTriplesForPermutation(locatedTriplesSnapshot), limitOffset); } // ______________________________________________________________________ @@ -210,7 +216,7 @@ const Permutation& Permutation::getActualPermutation(Id id) const { } // ______________________________________________________________________ -const LocatedTriplesPerBlock& Permutation::locatedTriples( - const DeltaTriples& deltaTriples) const { - return deltaTriples.getLocatedTriplesPerBlock(permutation_); +const LocatedTriplesPerBlock& Permutation::getLocatedTriplesForPermutation( + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { + return locatedTriplesSnapshot.getLocatedTriplesForPermutation(permutation_); } diff --git a/src/index/Permutation.h b/src/index/Permutation.h index 93cad7e775..118153708b 100644 --- a/src/index/Permutation.h +++ b/src/index/Permutation.h @@ -1,6 +1,7 @@ -// Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach (johannes.kalmbach@gmail.com) +// Copyright 2018 - 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach + #pragma once #include @@ -18,7 +19,8 @@ class IdTable; // Forward declaration of `LocatedTriplesPerBlock` class LocatedTriplesPerBlock; -class DeltaTriples; +class SharedLocatedTriplesSnapshot; +struct LocatedTriplesSnapshot; // Helper class to store static properties of the different permutations to // avoid code duplication. The first template parameter is a search functor for @@ -66,7 +68,7 @@ class Permutation { IdTable scan(const ScanSpecification& scanSpec, ColumnIndicesRef additionalColumns, const CancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset = {}) const; // For a given relation, determine the `col1Id`s and their counts. This is @@ -74,11 +76,11 @@ class Permutation { // in `meta_`. IdTable getDistinctCol1IdsAndCounts( Id col0Id, const CancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples) const; + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; IdTable getDistinctCol0IdsAndCounts( const CancellationHandle& cancellationHandle, - const DeltaTriples& deltaTriples) const; + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // Typedef to propagate the `MetadataAndblocks` and `IdTableGenerator` type. using MetadataAndBlocks = @@ -102,11 +104,11 @@ class Permutation { const ScanSpecification& scanSpec, std::optional> blocks, ColumnIndicesRef additionalColumns, CancellationHandle cancellationHandle, - const DeltaTriples& deltaTriples, + const LocatedTriplesSnapshot& locatedTriplesSnapshot, const LimitOffsetClause& limitOffset = {}) const; std::optional getMetadata( - Id col0Id, const DeltaTriples& deltaTriples) const; + Id col0Id, const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // Return the metadata for the scan specified by the `scanSpecification` // along with the metadata for all the blocks that are relevant for this scan. @@ -114,12 +116,13 @@ class Permutation { // empty) return `nullopt`. std::optional getMetadataAndBlocks( const ScanSpecification& scanSpec, - const DeltaTriples& deltaTriples) const; + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; /// Similar to the previous `scan` function, but only get the size of the /// result - size_t getResultSizeOfScan(const ScanSpecification& scanSpec, - const DeltaTriples& deltaTriples) const; + size_t getResultSizeOfScan( + const ScanSpecification& scanSpec, + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; // _______________________________________________________ void setKbName(const string& name) { meta_.setName(name); } @@ -146,19 +149,21 @@ class Permutation { const Permutation& getActualPermutation(const ScanSpecification& spec) const; const Permutation& getActualPermutation(Id id) const; - const LocatedTriplesPerBlock& locatedTriples(const DeltaTriples&) const; + // From the given snapshot, get the located triples for this permutation. + const LocatedTriplesPerBlock& getLocatedTriplesForPermutation( + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; const CompressedRelationReader& reader() const { return reader_.value(); } private: - // for Log output, e.g. "POS" + // Readable name for this permutation, e.g., `POS`. std::string readableName_; - // e.g. ".pos" + // File name suffix for this permutation, e.g., `.pos`. std::string fileSuffix_; - // order of the 3 keys S(0), P(1), and O(2) for which this permutation is - // sorted, for example {1, 0, 2} for PSO. + // The order of the three components (S=0, P=1, O=2) in this permutation, + // e.g., `{1, 0, 2}` for `PSO`. array keyOrder_; - + // The metadata for this permutation. MetaData meta_; // This member is `optional` because we initialize it in a deferred way in the diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index e9e858d727..cb481b43b8 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -319,3 +319,105 @@ TEST_F(DeltaTriplesTest, rewriteLocalVocabEntriesAndBlankNodes) { auto s4 = triples[0].ids_[0]; EXPECT_EQ(s4.getBits(), blank0.getBits()); } + +// _____________________________________________________________________________ +TEST_F(DeltaTriplesTest, DeltaTriplesManager) { + // Preparation. + DeltaTriplesManager deltaTriplesManager(testQec->getIndex().getImpl()); + auto& vocab = testQec->getIndex().getVocab(); + auto cancellationHandle = + std::make_shared>(); + std::vector threads; + static constexpr size_t numThreads = 18; + static constexpr size_t numIterations = 21; + + // Insert and delete a well-defined set of triples, some independent and some + // dependent on the thread index. Check that the snapshot before in the + // middle of these updates is as expected. + auto insertAndDelete = [&](size_t threadIdx) { + LocalVocab localVocab; + SharedLocatedTriplesSnapshot beforeUpdate = + deltaTriplesManager.getCurrentSnapshot(); + for (size_t i = 0; i < numIterations; ++i) { + // The first triple in both vectors is the same for all threads, the + // others are exclusive to this thread via the `threadIdx`. + auto triplesToInsert = makeIdTriples( + vocab, localVocab, + {" ", absl::StrCat(" "), + absl::StrCat(" ")}); + auto triplesToDelete = makeIdTriples( + vocab, localVocab, + {" ", absl::StrCat(" "), + absl::StrCat(" ")}); + // Insert the `triplesToInsert`. + deltaTriplesManager.modify([&](DeltaTriples& deltaTriples) { + deltaTriples.insertTriples(cancellationHandle, triplesToInsert); + }); + // We should have successfully completed an update, so the snapshot + // pointer should have changed. + EXPECT_NE(beforeUpdate, deltaTriplesManager.getCurrentSnapshot()); + // Delete the `triplesToDelete`. + deltaTriplesManager.modify([&](DeltaTriples& deltaTriples) { + deltaTriples.deleteTriples(cancellationHandle, triplesToDelete); + }); + + // Make some checks in the middle of these updates (while the other + // threads are likely to be in the middle of their updates as well). + if (i == numIterations / 2) { + { + // None of the thread-exclusive triples should be contained in the + // original snapshot and this should not change over time. The + // Boolean argument specifies whether the triple was inserted (`true`) + // or deleted (`false`). + const auto& locatedSPO = + beforeUpdate->getLocatedTriplesForPermutation(Permutation::SPO); + EXPECT_FALSE(locatedSPO.containsTriple(triplesToInsert.at(1), true)); + EXPECT_FALSE(locatedSPO.containsTriple(triplesToInsert.at(1), false)); + EXPECT_FALSE(locatedSPO.containsTriple(triplesToInsert.at(2), true)); + EXPECT_FALSE(locatedSPO.containsTriple(triplesToInsert.at(2), false)); + EXPECT_FALSE(locatedSPO.containsTriple(triplesToDelete.at(2), true)); + EXPECT_FALSE(locatedSPO.containsTriple(triplesToDelete.at(2), false)); + } + { + // Check for several of the thread-exclusive triples that they are + // properly contained in the current snapshot. + // + auto p = deltaTriplesManager.getCurrentSnapshot(); + const auto& locatedSPO = + p->getLocatedTriplesForPermutation(Permutation::SPO); + EXPECT_TRUE(locatedSPO.containsTriple(triplesToInsert.at(1), true)); + // This triple is exclusive to the thread and is inserted and then + // immediately deleted again. The `DeltaTriples` thus only store it as + // deleted. It might be contained in the original input, hence we + // cannot simply drop it. + EXPECT_TRUE(locatedSPO.containsTriple(triplesToInsert.at(2), false)); + EXPECT_TRUE(locatedSPO.containsTriple(triplesToDelete.at(2), false)); + } + } + } + }; + + // Run the above for each of `numThreads` threads, where each thread knows + // its index (used to create the thread-exclusive triples). + for (size_t i = 0; i < numThreads; ++i) { + threads.emplace_back(insertAndDelete, i); + } + threads.clear(); + + // Check that without updates, the snapshot pointer does not change. + auto p1 = deltaTriplesManager.getCurrentSnapshot(); + auto p2 = deltaTriplesManager.getCurrentSnapshot(); + EXPECT_EQ(p1, p2); + + // Each of the threads above inserts on thread-exclusive triple, deletes one + // thread-exclusive triple and inserts one thread-exclusive triple that is + // deleted right after (This triple is stored as deleted in the `DeltaTriples` + // because it might be contained in the original input). Additionally, there + // is one common triple inserted by// all the threads and one common triple + // that is deleted by all the threads. + // + + auto deltaImpl = deltaTriplesManager.deltaTriples_.rlock(); + EXPECT_THAT(*deltaImpl, NumTriples(numThreads + 1, 2 * numThreads + 1, + 3 * numThreads + 2)); +} diff --git a/test/DeltaTriplesTestHelpers.h b/test/DeltaTriplesTestHelpers.h index 586a54196a..bf64175a17 100644 --- a/test/DeltaTriplesTestHelpers.h +++ b/test/DeltaTriplesTestHelpers.h @@ -25,7 +25,7 @@ inline auto InAllPermutations = absl::StrCat(".getLocatedTriplesPerBlock(", Permutation::toString(perm), ")"), [perm](const DeltaTriples& deltaTriples) { - return deltaTriples.getLocatedTriplesPerBlock(perm); + return deltaTriples.getLocatedTriplesForPermutation(perm); }, InnerMatcher); })); diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index b707b11111..6ce7f6f732 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -41,7 +41,7 @@ auto makeTestScanWidthOne = [](const IndexImpl& index, IdTable result = index.scan({c0, c1, std::nullopt}, permutation, additionalColumns, std::make_shared>(), - qec.deltaTriples()); + qec.locatedTriplesSnapshot()); ASSERT_EQ(result.numColumns(), 1 + additionalColumns.size()); ASSERT_EQ(result, makeIdTableFromVector(expected)); }; @@ -62,7 +62,7 @@ auto makeTestScanWidthTwo = [](const IndexImpl& index, index.scan({c0, std::nullopt, std::nullopt}, permutation, Permutation::ColumnIndicesRef{}, std::make_shared>(), - qec.deltaTriples()); + qec.locatedTriplesSnapshot()); ASSERT_EQ(wol, makeIdTableFromVector(expected)); }; }; @@ -92,7 +92,7 @@ TEST(IndexTest, createFromTurtleTest) { return; } const auto& [index, qec] = getIndex(); - const auto& deltaTriples = qec.deltaTriples(); + const auto& locatedTriplesSnapshot = qec.locatedTriplesSnapshot(); auto getId = makeGetId(getQec(kb)->getIndex()); Id a = getId(""); @@ -103,33 +103,49 @@ TEST(IndexTest, createFromTurtleTest) { Id c2 = getId(""); // TODO We could also test the multiplicities here. - ASSERT_TRUE(index.PSO().getMetadata(b, deltaTriples).has_value()); - ASSERT_TRUE(index.PSO().getMetadata(b2, deltaTriples).has_value()); - ASSERT_FALSE(index.PSO().getMetadata(a2, deltaTriples).has_value()); - ASSERT_FALSE(index.PSO().getMetadata(c, deltaTriples).has_value()); + ASSERT_TRUE( + index.PSO().getMetadata(b, locatedTriplesSnapshot).has_value()); + ASSERT_TRUE( + index.PSO().getMetadata(b2, locatedTriplesSnapshot).has_value()); + ASSERT_FALSE( + index.PSO().getMetadata(a2, locatedTriplesSnapshot).has_value()); + ASSERT_FALSE( + index.PSO().getMetadata(c, locatedTriplesSnapshot).has_value()); ASSERT_FALSE( index.PSO() .getMetadata(Id::makeFromVocabIndex(VocabIndex::make(735)), - deltaTriples) + locatedTriplesSnapshot) .has_value()); - ASSERT_FALSE( - index.PSO().getMetadata(b, deltaTriples).value().isFunctional()); - ASSERT_TRUE( - index.PSO().getMetadata(b2, deltaTriples).value().isFunctional()); + ASSERT_FALSE(index.PSO() + .getMetadata(b, locatedTriplesSnapshot) + .value() + .isFunctional()); + ASSERT_TRUE(index.PSO() + .getMetadata(b2, locatedTriplesSnapshot) + .value() + .isFunctional()); - ASSERT_TRUE(index.POS().getMetadata(b, deltaTriples).has_value()); - ASSERT_TRUE(index.POS().getMetadata(b2, deltaTriples).has_value()); - ASSERT_FALSE(index.POS().getMetadata(a2, deltaTriples).has_value()); - ASSERT_FALSE(index.POS().getMetadata(c, deltaTriples).has_value()); + ASSERT_TRUE( + index.POS().getMetadata(b, locatedTriplesSnapshot).has_value()); + ASSERT_TRUE( + index.POS().getMetadata(b2, locatedTriplesSnapshot).has_value()); + ASSERT_FALSE( + index.POS().getMetadata(a2, locatedTriplesSnapshot).has_value()); + ASSERT_FALSE( + index.POS().getMetadata(c, locatedTriplesSnapshot).has_value()); ASSERT_FALSE( index.POS() .getMetadata(Id::makeFromVocabIndex(VocabIndex::make(735)), - deltaTriples) + locatedTriplesSnapshot) .has_value()); - ASSERT_TRUE( - index.POS().getMetadata(b, deltaTriples).value().isFunctional()); - ASSERT_TRUE( - index.POS().getMetadata(b2, deltaTriples).value().isFunctional()); + ASSERT_TRUE(index.POS() + .getMetadata(b, locatedTriplesSnapshot) + .value() + .isFunctional()); + ASSERT_TRUE(index.POS() + .getMetadata(b2, locatedTriplesSnapshot) + .value() + .isFunctional()); // Relation b // Pair index @@ -167,7 +183,7 @@ TEST(IndexTest, createFromTurtleTest) { const auto& qec = *getQec(kb); const IndexImpl& index = qec.getIndex().getImpl(); - const auto& deltaTriples = qec.deltaTriples(); + const auto& deltaTriples = qec.locatedTriplesSnapshot(); auto getId = makeGetId(getQec(kb)->getIndex()); Id zero = getId("<0>"); @@ -224,7 +240,7 @@ TEST(IndexTest, createFromOnDiskIndexTest) { " ."; const auto& qec = *getQec(kb); const IndexImpl& index = qec.getIndex().getImpl(); - const auto& deltaTriples = qec.deltaTriples(); + const auto& deltaTriples = qec.locatedTriplesSnapshot(); auto getId = makeGetId(getQec(kb)->getIndex()); Id b = getId(""); @@ -465,8 +481,8 @@ TEST(IndexTest, NumDistinctEntities) { EXPECT_FLOAT_EQ(multiplicities[1], 7.0 / 2.0); EXPECT_FLOAT_EQ(multiplicities[2], 7.0 / 7.0); - multiplicities = - index.getMultiplicities(iri(""), Permutation::SPO, qec.deltaTriples()); + multiplicities = index.getMultiplicities(iri(""), Permutation::SPO, + qec.locatedTriplesSnapshot()); EXPECT_FLOAT_EQ(multiplicities[0], 2.5); EXPECT_FLOAT_EQ(multiplicities[1], 1); } diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index ca8d1f09ee..79eb77b0d5 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -55,17 +55,19 @@ namespace { // folded into the permutations as additional columns. void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( const Index& index) { - DeltaTriples deltaTriples(index); + DeltaTriplesManager deltaTriplesManager(index.getImpl()); + auto sharedLocatedTriplesSnapshot = deltaTriplesManager.getCurrentSnapshot(); + const auto& locatedTriplesSnapshot = *sharedLocatedTriplesSnapshot; static constexpr size_t col0IdTag = 43; auto cancellationDummy = std::make_shared>(); auto iriOfHasPattern = TripleComponent::Iri::fromIriref(HAS_PATTERN_PREDICATE); auto checkSingleElement = [&cancellationDummy, &iriOfHasPattern, - &deltaTriples](const Index& index, - size_t patternIdx, Id id) { + &locatedTriplesSnapshot]( + const Index& index, size_t patternIdx, Id id) { auto scanResultHasPattern = index.scan( ScanSpecificationAsTripleComponent{iriOfHasPattern, id, std::nullopt}, - Permutation::Enum::PSO, {}, cancellationDummy, deltaTriples); + Permutation::Enum::PSO, {}, cancellationDummy, locatedTriplesSnapshot); // Each ID has at most one pattern, it can have none if it doesn't // appear as a subject in the knowledge graph. AD_CORRECTNESS_CHECK(scanResultHasPattern.numRows() <= 1); @@ -86,7 +88,7 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( ScanSpecification{col0Id, std::nullopt, std::nullopt}, permutation, std::array{ColumnIndex{ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN}, ColumnIndex{ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}}, - cancellationDummy, deltaTriples); + cancellationDummy, locatedTriplesSnapshot); ASSERT_EQ(scanResult.numColumns(), 4u); for (const auto& row : scanResult) { auto patternIdx = row[2].getInt(); @@ -112,12 +114,12 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( auto cancellationHandle = std::make_shared>(); auto predicates = index.getImpl().PSO().getDistinctCol0IdsAndCounts( - cancellationHandle, deltaTriples); + cancellationHandle, locatedTriplesSnapshot); for (const auto& predicate : predicates.getColumn(0)) { checkConsistencyForPredicate(predicate); } auto objects = index.getImpl().OSP().getDistinctCol0IdsAndCounts( - cancellationHandle, deltaTriples); + cancellationHandle, locatedTriplesSnapshot); for (const auto& object : objects.getColumn(0)) { checkConsistencyForObject(object); } From 50eda6235af3b3025689af12f13df851800e00f5 Mon Sep 17 00:00:00 2001 From: Julian <14220769+Qup42@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:22:14 +0100 Subject: [PATCH 10/12] Implement a function that executes an UPDATE request (#1607) This is another step towards support for `SPARQL UPDATE`. The function takes a `ParsedQuery` that contains a graph update (and UPDATE request that inserts and/or deletes triples that are computed from a WHERE clause to/from a given graph), executes the query, and passes the result to a `DeltaTriples` object. --- src/engine/ExecuteUpdate.cpp | 73 ++++++++++++ src/engine/ExecuteUpdate.h | 20 ++++ test/ExecuteUpdateTest.cpp | 225 ++++++++++++++++++++++++++++------- 3 files changed, 277 insertions(+), 41 deletions(-) diff --git a/src/engine/ExecuteUpdate.cpp b/src/engine/ExecuteUpdate.cpp index ef27c6a8d4..55564de978 100644 --- a/src/engine/ExecuteUpdate.cpp +++ b/src/engine/ExecuteUpdate.cpp @@ -6,6 +6,21 @@ #include "engine/ExportQueryExecutionTrees.h" +// _____________________________________________________________________________ +void ExecuteUpdate::executeUpdate( + const Index& index, const ParsedQuery& query, const QueryExecutionTree& qet, + DeltaTriples& deltaTriples, const CancellationHandle& cancellationHandle) { + auto [toInsert, toDelete] = + computeGraphUpdateQuads(index, query, qet, cancellationHandle); + + // "The deletion of the triples happens before the insertion." (SPARQL 1.1 + // Update 3.1.3) + deltaTriples.deleteTriples(cancellationHandle, + std::move(toDelete.idTriples_)); + deltaTriples.insertTriples(cancellationHandle, + std::move(toInsert.idTriples_)); +} + // _____________________________________________________________________________ std::pair, LocalVocab> ExecuteUpdate::transformTriplesTemplate( @@ -99,3 +114,61 @@ void ExecuteUpdate::computeAndAddQuadsForResultRow( result.emplace_back(std::array{*subject, *predicate, *object, *graph}); } } + +// _____________________________________________________________________________ +std::pair +ExecuteUpdate::computeGraphUpdateQuads( + const Index& index, const ParsedQuery& query, const QueryExecutionTree& qet, + const CancellationHandle& cancellationHandle) { + AD_CONTRACT_CHECK(query.hasUpdateClause()); + auto updateClause = query.updateClause(); + if (!std::holds_alternative(updateClause.op_)) { + throw std::runtime_error( + "Only INSERT/DELETE update operations are currently supported."); + } + auto graphUpdate = std::get(updateClause.op_); + // Fully materialize the result for now. This makes it easier to execute the + // update. + auto res = qet.getResult(false); + + const auto& vocab = index.getVocab(); + + auto prepareTemplateAndResultContainer = + [&vocab, &qet, + &res](std::vector&& tripleTemplates) { + auto [transformedTripleTemplates, localVocab] = + transformTriplesTemplate(vocab, qet.getVariableColumns(), + std::move(tripleTemplates)); + std::vector> updateTriples; + // The maximum result size is size(query result) x num template rows. + // The actual result can be smaller if there are template rows with + // variables for which a result row does not have a value. + updateTriples.reserve(res->idTable().size() * + transformedTripleTemplates.size()); + + return std::tuple{std::move(transformedTripleTemplates), + std::move(updateTriples), std::move(localVocab)}; + }; + + auto [toInsertTemplates, toInsert, localVocabInsert] = + prepareTemplateAndResultContainer(std::move(graphUpdate.toInsert_)); + auto [toDeleteTemplates, toDelete, localVocabDelete] = + prepareTemplateAndResultContainer(std::move(graphUpdate.toDelete_)); + + for (const auto& [pair, range] : + ExportQueryExecutionTrees::getRowIndices(query._limitOffset, *res)) { + auto& idTable = pair.idTable_; + for (const uint64_t i : range) { + computeAndAddQuadsForResultRow(toInsertTemplates, toInsert, idTable, i); + cancellationHandle->throwIfCancelled(); + + computeAndAddQuadsForResultRow(toDeleteTemplates, toDelete, idTable, i); + cancellationHandle->throwIfCancelled(); + } + } + + return { + IdTriplesAndLocalVocab{std::move(toInsert), std::move(localVocabInsert)}, + IdTriplesAndLocalVocab{std::move(toDelete), std::move(localVocabDelete)}}; +} diff --git a/src/engine/ExecuteUpdate.h b/src/engine/ExecuteUpdate.h index 729e65d51c..3cf686ed14 100644 --- a/src/engine/ExecuteUpdate.h +++ b/src/engine/ExecuteUpdate.h @@ -16,6 +16,13 @@ class ExecuteUpdate { using IdOrVariableIndex = std::variant; using TransformedTriple = std::array; + // Execute an update. This function is comparable to + // `ExportQueryExecutionTrees::computeResult` for queries. + static void executeUpdate(const Index& index, const ParsedQuery& query, + const QueryExecutionTree& qet, + DeltaTriples& deltaTriples, + const CancellationHandle& cancellationHandle); + private: // Resolve all `TripleComponent`s and `Graph`s in a vector of // `SparqlTripleSimpleWithGraph` into `Variable`s or `Id`s. @@ -41,4 +48,17 @@ class ExecuteUpdate { const std::vector& templates, std::vector>& result, const IdTable& idTable, uint64_t rowIdx); FRIEND_TEST(ExecuteUpdate, computeAndAddQuadsForResultRow); + + struct IdTriplesAndLocalVocab { + std::vector> idTriples_; + LocalVocab localVocab_; + }; + // Compute the set of quads to insert and delete for the given update. The + // ParsedQuery's clause must be an UpdateClause. The UpdateClause's operation + // must be a GraphUpdate. + static std::pair + computeGraphUpdateQuads(const Index& index, const ParsedQuery& query, + const QueryExecutionTree& qet, + const CancellationHandle& cancellationHandle); + FRIEND_TEST(ExecuteUpdate, computeGraphUpdateQuads); }; diff --git a/test/ExecuteUpdateTest.cpp b/test/ExecuteUpdateTest.cpp index 08c4ec284e..5367d8ec7d 100644 --- a/test/ExecuteUpdateTest.cpp +++ b/test/ExecuteUpdateTest.cpp @@ -32,6 +32,153 @@ MATCHER_P(AlwaysFalse, msg, "") { return false; } +// _____________________________________________________________________________ +TEST(ExecuteUpdate, executeUpdate) { + auto executeUpdate = [](const std::string& update) { + // These tests run on the default dataset defined in + // `IndexTestHelpers::makeTestIndex`. + QueryExecutionContext* qec = ad_utility::testing::getQec(std::nullopt); + const Index& index = qec->getIndex(); + DeltaTriples deltaTriples{index}; + const auto sharedHandle = + std::make_shared>(); + const std::vector datasets = {}; + auto pq = SparqlParser::parseQuery(update); + QueryPlanner qp{qec, sharedHandle}; + const auto qet = qp.createExecutionTree(pq); + ExecuteUpdate::executeUpdate(index, pq, qet, deltaTriples, sharedHandle); + return deltaTriples; + }; + auto expectExecuteUpdate = + [&executeUpdate]( + const std::string& update, + const testing::Matcher& deltaTriplesMatcher) { + EXPECT_THAT(executeUpdate(update), deltaTriplesMatcher); + }; + auto expectExecuteUpdateFails = + [&executeUpdate]( + const std::string& update, + const testing::Matcher& messageMatcher) { + AD_EXPECT_THROW_WITH_MESSAGE(executeUpdate(update), messageMatcher); + }; + expectExecuteUpdate("INSERT DATA {

. }", NumTriples(1, 0, 1)); + expectExecuteUpdate("DELETE DATA { } WHERE { ?s ?o }", + NumTriples(1, 2, 3)); + expectExecuteUpdate( + "DELETE { } INSERT { } WHERE { ?s ?o }", + NumTriples(1, 0, 1)); + expectExecuteUpdate( + "DELETE { ?s ?o } INSERT { ?s ?o } WHERE { ?s ?o }", + NumTriples(2, 0, 2)); + expectExecuteUpdate("DELETE WHERE { ?s ?p ?o }", NumTriples(0, 8, 8)); + expectExecuteUpdateFails( + "SELECT * WHERE { ?s ?p ?o }", + testing::HasSubstr("Assertion `query.hasUpdateClause()` failed.")); + expectExecuteUpdateFails( + "CLEAR DEFAULT", + testing::HasSubstr( + "Only INSERT/DELETE update operations are currently supported.")); +} + +// _____________________________________________________________________________ +TEST(ExecuteUpdate, computeGraphUpdateQuads) { + // These tests run on the default dataset defined in + // `IndexTestHelpers::makeTestIndex`. + QueryExecutionContext* qec = ad_utility::testing::getQec(std::nullopt); + const Index& index = qec->getIndex(); + const auto Id = ad_utility::testing::makeGetId(index); + auto defaultGraphId = Id(std::string{DEFAULT_GRAPH_IRI}); + + using namespace ::testing; + LocalVocab localVocab; + auto LVI = [&localVocab](const std::string& iri) { + return Id::makeFromLocalVocabIndex(localVocab.getIndexAndAddIfNotContained( + LocalVocabEntry(ad_utility::triple_component::Iri::fromIriref(iri)))); + }; + + auto IdTriple = [defaultGraphId](const ::Id s, const ::Id p, const ::Id o, + const std::optional<::Id> graph = + std::nullopt) -> ::IdTriple<> { + return ::IdTriple({s, p, o, graph.value_or(defaultGraphId)}); + }; + + auto executeComputeGraphUpdateQuads = [&qec, + &index](const std::string& update) { + const auto sharedHandle = + std::make_shared>(); + const std::vector datasets = {}; + auto pq = SparqlParser::parseQuery(update); + QueryPlanner qp{qec, sharedHandle}; + const auto qet = qp.createExecutionTree(pq); + return ExecuteUpdate::computeGraphUpdateQuads(index, pq, qet, sharedHandle); + }; + auto expectComputeGraphUpdateQuads = + [&executeComputeGraphUpdateQuads]( + const std::string& update, + const Matcher>&>& toInsertMatcher, + const Matcher>&>& toDeleteMatcher) { + EXPECT_THAT(executeComputeGraphUpdateQuads(update), + Pair(AD_FIELD(ExecuteUpdate::IdTriplesAndLocalVocab, + idTriples_, toInsertMatcher), + AD_FIELD(ExecuteUpdate::IdTriplesAndLocalVocab, + idTriples_, toDeleteMatcher))); + }; + auto expectComputeGraphUpdateQuadsFails = + [&executeComputeGraphUpdateQuads]( + const std::string& update, + const Matcher& messageMatcher) { + AD_EXPECT_THROW_WITH_MESSAGE(executeComputeGraphUpdateQuads(update), + messageMatcher); + }; + + expectComputeGraphUpdateQuads( + "INSERT DATA {

. }", + ElementsAreArray({IdTriple(LVI(""), LVI("

"), LVI(""))}), + IsEmpty()); + expectComputeGraphUpdateQuads( + "DELETE DATA {

} WHERE { ?s ?o }", + ElementsAreArray({IdTriple(LVI(""), LVI("

"), LVI("")), + IdTriple(LVI(""), LVI("

"), LVI(""))}), + ElementsAreArray({IdTriple(Id(""), Id(""), Id("")), + IdTriple(Id(""), Id(""), Id(""))})); + expectComputeGraphUpdateQuads( + "DELETE {

} INSERT {

} WHERE { ?s ?o }", + ElementsAreArray({IdTriple(LVI(""), LVI("

"), LVI("")), + IdTriple(LVI(""), LVI("

"), LVI(""))}), + ElementsAreArray({IdTriple(LVI(""), LVI("

"), LVI("")), + IdTriple(LVI(""), LVI("

"), LVI(""))})); + expectComputeGraphUpdateQuads( + "DELETE { ?s ?o } INSERT { ?s ?o } WHERE { ?s ?o }", + ElementsAreArray({IdTriple(Id(""), Id(""), Id("")), + IdTriple(Id(""), Id(""), Id(""))}), + ElementsAreArray({IdTriple(Id(""), Id(""), Id("")), + IdTriple(Id(""), Id(""), Id(""))})); + expectComputeGraphUpdateQuads( + "DELETE WHERE { ?s ?p ?o }", IsEmpty(), + UnorderedElementsAreArray( + {IdTriple(Id(""), Id("

. }", NumTriples(1, 0, 1)); expectExecuteUpdate("DELETE DATA {

\t-42019234865781\n" "\t

\t42\n" @@ -402,7 +431,7 @@ TEST(ExportQueryExecutionTrees, Bool) { runSelectQueryTestCase(testCase); TestCaseConstructQuery testCaseConstruct{ - kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 2, + kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 2, 2, // TSV "\t

\tfalse\n" "\t

\ttrue\n", @@ -444,10 +473,10 @@ TEST(ExportQueryExecutionTrees, UnusedVariable) { makeExpectedSparqlJSON({}), expectedXml}; runSelectQueryTestCase(testCase); - // If we use a variable that is always unbound in a CONSTRUCT triple, then - // the result for this triple will be empty. + // The `2` is the number of results including triples with UNDEF values. The + // `0` is the number of results excluding such triples. TestCaseConstructQuery testCaseConstruct{ - kg, "CONSTRUCT {?x ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 0, + kg, "CONSTRUCT {?x ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 2, 0, // TSV "", // CSV @@ -502,7 +531,7 @@ TEST(ExportQueryExecutionTrees, Floats) { runSelectQueryTestCase(testCaseFloat); TestCaseConstructQuery testCaseConstruct{ - kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 3, + kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 3, 3, // TSV "\t

\t-42019234865780982022144\n" "\t

\t4.01293e-12\n" @@ -559,6 +588,7 @@ TEST(ExportQueryExecutionTrees, Dates) { kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 1, + 1, // TSV "\t

\t\"1950-01-01T00:00:00\"^^\n", // missing @@ -648,6 +678,7 @@ TEST(ExportQueryExecutionTrees, Entities) { kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 1, + 1, // TSV "\t

\t\n", // CSV @@ -696,6 +727,7 @@ TEST(ExportQueryExecutionTrees, LiteralWithLanguageTag) { kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 1, + 1, // TSV "\t

\t\"Some\"Where Over,\"@en-ca\n", // CSV @@ -744,6 +776,7 @@ TEST(ExportQueryExecutionTrees, LiteralWithDatatype) { kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 1, + 1, // TSV "\t

\t\"something\"^^\n", // CSV @@ -791,6 +824,7 @@ TEST(ExportQueryExecutionTrees, LiteralPlain) { kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 1, + 1, // TSV "\t

\t\"something\"\n", // CSV @@ -836,6 +870,7 @@ testIriKg kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 1, + 1, // TSV "\t

\t\n", // CSV @@ -899,6 +934,7 @@ TEST(ExportQueryExecutionTrees, TestWithIriExtendedEscaped) { kg, "CONSTRUCT {?s ?p ?o} WHERE {?s ?p ?o} ORDER BY ?o", 1, + 1, // TSV "\t

\t\t

\t\" hallo\\n welt\"\n", // CSV @@ -993,12 +1030,13 @@ TEST(ExportQueryExecutionTrees, UndefinedValues) { expectedXml}; runSelectQueryTestCase(testCase); - // In CONSTRUCT queries, results with undefined values in the exported - // variables are filtered out, so the result is empty. + // The `1` is the number of results including triples with UNDEF values. The + // `0` is the number of results excluding such triples. TestCaseConstructQuery testCaseConstruct{ kg, "CONSTRUCT {?s ?o} WHERE {?s

OPTIONAL {?s ?o}} ORDER " "BY ?o", + 1, 0, "", "", @@ -1338,12 +1376,14 @@ TEST(ExportQueryExecutionTrees, ensureCorrectSlicingOfSingleIdTable) { }(); Result result{std::move(tableGenerator), {}}; + uint64_t resultSizeTotal = 0; auto generator = ExportQueryExecutionTrees::getRowIndices( - LimitOffsetClause{._limit = 1, ._offset = 1}, result); + LimitOffsetClause{._limit = 1, ._offset = 1}, result, resultSizeTotal); - auto referenceTable = makeIdTableFromVector({{2}}); + auto expectedResult = makeIdTableFromVector({{2}}); EXPECT_THAT(convertToVector(std::move(generator)), - matchesIdTables(referenceTable)); + matchesIdTables(expectedResult)); + EXPECT_EQ(resultSizeTotal, 1); } // _____________________________________________________________________________ @@ -1360,13 +1400,16 @@ TEST(ExportQueryExecutionTrees, }(); Result result{std::move(tableGenerator), {}}; + uint64_t resultSizeTotal = 0; auto generator = ExportQueryExecutionTrees::getRowIndices( - LimitOffsetClause{._limit = std::nullopt, ._offset = 3}, result); + LimitOffsetClause{._limit = std::nullopt, ._offset = 3}, result, + resultSizeTotal); - auto referenceTable1 = makeIdTableFromVector({{4}, {5}}); + auto expectedResult = makeIdTableFromVector({{4}, {5}}); EXPECT_THAT(convertToVector(std::move(generator)), - matchesIdTables(referenceTable1)); + matchesIdTables(expectedResult)); + EXPECT_EQ(resultSizeTotal, 2); } // _____________________________________________________________________________ @@ -1383,13 +1426,15 @@ TEST(ExportQueryExecutionTrees, }(); Result result{std::move(tableGenerator), {}}; + uint64_t resultSizeTotal = 0; auto generator = ExportQueryExecutionTrees::getRowIndices( - LimitOffsetClause{._limit = 3}, result); + LimitOffsetClause{._limit = 3}, result, resultSizeTotal); - auto referenceTable1 = makeIdTableFromVector({{1}, {2}, {3}}); + auto expectedResult = makeIdTableFromVector({{1}, {2}, {3}}); EXPECT_THAT(convertToVector(std::move(generator)), - matchesIdTables(referenceTable1)); + matchesIdTables(expectedResult)); + EXPECT_EQ(resultSizeTotal, 3); } // _____________________________________________________________________________ @@ -1406,14 +1451,16 @@ TEST(ExportQueryExecutionTrees, }(); Result result{std::move(tableGenerator), {}}; + uint64_t resultSizeTotal = 0; auto generator = ExportQueryExecutionTrees::getRowIndices( - LimitOffsetClause{._limit = 3, ._offset = 1}, result); + LimitOffsetClause{._limit = 3, ._offset = 1}, result, resultSizeTotal); - auto referenceTable1 = makeIdTableFromVector({{2}, {3}}); - auto referenceTable2 = makeIdTableFromVector({{4}}); + auto expectedResult1 = makeIdTableFromVector({{2}, {3}}); + auto expectedResult2 = makeIdTableFromVector({{4}}); EXPECT_THAT(convertToVector(std::move(generator)), - matchesIdTables(referenceTable1, referenceTable2)); + matchesIdTables(expectedResult1, expectedResult2)); + EXPECT_EQ(resultSizeTotal, 3); } // _____________________________________________________________________________ @@ -1434,30 +1481,33 @@ TEST(ExportQueryExecutionTrees, }(); Result result{std::move(tableGenerator), {}}; + uint64_t resultSizeTotal = 0; auto generator = ExportQueryExecutionTrees::getRowIndices( - LimitOffsetClause{._limit = 5, ._offset = 2}, result); + LimitOffsetClause{._limit = 5, ._offset = 2}, result, resultSizeTotal); - auto referenceTable1 = makeIdTableFromVector({{3}}); - auto referenceTable2 = makeIdTableFromVector({{4}, {5}}); - auto referenceTable3 = makeIdTableFromVector({{6}, {7}}); + auto expectedTable1 = makeIdTableFromVector({{3}}); + auto expectedTable2 = makeIdTableFromVector({{4}, {5}}); + auto expectedTable3 = makeIdTableFromVector({{6}, {7}}); - EXPECT_THAT( - convertToVector(std::move(generator)), - matchesIdTables(referenceTable1, referenceTable2, referenceTable3)); + EXPECT_THAT(convertToVector(std::move(generator)), + matchesIdTables(expectedTable1, expectedTable2, expectedTable3)); + EXPECT_EQ(resultSizeTotal, 5); } // _____________________________________________________________________________ TEST(ExportQueryExecutionTrees, ensureGeneratorIsNotConsumedWhenNotRequired) { { auto throwingGenerator = []() -> Result::Generator { - ADD_FAILURE() << "Generator was started" << std::endl; - throw std::runtime_error("Generator was started"); + std::string message = "Generator was started, but should not have been"; + ADD_FAILURE() << message << std::endl; + throw std::runtime_error(message); co_return; }(); Result result{std::move(throwingGenerator), {}}; + uint64_t resultSizeTotal = 0; auto generator = ExportQueryExecutionTrees::getRowIndices( - LimitOffsetClause{._limit = 0, ._offset = 0}, result); + LimitOffsetClause{._limit = 0, ._offset = 0}, result, resultSizeTotal); EXPECT_NO_THROW(convertToVector(std::move(generator))); } @@ -1467,17 +1517,22 @@ TEST(ExportQueryExecutionTrees, ensureGeneratorIsNotConsumedWhenNotRequired) { LocalVocab{}}; co_yield pair1; - ADD_FAILURE() << "Generator was resumed" << std::endl; - throw std::runtime_error("Generator was resumed"); + std::string message = + "Generator was called a second time, but should not " + "have been"; + ADD_FAILURE() << message << std::endl; + throw std::runtime_error(message); }(); Result result{std::move(throwAfterYieldGenerator), {}}; + uint64_t resultSizeTotal = 0; auto generator = ExportQueryExecutionTrees::getRowIndices( - LimitOffsetClause{._limit = 1, ._offset = 0}, result); - IdTable referenceTable1 = makeIdTableFromVector({{1}}); + LimitOffsetClause{._limit = 1, ._offset = 0}, result, resultSizeTotal); + IdTable expectedTable = makeIdTableFromVector({{1}}); std::vector tables; EXPECT_NO_THROW({ tables = convertToVector(std::move(generator)); }); - EXPECT_THAT(tables, matchesIdTables(referenceTable1)); + EXPECT_THAT(tables, matchesIdTables(expectedTable)); + EXPECT_EQ(resultSizeTotal, 1); } }