diff --git a/src/engine/Describe.cpp b/src/engine/Describe.cpp index 02e2ceabf9..0d8418b686 100644 --- a/src/engine/Describe.cpp +++ b/src/engine/Describe.cpp @@ -9,6 +9,7 @@ #include "engine/Join.h" #include "engine/Values.h" +// _____________________________________________________________________________ Describe::Describe(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Describe describe) @@ -18,11 +19,14 @@ Describe::Describe(QueryExecutionContext* qec, AD_CORRECTNESS_CHECK(subtree_ != nullptr); } +// _____________________________________________________________________________ std::vector Describe::getChildren() { return {subtree_.get()}; } +// _____________________________________________________________________________ string Describe::getCacheKeyImpl() const { + // The cache key must repesent the `resources_` as well as the `subtree_`. std::string resourceKey; for (const auto& resource : describe_.resources_) { if (std::holds_alternative(resource)) { @@ -39,13 +43,14 @@ string Describe::getCacheKeyImpl() const { return absl::StrCat("DESCRIBE ", subtree_->getCacheKey(), resourceKey); } +// _____________________________________________________________________________ string Describe::getDescriptor() const { return "DESCRIBE"; } +// _____________________________________________________________________________ size_t Describe::getResultWidth() const { return 3; } -// As DESCRIBE is never part of the query planning, we can return dummy values -// for the following functions. - +// As DESCRIBE is never part of the query planning (it is always the root +// operation), we can return dummy values for the following functions. size_t Describe::getCostEstimate() { return 2 * subtree_->getCostEstimate(); } uint64_t Describe::getSizeEstimateBeforeLimit() { @@ -56,8 +61,14 @@ float Describe::getMultiplicity([[maybe_unused]] size_t col) { return 1.0f; } bool Describe::knownEmptyResult() { return false; } +// The result cannot easily be sorted, as it involves recursive expanding of +// graphs. vector Describe::resultSortedOn() const { return {}; } +// The result always consists of three hardcoded variables `?subject`, +// `?predicate`, `?object`. Note: The variable names must be in sync with the +// implicit CONSTRUCT query created by the parser (see +// `SparqlQleverVisitor::visitDescribe`) for details. VariableToColumnMap Describe::computeVariableToColumnMap() const { using V = Variable; auto col = makeAlwaysDefinedColumn; @@ -66,6 +77,12 @@ VariableToColumnMap Describe::computeVariableToColumnMap() const { {V("?object"), col(2)}}; } +// A helper function for the recursive BFS. Return the subset of `input` (as an +// `IdTable` with one column) that fulfills the following properties: +// 1. The ID is a blank node +// 2. The ID is not part of `alreadySeen`. +// The returned IDs are then also added to `alreadySeen`. The result contains no +// duplicates. static IdTable getNewBlankNodes( const auto& allocator, ad_utility::HashSetWithMemoryLimit& alreadySeen, std::span input) { @@ -88,41 +105,50 @@ static IdTable getNewBlankNodes( return result; } -// TODO Comment. +// _____________________________________________________________________________ void Describe::recursivelyAddBlankNodes( IdTable& finalResult, ad_utility::HashSetWithMemoryLimit& alreadySeen, IdTable blankNodes) { AD_CORRECTNESS_CHECK(blankNodes.numColumns() == 1); + + // Stop condition for the recursion, no new start nodes found. if (blankNodes.empty()) { return; } + + // Set up a join between the `blankNodes` and the full index. using V = Variable; - SparqlTripleSimple triple{V{"?subject"}, V{"?predicate"}, V{"?object"}}; + auto subjectVar = V{"?subject"}; + SparqlTripleSimple triple{subjectVar, V{"?predicate"}, V{"?object"}}; auto indexScan = ad_utility::makeExecutionTree( getExecutionContext(), Permutation::SPO, triple); auto valuesOp = ad_utility::makeExecutionTree( getExecutionContext(), std::move(blankNodes), - std::vector>{V{"?subject"}}); + std::vector>{subjectVar}); + + auto joinColValues = valuesOp->getVariableColumn(subjectVar); + auto joinColScan = indexScan->getVariableColumn(subjectVar); - // TODO It might be, that the Column index is definitely not 0, we - // have to store this separately. auto join = ad_utility::makeExecutionTree( - getExecutionContext(), valuesOp, std::move(indexScan), 0, 0); + getExecutionContext(), std::move(valuesOp), std::move(indexScan), + joinColValues, joinColScan); auto result = join->getResult(); - // TODO A lot of those things are inefficient, lets get it working - // first. + // TODO As soon as the join is lazy, we can compute the + // result lazy, and therefore avoid the copy via `clone` of the IdTable. auto table = result->idTable().clone(); - finalResult.reserve(finalResult.size() + table.size()); - auto s = join->getVariableColumn(V{"?subject"}); - auto p = join->getVariableColumn(V{"?predicate"}); - auto o = join->getVariableColumn(V{"?object"}); + CI s = join->getVariableColumn(V{"?subject"}); + CI p = join->getVariableColumn(V{"?predicate"}); + CI o = join->getVariableColumn(V{"?object"}); table.setColumnSubset(std::vector{s, p, o}); + // TODO Make the result of DESCRIBE lazy, then we avoid the + // additional Copying here. finalResult.insertAtEnd(table); + // Compute the set of newly found blank nodes and recurse. + // Note: The stop condition is at the beginning of the recursive call. auto newBlankNodes = getNewBlankNodes(allocator(), alreadySeen, table.getColumn(2)); - // recurse recursivelyAddBlankNodes(finalResult, alreadySeen, std::move(newBlankNodes)); } @@ -133,39 +159,49 @@ ProtoResult Describe::computeResult([[maybe_unused]] bool requestLaziness) { if (std::holds_alternative(resource)) { valuesToDescribe.push_back(std::get(resource)); } else { + // TODO Implement this, it should be fairly simple. throw std::runtime_error("DESCRIBE with a variable is not yet supported"); } } parsedQuery::SparqlValues values; using V = Variable; - values._variables.push_back(V{"?subject"}); + Variable subjectVar = V{"?subject"}; + values._variables.push_back(subjectVar); + + // Set up and execute a JOIN between the described IRIs and the full index. + // TODO There is a lot of code duplication in the following block + // with the recursive BFS on the blank nodes, factor that out. for (const auto& v : valuesToDescribe) { values._values.push_back(std::vector{v}); } - SparqlTripleSimple triple{V{"?subject"}, V{"?predicate"}, V{"?object"}}; + SparqlTripleSimple triple{subjectVar, V{"?predicate"}, V{"?object"}}; auto indexScan = ad_utility::makeExecutionTree( getExecutionContext(), Permutation::SPO, triple); auto valuesOp = ad_utility::makeExecutionTree(getExecutionContext(), values); + auto joinColValues = valuesOp->getVariableColumn(subjectVar); + auto joinColScan = indexScan->getVariableColumn(subjectVar); - // TODO It might be, that the Column index is definitely not 0, we - // have to store this separately. // TODO We have to (here, as well as in the recursion) also respect // the GRAPHS by which the result is filtered ( + add unit tests for that // case). auto join = ad_utility::makeExecutionTree( - getExecutionContext(), valuesOp, std::move(indexScan), 0, 0); + getExecutionContext(), std::move(valuesOp), std::move(indexScan), + joinColValues, joinColScan); + // TODO The following code (which extracts the required columns and + // writes the initial triples) is also duplicated, factor it out. auto result = join->getResult(); IdTable resultTable = result->idTable().clone(); - auto s = join->getVariableColumn(V{"?subject"}); - auto p = join->getVariableColumn(V{"?predicate"}); - auto o = join->getVariableColumn(V{"?object"}); + using CI = ColumnIndex; + CI s = join->getVariableColumn(V{"?subject"}); + CI p = join->getVariableColumn(V{"?predicate"}); + CI o = join->getVariableColumn(V{"?object"}); resultTable.setColumnSubset(std::vector{s, p, o}); - // Recursively follow blank nodes + // Recursively follow all blank nodes. ad_utility::HashSetWithMemoryLimit alreadySeen{allocator()}; auto blankNodes = getNewBlankNodes(allocator(), alreadySeen, resultTable.getColumn(2)); diff --git a/src/engine/Describe.h b/src/engine/Describe.h index d2f4ac40d8..b497dfd95e 100644 --- a/src/engine/Describe.h +++ b/src/engine/Describe.h @@ -7,12 +7,20 @@ #include "engine/Operation.h" #include "parser/GraphPatternOperation.h" +// An `Operation` which implements SPARQL DESCRIBE queries according to the +// Concise Bounded Description (CBD, see +// https://www.w3.org/submissions/2005/SUBM-CBD-20050603/) class Describe : public Operation { private: + // The subtree that computes the WHERE clause of the DESCRIBE query. std::shared_ptr subtree_; + + // The specification of the DESCRIBE clause. parsedQuery::Describe describe_; public: + // Constructor. For details see the documentation of the member variables + // above. Describe(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Describe describe); @@ -20,14 +28,14 @@ class Describe : public Operation { // Getter for testing. const auto& getDescribe() const { return describe_; } + // The following functions are all overridden from the base class `Operation`, + // see there for documentation. std::vector getChildren() override; - string getCacheKeyImpl() const override; public: string getDescriptor() const override; size_t getResultWidth() const override; - size_t getCostEstimate() override; private: @@ -40,11 +48,18 @@ class Describe : public Operation { private: [[nodiscard]] vector resultSortedOn() const override; ProtoResult computeResult(bool requestLaziness) override; - // Compute the variable to column index mapping. Is used internally by - // `getInternallyVisibleVariableColumns`. VariableToColumnMap computeVariableToColumnMap() const override; - // TODO Comment. + // Add all triples, where the subject is one of the `blankNodes` to the + // `finalResult`. `blankNodes` is a table with one column. Recursively + // continue for all newly found blank nodes (objects of the newly found + // triples, which are not contained in `alreadySeen`). This is a recursive + // implementation of breadth-first-search (BFS) where `blankNodes` is the set + // of start nodes, and `alreadySeen` is the set of nodes which have already + // been explored, which is needed to handle cycles in the graph. The explored + // graph is `all triples currently stored by QLever`. + // TODO We have to extend this by the information of the allowed + // graphs. void recursivelyAddBlankNodes( IdTable& finalResult, ad_utility::HashSetWithMemoryLimit& alreadySeen, IdTable blankNodes); diff --git a/src/parser/DatasetClauses.cpp b/src/parser/DatasetClauses.cpp index 553e2bdcba..4cf9b5526a 100644 --- a/src/parser/DatasetClauses.cpp +++ b/src/parser/DatasetClauses.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures // Author: Johannes Kalmbach -#include "DatasetClauses.h" +#include "parser/DatasetClauses.h" // _____________________________________________________________________________ parsedQuery::DatasetClauses parsedQuery::DatasetClauses::fromClauses( diff --git a/src/parser/GraphPatternOperation.h b/src/parser/GraphPatternOperation.h index a4ae51e0cb..973293f7c7 100644 --- a/src/parser/GraphPatternOperation.h +++ b/src/parser/GraphPatternOperation.h @@ -132,11 +132,16 @@ class Subquery { const ParsedQuery& get() const; }; -// A SPARQL `DESCRIBE` construct. +// A SPARQL `DESCRIBE` query. struct Describe { using VarOrIri = std::variant; + // The resources (variables or IRIs) that are to be described, for example + // `?x` and `` in `DESCRIBE ?x `. std::vector resources_; + // The FROM clauses of the DESCRIBE query DatasetClauses datasetClauses_; + // The WHERE clause of the describe query. It is used to compute the values + // for variables that are to be described. Subquery whereClause_; }; diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 7af6a83247..0b589f64cc 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -279,45 +279,71 @@ ParsedQuery Visitor::visit(Parser::ConstructQueryContext* ctx) { // ____________________________________________________________________________________ ParsedQuery Visitor::visit(Parser::DescribeQueryContext* ctx) { - auto clause = parsedQuery::Describe{}; - auto specs = visitVector(ctx->varOrIri()); - if (specs.empty()) { - reportNotSupported(ctx, "DESCRIBE * is"); - } + auto describeClause = parsedQuery::Describe{}; + auto describedResources = visitVector(ctx->varOrIri()); std::vector describedVariables; - for (GraphTerm& spec : specs) { - if (std::holds_alternative(spec)) { - const auto& variable = std::get(spec); - clause.resources_.push_back(variable); - describedVariables.push_back(variable); - } else if (std::holds_alternative(spec)) { - auto iri = - TripleComponent::Iri::fromIriref(std::get(spec).toSparql()); - clause.resources_.push_back(std::move(iri)); - } + // Convert the describe resources (variables or IRIs) from the format that the + // parser delivers to the one that the `Describe` struct expects. + + auto addDescribedVariable = [&describeClause, + &describedVariables](const Variable& variable) { + describeClause.resources_.push_back(variable); + describedVariables.push_back(variable); + }; + auto addDescribedIri = [&describeClause](const Iri& iri) { + auto iriTc = + TripleComponent::Iri::fromIriref(std::get(resource).toSparql()); + describeClause.resources_.push_back(std::move(iriTc)); + }; + + for (GraphTerm& resource : describedResources) { + std::visit(ad_utility::OverloadCallOperator{addDescribedVariable, addDescribedIri}, + resource); } + // Parse the FROM (NAMED) clauses and store them in the `describeClause`. auto datasetClauses = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); - clause.datasetClauses_ = datasetClauses; + describeClause.datasetClauses_ = datasetClauses; + + // Parse the WHERE clause. + // TODO The following for lines are duplicated for all the different + // types of queries. add a `visitWhereClause` function. if (ctx->whereClause()) { auto [pattern, visibleVariables] = visit(ctx->whereClause()); parsedQuery_._rootGraphPattern = std::move(pattern); parsedQuery_.registerVariablesVisibleInQueryBody(visibleVariables); } - parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); + // HANDLE `DESCRIBE *` + if (describedResources.empty()) { + std::ranges::for_each(parsedQuery_.selectClause().getVisibleVariables(), + addDescribedVariable); + } auto& selectClause = parsedQuery_.selectClause(); selectClause.setSelected(std::move(describedVariables)); - clause.whereClause_ = std::move(parsedQuery_); + + // So far we have actually computed the subquery/WHERE clause of the DESCRIBE. + // We now store it inside the `describeClause` and setup the outer query, + // which is implemented as a CONSTRUCT query with a special DESCRIBE + // operation. + describeClause.whereClause_ = std::move(parsedQuery_); + parsedQuery_ = ParsedQuery{}; - parsedQuery_._rootGraphPattern._graphPatterns.push_back(std::move(clause)); + // The solution modifiers (in particular ORDER BY) have to be part of the + // outer query. + parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); + + parsedQuery_._rootGraphPattern._graphPatterns.push_back( + std::move(describeClause)); parsedQuery_.datasetClauses_ = datasetClauses; auto constructClause = ParsedQuery::ConstructClause{}; using G = GraphTerm; using V = Variable; + // The outer query has the form `CONSTRUCT { ?subject ?predicate ?object} + // {...}` constructClause.triples_.push_back( std::array{G(V("?subject")), G(V("?predicate")), G(V("?object"))}); parsedQuery_._clause = std::move(constructClause); diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index d3f114991c..78571162dd 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -389,19 +389,17 @@ constexpr auto Union = MatchTypeAndOrderedChildren<::Union>; // Match a `DESCRIBE` operation inline QetMatcher Describe( - const Matcher describeMatcher, + const Matcher& describeMatcher, const QetMatcher& childMatcher) { return RootOperation<::Describe>( AllOf(children(childMatcher), AD_PROPERTY(::Describe, getDescribe, describeMatcher))); } -// Match a `DISTINCT` operation - // inline QetMatcher QetWithWarnings( const std::vector& warningSubstrings, - QetMatcher actualMatcher) { + const QetMatcher& actualMatcher) { auto warningMatchers = ad_utility::transform( warningSubstrings, [](const std::string& s) { return ::testing::HasSubstr(s); }); diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 2eac6020bf..66d3a6626c 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -1368,24 +1368,33 @@ TEST(SparqlParser, Query) { "PREFIX doof: ")))); { - // DESCRIBE * queries are not yet supported. - expectQueryFails("DESCRIBE *"); using R = std::vector; auto i = [](const auto& x) { return TripleComponent::Iri::fromIriref(x); }; // The tested describe queries will always DESCRIBE ?y R xyz{i(""), Var{"?y"}, i("")}; + // A matcher for `?y ?v` + auto yIsAVGraphPattern = + m::GraphPattern(m::Triples({{Var{"?y"}, "", Var{"?v"}}})); // A matcher for the subquery `SELECT ?y { ?y ?v}` // This subquery computes the values for `?y` that are to // be described. - auto yIsAVMatcher = m::SelectQuery( - m::Select({Var{"?y"}}), - m::GraphPattern(m::Triples({{Var{"?y"}, "", Var{"?v"}}}))); + auto yIsAVMatcher = + m::SelectQuery(m::Select({Var{"?y"}}), yIsAVGraphPattern); // A test with empty dataset clauses (no FROM and FROM NAMED). expectQuery("DESCRIBE ?y { ?y ?v }", m::DescribeQuery(m::Describe(xyz, {}, yIsAVMatcher))); + // A test with DESCRIBE * + // `DESCRIBE * { ?y ?v}` is equivalent to + // `DESCRIBE ?y ?v { ?y ?v}` + auto yIsAVSelectBothMatcher = + m::SelectQuery(m::Select({Var{"?y"}, Var{"?v"}}), yIsAVGraphPattern); + R yv{Var{"?y"}, Var{"?v"}}; + expectQuery("DESCRIBE * { ?y ?v }", + m::DescribeQuery(m::Describe(yv, {}, yIsAVSelectBothMatcher))); + // A test with nonempty dataset clauses. // Note that the clauses are relevant for the retrieval of the values for // `?y`, as well as for the actual descriptions, so they have to be part of