From 4594765d543e5e69e3eab3e8c726efb8f6ff3e35 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Jul 2024 17:18:45 +0200 Subject: [PATCH 01/14] All tests passing, Let's feed this to the tools, and then implement the missing unit tests. --- src/engine/IndexScan.cpp | 62 +++++++++++++++++++++++----------- src/engine/QueryPlanner.cpp | 8 ++--- src/index/Index.cpp | 23 +++++++------ src/index/Index.h | 12 ++++--- src/index/IndexImpl.cpp | 42 ++++++++++++++++------- src/index/IndexImpl.h | 12 ++++--- test/IndexTest.cpp | 13 +++---- test/QueryPlannerTest.cpp | 57 +++++++++++++------------------ test/QueryPlannerTestHelpers.h | 3 ++ test/util/IndexTestHelpers.cpp | 7 ++-- 10 files changed, 140 insertions(+), 99 deletions(-) diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 0aec6a3e6a..24b69df383 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -32,11 +32,10 @@ IndexScan::IndexScan(QueryExecutionContext* qec, Permutation::Enum permutation, } sizeEstimate_ = computeSizeEstimate(); - // Check the following invariant: The permuted input triple must contain at - // least one variable, and all the variables must be at the end of the + // Check the following invariant: All the variables must be at the end of the // permuted triple. For example in the PSO permutation, either only the O, or - // the S and O, or all three of P, S, O can be variables, all other - // combinations are not supported. + // the S and O, or all three of P, S, O, or none of them can be variables, all + // other combinations are not supported. auto permutedTriple = getPermutedTriple(); for (size_t i = 0; i < 3 - numVariables_; ++i) { AD_CONTRACT_CHECK(!permutedTriple.at(i)->isVariable()); @@ -57,7 +56,7 @@ string IndexScan::getCacheKeyImpl() const { auto permutationString = Permutation::toString(permutation_); if (numVariables_ == 3) { - os << "SCAN FOR FULL INDEX " << permutationString << " (DUMMY OPERATION)"; + os << "SCAN FOR FULL INDEX " << permutationString; } else { os << "SCAN " << permutationString << " with "; @@ -66,10 +65,9 @@ string IndexScan::getCacheKeyImpl() const { const auto& key = getPermutedTriple().at(idx)->toRdfLiteral(); os << keyString << " = \"" << key << "\""; }; - addKey(0); - if (numVariables_ == 1) { + for (size_t i = 0; i < 3 - numVariables_; ++i) { + addKey(i); os << ", "; - addKey(1); } } if (!additionalColumns_.empty()) { @@ -93,6 +91,8 @@ size_t IndexScan::getResultWidth() const { // _____________________________________________________________________________ vector IndexScan::resultSortedOn() const { switch (numVariables_) { + case 0: + return {}; case 1: return {ColumnIndex{0}}; case 2: @@ -132,11 +132,18 @@ Result IndexScan::computeResult([[maybe_unused]] bool requestLaziness) { const auto& index = _executionContext->getIndex(); const auto permutedTriple = getPermutedTriple(); if (numVariables_ == 2) { - idTable = index.scan(*permutedTriple[0], std::nullopt, permutation_, - additionalColumns(), cancellationHandle_); + idTable = + index.scan(*permutedTriple[0], std::nullopt, std::nullopt, permutation_, + additionalColumns(), cancellationHandle_); } else if (numVariables_ == 1) { - idTable = index.scan(*permutedTriple[0], *permutedTriple[1], permutation_, - additionalColumns(), cancellationHandle_); + idTable = + index.scan(*permutedTriple[0], *permutedTriple[1], std::nullopt, + permutation_, additionalColumns(), cancellationHandle_); + } else if (numVariables_ == 0) { + idTable = + index.scan(*permutedTriple[0], *permutedTriple[1], *permutedTriple[2], + permutation_, additionalColumns(), cancellationHandle_); + } else { AD_CORRECTNESS_CHECK(numVariables_ == 3); computeFullScan(&idTable, permutation_); @@ -154,7 +161,11 @@ size_t IndexScan::computeSizeEstimate() const { // Should always be in this branch. Else is only for test cases. // We have to do a simple scan anyway so might as well do it now - if (numVariables_ == 1) { + if (numVariables_ == 0) { + return getIndex().getResultSizeOfScan( + *getPermutedTriple()[0], *getPermutedTriple().at(1), + *getPermutedTriple()[2], permutation_); + } else if (numVariables_ == 1) { // TODO Use the monadic operation `std::optional::or_else`. // Note: we cannot use `optional::value_or()` here, because the else // case is expensive to compute, and we need it lazily evaluated. @@ -165,8 +176,9 @@ size_t IndexScan::computeSizeEstimate() const { } else { // This call explicitly has to read two blocks of triples from memory to // obtain an exact size estimate. - return getIndex().getResultSizeOfScan( - *getPermutedTriple()[0], *getPermutedTriple().at(1), permutation_); + return getIndex().getResultSizeOfScan(*getPermutedTriple()[0], + *getPermutedTriple().at(1), + std::nullopt, permutation_); } } else if (numVariables_ == 2) { const TripleComponent& firstKey = *getPermutedTriple().at(0); @@ -197,6 +209,9 @@ size_t IndexScan::getCostEstimate() { return getSizeEstimateBeforeLimit(); } // _____________________________________________________________________________ void IndexScan::determineMultiplicities() { multiplicity_.clear(); + if (numVariables_ == 0) { + return; + } if (_executionContext) { const auto& idx = getIndex(); if (numVariables_ == 1) { @@ -285,8 +300,12 @@ Permutation::IdTableGenerator IndexScan::getLazyScan( if (s.numVariables_ < 2) { col1Id = s.getPermutedTriple()[1]->toValueId(index.getVocab()).value(); } + std::optional col2Id; + if (s.numVariables_ < 1) { + col2Id = s.getPermutedTriple()[2]->toValueId(index.getVocab()).value(); + } return index.getPermutation(s.permutation()) - .lazyScan({col0Id, col1Id, std::nullopt}, std::move(blocks), + .lazyScan({col0Id, col1Id, col2Id}, std::move(blocks), s.additionalColumns(), s.cancellationHandle_); }; @@ -302,19 +321,24 @@ std::optional IndexScan::getMetadataForScan( std::optional col1Id = numVars >= 2 ? std::nullopt : permutedTriple[1]->toValueId(index.getVocab()); + std::optional col2Id = + numVars >= 1 ? std::nullopt + : permutedTriple[2]->toValueId(index.getVocab()); if ((!col0Id.has_value() && numVars < 3) || - (!col1Id.has_value() && numVars < 2)) { + (!col1Id.has_value() && numVars < 2) || + (!col2Id.has_value() && numVars < 1)) { return std::nullopt; } return index.getPermutation(s.permutation()) - .getMetadataAndBlocks({col0Id, col1Id, std::nullopt}); + .getMetadataAndBlocks({col0Id, col1Id, col2Id}); }; // ________________________________________________________________ std::array IndexScan::lazyScanForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2) { AD_CONTRACT_CHECK(s1.numVariables_ <= 3 && s2.numVariables_ <= 3); + AD_CONTRACT_CHECK(s1.numVariables_ >= 1 && s2.numVariables_ >= 1); // This function only works for single column joins. This means that the first // variable of both scans must be equal, but all other variables of the scans @@ -363,7 +387,7 @@ IndexScan::lazyScanForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2) { Permutation::IdTableGenerator IndexScan::lazyScanForJoinOfColumnWithScan( std::span joinColumn, const IndexScan& s) { AD_EXPENSIVE_CHECK(std::ranges::is_sorted(joinColumn)); - AD_CORRECTNESS_CHECK(s.numVariables_ <= 3); + AD_CORRECTNESS_CHECK(s.numVariables_ <= 3 && s.numVariables_ > 0); auto metaBlocks1 = getMetadataForScan(s); diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index a1978b3bd1..cbb9d15e35 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -629,7 +629,9 @@ void QueryPlanner::seedFromOrdinaryTriple( const size_t numVars = static_cast(isVariable(triple.s_)) + static_cast(isVariable(triple.p_)) + static_cast(isVariable(triple.o_)); - if (numVars == 1) { + if (numVars == 0) { + addIndexScan(Permutation::Enum::PSO); + } else if (numVars == 1) { indexScanSingleVarCase(triple, addIndexScan); } else if (numVars == 2) { indexScanTwoVarsCase(triple, addIndexScan, addFilter); @@ -673,10 +675,6 @@ auto QueryPlanner::seedWithScansAndText( seeds.push_back(getTextLeafPlan(node, textLimits)); continue; } - if (node._variables.empty()) { - AD_THROW("Triples should have at least one variable. Not the case in: " + - node.triple_.asString()); - } // Property paths must have been handled previously. AD_CORRECTNESS_CHECK(node.triple_.p_._operation == diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 8bc3fc351f..f8bf930f0b 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -278,24 +278,27 @@ vector Index::getMultiplicities(const TripleComponent& key, IdTable Index::scan( const TripleComponent& col0String, std::optional> col1String, + std::optional> col2String, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const { - return pimpl_->scan(col0String, col1String, p, additionalColumns, - std::move(cancellationHandle)); + return pimpl_->scan(col0String, col1String, col2String, p, additionalColumns, + cancellationHandle); } // ____________________________________________________________________________ IdTable Index::scan( - Id col0Id, std::optional col1Id, Permutation::Enum p, - Permutation::ColumnIndicesRef additionalColumns, + Id col0Id, std::optional col1Id, std::optional col2Id, + Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const { - return pimpl_->scan(col0Id, col1Id, p, additionalColumns, - std::move(cancellationHandle)); + return pimpl_->scan({col0Id, col1Id, col2Id}, p, additionalColumns, + cancellationHandle); } // ____________________________________________________________________________ -size_t Index::getResultSizeOfScan(const TripleComponent& col0String, - const TripleComponent& col1String, - const Permutation::Enum& permutation) const { - return pimpl_->getResultSizeOfScan(col0String, col1String, permutation); +size_t Index::getResultSizeOfScan( + const TripleComponent& col0String, const TripleComponent& col1String, + std::optional> col2String, + const Permutation::Enum& permutation) const { + return pimpl_->getResultSizeOfScan(col0String, col1String, col2String, + permutation); } diff --git a/src/index/Index.h b/src/index/Index.h index 48c2717687..60489db04c 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -241,20 +241,22 @@ class Index { IdTable scan( const TripleComponent& col0String, std::optional> col1String, + std::optional> col2String, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const; // Similar to the overload of `scan` above, but the keys are specified as IDs. IdTable scan( - Id col0Id, std::optional col1Id, Permutation::Enum p, - Permutation::ColumnIndicesRef additionalColumns, + Id col0Id, std::optional col1Id, std::optional col2Id, + Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const; // Similar to the previous overload of `scan`, but only get the exact size of // the scan result. - size_t getResultSizeOfScan(const TripleComponent& col0String, - const TripleComponent& col1String, - const Permutation::Enum& permutation) const; + size_t getResultSizeOfScan( + const TripleComponent& col0String, const TripleComponent& col1String, + std::optional> col2String, + const Permutation::Enum& permutation) const; // Get access to the implementation. This should be used rarely as it // requires including the rather expensive `IndexImpl.h` header diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 49b55cb763..15abe39f4b 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1369,41 +1369,59 @@ vector IndexImpl::getMultiplicities( IdTable IndexImpl::scan( const TripleComponent& col0String, std::optional> col1String, + std::optional> col2String, const Permutation::Enum& permutation, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const { + AD_CORRECTNESS_CHECK(!col2String.has_value() || col1String.has_value()); std::optional col0Id = col0String.toValueId(getVocab()); - std::optional col1Id = - col1String.has_value() ? col1String.value().get().toValueId(getVocab()) - : std::nullopt; - if (!col0Id.has_value() || (col1String.has_value() && !col1Id.has_value())) { - size_t numColumns = col1String.has_value() ? 1 : 2; + // TODO Use the monadic operations for std::optional. + bool elementNotFound = !col0Id.has_value(); + size_t numColumns = 2; + auto handleOptional = [&elementNotFound, &numColumns, + this](const auto& comp) -> std::optional { + if (!comp.has_value()) { + return std::nullopt; + } + --numColumns; + auto result = comp.value().get().toValueId(getVocab()); + elementNotFound = elementNotFound || !result.has_value(); + return result; + }; + std::optional col1Id = handleOptional(col1String); + std::optional col2Id = handleOptional(col2String); + if (elementNotFound) { cancellationHandle->throwIfCancelled(); return IdTable{numColumns + additionalColumns.size(), allocator_}; } - return scan(col0Id.value(), col1Id, permutation, additionalColumns, + return scan({col0Id.value(), col1Id, col2Id}, permutation, additionalColumns, cancellationHandle); } // _____________________________________________________________________________ IdTable IndexImpl::scan( - Id col0Id, std::optional col1Id, Permutation::Enum p, - Permutation::ColumnIndicesRef additionalColumns, + const Permutation::ScanSpecification& scanSpecification, + Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const { - return getPermutation(p).scan({col0Id, col1Id, std::nullopt}, - additionalColumns, cancellationHandle); + return getPermutation(p).scan(scanSpecification, additionalColumns, + cancellationHandle); } // _____________________________________________________________________________ size_t IndexImpl::getResultSizeOfScan( const TripleComponent& col0, const TripleComponent& col1, + std::optional> col2, const Permutation::Enum& permutation) const { std::optional col0Id = col0.toValueId(getVocab()); std::optional col1Id = col1.toValueId(getVocab()); - if (!col0Id.has_value() || !col1Id.has_value()) { + std::optional col2Id = col2.has_value() + ? col2.value().get().toValueId(getVocab()) + : std::nullopt; + if (!col0Id.has_value() || !col1Id.has_value() || + (col2.has_value() && !col1Id.has_value())) { return 0; } const Permutation& p = getPermutation(permutation); - return p.getResultSizeOfScan({col0Id.value(), col1Id.value(), std::nullopt}); + return p.getResultSizeOfScan({col0Id.value(), col1Id.value(), col2Id}); } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 10c6ff0c79..ab7d5ce67a 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -400,20 +400,22 @@ class IndexImpl { IdTable scan( const TripleComponent& col0String, std::optional> col1String, + std::optional> col2String, const Permutation::Enum& permutation, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const; // _____________________________________________________________________________ IdTable scan( - Id col0Id, std::optional col1Id, Permutation::Enum p, - Permutation::ColumnIndicesRef additionalColumns, + const Permutation::ScanSpecification& scanSpecification, + Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle) const; // _____________________________________________________________________________ - size_t getResultSizeOfScan(const TripleComponent& col0, - const TripleComponent& col1, - const Permutation::Enum& permutation) const; + size_t getResultSizeOfScan( + const TripleComponent& col0, const TripleComponent& col1, + std::optional> col2, + const Permutation::Enum& permutation) const; private: // Private member functions diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index 3aaed82ec1..2b38c14f75 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -36,9 +36,9 @@ auto makeTestScanWidthOne = [](const IndexImpl& index) { ad_utility::source_location l = ad_utility::source_location::current()) { auto t = generateLocationTrace(l); - IdTable result = - index.scan(c0, std::cref(c1), permutation, additionalColumns, - std::make_shared>()); + IdTable result = index.scan( + c0, std::cref(c1), std::nullopt, permutation, additionalColumns, + std::make_shared>()); ASSERT_EQ(result.numColumns(), 1 + additionalColumns.size()); ASSERT_EQ(result, makeIdTableFromVector(expected)); }; @@ -54,9 +54,10 @@ auto makeTestScanWidthTwo = [](const IndexImpl& index) { ad_utility::source_location l = ad_utility::source_location::current()) { auto t = generateLocationTrace(l); - IdTable wol = index.scan( - c0, std::nullopt, permutation, Permutation::ColumnIndicesRef{}, - std::make_shared>()); + IdTable wol = + index.scan(c0, std::nullopt, std::nullopt, permutation, + Permutation::ColumnIndicesRef{}, + std::make_shared>()); ASSERT_EQ(wol, makeIdTableFromVector(expected)); }; }; diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 7451016f2a..a32df5ac0c 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -287,31 +287,27 @@ TEST(QueryPlanner, testStarTwoFree) { } TEST(QueryPlanner, testFilterAfterSeed) { - ParsedQuery pq = SparqlParser::parseQuery( + auto scan = h::IndexScanFromStrings; + auto qec = ad_utility::testing::getQec(" "); + h::expect( "SELECT ?x ?y ?z WHERE {" "?x ?y . ?y ?z . " - "FILTER(?x != ?y) }"); - QueryPlanner qp = makeQueryPlanner(); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(qet.getCacheKey(), - "FILTER JOIN\nSCAN POS with P = \"\" join-column: " - "[0]\n|X|\nSCAN PSO with P = \"\" join-column: [0] with " - "N16sparqlExpression10relational20RelationalExpressionILN18valueIdC" - "omparators10ComparisonE3EEE#column_1##column_0#"); + "FILTER(?x != ?y) }", + h::Join(h::Filter("?x != ?y", scan("?x", "", "?y")), + scan("?y", "", "?z")), + qec); } TEST(QueryPlanner, testFilterAfterJoin) { - ParsedQuery pq = SparqlParser::parseQuery( + auto scan = h::IndexScanFromStrings; + auto qec = ad_utility::testing::getQec(" "); + h::expect( "SELECT ?x ?y ?z WHERE {" "?x ?y . ?y ?z . " - "FILTER(?x != ?z) }"); - QueryPlanner qp = makeQueryPlanner(); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(qet.getCacheKey(), - "FILTER JOIN\nSCAN POS with P = \"\" join-column: " - "[0]\n|X|\nSCAN PSO with P = \"\" join-column: [0] with " - "N16sparqlExpression10relational20RelationalExpressionILN18valueIdC" - "omparators10ComparisonE3EEE#column_1##column_2#"); + "FILTER(?x != ?z) }", + h::Filter("?x != ?z", + h::Join(scan("?x", "", "?y"), scan("?y", "", "?z"))), + qec); } TEST(QueryPlanner, threeVarTriples) { @@ -565,25 +561,18 @@ TEST(QueryExecutionTreeTest, testFormerSegfaultTriFilter) { } TEST(QueryPlanner, testSimpleOptional) { - QueryPlanner qp = makeQueryPlanner(); - - ParsedQuery pq = SparqlParser::parseQuery( + auto scan = h::IndexScanFromStrings; + h::expect( "SELECT ?a ?b \n " - "WHERE {?a ?b . OPTIONAL { ?a ?c }}"); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(qet.getCacheKey(), - "OPTIONAL_JOIN\nSCAN PSO with P = \"\" join-columns: " - "[0]\n|X|\nSCAN PSO with P = \"\" join-columns: [0]"); - - ParsedQuery pq2 = SparqlParser::parseQuery( + "WHERE {?a ?b . OPTIONAL { ?a ?c }}", + h::OptionalJoin(scan("?a", "", "?b"), scan("?a", "", "?c"))); + h::expect( "SELECT ?a ?b \n " "WHERE {?a ?b . " - "OPTIONAL { ?a ?c }} ORDER BY ?b"); - QueryExecutionTree qet2 = qp.createExecutionTree(pq2); - ASSERT_EQ(qet2.getCacheKey(), - "ORDER BY on columns:asc(1) \nOPTIONAL_JOIN\nSCAN PSO with P = " - "\"\" join-columns: [0]\n|X|\nSCAN PSO with P = \"\" " - "join-columns: [0]"); + "OPTIONAL { ?a ?c }} ORDER BY ?b", + h::OrderBy({{Variable{"?b"}, ::OrderBy::AscOrDesc::Asc}}, + h::OptionalJoin(scan("?a", "", "?b"), + scan("?a", "", "?c")))); } TEST(QueryPlanner, SimpleTripleOneVariable) { diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index c33f9a2faa..37c48f3124 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -16,6 +16,7 @@ #include "engine/Join.h" #include "engine/MultiColumnJoin.h" #include "engine/NeutralElementOperation.h" +#include "engine/OptionalJoin.h" #include "engine/OrderBy.h" #include "engine/QueryExecutionTree.h" #include "engine/QueryPlanner.h" @@ -204,6 +205,8 @@ inline auto IndexScanFromStrings = inline auto MultiColumnJoin = MatchTypeAndUnorderedChildren<::MultiColumnJoin>; inline auto Join = MatchTypeAndUnorderedChildren<::Join>; +constexpr auto OptionalJoin = MatchTypeAndOrderedChildren<::OptionalJoin>; + // Return a matcher that matches a query execution tree that consists of // multiple JOIN operations that join the `children`. The `INTERNAL SORT BY` // operations required for the joins are also ignored by this matcher. diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 4128af6612..cf12731650 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -60,8 +60,9 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( auto hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); auto checkSingleElement = [&cancellationDummy, &hasPatternId]( const Index& index, size_t patternIdx, Id id) { - auto scanResultHasPattern = index.scan( - hasPatternId, id, Permutation::Enum::PSO, {}, cancellationDummy); + auto scanResultHasPattern = + index.scan(hasPatternId, id, std::nullopt, Permutation::Enum::PSO, {}, + cancellationDummy); // Each ID has at most one pattern, it can have none if it doesn't // appear as a subject in the knowledge graph. AD_CORRECTNESS_CHECK(scanResultHasPattern.numRows() <= 1); @@ -79,7 +80,7 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( auto cancellationDummy = std::make_shared>(); auto scanResult = index.scan( - col0Id, std::nullopt, permutation, + col0Id, std::nullopt, std::nullopt, permutation, std::array{ColumnIndex{ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN}, ColumnIndex{ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}}, cancellationDummy); From 4d6d981bd0e51a9264556efa236e6eac527bccb3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Jul 2024 20:36:02 +0200 Subject: [PATCH 02/14] Add some unit tests. --- src/index/IndexImpl.cpp | 2 +- test/QueryPlannerTest.cpp | 3 +- test/engine/IndexScanTest.cpp | 64 +++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 612c4db4bc..88ceee38ae 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1419,7 +1419,7 @@ size_t IndexImpl::getResultSizeOfScan( ? col2.value().get().toValueId(getVocab()) : std::nullopt; if (!col0Id.has_value() || !col1Id.has_value() || - (col2.has_value() && !col1Id.has_value())) { + (col2.has_value() && !col2Id.has_value())) { return 0; } const Permutation& p = getPermutation(permutation); diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index a32df5ac0c..5f01553168 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -288,7 +288,8 @@ TEST(QueryPlanner, testStarTwoFree) { TEST(QueryPlanner, testFilterAfterSeed) { auto scan = h::IndexScanFromStrings; - auto qec = ad_utility::testing::getQec(" "); + auto qec = ad_utility::testing::getQec( + " , , . , , ."); h::expect( "SELECT ?x ?y ?z WHERE {" "?x ?y . ?y ?z . " diff --git a/test/engine/IndexScanTest.cpp b/test/engine/IndexScanTest.cpp index 2920a2cd8e..abc5babd38 100644 --- a/test/engine/IndexScanTest.cpp +++ b/test/engine/IndexScanTest.cpp @@ -380,3 +380,67 @@ TEST(IndexScan, additionalColumn) { {{getId(""), getId(""), I(0), I(NO_PATTERN)}}); EXPECT_THAT(res.idTable(), ::testing::ElementsAreArray(exp)); } + +TEST(IndexScan, getResultSizeOfScan) { + auto qec = getQec("

, . ."); + auto getId = makeGetId(qec->getIndex()); + [[maybe_unused]] auto x = getId(""); + [[maybe_unused]] auto p = getId("

"); + [[maybe_unused]] auto s1 = getId(""); + [[maybe_unused]] auto s2 = getId(""); + [[maybe_unused]] auto p2 = getId(""); + using V = Variable; + using I = TripleComponent::Iri; + + { + SparqlTripleSimple scanTriple{V{"?x"}, V("?y"), V{"?z"}}; + IndexScan scan{qec, Permutation::Enum::PSO, scanTriple}; + // Note: this currently also contains the (internal) triple for the + // `ql:has-pattern` relation of ``. + EXPECT_EQ(scan.getSizeEstimate(), 4); + } + { + SparqlTripleSimple scanTriple{V{"?x"}, I::fromIriref("

"), V{"?y"}}; + IndexScan scan{qec, Permutation::Enum::PSO, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 2); + } + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + V{"?y"}}; + IndexScan scan{qec, Permutation::Enum::PSO, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 2); + } + { + SparqlTripleSimple scanTriple{V("?x"), I::fromIriref("

"), + I::fromIriref("")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 1); + } + // 0 variables + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + I::fromIriref("")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 1); + EXPECT_ANY_THROW(scan.getMultiplicity(0)); + auto res = scan.computeResultOnlyForTesting(); + ASSERT_EQ(res.idTable().numRows(), 1); + ASSERT_EQ(res.idTable().numColumns(), 0); + } + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + I::fromIriref("")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 0); + } + { + SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), + I::fromIriref("

")}; + IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; + EXPECT_EQ(scan.getSizeEstimate(), 0); + EXPECT_ANY_THROW(scan.getMultiplicity(0)); + auto res = scan.computeResultOnlyForTesting(); + ASSERT_EQ(res.idTable().numRows(), 0); + ASSERT_EQ(res.idTable().numColumns(), 0); + } +} From 65e569b3b26db106df1d43d22e12f32ef1c365a6 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 12 Jul 2024 09:25:24 +0200 Subject: [PATCH 03/14] Fix the unrelated unit test. --- test/QueryPlannerTest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 5f01553168..59a3a5b8f5 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -294,8 +294,8 @@ TEST(QueryPlanner, testFilterAfterSeed) { "SELECT ?x ?y ?z WHERE {" "?x ?y . ?y ?z . " "FILTER(?x != ?y) }", - h::Join(h::Filter("?x != ?y", scan("?x", "", "?y")), - scan("?y", "", "?z")), + h::Filter("?x != ?y", + h::Join(scan("?x", "", "?y"), scan("?y", "", "?z"))), qec); } From ee0848fe465af4c9a41f941d3932666b1cb55d23 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 12 Jul 2024 18:30:42 +0200 Subject: [PATCH 04/14] Continue this work on Monday morning, we really have to start cleaning up this mess. --- src/engine/IndexScan.cpp | 59 +++++++++++++++------------------------ src/engine/IndexScan.h | 2 ++ src/index/Index.cpp | 32 ++++++++++++++------- src/index/Index.h | 32 ++++++++++++++------- src/index/IndexImpl.cpp | 46 ++++++++---------------------- src/index/IndexImpl.h | 16 ++++------- test/QueryPlannerTest.cpp | 14 ++++++++++ 7 files changed, 101 insertions(+), 100 deletions(-) diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index a8f1daa28b..2ab72c8ac3 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -6,6 +6,7 @@ #include +#include #include #include @@ -130,20 +131,9 @@ Result IndexScan::computeResult([[maybe_unused]] bool requestLaziness) { using enum Permutation::Enum; idTable.setNumColumns(numVariables_); const auto& index = _executionContext->getIndex(); - const auto permutedTriple = getPermutedTriple(); - if (numVariables_ == 2) { - idTable = - index.scan(*permutedTriple[0], std::nullopt, std::nullopt, permutation_, - additionalColumns(), cancellationHandle_, getLimit()); - } else if (numVariables_ == 1) { - idTable = index.scan(*permutedTriple[0], *permutedTriple[1], std::nullopt, - permutation_, additionalColumns(), cancellationHandle_, - getLimit()); - } else if (numVariables_ == 0) { - idTable = index.scan(*permutedTriple[0], *permutedTriple[1], - *permutedTriple[2], permutation_, additionalColumns(), - cancellationHandle_, getLimit()); - + if (numVariables_ < 3) { + idTable = index.scan(getPermutedTripleNoVariables(), permutation_, + additionalColumns(), cancellationHandle_, getLimit()); } else { AD_CORRECTNESS_CHECK(numVariables_ == 3); computeFullScan(&idTable, permutation_); @@ -158,14 +148,7 @@ Result IndexScan::computeResult([[maybe_unused]] bool requestLaziness) { // _____________________________________________________________________________ size_t IndexScan::computeSizeEstimate() const { if (_executionContext) { - // Should always be in this branch. Else is only for test cases. - - // We have to do a simple scan anyway so might as well do it now - if (numVariables_ == 0) { - return getIndex().getResultSizeOfScan( - *getPermutedTriple()[0], *getPermutedTriple().at(1), - *getPermutedTriple()[2], permutation_); - } else if (numVariables_ == 1) { + if (numVariables_ == 1) { // TODO Use the monadic operation `std::optional::or_else`. // Note: we cannot use `optional::value_or()` here, because the else // case is expensive to compute, and we need it lazily evaluated. @@ -173,16 +156,13 @@ size_t IndexScan::computeSizeEstimate() const { getCacheKey()); size.has_value()) { return size.value(); - } else { - // This call explicitly has to read two blocks of triples from memory to - // obtain an exact size estimate. - return getIndex().getResultSizeOfScan(*getPermutedTriple()[0], - *getPermutedTriple().at(1), - std::nullopt, permutation_); } - } else if (numVariables_ == 2) { - const TripleComponent& firstKey = *getPermutedTriple().at(0); - return getIndex().getCardinality(firstKey, permutation_); + } + + // We have to do a simple scan anyway so might as well do it now + if (numVariables_ < 3) { + return getIndex().getResultSizeOfScan(getPermutedTripleNoVariables(), + permutation_); } else { // The triple consists of three variables. // TODO As soon as all implementations of a full index scan @@ -292,6 +272,13 @@ std::array IndexScan::getPermutedTriple() triple[permutation[2]]}; } +// ___________________________________________________________________________ +ScanSpecificationAsTripleComponent IndexScan::getPermutedTripleNoVariables() + const { + auto permutedTriple = getPermutedTriple(); + return {*permutedTriple[0], *permutedTriple[1], *permutedTriple[2]}; +} + // ___________________________________________________________________________ Permutation::IdTableGenerator IndexScan::getLazyScan( const IndexScan& s, std::vector blocks) { @@ -304,11 +291,11 @@ Permutation::IdTableGenerator IndexScan::getLazyScan( if (s.numVariables_ < 2) { col1Id = s.getPermutedTriple()[1]->toValueId(index.getVocab()).value(); } - std::optional col2Id; - if (s.numVariables_ < 1) { - col2Id = s.getPermutedTriple()[2]->toValueId(index.getVocab()).value(); - } + // This function is currently only called by the `getLazyScanForJoin...` + // functions. In these cases we always have at least one variable in each of + // the scans, because otherwise there would be no join column. + AD_CORRECTNESS_CHECK(s.numVariables_ >= 1); // If there is a LIMIT or OFFSET clause that constrains the scan // (which can happen with an explicit subquery), we cannot use the prefiltered // blocks, as we currently have no mechanism to include limits and offsets @@ -318,7 +305,7 @@ Permutation::IdTableGenerator IndexScan::getLazyScan( : std::nullopt; return index.getPermutation(s.permutation()) - .lazyScan({col0Id, col1Id, col2Id}, std::move(actualBlocks), + .lazyScan({col0Id, col1Id, std::nullopt}, std::move(actualBlocks), s.additionalColumns(), s.cancellationHandle_, s.getLimit()); }; diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index 6290dfb8d0..edee48a509 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -3,6 +3,7 @@ // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) #pragma once +#include #include #include "./Operation.h" @@ -100,6 +101,7 @@ class IndexScan final : public Operation { // `permutation_`. For example if `permutation_ == PSO` then the result is // {&predicate_, &subject_, &object_} std::array getPermutedTriple() const; + ScanSpecificationAsTripleComponent getPermutedTripleNoVariables() const; private: Result computeResult([[maybe_unused]] bool requestLaziness) override; diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 1df87b5a82..39f255d58b 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -276,31 +276,43 @@ vector Index::getMultiplicities(const TripleComponent& key, // ____________________________________________________________________________ IdTable Index::scan( - const TripleComponent& col0String, - std::optional> col1String, - std::optional> col2String, + const ScanSpecificationAsTripleComponent& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { - return pimpl_->scan(col0String, col1String, col2String, p, additionalColumns, + return pimpl_->scan(scanSpecification, p, additionalColumns, cancellationHandle, limitOffset); } // ____________________________________________________________________________ IdTable Index::scan( - Id col0Id, std::optional col1Id, std::optional col2Id, + const Permutation::ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { - return pimpl_->scan({col0Id, col1Id, col2Id}, p, additionalColumns, + return pimpl_->scan(scanSpecification, p, additionalColumns, cancellationHandle, limitOffset); } // ____________________________________________________________________________ size_t Index::getResultSizeOfScan( - const TripleComponent& col0String, const TripleComponent& col1String, - std::optional> col2String, + const ScanSpecificationAsTripleComponent& scanSpecification, const Permutation::Enum& permutation) const { - return pimpl_->getResultSizeOfScan(col0String, col1String, col2String, - permutation); + return pimpl_->getResultSizeOfScan(scanSpecification, permutation); +} +// ____________________________________________________________________________ +std::optional +ScanSpecificationAsTripleComponent::toScanSpecification( + const IndexImpl& index) const { + std::optional col0Id = col0_.toValueId(index.getVocab()); + std::optional col1Id = col1_.has_value() + ? col1_.value().toValueId(index.getVocab()) + : std::nullopt; + std::optional col2Id = col2_.has_value() + ? col2_.value().toValueId(index.getVocab()) + : std::nullopt; + if (!col0Id.has_value() || (col1_.has_value() && !col1Id.has_value()) || + (col2_.has_value() && !col2Id.has_value())) { + return std::nullopt; + } } diff --git a/src/index/Index.h b/src/index/Index.h index e1322b5b5e..5bd88ea58b 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -23,6 +23,21 @@ class IdTable; class TextBlockMetaData; class IndexImpl; +// TODO Comment. +struct ScanSpecificationAsTripleComponent { + TripleComponent col0_; + std::optional col1_; + std::optional col2_; + ScanSpecificationAsTripleComponent(TripleComponent col0, + const TripleComponent& col1, + const TripleComponent& col2); + std::optional toScanSpecification( + const IndexImpl& index) const; + size_t numColumns() const { + return 2 - col1_.has_value() - col2_.has_value(); + } +}; + class Index { private: // Pimpl to reduce compile times. @@ -238,16 +253,14 @@ class Index { * @param p The Permutation::Enum to use (in particularly POS(), SOP,... * members of Index class). */ - IdTable scan( - const TripleComponent& col0String, - std::optional> col1String, - std::optional> col2String, - Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, - const ad_utility::SharedCancellationHandle& cancellationHandle, - const LimitOffsetClause& limitOffset = {}) const; + IdTable scan(const ScanSpecificationAsTripleComponent& scanSpecification, + Permutation::Enum p, + Permutation::ColumnIndicesRef additionalColumns, + const ad_utility::SharedCancellationHandle& cancellationHandle, + const LimitOffsetClause& limitOffset = {}) const; // Similar to the overload of `scan` above, but the keys are specified as IDs. - IdTable scan(Id col0Id, std::optional col1Id, std::optional col2Id, + IdTable scan(const Permutation::ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, @@ -256,8 +269,7 @@ class Index { // Similar to the previous overload of `scan`, but only get the exact size of // the scan result. size_t getResultSizeOfScan( - const TripleComponent& col0String, const TripleComponent& col1String, - std::optional> col2String, + const ScanSpecificationAsTripleComponent& scanSpecification, const Permutation::Enum& permutation) const; // Get access to the implementation. This should be used rarely as it diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 88ceee38ae..71fec97cfa 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1367,35 +1367,19 @@ vector IndexImpl::getMultiplicities( // _____________________________________________________________________________ IdTable IndexImpl::scan( - const TripleComponent& col0String, - std::optional> col1String, - std::optional> col2String, + const ScanSpecificationAsTripleComponent& scanSpecificationAsTc, const Permutation::Enum& permutation, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { - AD_CORRECTNESS_CHECK(!col2String.has_value() || col1String.has_value()); - std::optional col0Id = col0String.toValueId(getVocab()); - // TODO Use the monadic operations for std::optional. - bool elementNotFound = !col0Id.has_value(); - size_t numColumns = 2; - auto handleOptional = [&elementNotFound, &numColumns, - this](const auto& comp) -> std::optional { - if (!comp.has_value()) { - return std::nullopt; - } - --numColumns; - auto result = comp.value().get().toValueId(getVocab()); - elementNotFound = elementNotFound || !result.has_value(); - return result; - }; - std::optional col1Id = handleOptional(col1String); - std::optional col2Id = handleOptional(col2String); - if (elementNotFound) { + auto scanSpecification = scanSpecificationAsTc.toScanSpecification(*this); + if (!scanSpecification.has_value()) { cancellationHandle->throwIfCancelled(); - return IdTable{numColumns + additionalColumns.size(), allocator_}; + return IdTable{ + scanSpecificationAsTc.numColumns() + additionalColumns.size(), + allocator_}; } - return scan({col0Id.value(), col1Id, col2Id}, permutation, additionalColumns, + return scan(scanSpecification.value(), permutation, additionalColumns, cancellationHandle, limitOffset); } // _____________________________________________________________________________ @@ -1410,20 +1394,14 @@ IdTable IndexImpl::scan( // _____________________________________________________________________________ size_t IndexImpl::getResultSizeOfScan( - const TripleComponent& col0, const TripleComponent& col1, - std::optional> col2, + const ScanSpecificationAsTripleComponent& scanSpecificationAsTc, const Permutation::Enum& permutation) const { - std::optional col0Id = col0.toValueId(getVocab()); - std::optional col1Id = col1.toValueId(getVocab()); - std::optional col2Id = col2.has_value() - ? col2.value().get().toValueId(getVocab()) - : std::nullopt; - if (!col0Id.has_value() || !col1Id.has_value() || - (col2.has_value() && !col2Id.has_value())) { + const Permutation& p = getPermutation(permutation); + auto scanSpecification = scanSpecificationAsTc.toScanSpecification(*this); + if (!scanSpecification.has_value()) { return 0; } - const Permutation& p = getPermutation(permutation); - return p.getResultSizeOfScan({col0Id.value(), col1Id.value(), col2Id}); + return p.getResultSizeOfScan(scanSpecification.value()); } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index d5020ac717..6caef088aa 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -397,14 +397,11 @@ class IndexImpl { vector getMultiplicities(Permutation::Enum permutation) const; // _____________________________________________________________________________ - IdTable scan( - const TripleComponent& col0String, - std::optional> col1String, - std::optional> col2String, - const Permutation::Enum& permutation, - Permutation::ColumnIndicesRef additionalColumns, - const ad_utility::SharedCancellationHandle& cancellationHandle, - const LimitOffsetClause& limitOffset = {}) const; + IdTable scan(const ScanSpecificationAsTripleComponent& scanSpecification, + const Permutation::Enum& permutation, + Permutation::ColumnIndicesRef additionalColumns, + const ad_utility::SharedCancellationHandle& cancellationHandle, + const LimitOffsetClause& limitOffset = {}) const; // _____________________________________________________________________________ IdTable scan(const Permutation::ScanSpecification& scanSpecification, @@ -415,8 +412,7 @@ class IndexImpl { // _____________________________________________________________________________ size_t getResultSizeOfScan( - const TripleComponent& col0, const TripleComponent& col1, - std::optional> col2, + const ScanSpecificationAsTripleComponent& scanSpecification, const Permutation::Enum& permutation) const; private: diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 59a3a5b8f5..59465bfb40 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -204,6 +204,20 @@ TEST(QueryPlanner, testBFSLeaveOut) { } } +TEST(QueryPlanner, indexScanZeroVariables) { + auto scan = h::IndexScanFromStrings; + using enum Permutation::Enum; + h::expect( + "SELECT * \n " + "WHERE \t { }", + scan("", "", "")); + h::expect( + "SELECT * \n " + "WHERE \t { . ?z}", + h::CartesianProductJoin(scan("", "", ""), + scan("", "", "?z"))); +} + TEST(QueryPlanner, indexScanOneVariable) { auto scan = h::IndexScanFromStrings; using enum Permutation::Enum; From b9afa81335d66babedfadea85424e8ab0451ace1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Mon, 15 Jul 2024 09:58:30 +0200 Subject: [PATCH 05/14] Continue this work on Monday morning, we really have to start cleaning up this mess. --- src/index/Index.cpp | 34 +++++++++++++++++++++++++++++++++- src/index/Index.h | 15 +++++++++------ test/IndexTest.cpp | 8 ++++---- test/util/IndexTestHelpers.cpp | 4 ++-- 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 39f255d58b..f853d430df 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -300,11 +300,17 @@ size_t Index::getResultSizeOfScan( const Permutation::Enum& permutation) const { return pimpl_->getResultSizeOfScan(scanSpecification, permutation); } + // ____________________________________________________________________________ std::optional ScanSpecificationAsTripleComponent::toScanSpecification( const IndexImpl& index) const { - std::optional col0Id = col0_.toValueId(index.getVocab()); + // TODO Use `std::optional::transform`. + // TODO: We can also have LocalVocab entries is the + // ScanSpecification. + std::optional col0Id = col0_.has_value() + ? col0_.value().toValueId(index.getVocab()) + : std::nullopt; std::optional col1Id = col1_.has_value() ? col1_.value().toValueId(index.getVocab()) : std::nullopt; @@ -315,4 +321,30 @@ ScanSpecificationAsTripleComponent::toScanSpecification( (col2_.has_value() && !col2Id.has_value())) { return std::nullopt; } + return Permutation::ScanSpecification{col0Id, col1Id, col2Id}; +} + +// ____________________________________________________________________________ +ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, + T col1, + T col2) { + auto isUnbound = [](const T& t) { + return !t.has_value() || t.value().isVariable(); + }; + if (isUnbound(col0)) { + AD_CONTRACT_CHECK(isUnbound(col1)); + } + if (isUnbound(col1)) { + AD_CONTRACT_CHECK(isUnbound(col2)); + } + auto toNulloptIfVariable = [](T& tc) -> std::optional { + if (tc.has_value() && tc.value().isVariable()) { + return std::nullopt; + } else { + return std::move(tc); + } + }; + col0_ = toNulloptIfVariable(col0); + col1_ = toNulloptIfVariable(col1); + col2_ = toNulloptIfVariable(col2); } diff --git a/src/index/Index.h b/src/index/Index.h index 5bd88ea58b..a6c3fce5dc 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -24,17 +24,20 @@ class TextBlockMetaData; class IndexImpl; // TODO Comment. -struct ScanSpecificationAsTripleComponent { - TripleComponent col0_; +class ScanSpecificationAsTripleComponent { + using T = std::optional; + + private: + std::optional col0_; std::optional col1_; std::optional col2_; - ScanSpecificationAsTripleComponent(TripleComponent col0, - const TripleComponent& col1, - const TripleComponent& col2); + + public: + ScanSpecificationAsTripleComponent(T col0, T col1, T col2); std::optional toScanSpecification( const IndexImpl& index) const; size_t numColumns() const { - return 2 - col1_.has_value() - col2_.has_value(); + return 3 - col0_.has_value() - col1_.has_value() - col2_.has_value(); } }; diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index 2b38c14f75..0e5e96267e 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -36,9 +36,9 @@ auto makeTestScanWidthOne = [](const IndexImpl& index) { ad_utility::source_location l = ad_utility::source_location::current()) { auto t = generateLocationTrace(l); - IdTable result = index.scan( - c0, std::cref(c1), std::nullopt, permutation, additionalColumns, - std::make_shared>()); + IdTable result = + index.scan({c0, c1, std::nullopt}, permutation, additionalColumns, + std::make_shared>()); ASSERT_EQ(result.numColumns(), 1 + additionalColumns.size()); ASSERT_EQ(result, makeIdTableFromVector(expected)); }; @@ -55,7 +55,7 @@ auto makeTestScanWidthTwo = [](const IndexImpl& index) { ad_utility::source_location::current()) { auto t = generateLocationTrace(l); IdTable wol = - index.scan(c0, std::nullopt, std::nullopt, permutation, + index.scan({c0, std::nullopt, std::nullopt}, permutation, Permutation::ColumnIndicesRef{}, std::make_shared>()); ASSERT_EQ(wol, makeIdTableFromVector(expected)); diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index cf12731650..17795652f8 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -61,7 +61,7 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( auto checkSingleElement = [&cancellationDummy, &hasPatternId]( const Index& index, size_t patternIdx, Id id) { auto scanResultHasPattern = - index.scan(hasPatternId, id, std::nullopt, Permutation::Enum::PSO, {}, + index.scan({hasPatternId, id, std::nullopt}, Permutation::Enum::PSO, {}, cancellationDummy); // Each ID has at most one pattern, it can have none if it doesn't // appear as a subject in the knowledge graph. @@ -80,7 +80,7 @@ void checkConsistencyBetweenPatternPredicateAndAdditionalColumn( auto cancellationDummy = std::make_shared>(); auto scanResult = index.scan( - col0Id, std::nullopt, std::nullopt, permutation, + {col0Id, std::nullopt, std::nullopt}, permutation, std::array{ColumnIndex{ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN}, ColumnIndex{ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}}, cancellationDummy); From 395dfa9fe20040e4a2dc9ded0a9b4a71b58d7c2f Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Mon, 15 Jul 2024 10:38:04 +0200 Subject: [PATCH 06/14] Removed the pinned sizes. --- src/engine/IndexScan.cpp | 90 +++++++++--------------------- src/engine/IndexScan.h | 1 - src/engine/Operation.cpp | 19 ------- src/engine/QueryExecutionContext.h | 24 +------- src/engine/Server.cpp | 1 - test/QueryPlannerTest.cpp | 2 +- 6 files changed, 28 insertions(+), 109 deletions(-) diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 2ab72c8ac3..bbd1c98cdd 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -27,6 +27,9 @@ IndexScan::IndexScan(QueryExecutionContext* qec, Permutation::Enum permutation, numVariables_(static_cast(subject_.isVariable()) + static_cast(predicate_.isVariable()) + static_cast(object_.isVariable())) { + // We previously had `nullptr`s here in unit tests. This is no longer + // necessary nor allowed. + AD_CONTRACT_CHECK(qec != nullptr); for (auto& [idx, variable] : triple.additionalScanColumns_) { additionalColumns_.push_back(idx); additionalVariables_.push_back(variable); @@ -91,18 +94,8 @@ size_t IndexScan::getResultWidth() const { // _____________________________________________________________________________ vector IndexScan::resultSortedOn() const { - switch (numVariables_) { - case 0: - return {}; - case 1: - return {ColumnIndex{0}}; - case 2: - return {ColumnIndex{0}, ColumnIndex{1}}; - case 3: - return {ColumnIndex{0}, ColumnIndex{1}, ColumnIndex{2}}; - default: - AD_FAIL(); - } + auto resAsView = ad_utility::integerRange(ColumnIndex{numVariables_}); + return std::vector{resAsView.begin(), resAsView.end()}; } // _____________________________________________________________________________ @@ -147,39 +140,20 @@ Result IndexScan::computeResult([[maybe_unused]] bool requestLaziness) { // _____________________________________________________________________________ size_t IndexScan::computeSizeEstimate() const { - if (_executionContext) { - if (numVariables_ == 1) { - // TODO Use the monadic operation `std::optional::or_else`. - // Note: we cannot use `optional::value_or()` here, because the else - // case is expensive to compute, and we need it lazily evaluated. - if (auto size = getExecutionContext()->getQueryTreeCache().getPinnedSize( - getCacheKey()); - size.has_value()) { - return size.value(); - } - } - - // We have to do a simple scan anyway so might as well do it now - if (numVariables_ < 3) { - return getIndex().getResultSizeOfScan(getPermutedTripleNoVariables(), - permutation_); - } else { - // The triple consists of three variables. - // TODO As soon as all implementations of a full index scan - // (Including the "dummy joins" in Join.cpp) consistently exclude the - // internal triples, this estimate should be changed to only return - // the number of triples in the actual knowledge graph (excluding the - // internal triples). - AD_CORRECTNESS_CHECK(numVariables_ == 3); - return getIndex().numTriples().normalAndInternal_(); - } + AD_CORRECTNESS_CHECK(_executionContext); + // We have to do a simple scan anyway so might as well do it now + if (numVariables_ < 3) { + return getIndex().getResultSizeOfScan(getPermutedTripleNoVariables(), + permutation_); } else { - // Only for test cases. The handling of the objects is to make the - // strange query planner tests pass. - auto strLen = [](const auto& el) { - return (el.isString() ? el.getString() : el.toString()).size(); - }; - return 1000 + strLen(subject_) + strLen(object_) + strLen(predicate_); + // The triple consists of three variables. + // TODO As soon as all implementations of a full index scan + // (Including the "dummy joins" in Join.cpp) consistently exclude the + // internal triples, this estimate should be changed to only return + // the number of triples in the actual knowledge graph (excluding the + // internal triples). + AD_CORRECTNESS_CHECK(numVariables_ == 3); + return getIndex().numTriples().normalAndInternal_(); } } @@ -192,32 +166,20 @@ size_t IndexScan::getCostEstimate() { // _____________________________________________________________________________ void IndexScan::determineMultiplicities() { - multiplicity_.clear(); - if (numVariables_ == 0) { - return; - } - if (_executionContext) { + multiplicity_ = [this]() -> std::vector { const auto& idx = getIndex(); - if (numVariables_ == 1) { + if (numVariables_ == 0) { + return {}; + } else if (numVariables_ == 1) { // There are no duplicate triples in RDF and two elements are fixed. - multiplicity_.emplace_back(1); + return {1.0f}; } else if (numVariables_ == 2) { - const auto permutedTriple = getPermutedTriple(); - multiplicity_ = idx.getMultiplicities(*permutedTriple[0], permutation_); + return idx.getMultiplicities(*getPermutedTriple()[0], permutation_); } else { AD_CORRECTNESS_CHECK(numVariables_ == 3); - multiplicity_ = idx.getMultiplicities(permutation_); + return idx.getMultiplicities(permutation_); } - } else { - // This branch is only used in certain unit tests. - multiplicity_.emplace_back(1); - if (numVariables_ == 2) { - multiplicity_.emplace_back(1); - } - if (numVariables_ == 3) { - multiplicity_.emplace_back(1); - } - } + }(); for ([[maybe_unused]] size_t i : std::views::iota(multiplicity_.size(), getResultWidth())) { multiplicity_.emplace_back(1); diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index edee48a509..cc66e89855 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -3,7 +3,6 @@ // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) #pragma once -#include #include #include "./Operation.h" diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp index cd38e3d504..b29e2c6098 100644 --- a/src/engine/Operation.cpp +++ b/src/engine/Operation.cpp @@ -88,25 +88,6 @@ std::shared_ptr Operation::getResult( const bool pinResult = _executionContext->_pinSubtrees || pinFinalResultButNotSubtrees; - // When we pin the final result but no subtrees, we need to remember the sizes - // of all involved index scans that have only one free variable. Note that - // these index scans are executed already during query planning because they - // have to be executed anyway, for any query plan. If we don't remember these - // sizes here, future queries that take the result from the cache would redo - // these index scans. Note that we do not need to remember the multiplicity - // (and distinctness) because the multiplicity for an index scan with a single - // free variable is always 1. - if (pinFinalResultButNotSubtrees) { - auto lock = - getExecutionContext()->getQueryTreeCache().pinnedSizes().wlock(); - forAllDescendants([&lock](QueryExecutionTree* child) { - if (child->getRootOperation()->isIndexScanWithNumVariables(1)) { - (*lock)[child->getRootOperation()->getCacheKey()] = - child->getSizeEstimate(); - } - }); - } - try { // In case of an exception, create the correct runtime info, no matter which // exception handler is called. diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h index ba2bc24487..8311c32eb7 100644 --- a/src/engine/QueryExecutionContext.h +++ b/src/engine/QueryExecutionContext.h @@ -55,34 +55,12 @@ class CacheValue { // by another query. using ConcurrentLruCache = ad_utility::ConcurrentCache< ad_utility::LRUCache>; -using PinnedSizes = - ad_utility::Synchronized, - std::shared_mutex>; class QueryResultCache : public ConcurrentLruCache { - private: - PinnedSizes _pinnedSizes; - public: virtual ~QueryResultCache() = default; - void clearAll() override { - // The _pinnedSizes are not part of the (otherwise threadsafe) _cache - // and thus have to be manually locked. - auto lock = _pinnedSizes.wlock(); - ConcurrentLruCache::clearAll(); - lock->clear(); - } + void clearAll() override { ConcurrentLruCache::clearAll(); } // Inherit the constructor. using ConcurrentLruCache::ConcurrentLruCache; - const PinnedSizes& pinnedSizes() const { return _pinnedSizes; } - PinnedSizes& pinnedSizes() { return _pinnedSizes; } - std::optional getPinnedSize(const std::string& key) { - auto rlock = _pinnedSizes.rlock(); - if (rlock->contains(key)) { - return rlock->at(key); - } else { - return std::nullopt; - } - } }; // Execution context for queries. diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 689d58c7c5..245f3cb94a 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -469,7 +469,6 @@ nlohmann::json Server::composeCacheStatsJson() const { // converter. result["non-pinned-size"] = cache_.nonPinnedSize().getBytes(); result["pinned-size"] = cache_.pinnedSize().getBytes(); - result["num-pinned-index-scan-sizes"] = cache_.pinnedSizes().rlock()->size(); return result; } diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 59465bfb40..537d4c088c 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -17,7 +17,7 @@ constexpr auto iri = ad_utility::testing::iri; using ::testing::HasSubstr; QueryPlanner makeQueryPlanner() { - return QueryPlanner{nullptr, + return QueryPlanner{ad_utility::testing::getQec(), std::make_shared>()}; } From 733229f8720af8507e0a0caa15c2054716fc88dd Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Mon, 15 Jul 2024 11:17:37 +0200 Subject: [PATCH 07/14] Cleaned up a lot of code. --- src/engine/CMakeLists.txt | 7 ++- src/engine/ScanSpecification.cpp | 55 +++++++++++++++++++++ src/engine/ScanSpecification.h | 83 ++++++++++++++++++++++++++++++++ src/index/CMakeLists.txt | 2 +- src/index/CompressedRelation.h | 43 +---------------- src/index/Index.cpp | 52 +------------------- src/index/Index.h | 21 +------- src/index/IndexImpl.cpp | 4 +- src/index/IndexImpl.h | 3 +- src/index/Permutation.h | 2 - test/CompressedRelationsTest.cpp | 7 ++- test/TriplesViewTest.cpp | 2 +- 12 files changed, 155 insertions(+), 126 deletions(-) create mode 100644 src/engine/ScanSpecification.cpp create mode 100644 src/engine/ScanSpecification.h diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 6aba4674f8..9570706075 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -13,5 +13,8 @@ add_library(engine VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp - idTable/CompressedExternalIdTable.h) -qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams) + idTable/CompressedExternalIdTable.h +) +add_library(scanSpecification ScanSpecification.cpp) +qlever_target_link_libraries(scanSpecification index) +qlever_target_link_libraries(engine util index parser sparqlExpressions scanSpecification http SortPerformanceEstimator Boost::iostreams) diff --git a/src/engine/ScanSpecification.cpp b/src/engine/ScanSpecification.cpp new file mode 100644 index 0000000000..bc0730bc79 --- /dev/null +++ b/src/engine/ScanSpecification.cpp @@ -0,0 +1,55 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "engine/ScanSpecification.h" + +#include "index/IndexImpl.h" + +// ____________________________________________________________________________ +std::optional +ScanSpecificationAsTripleComponent::toScanSpecification( + const IndexImpl& index) const { + // TODO Use `std::optional::transform`. + // TODO: We can also have LocalVocab entries is the + // ScanSpecification. + std::optional col0Id = col0_.has_value() + ? col0_.value().toValueId(index.getVocab()) + : std::nullopt; + std::optional col1Id = col1_.has_value() + ? col1_.value().toValueId(index.getVocab()) + : std::nullopt; + std::optional col2Id = col2_.has_value() + ? col2_.value().toValueId(index.getVocab()) + : std::nullopt; + if (!col0Id.has_value() || (col1_.has_value() && !col1Id.has_value()) || + (col2_.has_value() && !col2Id.has_value())) { + return std::nullopt; + } + return ScanSpecification{col0Id, col1Id, col2Id}; +} + +// ____________________________________________________________________________ +ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, + T col1, + T col2) { + auto isUnbound = [](const T& t) { + return !t.has_value() || t.value().isVariable(); + }; + if (isUnbound(col0)) { + AD_CONTRACT_CHECK(isUnbound(col1)); + } + if (isUnbound(col1)) { + AD_CONTRACT_CHECK(isUnbound(col2)); + } + auto toNulloptIfVariable = [](T& tc) -> std::optional { + if (tc.has_value() && tc.value().isVariable()) { + return std::nullopt; + } else { + return std::move(tc); + } + }; + col0_ = toNulloptIfVariable(col0); + col1_ = toNulloptIfVariable(col1); + col2_ = toNulloptIfVariable(col2); +} diff --git a/src/engine/ScanSpecification.h b/src/engine/ScanSpecification.h new file mode 100644 index 0000000000..e315fd29ba --- /dev/null +++ b/src/engine/ScanSpecification.h @@ -0,0 +1,83 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once +#include + +#include "global/Id.h" +#include "parser/TripleComponent.h" + +// Forward declaration +class IndexImpl; + +// The specification of a scan operation for a given permutation. +// Can either be a full scan (all three elements are `std::nullopt`), +// a scan for a fixed `col0Id`, a scan for a fixed `col0Id` and `col1Id`, +// or even a scan for a single triple to check whether it is contained in +// the knowledge graph at all. The values which are `nullopt` become variables +// and are returned as columns in the result of the scan. +class ScanSpecification { + private: + using T = std::optional; + + T col0Id_; + T col1Id_; + T col2Id_; + + void validate() const { + bool c0 = col0Id_.has_value(); + bool c1 = col1Id_.has_value(); + bool c2 = col2Id_.has_value(); + if (!c0) { + AD_CORRECTNESS_CHECK(!c1 && !c2); + } + if (!c1) { + AD_CORRECTNESS_CHECK(!c2); + } + } + + public: + ScanSpecification(T col0Id, T col1Id, T col2Id) + : col0Id_{col0Id}, col1Id_{col1Id}, col2Id_{col2Id} { + validate(); + } + const T& col0Id() const { return col0Id_; } + const T& col1Id() const { return col1Id_; } + const T& col2Id() const { return col2Id_; } + + // Only used in tests. + void setCol1Id(T col1Id) { + col1Id_ = col1Id; + validate(); + } +}; + +// Same as `ScanSpecification` (see above), but stores `TripleComponent`s +// instead of `Id`s. +class ScanSpecificationAsTripleComponent { + using T = std::optional; + + private: + std::optional col0_; + std::optional col1_; + std::optional col2_; + + public: + // Construct from three optional `TripleComponent`s. If any of the three + // entries is unbound (`nullopt` or of type `Variable`), then all subsequent + // entries also have to be unbound. For example if `col0` is bound, but `col1` + // isn't, then `col2` also has to be unbound. + ScanSpecificationAsTripleComponent(T col0, T col1, T col2); + + // Convert to a `ScanSpecification`. The `index` is used to convert the + // `TripleComponent` to `Id`s by looking them up in the vocabulary. Return + // `nullopt` if one of the vocab lookup fails. Note: Once we implement SPARQL + // UPDATE, we possibly also have to use a `LocalVocab` here. + // TODO Try out if this can be done just now. + std::optional toScanSpecification( + const IndexImpl& index) const; + size_t numColumns() const { + return 3 - col0_.has_value() - col1_.has_value() - col2_.has_value(); + } +}; diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index d06778e2d7..3a6543c2ec 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -6,4 +6,4 @@ add_library(index DocsDB.cpp FTSAlgorithms.cpp PrefixHeuristic.cpp CompressedRelation.cpp PatternCreator.cpp) -qlever_target_link_libraries(index util parser vocabulary compilationInfo ${STXXL_LIBRARIES}) +qlever_target_link_libraries(index util parser vocabulary compilationInfo scanSpecification ${STXXL_LIBRARIES}) diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 5c7179c096..a46e5c6ba9 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -9,6 +9,7 @@ #include #include +#include "engine/ScanSpecification.h" #include "engine/idTable/IdTable.h" #include "global/Id.h" #include "parser/data/LimitOffsetClause.h" @@ -336,48 +337,6 @@ class CompressedRelationReader { using ColumnIndices = std::vector; using CancellationHandle = ad_utility::SharedCancellationHandle; - // The specification of a scan operation for a given permutation. - // Can either be a full scan (all three elements are `std::nullopt`), - // a scan for a fixed `col0Id`, a scan for a fixed `col0Id` and `col1Id`, - // or even a scan for a single triple to check whether it is contained in - // the knowledge graph at all. The values which are `nullopt` become variables - // and are returned as columns in the result of the scan. - class ScanSpecification { - private: - using T = std::optional; - - T col0Id_; - T col1Id_; - T col2Id_; - - void validate() const { - bool c0 = col0Id_.has_value(); - bool c1 = col1Id_.has_value(); - bool c2 = col2Id_.has_value(); - if (!c0) { - AD_CORRECTNESS_CHECK(!c1 && !c2); - } - if (!c1) { - AD_CORRECTNESS_CHECK(!c2); - } - } - - public: - ScanSpecification(T col0Id, T col1Id, T col2Id) - : col0Id_{col0Id}, col1Id_{col1Id}, col2Id_{col2Id} { - validate(); - } - const T& col0Id() const { return col0Id_; } - const T& col1Id() const { return col1Id_; } - const T& col2Id() const { return col2Id_; } - - // Only used in tests. - void setCol1Id(T col1Id) { - col1Id_ = col1Id; - validate(); - } - }; - // The metadata of a single relation together with a subset of its // blocks and possibly a `col1Id` for additional filtering. This is used as // the input to several functions below that take such an input. diff --git a/src/index/Index.cpp b/src/index/Index.cpp index f853d430df..8c2841ca22 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -286,8 +286,8 @@ IdTable Index::scan( // ____________________________________________________________________________ IdTable Index::scan( - const Permutation::ScanSpecification& scanSpecification, - Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, + const ScanSpecification& scanSpecification, Permutation::Enum p, + Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { return pimpl_->scan(scanSpecification, p, additionalColumns, @@ -300,51 +300,3 @@ size_t Index::getResultSizeOfScan( const Permutation::Enum& permutation) const { return pimpl_->getResultSizeOfScan(scanSpecification, permutation); } - -// ____________________________________________________________________________ -std::optional -ScanSpecificationAsTripleComponent::toScanSpecification( - const IndexImpl& index) const { - // TODO Use `std::optional::transform`. - // TODO: We can also have LocalVocab entries is the - // ScanSpecification. - std::optional col0Id = col0_.has_value() - ? col0_.value().toValueId(index.getVocab()) - : std::nullopt; - std::optional col1Id = col1_.has_value() - ? col1_.value().toValueId(index.getVocab()) - : std::nullopt; - std::optional col2Id = col2_.has_value() - ? col2_.value().toValueId(index.getVocab()) - : std::nullopt; - if (!col0Id.has_value() || (col1_.has_value() && !col1Id.has_value()) || - (col2_.has_value() && !col2Id.has_value())) { - return std::nullopt; - } - return Permutation::ScanSpecification{col0Id, col1Id, col2Id}; -} - -// ____________________________________________________________________________ -ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, - T col1, - T col2) { - auto isUnbound = [](const T& t) { - return !t.has_value() || t.value().isVariable(); - }; - if (isUnbound(col0)) { - AD_CONTRACT_CHECK(isUnbound(col1)); - } - if (isUnbound(col1)) { - AD_CONTRACT_CHECK(isUnbound(col2)); - } - auto toNulloptIfVariable = [](T& tc) -> std::optional { - if (tc.has_value() && tc.value().isVariable()) { - return std::nullopt; - } else { - return std::move(tc); - } - }; - col0_ = toNulloptIfVariable(col0); - col1_ = toNulloptIfVariable(col1); - col2_ = toNulloptIfVariable(col2); -} diff --git a/src/index/Index.h b/src/index/Index.h index a6c3fce5dc..e74690157e 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -23,24 +23,6 @@ class IdTable; class TextBlockMetaData; class IndexImpl; -// TODO Comment. -class ScanSpecificationAsTripleComponent { - using T = std::optional; - - private: - std::optional col0_; - std::optional col1_; - std::optional col2_; - - public: - ScanSpecificationAsTripleComponent(T col0, T col1, T col2); - std::optional toScanSpecification( - const IndexImpl& index) const; - size_t numColumns() const { - return 3 - col0_.has_value() - col1_.has_value() - col2_.has_value(); - } -}; - class Index { private: // Pimpl to reduce compile times. @@ -263,8 +245,7 @@ class Index { const LimitOffsetClause& limitOffset = {}) const; // Similar to the overload of `scan` above, but the keys are specified as IDs. - IdTable scan(const Permutation::ScanSpecification& scanSpecification, - Permutation::Enum p, + IdTable scan(const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset = {}) const; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 71fec97cfa..b7596b5450 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1384,8 +1384,8 @@ IdTable IndexImpl::scan( } // _____________________________________________________________________________ IdTable IndexImpl::scan( - const Permutation::ScanSpecification& scanSpecification, - Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, + const ScanSpecification& scanSpecification, Permutation::Enum p, + Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset) const { return getPermutation(p).scan(scanSpecification, additionalColumns, diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 6caef088aa..d820ffc4d3 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -404,8 +404,7 @@ class IndexImpl { const LimitOffsetClause& limitOffset = {}) const; // _____________________________________________________________________________ - IdTable scan(const Permutation::ScanSpecification& scanSpecification, - Permutation::Enum p, + IdTable scan(const ScanSpecification& scanSpecification, Permutation::Enum p, Permutation::ColumnIndicesRef additionalColumns, const ad_utility::SharedCancellationHandle& cancellationHandle, const LimitOffsetClause& limitOffset = {}) const; diff --git a/src/index/Permutation.h b/src/index/Permutation.h index 2defb4c58e..db4b36635d 100644 --- a/src/index/Permutation.h +++ b/src/index/Permutation.h @@ -38,8 +38,6 @@ class Permutation { using ColumnIndices = CompressedRelationReader::ColumnIndices; using CancellationHandle = ad_utility::SharedCancellationHandle; - using ScanSpecification = CompressedRelationReader::ScanSpecification; - // Convert a permutation to the corresponding string, etc. `PSO` is converted // to "PSO". static std::string_view toString(Enum permutation); diff --git a/test/CompressedRelationsTest.cpp b/test/CompressedRelationsTest.cpp index c2aad9d7c6..24faf16c2e 100644 --- a/test/CompressedRelationsTest.cpp +++ b/test/CompressedRelationsTest.cpp @@ -174,8 +174,7 @@ void testCompressedRelations(const auto& inputs, std::string testCaseName, ASSERT_FLOAT_EQ(m.numRows_ / static_cast(i + 1), m.multiplicityCol1_); // Scan for all distinct `col0` and check that we get the expected result. - CompressedRelationReader::ScanSpecification scanSpec{ - metaData[i].col0Id_, std::nullopt, std::nullopt}; + ScanSpecification scanSpec{metaData[i].col0Id_, std::nullopt, std::nullopt}; IdTable table = reader.scan(scanSpec, blocks, additionalColumns, cancellationHandle); const auto& col1And2 = inputs[i].col1And2_; @@ -207,8 +206,8 @@ void testCompressedRelations(const auto& inputs, std::string testCaseName, std::vector> col3; auto scanAndCheck = [&]() { - CompressedRelationReader::ScanSpecification scanSpec{ - metaData[i].col0Id_, V(lastCol1Id), std::nullopt}; + ScanSpecification scanSpec{metaData[i].col0Id_, V(lastCol1Id), + std::nullopt}; auto size = reader.getResultSizeOfScan(scanSpec, blocks); IdTable tableWidthOne = reader.scan(scanSpec, blocks, Permutation::ColumnIndicesRef{}, diff --git a/test/TriplesViewTest.cpp b/test/TriplesViewTest.cpp index 4c9fd53825..956100991c 100644 --- a/test/TriplesViewTest.cpp +++ b/test/TriplesViewTest.cpp @@ -30,7 +30,7 @@ struct DummyPermutation { } cppcoro::generator lazyScan( - CompressedRelationReader::ScanSpecification scanSpec, + ScanSpecification scanSpec, std::optional> blocks, std::span, const auto&) const { AD_CORRECTNESS_CHECK(!blocks.has_value()); From fe183000e099c15e2852ae23499bea4f3a60b19a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 1 Aug 2024 19:34:02 +0200 Subject: [PATCH 08/14] A round of reviews. --- src/engine/CMakeLists.txt | 2 +- src/engine/CartesianProductJoin.cpp | 18 +++++++++++- src/engine/IndexScan.cpp | 29 +++++-------------- src/engine/IndexScan.h | 2 +- src/engine/QueryPlanner.cpp | 1 + src/engine/ScanSpecification.cpp | 45 ++++++++++++++++------------- src/engine/ScanSpecification.h | 7 +++-- 7 files changed, 57 insertions(+), 47 deletions(-) diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index a21afc093c..4587406bbe 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -17,4 +17,4 @@ qlever_target_link_libraries(engine util index parser sparqlExpressions http Sor add_library(scanSpecification ScanSpecification.cpp) qlever_target_link_libraries(scanSpecification index) -qlever_target_link_libraries(engine util index parser sparqlExpressions scanSpecification http SortPerformanceEstimator Boost::iostreams) +qlever_target_link_libraries(engine util index parser sparqlExpressions scanSpecification http SortPerformanceEstimator Boost::iostreams) \ No newline at end of file diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp index 8f8147f5c2..d2ada341de 100644 --- a/src/engine/CartesianProductJoin.cpp +++ b/src/engine/CartesianProductJoin.cpp @@ -154,10 +154,18 @@ ProtoResult CartesianProductJoin::computeResult( child.setLimit(limitIfPresent.value()); } subResults.push_back(child.getResult()); + + const auto& table = subResults.back()->idTable(); // Early stopping: If one of the results is empty, we can stop early. - if (subResults.back()->idTable().size() == 0) { + if (table.size() == 0) { break; } + + // If one of the children is the neutral element (because of a triple with + // zero variables), we can simply ignore it here. + if (table.numRows() == 1 && table.numColumns() == 0) { + subResults.pop_back(); + } // Example for the following calculation: If we have a LIMIT of 1000 and // the first child already has a result of size 100, then the second child // needs to evaluate only its first 10 results. The +1 is because integer @@ -169,6 +177,14 @@ ProtoResult CartesianProductJoin::computeResult( } } + // TODO find a solution to simply return the subresult. + // This can probably be done by using the `ProtoResult`. + /* + if (subResults.size() == 1) { + return subResults.at(0); + } + */ + auto sizesView = std::views::transform( subResults, [](const auto& child) { return child->idTable().size(); }); auto totalResultSize = std::accumulate(sizesView.begin(), sizesView.end(), diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 93d0f2b0eb..15940b525b 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -125,7 +125,7 @@ ProtoResult IndexScan::computeResult([[maybe_unused]] bool requestLaziness) { idTable.setNumColumns(numVariables_); const auto& index = _executionContext->getIndex(); if (numVariables_ < 3) { - idTable = index.scan(getPermutedTripleNoVariables(), permutation_, + idTable = index.scan(getScanSpecification(), permutation_, additionalColumns(), cancellationHandle_, getLimit()); } else { AD_CORRECTNESS_CHECK(numVariables_ == 3); @@ -143,8 +143,7 @@ size_t IndexScan::computeSizeEstimate() const { AD_CORRECTNESS_CHECK(_executionContext); // We have to do a simple scan anyway so might as well do it now if (numVariables_ < 3) { - return getIndex().getResultSizeOfScan(getPermutedTripleNoVariables(), - permutation_); + return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_); } else { // The triple consists of three variables. // TODO As soon as all implementations of a full index scan @@ -235,8 +234,7 @@ std::array IndexScan::getPermutedTriple() } // ___________________________________________________________________________ -ScanSpecificationAsTripleComponent IndexScan::getPermutedTripleNoVariables() - const { +ScanSpecificationAsTripleComponent IndexScan::getScanSpecification() const { auto permutedTriple = getPermutedTriple(); return {*permutedTriple[0], *permutedTriple[1], *permutedTriple[2]}; } @@ -274,26 +272,13 @@ Permutation::IdTableGenerator IndexScan::getLazyScan( // ________________________________________________________________ std::optional IndexScan::getMetadataForScan( const IndexScan& s) { - auto permutedTriple = s.getPermutedTriple(); - const IndexImpl& index = s.getIndex().getImpl(); - auto numVars = s.numVariables_; - std::optional col0Id = - numVars == 3 ? std::nullopt - : permutedTriple[0]->toValueId(index.getVocab()); - std::optional col1Id = - numVars >= 2 ? std::nullopt - : permutedTriple[1]->toValueId(index.getVocab()); - std::optional col2Id = - numVars >= 1 ? std::nullopt - : permutedTriple[2]->toValueId(index.getVocab()); - if ((!col0Id.has_value() && numVars < 3) || - (!col1Id.has_value() && numVars < 2) || - (!col2Id.has_value() && numVars < 1)) { + const auto& index = s.getExecutionContext()->getIndex().getImpl(); + auto scanSpec = s.getScanSpecification().toScanSpecification(index); + if (!scanSpec.has_value()) { return std::nullopt; } - return index.getPermutation(s.permutation()) - .getMetadataAndBlocks({col0Id, col1Id, col2Id}); + .getMetadataAndBlocks(scanSpec.value()); }; // ________________________________________________________________ diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index f2fa4611cc..5914e27d71 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -100,7 +100,7 @@ class IndexScan final : public Operation { // `permutation_`. For example if `permutation_ == PSO` then the result is // {&predicate_, &subject_, &object_} std::array getPermutedTriple() const; - ScanSpecificationAsTripleComponent getPermutedTripleNoVariables() const; + ScanSpecificationAsTripleComponent getScanSpecification() const; private: ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override; diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index f12a4f6e4b..26f6077270 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -630,6 +630,7 @@ void QueryPlanner::seedFromOrdinaryTriple( static_cast(isVariable(triple.p_)) + static_cast(isVariable(triple.o_)); if (numVars == 0) { + // We could read this from any of the permutations. addIndexScan(Permutation::Enum::PSO); } else if (numVars == 1) { indexScanSingleVarCase(triple, addIndexScan); diff --git a/src/engine/ScanSpecification.cpp b/src/engine/ScanSpecification.cpp index bc0730bc79..d546491567 100644 --- a/src/engine/ScanSpecification.cpp +++ b/src/engine/ScanSpecification.cpp @@ -13,17 +13,24 @@ ScanSpecificationAsTripleComponent::toScanSpecification( // TODO Use `std::optional::transform`. // TODO: We can also have LocalVocab entries is the // ScanSpecification. - std::optional col0Id = col0_.has_value() - ? col0_.value().toValueId(index.getVocab()) - : std::nullopt; - std::optional col1Id = col1_.has_value() - ? col1_.value().toValueId(index.getVocab()) - : std::nullopt; - std::optional col2Id = col2_.has_value() - ? col2_.value().toValueId(index.getVocab()) - : std::nullopt; - if (!col0Id.has_value() || (col1_.has_value() && !col1Id.has_value()) || - (col2_.has_value() && !col2Id.has_value())) { + bool nonexistingVocabEntryFound = false; + auto getId = + [&index, &nonexistingVocabEntryFound]( + const std::optional& tc) -> std::optional { + if (!tc.has_value()) { + return std::nullopt; + } + auto id = tc.value().toValueId(index.getVocab()); + if (!id.has_value()) { + nonexistingVocabEntryFound = true; + } + return id; + }; + std::optional col0Id = getId(col0_); + std::optional col1Id = getId(col1_); + std::optional col2Id = getId(col2_); + + if (nonexistingVocabEntryFound) { return std::nullopt; } return ScanSpecification{col0Id, col1Id, col2Id}; @@ -33,15 +40,6 @@ ScanSpecificationAsTripleComponent::toScanSpecification( ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, T col1, T col2) { - auto isUnbound = [](const T& t) { - return !t.has_value() || t.value().isVariable(); - }; - if (isUnbound(col0)) { - AD_CONTRACT_CHECK(isUnbound(col1)); - } - if (isUnbound(col1)) { - AD_CONTRACT_CHECK(isUnbound(col2)); - } auto toNulloptIfVariable = [](T& tc) -> std::optional { if (tc.has_value() && tc.value().isVariable()) { return std::nullopt; @@ -52,4 +50,11 @@ ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, col0_ = toNulloptIfVariable(col0); col1_ = toNulloptIfVariable(col1); col2_ = toNulloptIfVariable(col2); + + if (!col0.has_value()) { + AD_CONTRACT_CHECK(!col1.has_value()); + } + if (col1.has_value()) { + AD_CONTRACT_CHECK(!col2.has_value()); + } } diff --git a/src/engine/ScanSpecification.h b/src/engine/ScanSpecification.h index e315fd29ba..9935fe64fc 100644 --- a/src/engine/ScanSpecification.h +++ b/src/engine/ScanSpecification.h @@ -5,6 +5,7 @@ #pragma once #include +#include "engine/LocalVocab.h" #include "global/Id.h" #include "parser/TripleComponent.h" @@ -24,6 +25,7 @@ class ScanSpecification { T col0Id_; T col1Id_; T col2Id_; + friend class ScanSpecificationAsTripleComponent; void validate() const { bool c0 = col0Id_.has_value(); @@ -72,8 +74,9 @@ class ScanSpecificationAsTripleComponent { // Convert to a `ScanSpecification`. The `index` is used to convert the // `TripleComponent` to `Id`s by looking them up in the vocabulary. Return - // `nullopt` if one of the vocab lookup fails. Note: Once we implement SPARQL - // UPDATE, we possibly also have to use a `LocalVocab` here. + // `nullopt` if and only if one of the vocab lookup fails and the result of + // the corresponding scan will thus always be empty. Note: Once we implement + // SPARQL UPDATE, we possibly also have to use a `LocalVocab` here. // TODO Try out if this can be done just now. std::optional toScanSpecification( const IndexImpl& index) const; From 6cabb80689c8b384f9b234ef5496ec2e6e5243b8 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 1 Aug 2024 19:38:42 +0200 Subject: [PATCH 09/14] A small change. --- src/engine/CMakeLists.txt | 2 +- src/engine/ScanSpecification.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 4587406bbe..a21afc093c 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -17,4 +17,4 @@ qlever_target_link_libraries(engine util index parser sparqlExpressions http Sor add_library(scanSpecification ScanSpecification.cpp) qlever_target_link_libraries(scanSpecification index) -qlever_target_link_libraries(engine util index parser sparqlExpressions scanSpecification http SortPerformanceEstimator Boost::iostreams) \ No newline at end of file +qlever_target_link_libraries(engine util index parser sparqlExpressions scanSpecification http SortPerformanceEstimator Boost::iostreams) diff --git a/src/engine/ScanSpecification.h b/src/engine/ScanSpecification.h index 9935fe64fc..dfdf931a56 100644 --- a/src/engine/ScanSpecification.h +++ b/src/engine/ScanSpecification.h @@ -74,10 +74,10 @@ class ScanSpecificationAsTripleComponent { // Convert to a `ScanSpecification`. The `index` is used to convert the // `TripleComponent` to `Id`s by looking them up in the vocabulary. Return - // `nullopt` if and only if one of the vocab lookup fails and the result of - // the corresponding scan will thus always be empty. Note: Once we implement - // SPARQL UPDATE, we possibly also have to use a `LocalVocab` here. - // TODO Try out if this can be done just now. + // `nullopt` if and only if one of the vocab lookup fails (then the result of + // the corresponding scan will be empty). + // TODO Once we implement SPARQL UPDATE, we possibly also to use the + // `LocalVocab` of the UPDATE triples here. std::optional toScanSpecification( const IndexImpl& index) const; size_t numColumns() const { From f9897ce6b083b258fa8e29ebd63864080db792b6 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 1 Aug 2024 19:43:31 +0200 Subject: [PATCH 10/14] BugfixOfBugfix --- src/engine/ScanSpecification.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/ScanSpecification.cpp b/src/engine/ScanSpecification.cpp index d546491567..4ede911a10 100644 --- a/src/engine/ScanSpecification.cpp +++ b/src/engine/ScanSpecification.cpp @@ -54,7 +54,7 @@ ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, if (!col0.has_value()) { AD_CONTRACT_CHECK(!col1.has_value()); } - if (col1.has_value()) { + if (!col1.has_value()) { AD_CONTRACT_CHECK(!col2.has_value()); } } From d71a2fccbeb3c8a5e7198da08907c797f595300d Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 1 Aug 2024 19:48:21 +0200 Subject: [PATCH 11/14] Last thing that is left is To add the missing unit tests. --- src/engine/CartesianProductJoin.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp index d2ada341de..ed4541d19a 100644 --- a/src/engine/CartesianProductJoin.cpp +++ b/src/engine/CartesianProductJoin.cpp @@ -177,13 +177,9 @@ ProtoResult CartesianProductJoin::computeResult( } } - // TODO find a solution to simply return the subresult. - // This can probably be done by using the `ProtoResult`. - /* - if (subResults.size() == 1) { - return subResults.at(0); - } - */ + // TODO Find a solution to cheaply handle the case, that only a + // single result is left. This can probably be done by using the + // `ProtoResult`. auto sizesView = std::views::transform( subResults, [](const auto& child) { return child->idTable().size(); }); From 2aefcb4bb4ace0e1b60a63c9149c10819a6bf82a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 2 Aug 2024 11:00:37 +0200 Subject: [PATCH 12/14] Add unit tests and inevitably fix the bugs that those tests discovered. --- src/engine/CMakeLists.txt | 2 +- src/engine/CartesianProductJoin.cpp | 3 +- src/engine/CartesianProductJoin.h | 6 +- src/engine/Operation.cpp | 2 +- src/engine/Operation.h | 2 +- src/engine/QueryExecutionContext.h | 9 +-- src/engine/ScanSpecification.cpp | 37 +++++++++-- src/engine/ScanSpecification.h | 23 +++---- src/engine/idTable/IdTable.h | 2 +- src/parser/CMakeLists.txt | 2 +- src/util/ConcurrentCache.h | 2 +- test/ValuesForTestingTest.cpp | 16 ++++- test/engine/CMakeLists.txt | 1 + test/engine/CartesianProductJoinTest.cpp | 16 ++++- test/engine/ScanSpecificationTest.cpp | 81 ++++++++++++++++++++++++ test/engine/ValuesForTesting.h | 3 +- 16 files changed, 163 insertions(+), 44 deletions(-) create mode 100644 test/engine/ScanSpecificationTest.cpp diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index a21afc093c..fbf6cad996 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -16,5 +16,5 @@ add_library(engine qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams) add_library(scanSpecification ScanSpecification.cpp) -qlever_target_link_libraries(scanSpecification index) +qlever_target_link_libraries(scanSpecification index engine) qlever_target_link_libraries(engine util index parser sparqlExpressions scanSpecification http SortPerformanceEstimator Boost::iostreams) diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp index ed4541d19a..c73361e001 100644 --- a/src/engine/CartesianProductJoin.cpp +++ b/src/engine/CartesianProductJoin.cpp @@ -157,7 +157,7 @@ ProtoResult CartesianProductJoin::computeResult( const auto& table = subResults.back()->idTable(); // Early stopping: If one of the results is empty, we can stop early. - if (table.size() == 0) { + if (table.empty()) { break; } @@ -165,6 +165,7 @@ ProtoResult CartesianProductJoin::computeResult( // zero variables), we can simply ignore it here. if (table.numRows() == 1 && table.numColumns() == 0) { subResults.pop_back(); + continue; } // Example for the following calculation: If we have a LIMIT of 1000 and // the first child already has a result of size 100, then the second child diff --git a/src/engine/CartesianProductJoin.h b/src/engine/CartesianProductJoin.h index 96047d62fa..779adf1dba 100644 --- a/src/engine/CartesianProductJoin.h +++ b/src/engine/CartesianProductJoin.h @@ -2,9 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#ifndef QLEVER_CARTESIANPRODUCTJOIN_H -#define QLEVER_CARTESIANPRODUCTJOIN_H - +#pragma once #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" @@ -92,5 +90,3 @@ class CartesianProductJoin : public Operation { std::span inputColumn, size_t groupSize, size_t offset); }; - -#endif // QLEVER_CARTESIANPRODUCTJOIN_H diff --git a/src/engine/Operation.cpp b/src/engine/Operation.cpp index b29e2c6098..4293aa2c9c 100644 --- a/src/engine/Operation.cpp +++ b/src/engine/Operation.cpp @@ -251,7 +251,7 @@ void Operation::updateRuntimeInformationOnSuccess( // ____________________________________________________________________________________________________________________ void Operation::updateRuntimeInformationOnSuccess( - const ConcurrentLruCache ::ResultAndCacheStatus& resultAndCacheStatus, + const QueryResultCache::ResultAndCacheStatus& resultAndCacheStatus, Milliseconds duration) { updateRuntimeInformationOnSuccess( *resultAndCacheStatus._resultPointer->resultTable(), diff --git a/src/engine/Operation.h b/src/engine/Operation.h index 920b09594f..9e23ee3fcf 100644 --- a/src/engine/Operation.h +++ b/src/engine/Operation.h @@ -263,7 +263,7 @@ class Operation { // Create and store the complete runtime information for this operation after // it has either been successfully computed or read from the cache. virtual void updateRuntimeInformationOnSuccess( - const ConcurrentLruCache::ResultAndCacheStatus& resultAndCacheStatus, + const QueryResultCache::ResultAndCacheStatus& resultAndCacheStatus, Milliseconds duration) final; // Similar to the function above, but the components are specified manually. diff --git a/src/engine/QueryExecutionContext.h b/src/engine/QueryExecutionContext.h index 8311c32eb7..e7c1ff37f6 100644 --- a/src/engine/QueryExecutionContext.h +++ b/src/engine/QueryExecutionContext.h @@ -53,15 +53,8 @@ class CacheValue { // Threadsafe LRU cache for (partial) query results, that // checks on insertion, if the result is currently being computed // by another query. -using ConcurrentLruCache = ad_utility::ConcurrentCache< +using QueryResultCache = ad_utility::ConcurrentCache< ad_utility::LRUCache>; -class QueryResultCache : public ConcurrentLruCache { - public: - virtual ~QueryResultCache() = default; - void clearAll() override { ConcurrentLruCache::clearAll(); } - // Inherit the constructor. - using ConcurrentLruCache::ConcurrentLruCache; -}; // Execution context for queries. // Holds references to index and engine, implements caching. diff --git a/src/engine/ScanSpecification.cpp b/src/engine/ScanSpecification.cpp index 4ede911a10..ee8a0543d3 100644 --- a/src/engine/ScanSpecification.cpp +++ b/src/engine/ScanSpecification.cpp @@ -4,8 +4,16 @@ #include "engine/ScanSpecification.h" +#include "index/Index.h" #include "index/IndexImpl.h" +// ____________________________________________________________________________ +std::optional +ScanSpecificationAsTripleComponent::toScanSpecification( + const Index& index) const { + return toScanSpecification(index.getImpl()); +} + // ____________________________________________________________________________ std::optional ScanSpecificationAsTripleComponent::toScanSpecification( @@ -51,10 +59,31 @@ ScanSpecificationAsTripleComponent::ScanSpecificationAsTripleComponent(T col0, col1_ = toNulloptIfVariable(col1); col2_ = toNulloptIfVariable(col2); - if (!col0.has_value()) { - AD_CONTRACT_CHECK(!col1.has_value()); + if (!col0_.has_value()) { + AD_CONTRACT_CHECK(!col1_.has_value()); + } + if (!col1_.has_value()) { + AD_CONTRACT_CHECK(!col2_.has_value()); + } +} + +// ____________________________________________________________________________ +size_t ScanSpecificationAsTripleComponent::numColumns() const { + auto i = [](const auto& x) -> size_t { + return static_cast(x.has_value()); + }; + return 3 - i(col0_) - i(col1_) - i(col2_); +} + +// _____________________________________________________________________________ +void ScanSpecification::validate() const { + bool c0 = col0Id_.has_value(); + bool c1 = col1Id_.has_value(); + bool c2 = col2Id_.has_value(); + if (!c0) { + AD_CORRECTNESS_CHECK(!c1 && !c2); } - if (!col1.has_value()) { - AD_CONTRACT_CHECK(!col2.has_value()); + if (!c1) { + AD_CORRECTNESS_CHECK(!c2); } } diff --git a/src/engine/ScanSpecification.h b/src/engine/ScanSpecification.h index dfdf931a56..de8e915cee 100644 --- a/src/engine/ScanSpecification.h +++ b/src/engine/ScanSpecification.h @@ -11,6 +11,7 @@ // Forward declaration class IndexImpl; +class Index; // The specification of a scan operation for a given permutation. // Can either be a full scan (all three elements are `std::nullopt`), @@ -27,17 +28,7 @@ class ScanSpecification { T col2Id_; friend class ScanSpecificationAsTripleComponent; - void validate() const { - bool c0 = col0Id_.has_value(); - bool c1 = col1Id_.has_value(); - bool c2 = col2Id_.has_value(); - if (!c0) { - AD_CORRECTNESS_CHECK(!c1 && !c2); - } - if (!c1) { - AD_CORRECTNESS_CHECK(!c2); - } - } + void validate() const; public: ScanSpecification(T col0Id, T col1Id, T col2Id) @@ -48,6 +39,8 @@ class ScanSpecification { const T& col1Id() const { return col1Id_; } const T& col2Id() const { return col2Id_; } + bool operator==(const ScanSpecification&) const = default; + // Only used in tests. void setCol1Id(T col1Id) { col1Id_ = col1Id; @@ -80,7 +73,9 @@ class ScanSpecificationAsTripleComponent { // `LocalVocab` of the UPDATE triples here. std::optional toScanSpecification( const IndexImpl& index) const; - size_t numColumns() const { - return 3 - col0_.has_value() - col1_.has_value() - col2_.has_value(); - } + std::optional toScanSpecification( + const Index& index) const; + + // The number of columns that the corresponding index scan will have. + size_t numColumns() const; }; diff --git a/src/engine/idTable/IdTable.h b/src/engine/idTable/IdTable.h index 4377b02445..c8fb92d84e 100644 --- a/src/engine/idTable/IdTable.h +++ b/src/engine/idTable/IdTable.h @@ -1,4 +1,4 @@ -// Copyright 2021, University of Freiburg, Chair of Algorithms and Data +/// Copyright 2021, University of Freiburg, Chair of Algorithms and Data // Structures. Author: Johannes Kalmbach #pragma once diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index 29ca836c71..8b1efdcb1e 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -25,5 +25,5 @@ add_library(parser Iri.cpp Literal.cpp LiteralOrIri.cpp) -qlever_target_link_libraries(parser sparqlParser parserData sparqlExpressions rdfEscaping re2::re2 util engine) +qlever_target_link_libraries(parser sparqlParser parserData sparqlExpressions rdfEscaping re2::re2 util engine index) diff --git a/src/util/ConcurrentCache.h b/src/util/ConcurrentCache.h index e76e2980b7..de8d0f270e 100644 --- a/src/util/ConcurrentCache.h +++ b/src/util/ConcurrentCache.h @@ -206,7 +206,7 @@ class ConcurrentCache { } /// Clear the cache, including the pinned entries. - virtual void clearAll() { _cacheAndInProgressMap.wlock()->_cache.clearAll(); } + void clearAll() { _cacheAndInProgressMap.wlock()->_cache.clearAll(); } /// Delete elements from the unpinned part of the cache of total size /// at least `size`; diff --git a/test/ValuesForTestingTest.cpp b/test/ValuesForTestingTest.cpp index 95ce76ce46..8c4b86d019 100644 --- a/test/ValuesForTestingTest.cpp +++ b/test/ValuesForTestingTest.cpp @@ -25,9 +25,9 @@ TEST(ValuesForTesting, valuesForTesting) { ASSERT_EQ(v.getMultiplicity(0), 42.0); ASSERT_EQ(v.getMultiplicity(1), 84.0); - ASSERT_THAT( - v.getCacheKey(), - ::testing::StartsWith("Values for testing with 2 columns. V:3 V:12")); + ASSERT_THAT(v.getCacheKey(), + ::testing::StartsWith( + "Values for testing with 2 columns and 3 rows. V:3 V:12")); ASSERT_THAT(v.getCacheKey(), ::testing::EndsWith("Supports limit: 0")); ASSERT_EQ(v.getDescriptor(), "explicit values for testing"); ASSERT_TRUE(v.resultSortedOn().empty()); @@ -36,3 +36,13 @@ TEST(ValuesForTesting, valuesForTesting) { auto result = v.getResult(); ASSERT_EQ(result->idTable(), table); } + +// ____________________________________________________________________________ +TEST(ValuesForTesting, cornerCasesCacheKey) { + auto empty = makeIdTableFromVector({}); + auto neutral = makeIdTableFromVector({{}}); + + ValuesForTesting vEmpty{getQec(), empty.clone(), {}}; + ValuesForTesting vNeutral{getQec(), neutral.clone(), {}}; + EXPECT_NE(vEmpty.getCacheKey(), vNeutral.getCacheKey()); +} diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index 4f6e09ff5f..d64860a72d 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -4,3 +4,4 @@ addLinkAndDiscoverTest(CartesianProductJoinTest engine) addLinkAndDiscoverTest(TextIndexScanForWordTest engine) addLinkAndDiscoverTest(TextIndexScanForEntityTest engine) addLinkAndDiscoverTest(DistinctTest engine) +addLinkAndDiscoverTestSerial(ScanSpecificationTest engine index parser scanSpecification) diff --git a/test/engine/CartesianProductJoinTest.cpp b/test/engine/CartesianProductJoinTest.cpp index f07bb11218..cd05e31b9b 100644 --- a/test/engine/CartesianProductJoinTest.cpp +++ b/test/engine/CartesianProductJoinTest.cpp @@ -90,9 +90,21 @@ void testCartesianProduct(VectorTable expected, std::vector inputs, TEST(CartesianProductJoin, computeResult) { // Simple base cases. VectorTable v{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; + VectorTable empty{}; testCartesianProduct(v, {v}); - testCartesianProduct({}, {{}, v, {}}); - testCartesianProduct({}, {{}, {}}); + testCartesianProduct(empty, {empty, v, empty}); + testCartesianProduct(empty, {empty, empty}); + + // Test cases where some or all of the inputs are Neutral elements (1 row, + // zero columns) that are automatically filtered out by the + // `CartesianProductJoin`. + VectorTable neutral{{}}; + testCartesianProduct(neutral, {neutral}); + testCartesianProduct(v, {v, neutral}); + testCartesianProduct(v, {neutral, v, neutral}); + testCartesianProduct(neutral, {neutral, neutral, neutral}); + testCartesianProduct(empty, {neutral, empty, neutral}); + testCartesianProduct(empty, {neutral, empty, v}); // Fails because of an empty input. EXPECT_ANY_THROW(makeJoin({})); diff --git a/test/engine/ScanSpecificationTest.cpp b/test/engine/ScanSpecificationTest.cpp new file mode 100644 index 0000000000..63c4386d67 --- /dev/null +++ b/test/engine/ScanSpecificationTest.cpp @@ -0,0 +1,81 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include + +#include "../util/GTestHelpers.h" +#include "../util/IndexTestHelpers.h" +#include "engine/ScanSpecification.h" + +// _____________________________________________________________________________ +TEST(ScanSpecification, validate) { + Id i = Id::makeFromInt(42); + auto n = std::nullopt; + using S = ScanSpecification; + EXPECT_NO_THROW(S(i, i, i)); + EXPECT_NO_THROW(S(i, i, n)); + EXPECT_NO_THROW(S(i, n, n)); + EXPECT_NO_THROW(S(n, n, n)); + + EXPECT_ANY_THROW(S(n, i, i)); + EXPECT_ANY_THROW(S(n, n, i)); + EXPECT_ANY_THROW(S(n, i, n)); + EXPECT_ANY_THROW(S(i, n, i)); +} + +// _____________________________________________________________________________ +TEST(ScanSpecification, ScanSpecificationAsTripleComponent) { + Id i = Id::makeFromInt(42); + TripleComponent iTc{42}; + auto n = std::nullopt; + using S = ScanSpecification; + using STc = ScanSpecificationAsTripleComponent; + + EXPECT_ANY_THROW(STc(n, iTc, iTc)); + EXPECT_ANY_THROW(STc(n, n, iTc)); + EXPECT_ANY_THROW(STc(n, iTc, n)); + EXPECT_ANY_THROW(STc(iTc, n, iTc)); + + const auto& index = ad_utility::testing::getQec()->getIndex(); + auto toScanSpec = [&index](const STc& s) { + return s.toScanSpecification(index); + }; + + // Match that a `ScanSpecificationAsTripleComponent` has the expected number + // of columns, and yields the expected `ScanSpecification` when + // `toScanSpecification` is called on it. + auto matchScanSpec = + [&toScanSpec](const std::optional spec, + size_t numColumns = 0) -> ::testing::Matcher { + auto innerMatcher = [&toScanSpec, &spec] { + return ::testing::ResultOf(toScanSpec, ::testing::Eq(spec)); + }; + if (!spec.has_value()) { + return innerMatcher(); + } else { + return ::testing::AllOf( + innerMatcher(), + AD_PROPERTY(STc, numColumns, ::testing::Eq(numColumns))); + } + }; + EXPECT_THAT(STc(iTc, iTc, iTc), matchScanSpec(S(i, i, i), 0)); + EXPECT_THAT(STc(iTc, iTc, n), matchScanSpec(S(i, i, n), 1)); + EXPECT_THAT(STc(iTc, n, n), matchScanSpec(S(i, n, n), 2)); + EXPECT_THAT(STc(n, n, n), matchScanSpec(S(n, n, n), 3)); + + // Test the resolution of vocab entries. + auto getId = ad_utility::testing::makeGetId(index); + auto x = getId(""); + TripleComponent xIri = TripleComponent::Iri::fromIriref(""); + + EXPECT_THAT(STc(xIri, xIri, xIri), matchScanSpec(S(x, x, x), 0)); + + // For an entry that is not in the vocabulary, the complete result of + // `toScanSpecification` is `nullopt`. + TripleComponent notInVocab = + TripleComponent::Iri::fromIriref(""); + EXPECT_THAT(STc(notInVocab, xIri, xIri), matchScanSpec(std::nullopt)); + EXPECT_THAT(STc(xIri, notInVocab, xIri), matchScanSpec(std::nullopt)); + EXPECT_THAT(STc(xIri, xIri, notInVocab), matchScanSpec(std::nullopt)); +} diff --git a/test/engine/ValuesForTesting.h b/test/engine/ValuesForTesting.h index dcbf130da8..ac7e363a95 100644 --- a/test/engine/ValuesForTesting.h +++ b/test/engine/ValuesForTesting.h @@ -65,7 +65,8 @@ class ValuesForTesting : public Operation { // ___________________________________________________________________________ string getCacheKeyImpl() const override { std::stringstream str; - str << "Values for testing with " << table_.numColumns() << " columns. "; + str << "Values for testing with " << table_.numColumns() << " columns and " + << table_.numRows() << " rows. "; if (table_.numRows() > 1000) { str << ad_utility::FastRandomIntGenerator{}(); } else { From a70579989cbe3dd9af00bb4e6df7a9ed9e30fbdd Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 2 Aug 2024 11:13:14 +0200 Subject: [PATCH 13/14] Fix linker troubles... --- src/engine/CMakeLists.txt | 4 ---- src/index/CMakeLists.txt | 4 ++-- src/index/CompressedRelation.h | 2 +- src/{engine => index}/ScanSpecification.cpp | 2 +- src/{engine => index}/ScanSpecification.h | 0 test/engine/CMakeLists.txt | 1 - test/index/CMakeLists.txt | 1 + test/{engine => index}/ScanSpecificationTest.cpp | 2 +- 8 files changed, 6 insertions(+), 10 deletions(-) rename src/{engine => index}/ScanSpecification.cpp (98%) rename src/{engine => index}/ScanSpecification.h (100%) rename test/{engine => index}/ScanSpecificationTest.cpp (98%) diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index fbf6cad996..92266f8de1 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,7 +14,3 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LocalVocabEntry.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams) - -add_library(scanSpecification ScanSpecification.cpp) -qlever_target_link_libraries(scanSpecification index engine) -qlever_target_link_libraries(engine util index parser sparqlExpressions scanSpecification http SortPerformanceEstimator Boost::iostreams) diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 7d4e7dd5b1..e41fd2471f 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -5,5 +5,5 @@ add_library(index LocatedTriples.cpp Permutation.cpp TextMetaData.cpp DocsDB.cpp FTSAlgorithms.cpp PrefixHeuristic.cpp CompressedRelation.cpp - PatternCreator.cpp) -qlever_target_link_libraries(index util parser vocabulary compilationInfo scanSpecification ${STXXL_LIBRARIES}) + PatternCreator.cpp ScanSpecification.cpp) +qlever_target_link_libraries(index util parser vocabulary compilationInfo ${STXXL_LIBRARIES}) diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 42d3a3c200..af8725ec9a 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -9,9 +9,9 @@ #include #include -#include "engine/ScanSpecification.h" #include "engine/idTable/IdTable.h" #include "global/Id.h" +#include "index/ScanSpecification.h" #include "parser/data/LimitOffsetClause.h" #include "util/Cache.h" #include "util/CancellationHandle.h" diff --git a/src/engine/ScanSpecification.cpp b/src/index/ScanSpecification.cpp similarity index 98% rename from src/engine/ScanSpecification.cpp rename to src/index/ScanSpecification.cpp index ee8a0543d3..505dc1e051 100644 --- a/src/engine/ScanSpecification.cpp +++ b/src/index/ScanSpecification.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#include "engine/ScanSpecification.h" +#include "index/ScanSpecification.h" #include "index/Index.h" #include "index/IndexImpl.h" diff --git a/src/engine/ScanSpecification.h b/src/index/ScanSpecification.h similarity index 100% rename from src/engine/ScanSpecification.h rename to src/index/ScanSpecification.h diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index d64860a72d..4f6e09ff5f 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -4,4 +4,3 @@ addLinkAndDiscoverTest(CartesianProductJoinTest engine) addLinkAndDiscoverTest(TextIndexScanForWordTest engine) addLinkAndDiscoverTest(TextIndexScanForEntityTest engine) addLinkAndDiscoverTest(DistinctTest engine) -addLinkAndDiscoverTestSerial(ScanSpecificationTest engine index parser scanSpecification) diff --git a/test/index/CMakeLists.txt b/test/index/CMakeLists.txt index aa0d0950e1..3651259d1a 100644 --- a/test/index/CMakeLists.txt +++ b/test/index/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(vocabulary) addLinkAndDiscoverTest(PatternCreatorTest index) +addLinkAndDiscoverTestSerial(ScanSpecificationTest index) diff --git a/test/engine/ScanSpecificationTest.cpp b/test/index/ScanSpecificationTest.cpp similarity index 98% rename from test/engine/ScanSpecificationTest.cpp rename to test/index/ScanSpecificationTest.cpp index 63c4386d67..3c7df13755 100644 --- a/test/engine/ScanSpecificationTest.cpp +++ b/test/index/ScanSpecificationTest.cpp @@ -6,7 +6,7 @@ #include "../util/GTestHelpers.h" #include "../util/IndexTestHelpers.h" -#include "engine/ScanSpecification.h" +#include "index/ScanSpecification.h" // _____________________________________________________________________________ TEST(ScanSpecification, validate) { From bfa6dce35c2f08a260e18d9d5dc4d181124426b0 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 3 Aug 2024 17:44:59 +0200 Subject: [PATCH 14/14] Minor fix in layout. --- src/engine/idTable/IdTable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine/idTable/IdTable.h b/src/engine/idTable/IdTable.h index c8fb92d84e..dd9895810c 100644 --- a/src/engine/idTable/IdTable.h +++ b/src/engine/idTable/IdTable.h @@ -1,5 +1,5 @@ -/// Copyright 2021, University of Freiburg, Chair of Algorithms and Data -// Structures. Author: Johannes Kalmbach +// Copyright 2021, University of Freiburg, Chair of Algorithms and Data +// Structures. Author: Johannes Kalmbach #pragma once