Store the internal and external triples separately (#1532)

The internal triples (for the predicates `ql:has-pattern` and `ql:langtag` and all the `@lang@...` predicates) are now stored in a separate PSO&POS permutation pair and corresponding index files. This makes the handling of internal triples in the rest of the code much simpler. Also, internal IRIs now no longer leak for any query, unless it uses an internal IRI itself. In particular, this fixes #1513. This is an index-breaking change. In particular, it adds new index files with names `<basename>.internal.<suffix>`.
ad-freiburg · Oct 4, 2024 · 77ea2c6 · 77ea2c6
1 parent 342e06d
commit 77ea2c6
Show file tree

Hide file tree

Showing 24 changed files with 336 additions and 710 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -421,8 +421,5 @@ target_precompile_headers(ServerMain REUSE_FROM engine)
 add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp)
 qlever_target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})
 
-add_executable(PermutationExporterMain src/index/PermutationExporterMain.cpp)
-qlever_target_link_libraries(PermutationExporterMain index ${CMAKE_THREAD_LIBS_INIT})
-
 add_executable(PrintIndexVersionMain src/PrintIndexVersionMain.cpp)
 qlever_target_link_libraries(PrintIndexVersionMain util)
diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt
@@ -12,5 +12,5 @@ add_library(engine
         Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp
         VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
         CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
-        TextLimit.cpp LocalVocabEntry.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp)
+        TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp)
 qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams)
diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp
@@ -708,22 +708,13 @@ std::optional<IdTable> GroupBy::computeGroupByForFullIndexScan() const {
 
   _subtree->getRootOperation()->updateRuntimeInformationWhenOptimizedOut({});
 
-  auto ignoredRanges =
-      getIndex().getImpl().getIgnoredIdRanges(permutationEnum.value()).first;
   const auto& permutation =
       getExecutionContext()->getIndex().getPimpl().getPermutation(
           permutationEnum.value());
   auto table = permutation.getDistinctCol0IdsAndCounts(cancellationHandle_);
   if (numCounts == 0) {
     table.setColumnSubset({{0}});
   }
-  // TODO<joka921> This is only semi-efficient.
-  auto end = std::ranges::remove_if(table, [&ignoredRanges](const auto& row) {
-    return std::ranges::any_of(ignoredRanges, [id = row[0]](const auto& pair) {
-      return id >= pair.first && id < pair.second;
-    });
-  });
-  table.resize(end.begin() - table.begin());
 
   // TODO<joka921> This optimization should probably also apply if
   // the query is `SELECT DISTINCT ?s WHERE {?s ?p ?o} ` without a

diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp
@@ -11,7 +11,6 @@
 #include <string>
 
 #include "index/IndexImpl.h"
-#include "index/TriplesView.h"
 #include "parser/ParsedQuery.h"
 
 using std::string;
@@ -161,13 +160,8 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) {
   using enum Permutation::Enum;
   idTable.setNumColumns(numVariables_);
   const auto& index = _executionContext->getIndex();
-  if (numVariables_ < 3 || !additionalColumns().empty()) {
-    idTable = index.scan(getScanSpecification(), permutation_,
-                         additionalColumns(), cancellationHandle_, getLimit());
-  } else {
-    AD_CORRECTNESS_CHECK(numVariables_ == 3);
-    computeFullScan(&idTable, permutation_);
-  }
+  idTable = index.scan(getScanSpecification(), permutation_,
+                       additionalColumns(), cancellationHandle_, getLimit());
   AD_CORRECTNESS_CHECK(idTable.numColumns() == getResultWidth());
   LOG(DEBUG) << "IndexScan result computation done.\n";
   checkCancellation();
@@ -178,19 +172,7 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) {
 // _____________________________________________________________________________
 size_t IndexScan::computeSizeEstimate() const {
   AD_CORRECTNESS_CHECK(_executionContext);
-  // We have to do a simple scan anyway so might as well do it now
-  if (numVariables_ < 3) {
-    return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_);
-  } else {
-    // The triple consists of three variables.
-    // TODO<joka921> As soon as all implementations of a full index scan
-    // (Including the "dummy joins" in Join.cpp) consistently exclude the
-    // internal triples, this estimate should be changed to only return
-    // the number of triples in the actual knowledge graph (excluding the
-    // internal triples).
-    AD_CORRECTNESS_CHECK(numVariables_ == 3);
-    return getIndex().numTriples().normalAndInternal_();
-  }
+  return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_);
 }
 
 // _____________________________________________________________________________
@@ -223,44 +205,6 @@ void IndexScan::determineMultiplicities() {
   AD_CONTRACT_CHECK(multiplicity_.size() == getResultWidth());
 }
 
-// ________________________________________________________________________
-void IndexScan::computeFullScan(IdTable* result,
-                                const Permutation::Enum permutation) const {
-  auto [ignoredRanges, isTripleIgnored] =
-      getIndex().getImpl().getIgnoredIdRanges(permutation);
-
-  result->setNumColumns(3);
-
-  // This implementation computes the complete knowledge graph, except the
-  // internal triples.
-  uint64_t resultSize = getIndex().numTriples().normal;
-  if (getLimit()._limit.has_value() && getLimit()._limit < resultSize) {
-    resultSize = getLimit()._limit.value();
-  }
-
-  // TODO<joka921> Implement OFFSET
-  if (getLimit()._offset != 0) {
-    throw NotSupportedException{
-        "Scanning the complete index with an OFFSET clause is currently not "
-        "supported by QLever"};
-  }
-  result->reserve(resultSize);
-  auto table = std::move(*result).toStatic<3>();
-  size_t i = 0;
-  const auto& permutationImpl =
-      getExecutionContext()->getIndex().getImpl().getPermutation(permutation);
-  auto triplesView = TriplesView(permutationImpl, cancellationHandle_,
-                                 ignoredRanges, isTripleIgnored);
-  for (const auto& triple : triplesView) {
-    if (i >= resultSize) {
-      break;
-    }
-    table.push_back(triple);
-    ++i;
-  }
-  *result = std::move(table).toDynamic();
-}
-
 // ___________________________________________________________________________
 std::array<const TripleComponent* const, 3> IndexScan::getPermutedTriple()
     const {

diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h
@@ -124,8 +124,6 @@ class IndexScan final : public Operation {
 
   vector<QueryExecutionTree*> getChildren() override { return {}; }
 
-  void computeFullScan(IdTable* result, Permutation::Enum permutation) const;
-
   size_t computeSizeEstimate() const;
 
   std::string getCacheKeyImpl() const override;

diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
@@ -4,9 +4,9 @@
 
 #pragma once
 
-#include "engine/LocalVocabEntry.h"
 #include "global/TypedIndex.h"
 #include "global/VocabIndex.h"
+#include "index/LocalVocabEntry.h"
 
 // Typedefs for several kinds of typed indices that are used across QLever.
 

diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt
@@ -6,5 +6,5 @@ add_library(index
         DocsDB.cpp FTSAlgorithms.cpp
         PrefixHeuristic.cpp CompressedRelation.cpp
         PatternCreator.cpp ScanSpecification.cpp
-        DeltaTriples.cpp)
+        DeltaTriples.cpp LocalVocabEntry.cpp)
 qlever_target_link_libraries(index util parser vocabulary ${STXXL_LIBRARIES})
diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h
@@ -62,6 +62,9 @@ constexpr inline std::string_view PARTIAL_MMAP_IDS = ".tmp.partial-ids-mmap.";
 constexpr inline std::string_view TMP_BASENAME_COMPRESSION =
     ".tmp.for-prefix-compression";
 
+// _________________________________________________________________
+constexpr inline std::string_view INTERNAL_INDEX_INFIX = ".internal";
+
 // _________________________________________________________________
 // The degree of parallelism that is used for the index building step, where the
 // unique elements of the vocabulary are identified via hash maps. Typically, 6

diff --git a/src/index/IndexBuilderTypes.h b/src/index/IndexBuilderTypes.h
@@ -262,14 +262,7 @@ auto getIdMapLambdas(
    * - All Ids are assigned according to itemArray[idx]
    */
   const auto itemMapLamdaCreator = [&itemArray, indexPtr](const size_t idx) {
-    auto& map = *itemArray[idx];
-    // Resolve the special IDs of the default and internal graph to their actual
-    // IDs. This is precomputed for efficiency gains.
-    auto internalGraphId =
-        map.getId(qlever::specialIds().at(INTERNAL_GRAPH_IRI));
-    auto defaultGraphId = map.getId(qlever::specialIds().at(DEFAULT_GRAPH_IRI));
-    return [&map = *itemArray[idx], indexPtr, internalGraphId,
-            defaultGraphId](ad_utility::Rvalue auto&& tr) {
+    return [&map = *itemArray[idx], indexPtr](ad_utility::Rvalue auto&& tr) {
       auto lt = indexPtr->tripleToInternalRepresentation(AD_FWD(tr));
       OptionalIds res;
       // get Ids for the actual triple and store them in the result.
@@ -293,24 +286,19 @@ auto getIdMapLambdas(
                       " The following lines probably have to be changed when "
                       "the number of payload columns changes");
         // extra triple <subject> @language@<predicate> <object>
-        // The additional triples have the graph ID of the internal graph if the
-        // triple was in the default/fallback graph, else they keep their graph
-        // ID.
-        // TODO<joka921> Maybe we should have an `internalGraph` per graph, but
-        // this requires further work. The current approach at least keeps the
-        // language filters working in combination with named graphs and doesn't
-        // add further inconsistencies.
+        // The additional triples have the same graph ID as the original triple.
+        // This makes optimizations such as language filters also work with
+        // named graphs. Note that we have a different mechanism in place to
+        // distinguish between normal and internal triples.
         auto tripleGraphId = res[0].value()[ADDITIONAL_COLUMN_GRAPH_ID];
-        auto addedTripleGraphId =
-            tripleGraphId == defaultGraphId ? internalGraphId : tripleGraphId;
         res[1].emplace(
-            Arr{spoIds[0], langTaggedPredId, spoIds[2], addedTripleGraphId});
+            Arr{spoIds[0], langTaggedPredId, spoIds[2], tripleGraphId});
         // extra triple <object> ql:language-tag <@language>
         res[2].emplace(Arr{spoIds[2],
                            map.getId(TripleComponent{
                                ad_utility::triple_component::Iri::fromIriref(
                                    LANGUAGE_PREDICATE)}),
-                           langTagId, addedTripleGraphId});
+                           langTagId, tripleGraphId});
       }
       return res;
     };

diff --git a/src/index/IndexFormatVersion.h b/src/index/IndexFormatVersion.h
@@ -36,5 +36,5 @@ struct IndexFormatVersion {
 // The actual index version. Change it once the binary format of the index
 // changes.
 inline const IndexFormatVersion& indexFormatVersion{
-    1506, DateYearOrDuration{Date{2024, 9, 27}}};
+    1532, DateYearOrDuration{Date{2024, 10, 4}}};
 }  // namespace qlever