Skip to content

Commit

Permalink
Store the internal and external triples separately (#1532)
Browse files Browse the repository at this point in the history
The internal triples (for the predicates `ql:has-pattern` and `ql:langtag` and all the `@lang@...` predicates) are now stored in a separate PSO&POS permutation pair and corresponding index files. This makes the handling of internal triples in the rest of the code much simpler. Also, internal IRIs now no longer leak for any query, unless it uses an internal IRI itself. In particular, this fixes #1513.

This is an index-breaking change. In particular, it adds new index files with names `<basename>.internal.<suffix>`.
  • Loading branch information
joka921 authored Oct 4, 2024
1 parent 342e06d commit 77ea2c6
Show file tree
Hide file tree
Showing 24 changed files with 336 additions and 710 deletions.
3 changes: 0 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,5 @@ target_precompile_headers(ServerMain REUSE_FROM engine)
add_executable(VocabularyMergerMain src/VocabularyMergerMain.cpp)
qlever_target_link_libraries(VocabularyMergerMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(PermutationExporterMain src/index/PermutationExporterMain.cpp)
qlever_target_link_libraries(PermutationExporterMain index ${CMAKE_THREAD_LIBS_INIT})

add_executable(PrintIndexVersionMain src/PrintIndexVersionMain.cpp)
qlever_target_link_libraries(PrintIndexVersionMain util)
2 changes: 1 addition & 1 deletion src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ add_library(engine
Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
TextLimit.cpp LocalVocabEntry.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp)
TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams)
9 changes: 0 additions & 9 deletions src/engine/GroupBy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,22 +708,13 @@ std::optional<IdTable> GroupBy::computeGroupByForFullIndexScan() const {

_subtree->getRootOperation()->updateRuntimeInformationWhenOptimizedOut({});

auto ignoredRanges =
getIndex().getImpl().getIgnoredIdRanges(permutationEnum.value()).first;
const auto& permutation =
getExecutionContext()->getIndex().getPimpl().getPermutation(
permutationEnum.value());
auto table = permutation.getDistinctCol0IdsAndCounts(cancellationHandle_);
if (numCounts == 0) {
table.setColumnSubset({{0}});
}
// TODO<joka921> This is only semi-efficient.
auto end = std::ranges::remove_if(table, [&ignoredRanges](const auto& row) {
return std::ranges::any_of(ignoredRanges, [id = row[0]](const auto& pair) {
return id >= pair.first && id < pair.second;
});
});
table.resize(end.begin() - table.begin());

// TODO<joka921> This optimization should probably also apply if
// the query is `SELECT DISTINCT ?s WHERE {?s ?p ?o} ` without a
Expand Down
62 changes: 3 additions & 59 deletions src/engine/IndexScan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <string>

#include "index/IndexImpl.h"
#include "index/TriplesView.h"
#include "parser/ParsedQuery.h"

using std::string;
Expand Down Expand Up @@ -161,13 +160,8 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) {
using enum Permutation::Enum;
idTable.setNumColumns(numVariables_);
const auto& index = _executionContext->getIndex();
if (numVariables_ < 3 || !additionalColumns().empty()) {
idTable = index.scan(getScanSpecification(), permutation_,
additionalColumns(), cancellationHandle_, getLimit());
} else {
AD_CORRECTNESS_CHECK(numVariables_ == 3);
computeFullScan(&idTable, permutation_);
}
idTable = index.scan(getScanSpecification(), permutation_,
additionalColumns(), cancellationHandle_, getLimit());
AD_CORRECTNESS_CHECK(idTable.numColumns() == getResultWidth());
LOG(DEBUG) << "IndexScan result computation done.\n";
checkCancellation();
Expand All @@ -178,19 +172,7 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) {
// _____________________________________________________________________________
size_t IndexScan::computeSizeEstimate() const {
AD_CORRECTNESS_CHECK(_executionContext);
// We have to do a simple scan anyway so might as well do it now
if (numVariables_ < 3) {
return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_);
} else {
// The triple consists of three variables.
// TODO<joka921> As soon as all implementations of a full index scan
// (Including the "dummy joins" in Join.cpp) consistently exclude the
// internal triples, this estimate should be changed to only return
// the number of triples in the actual knowledge graph (excluding the
// internal triples).
AD_CORRECTNESS_CHECK(numVariables_ == 3);
return getIndex().numTriples().normalAndInternal_();
}
return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_);
}

// _____________________________________________________________________________
Expand Down Expand Up @@ -223,44 +205,6 @@ void IndexScan::determineMultiplicities() {
AD_CONTRACT_CHECK(multiplicity_.size() == getResultWidth());
}

// ________________________________________________________________________
void IndexScan::computeFullScan(IdTable* result,
const Permutation::Enum permutation) const {
auto [ignoredRanges, isTripleIgnored] =
getIndex().getImpl().getIgnoredIdRanges(permutation);

result->setNumColumns(3);

// This implementation computes the complete knowledge graph, except the
// internal triples.
uint64_t resultSize = getIndex().numTriples().normal;
if (getLimit()._limit.has_value() && getLimit()._limit < resultSize) {
resultSize = getLimit()._limit.value();
}

// TODO<joka921> Implement OFFSET
if (getLimit()._offset != 0) {
throw NotSupportedException{
"Scanning the complete index with an OFFSET clause is currently not "
"supported by QLever"};
}
result->reserve(resultSize);
auto table = std::move(*result).toStatic<3>();
size_t i = 0;
const auto& permutationImpl =
getExecutionContext()->getIndex().getImpl().getPermutation(permutation);
auto triplesView = TriplesView(permutationImpl, cancellationHandle_,
ignoredRanges, isTripleIgnored);
for (const auto& triple : triplesView) {
if (i >= resultSize) {
break;
}
table.push_back(triple);
++i;
}
*result = std::move(table).toDynamic();
}

// ___________________________________________________________________________
std::array<const TripleComponent* const, 3> IndexScan::getPermutedTriple()
const {
Expand Down
2 changes: 0 additions & 2 deletions src/engine/IndexScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,6 @@ class IndexScan final : public Operation {

vector<QueryExecutionTree*> getChildren() override { return {}; }

void computeFullScan(IdTable* result, Permutation::Enum permutation) const;

size_t computeSizeEstimate() const;

std::string getCacheKeyImpl() const override;
Expand Down
2 changes: 1 addition & 1 deletion src/global/IndexTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

#pragma once

#include "engine/LocalVocabEntry.h"
#include "global/TypedIndex.h"
#include "global/VocabIndex.h"
#include "index/LocalVocabEntry.h"

// Typedefs for several kinds of typed indices that are used across QLever.

Expand Down
2 changes: 1 addition & 1 deletion src/index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ add_library(index
DocsDB.cpp FTSAlgorithms.cpp
PrefixHeuristic.cpp CompressedRelation.cpp
PatternCreator.cpp ScanSpecification.cpp
DeltaTriples.cpp)
DeltaTriples.cpp LocalVocabEntry.cpp)
qlever_target_link_libraries(index util parser vocabulary ${STXXL_LIBRARIES})
3 changes: 3 additions & 0 deletions src/index/ConstantsIndexBuilding.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ constexpr inline std::string_view PARTIAL_MMAP_IDS = ".tmp.partial-ids-mmap.";
constexpr inline std::string_view TMP_BASENAME_COMPRESSION =
".tmp.for-prefix-compression";

// _________________________________________________________________
constexpr inline std::string_view INTERNAL_INDEX_INFIX = ".internal";

// _________________________________________________________________
// The degree of parallelism that is used for the index building step, where the
// unique elements of the vocabulary are identified via hash maps. Typically, 6
Expand Down
26 changes: 7 additions & 19 deletions src/index/IndexBuilderTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -262,14 +262,7 @@ auto getIdMapLambdas(
* - All Ids are assigned according to itemArray[idx]
*/
const auto itemMapLamdaCreator = [&itemArray, indexPtr](const size_t idx) {
auto& map = *itemArray[idx];
// Resolve the special IDs of the default and internal graph to their actual
// IDs. This is precomputed for efficiency gains.
auto internalGraphId =
map.getId(qlever::specialIds().at(INTERNAL_GRAPH_IRI));
auto defaultGraphId = map.getId(qlever::specialIds().at(DEFAULT_GRAPH_IRI));
return [&map = *itemArray[idx], indexPtr, internalGraphId,
defaultGraphId](ad_utility::Rvalue auto&& tr) {
return [&map = *itemArray[idx], indexPtr](ad_utility::Rvalue auto&& tr) {
auto lt = indexPtr->tripleToInternalRepresentation(AD_FWD(tr));
OptionalIds res;
// get Ids for the actual triple and store them in the result.
Expand All @@ -293,24 +286,19 @@ auto getIdMapLambdas(
" The following lines probably have to be changed when "
"the number of payload columns changes");
// extra triple <subject> @language@<predicate> <object>
// The additional triples have the graph ID of the internal graph if the
// triple was in the default/fallback graph, else they keep their graph
// ID.
// TODO<joka921> Maybe we should have an `internalGraph` per graph, but
// this requires further work. The current approach at least keeps the
// language filters working in combination with named graphs and doesn't
// add further inconsistencies.
// The additional triples have the same graph ID as the original triple.
// This makes optimizations such as language filters also work with
// named graphs. Note that we have a different mechanism in place to
// distinguish between normal and internal triples.
auto tripleGraphId = res[0].value()[ADDITIONAL_COLUMN_GRAPH_ID];
auto addedTripleGraphId =
tripleGraphId == defaultGraphId ? internalGraphId : tripleGraphId;
res[1].emplace(
Arr{spoIds[0], langTaggedPredId, spoIds[2], addedTripleGraphId});
Arr{spoIds[0], langTaggedPredId, spoIds[2], tripleGraphId});
// extra triple <object> ql:language-tag <@language>
res[2].emplace(Arr{spoIds[2],
map.getId(TripleComponent{
ad_utility::triple_component::Iri::fromIriref(
LANGUAGE_PREDICATE)}),
langTagId, addedTripleGraphId});
langTagId, tripleGraphId});
}
return res;
};
Expand Down
2 changes: 1 addition & 1 deletion src/index/IndexFormatVersion.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ struct IndexFormatVersion {
// The actual index version. Change it once the binary format of the index
// changes.
inline const IndexFormatVersion& indexFormatVersion{
1506, DateYearOrDuration{Date{2024, 9, 27}}};
1532, DateYearOrDuration{Date{2024, 10, 4}}};
} // namespace qlever
Loading

0 comments on commit 77ea2c6

Please sign in to comment.