Skip to content

Commit

Permalink
Merge located triples when performing index scans (#1597)
Browse files Browse the repository at this point in the history
PR #1582 and #1603 gave all index-lookup methods access to a snapshot of the (located) delta triples. With this change, these triples are now merged with the original triples during query processing whenever necessary. When an index block does not contain any located triples, the performance for accessing that block is the same as before.

The methods for obtaining the result size of an index scan now have two versions: one for obtaining an approximate size (this is cheap because it can be computed from the metadata of the blocks and the located triples) and one for obtaining the exact size (if there are located triples this is expensive because it requires reading and decompressing a block and merging the located triples).
  • Loading branch information
joka921 authored Nov 14, 2024
1 parent b0a1d1e commit 77ac964
Show file tree
Hide file tree
Showing 13 changed files with 733 additions and 338 deletions.
15 changes: 13 additions & 2 deletions src/engine/IndexScan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ IndexScan::IndexScan(QueryExecutionContext* qec, Permutation::Enum permutation,
additionalColumns_.push_back(idx);
additionalVariables_.push_back(variable);
}
sizeEstimate_ = computeSizeEstimate();
std::tie(sizeEstimateIsExact_, sizeEstimate_) = computeSizeEstimate();

// Check the following invariant: All the variables must be at the end of the
// permuted triple. For example in the PSO permutation, either only the O, or
Expand Down Expand Up @@ -171,7 +171,18 @@ ProtoResult IndexScan::computeResult(bool requestLaziness) {
}

// _____________________________________________________________________________
size_t IndexScan::computeSizeEstimate() const {
std::pair<bool, size_t> IndexScan::computeSizeEstimate() const {
AD_CORRECTNESS_CHECK(_executionContext);
auto [lower, upper] = getIndex()
.getImpl()
.getPermutation(permutation())
.getSizeEstimateForScan(getScanSpecification(),
locatedTriplesSnapshot());
return {lower == upper, std::midpoint(lower, upper)};
}

// _____________________________________________________________________________
size_t IndexScan::getExactSize() const {
AD_CORRECTNESS_CHECK(_executionContext);
return getIndex().getResultSizeOfScan(getScanSpecification(), permutation_,
locatedTriplesSnapshot());
Expand Down
17 changes: 12 additions & 5 deletions src/engine/IndexScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class IndexScan final : public Operation {
Graphs graphsToFilter_;
size_t numVariables_;
size_t sizeEstimate_;
bool sizeEstimateIsExact_;
vector<float> multiplicity_;

// Additional columns (e.g. patterns) that are being retrieved in addition to
Expand Down Expand Up @@ -59,7 +60,7 @@ class IndexScan final : public Operation {

// Return the exact result size of the index scan. This is always known as it
// can be read from the Metadata.
size_t getExactSize() const { return sizeEstimate_; }
size_t getExactSize() const;

// Return two generators that lazily yield the results of `s1` and `s2` in
// blocks, but only the blocks that can theoretically contain matching rows
Expand All @@ -78,7 +79,7 @@ class IndexScan final : public Operation {
private:
// TODO<joka921> Make the `getSizeEstimateBeforeLimit()` function `const` for
// ALL the `Operations`.
uint64_t getSizeEstimateBeforeLimit() override { return getExactSize(); }
uint64_t getSizeEstimateBeforeLimit() override { return sizeEstimate_; }

public:
size_t getCostEstimate() override;
Expand All @@ -93,7 +94,9 @@ class IndexScan final : public Operation {
return multiplicity_[col];
}

bool knownEmptyResult() override { return getExactSize() == 0; }
bool knownEmptyResult() override {
return sizeEstimateIsExact_ && sizeEstimate_ == 0;
}

bool isIndexScanWithNumVariables(size_t target) const override {
return numVariables() == target;
Expand All @@ -103,7 +106,7 @@ class IndexScan final : public Operation {
// size of wikidata, so we don't even need to try and waste performance.
bool unlikelyToFitInCache(
ad_utility::MemorySize maxCacheableSize) const override {
return ad_utility::MemorySize::bytes(getExactSize() * getResultWidth() *
return ad_utility::MemorySize::bytes(sizeEstimate_ * getResultWidth() *
sizeof(Id)) > maxCacheableSize;
}

Expand All @@ -124,7 +127,11 @@ class IndexScan final : public Operation {

vector<QueryExecutionTree*> getChildren() override { return {}; }

size_t computeSizeEstimate() const;
// Compute the size estimate of the index scan, taking delta triples (from
// the `queryExecutionContext_`) into account. The `bool` is true iff the
// estimate is exact. If not, the estimate is the mean of the lower and upper
// bound.
std::pair<bool, size_t> computeSizeEstimate() const;

std::string getCacheKeyImpl() const override;

Expand Down
19 changes: 12 additions & 7 deletions src/engine/Join.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -566,13 +566,18 @@ void updateRuntimeInfoForLazyScan(
rti.addDetail("num-blocks-read", metadata.numBlocksRead_);
rti.addDetail("num-blocks-all", metadata.numBlocksAll_);
rti.addDetail("num-elements-read", metadata.numElementsRead_);
if (metadata.numBlocksSkippedBecauseOfGraph_ > 0) {
rti.addDetail("num-blocks-skipped-graph",
metadata.numBlocksSkippedBecauseOfGraph_);
}
if (metadata.numBlocksPostprocessed_ > 0) {
rti.addDetail("num-blocks-postprocessed", metadata.numBlocksPostprocessed_);
}

// Add more details, but only if the respective value is non-zero.
auto updateIfPositive = [&rti](const auto& value, const std::string& key) {
if (value > 0) {
rti.addDetail(key, value);
}
};
updateIfPositive(metadata.numBlocksSkippedBecauseOfGraph_,
"num-blocks-skipped-graph");
updateIfPositive(metadata.numBlocksPostprocessed_,
"num-blocks-postprocessed");
updateIfPositive(metadata.numBlocksWithUpdate_, "num-blocks-with-update");
}
} // namespace

Expand Down
Loading

0 comments on commit 77ac964

Please sign in to comment.