Skip to content

Commit

Permalink
Merge ssh://github.com/ad-freiburg/qlever into nearest-neighbor
Browse files Browse the repository at this point in the history
  • Loading branch information
ullingerc committed Oct 10, 2024
2 parents 4cdebbb + b97c44c commit c4be461
Show file tree
Hide file tree
Showing 11 changed files with 470 additions and 27 deletions.
3 changes: 2 additions & 1 deletion src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ add_library(engine
Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp)
TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp
CountConnectedSubgraphs.cpp)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2)
116 changes: 116 additions & 0 deletions src/engine/CountConnectedSubgraphs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <[email protected]>

#include "engine/CountConnectedSubgraphs.h"

#include "util/BitUtils.h"

namespace countConnectedSubgraphs {

// _____________________________________________________________________________
size_t countSubgraphs(const Graph& graph, size_t budget) {
size_t count = 0;
// For each node `i`, recursively count all subgraphs that contain `i`, but no
// node `k < i` (because these have already been counted previously, when we
// ran the loop for `k`).
for (size_t i = 0; i < graph.size(); ++i) {
++count;
if (count > budget) {
return budget + 1;
}
// The set of nodes that only consists of node `i` is encoded by a single
// `1` bit. The ignored set has `1`s in all `i` bits that have a lower index
// than `i` (e.g. if `i` is 3, then `nodes` is `[0 x 56] 0000 1000` and
// `ignored` is `[0 x 56] 0000 0111`.
uint64_t nodes = 1ULL << i;
uint64_t ignored = ad_utility::bitMaskForLowerBits(i);
count = countSubgraphsRecursively(graph, nodes, ignored, count, budget);
}
return count;
}

// Return the set of nodes in `graph` that are adjacent to at least one of the
// nodes in `nodes`. Nodes that are `ignored` are excluded from the result. Note
// that the result may contain nodes from the `nodes` itself. The result is
// returned using the same encoding as `nodes` and `ignored`.
static uint64_t computeNeighbors(const Graph& graph, uint64_t nodes,
uint64_t ignored) {
uint64_t neighbors{};
for (size_t i = 0; i < 64; ++i) {
bool set = nodes & (1ULL << i);
if (set) {
neighbors |= graph[i].neighbors_;
}
}
neighbors &= (~ignored);
return neighbors;
}

// For a number `i` from 0 .. 2^`neighbors.size()` - 1, return the `i`th
// subset of the elements of `neighbors`. All elements in `neighbors` have
// to be from 0..63 so that the final result can be expressed as a bitmap.
static uint64_t subsetIndexToBitmap(size_t i,
const std::vector<uint8_t>& neighbors) {
// Note: This can probably be done more efficiently using bit fiddling, but it
// is efficient enough for now.
uint64_t subset = 0;
for (size_t k = 0; k < neighbors.size(); ++k) {
if (1 << k & i) {
subset |= (1ULL << neighbors[k]);
}
}
return subset;
}

// Convert a bitset to a vector of the indices of the bits that are set. For
// example, `13` (`1101` as bits) will be converted to `[0, 2, 3]`;
static std::vector<uint8_t> bitsetToVector(uint64_t bitset) {
std::vector<uint8_t> result;
for (uint8_t i = 0; i < 64; ++i) {
if (bitset & (1ULL << i)) {
result.push_back(i);
}
}
return result;
};

// _____________________________________________________________________________
std::string toBitsetString(uint64_t x) {
auto res = std::bitset<64>{x}.to_string();
auto pos = res.find('1');
if (pos >= res.size()) {
return "0";
}
return res.substr(pos);
}

// _____________________________________________________________________________
size_t countSubgraphsRecursively(const Graph& graph, uint64_t nodes,
uint64_t ignored, size_t count,
size_t budget) {
// Compute the set of direct neighbors of the `nodes` that is not
// ignored
uint64_t neighbors = computeNeighbors(graph, nodes, ignored);

std::vector<uint8_t> neighborsAsVector = bitsetToVector(neighbors);

// This is the recursion level which handles all the subsets of the neigrbors,
// and the above recursion levels deal with `nodes`, so we have to exclude
// them further down.
auto newIgnored = ignored | neighbors | nodes;

// Iterate over all Subsets of the neighbors
size_t upperBound = 1ULL << neighborsAsVector.size();
for (size_t i = 1; i < upperBound; ++i) {
++count;
if (count > budget) {
return budget + 1;
}
auto subset = subsetIndexToBitmap(i, neighborsAsVector);
count = countSubgraphsRecursively(graph, nodes | subset, newIgnored, count,
budget);
}
return count;
}
} // namespace countConnectedSubgraphs
41 changes: 41 additions & 0 deletions src/engine/CountConnectedSubgraphs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <[email protected]>

#pragma once

#include <cstdint>

// This module implements the efficient counting of the number of connected
// subgraphs in a given graph. This routine can be used to analyze the
// complexity of query graphs and to choose an appropriate query planner (see
// `QueryPlanner.cpp`). The algorithm is taken from
// Neumann and Radke, Adaptive Optimization of Very Large Join Queries, see
// https://dl.acm.org/doi/pdf/10.1145/3183713.3183733
namespace countConnectedSubgraphs {

// A representation of an undirected graph with at most 64 nodes. Each node is
// represented by a 64-bit number, where the i-th bit is 1 iff the corresponding
// node is a neighbor of the node.
struct Node {
uint64_t neighbors_{};
};
using Graph = std::vector<Node>;

// Compute the number of connected subgraphs in the `graph`. If the number of
// such subraphs is `> budget`, return `budget + 1`.
size_t countSubgraphs(const Graph& graph, size_t budget);

// Recursive implementation of `countSubgraphs`. Compute the number of connected
// subgraphs in `graph` that contains all the nodes in `nodes`, but none of the
// nodes in `ignored`. Assume that `count` subgraphs have been previously found
// and therefore count towards the `budget`. The `nodes` and `ignored` are 1-hot
// encoded bitsets (see above).
size_t countSubgraphsRecursively(const Graph& graph, uint64_t nodes,
uint64_t ignored, size_t count, size_t budget);

// Convert `x` to a string of bits, with the leading zeros removed, e.g.,
// `3` will become "11". This is useful for debugging the functions above.
std::string toBitsetString(uint64_t x);

} // namespace countConnectedSubgraphs
134 changes: 120 additions & 14 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "engine/CartesianProductJoin.h"
#include "engine/CheckUsePatternTrick.h"
#include "engine/CountAvailablePredicates.h"
#include "engine/CountConnectedSubgraphs.h"
#include "engine/Distinct.h"
#include "engine/Filter.h"
#include "engine/GroupBy.h"
Expand All @@ -36,6 +37,7 @@
#include "engine/Values.h"
#include "engine/sparqlExpressions/LiteralExpression.h"
#include "engine/sparqlExpressions/RelationalExpressions.h"
#include "global/RuntimeParameters.h"
#include "parser/Alias.h"
#include "parser/SparqlParserHelpers.h"

Expand Down Expand Up @@ -361,8 +363,6 @@ vector<QueryPlanner::SubtreePlan> QueryPlanner::getGroupByRow(
"should have thrown an exception earlier");
groupVariables.push_back(activeGraphVariable_.value());
}
// The GroupBy constructor automatically takes care of sorting the input if
// necessary.
groupByPlan._qet = makeExecutionTree<GroupBy>(
_qec, groupVariables, std::move(aliases), parent._qet);
added.push_back(groupByPlan);
Expand Down Expand Up @@ -682,6 +682,11 @@ auto QueryPlanner::seedWithScansAndText(
// add all child plans as seeds
uint64_t idShift = tg._nodeMap.size();
for (const auto& vec : children) {
AD_CONTRACT_CHECK(
idShift < 64,
absl::StrCat("Group graph pattern too large: QLever currently supports "
"at most 64 elements (like triples), but found ",
idShift));
for (const SubtreePlan& plan : vec) {
SubtreePlan newIdPlan = plan;
// give the plan a unique id bit
Expand Down Expand Up @@ -1094,8 +1099,7 @@ bool QueryPlanner::connected(const QueryPlanner::SubtreePlan& a,

// _____________________________________________________________________________
std::vector<std::array<ColumnIndex, 2>> QueryPlanner::getJoinColumns(
const QueryPlanner::SubtreePlan& a,
const QueryPlanner::SubtreePlan& b) const {
const SubtreePlan& a, const SubtreePlan& b) {
AD_CORRECTNESS_CHECK(a._qet && b._qet);
return QueryExecutionTree::getJoinColumns(*a._qet, *b._qet);
}
Expand Down Expand Up @@ -1247,6 +1251,20 @@ void QueryPlanner::applyTextLimitsIfPossible(
row.insert(row.end(), addedPlans.begin(), addedPlans.end());
}

// _____________________________________________________________________________
size_t QueryPlanner::findUniqueNodeIds(
const std::vector<SubtreePlan>& connectedComponent) {
ad_utility::HashSet<uint64_t> uniqueNodeIds;
auto nodeIds = connectedComponent |
std::views::transform(&SubtreePlan::_idsOfIncludedNodes);
// Check that all the `_idsOfIncludedNodes` are one-hot encodings of a single
// value, i.e. they have exactly one bit set.
AD_CORRECTNESS_CHECK(std::ranges::all_of(
nodeIds, [](auto nodeId) { return std::popcount(nodeId) == 1; }));
std::ranges::copy(nodeIds, std::inserter(uniqueNodeIds, uniqueNodeIds.end()));
return uniqueNodeIds.size();
}

// _____________________________________________________________________________
std::vector<QueryPlanner::SubtreePlan>
QueryPlanner::runDynamicProgrammingOnConnectedComponent(
Expand All @@ -1260,16 +1278,12 @@ QueryPlanner::runDynamicProgrammingOnConnectedComponent(
dpTab.push_back(std::move(connectedComponent));
applyFiltersIfPossible<false>(dpTab.back(), filters);
applyTextLimitsIfPossible(dpTab.back(), textLimits, false);
ad_utility::HashSet<uint64_t> uniqueNodeIds;
std::ranges::copy(
dpTab.back() | std::views::transform(&SubtreePlan::_idsOfIncludedNodes),
std::inserter(uniqueNodeIds, uniqueNodeIds.end()));
size_t numSeeds = uniqueNodeIds.size();
size_t numSeeds = findUniqueNodeIds(dpTab.back());

for (size_t k = 2; k <= numSeeds; ++k) {
LOG(TRACE) << "Producing plans that unite " << k << " triples."
<< std::endl;
dpTab.emplace_back(vector<SubtreePlan>());
dpTab.emplace_back();
for (size_t i = 1; i * 2 <= k; ++i) {
checkCancellation();
auto newPlans = merge(dpTab[i - 1], dpTab[k - i - 1], tg);
Expand All @@ -1284,6 +1298,72 @@ QueryPlanner::runDynamicProgrammingOnConnectedComponent(
return std::move(dpTab.back());
}

// _____________________________________________________________________________
size_t QueryPlanner::countSubgraphs(
std::vector<const QueryPlanner::SubtreePlan*> graph, size_t budget) {
// Remove duplicate plans from `graph`.
auto getId = [](const SubtreePlan* v) { return v->_idsOfIncludedNodes; };
std::ranges::sort(graph, std::ranges::less{}, getId);
graph.erase(
std::ranges::unique(graph, std::ranges::equal_to{}, getId).begin(),
graph.end());

// Qlever currently limits the number of triples etc. per group to be <= 64
// anyway, so we can simply assert here.
AD_CORRECTNESS_CHECK(graph.size() <= 64,
"Should qlever ever support more than 64 elements per "
"group graph pattern, then the `countSubgraphs` "
"functionality also has to be changed");

// Compute the bit representation needed for the call to
// `countConnectedSubgraphs::countSubgraphs` below.
countConnectedSubgraphs::Graph g;
for (size_t i = 0; i < graph.size(); ++i) {
countConnectedSubgraphs::Node v{0};
for (size_t k = 0; k < graph.size(); ++k) {
if ((k != i) &&
!QueryPlanner::getJoinColumns(*graph.at(k), *graph.at(i)).empty()) {
v.neighbors_ |= (1ULL << k);
}
}
g.push_back(v);
}

return countConnectedSubgraphs::countSubgraphs(g, budget);
}

// _____________________________________________________________________________
std::vector<QueryPlanner::SubtreePlan>
QueryPlanner::runGreedyPlanningOnConnectedComponent(
std::vector<SubtreePlan> connectedComponent,
const vector<SparqlFilter>& filters, const TextLimitMap& textLimits,
const TripleGraph& tg) const {
auto& result = connectedComponent;
applyFiltersIfPossible<true>(result, filters);
applyTextLimitsIfPossible(result, textLimits, true);
size_t numSeeds = findUniqueNodeIds(result);

while (numSeeds > 1) {
checkCancellation();
auto newPlans = merge(result, result, tg);
applyFiltersIfPossible<true>(newPlans, filters);
applyTextLimitsIfPossible(newPlans, textLimits, true);
auto smallestIdx = findSmallestExecutionTree(newPlans);
auto& cheapestNewTree = newPlans.at(smallestIdx);
size_t oldSize = result.size();
std::erase_if(result, [&cheapestNewTree](const auto& plan) {
// TODO<joka921> We can also assert some other invariants here.
return (cheapestNewTree._idsOfIncludedNodes & plan._idsOfIncludedNodes) !=
0;
});
result.push_back(std::move(cheapestNewTree));
AD_CORRECTNESS_CHECK(result.size() < oldSize);
numSeeds--;
}
// TODO<joka921> Assert that all seeds are covered by the result.
return std::move(result);
}

// _____________________________________________________________________________
vector<vector<QueryPlanner::SubtreePlan>> QueryPlanner::fillDpTab(
const QueryPlanner::TripleGraph& tg, vector<SparqlFilter> filters,
Expand All @@ -1302,8 +1382,22 @@ vector<vector<QueryPlanner::SubtreePlan>> QueryPlanner::fillDpTab(
}
vector<vector<SubtreePlan>> lastDpRowFromComponents;
for (auto& component : components | std::views::values) {
lastDpRowFromComponents.push_back(runDynamicProgrammingOnConnectedComponent(
std::move(component), filters, textLimits, tg));
std::vector<const SubtreePlan*> g;
for (const auto& plan : component) {
g.push_back(&plan);
}
const size_t budget = RuntimeParameters().get<"query-planning-budget">();
bool useGreedyPlanning = countSubgraphs(g, budget) > budget;
if (useGreedyPlanning) {
LOG(INFO)
<< "Using the greedy query planner for a large connected component"
<< std::endl;
}
auto impl = useGreedyPlanning
? &QueryPlanner::runGreedyPlanningOnConnectedComponent
: &QueryPlanner::runDynamicProgrammingOnConnectedComponent;
lastDpRowFromComponents.push_back(
std::invoke(impl, this, std::move(component), filters, textLimits, tg));
checkCancellation();
}
size_t numConnectedComponents = lastDpRowFromComponents.size();
Expand Down Expand Up @@ -1643,8 +1737,20 @@ size_t QueryPlanner::findCheapestExecutionTree(
return aCost < bCost;
}
};
return std::min_element(lastRow.begin(), lastRow.end(), compare) -
lastRow.begin();
return std::ranges::min_element(lastRow, compare) - lastRow.begin();
};

// _________________________________________________________________________________
size_t QueryPlanner::findSmallestExecutionTree(
const std::vector<SubtreePlan>& lastRow) {
AD_CONTRACT_CHECK(!lastRow.empty());
auto compare = [](const auto& a, const auto& b) {
auto tie = [](const auto& x) {
return std::tuple{x.getSizeEstimate(), x.getSizeEstimate()};
};
return tie(a) < tie(b);
};
return std::ranges::min_element(lastRow, compare) - lastRow.begin();
};

// _____________________________________________________________________________
Expand Down
Loading

0 comments on commit c4be461

Please sign in to comment.