From 2b4e6b38e605fb50d4c6f43eeb39da3e810e8cf3 Mon Sep 17 00:00:00 2001 From: "c.u." Date: Fri, 18 Oct 2024 08:12:23 +0200 Subject: [PATCH] Better size estimate for `SpatialJoin` (#1555) Up until now the `SpatialJoin` class always returned the product of the sizes of both children as its own size estimate. For a k-nearest-neighbor join this is usually way larger than the actual maximum size of the result. This PR implements a more precise size estimate. --- src/engine/SpatialJoin.cpp | 18 ++++++++++++++++-- src/global/Constants.h | 6 ++++++ test/engine/SpatialJoinTest.cpp | 25 ++++++++++--------------- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/engine/SpatialJoin.cpp b/src/engine/SpatialJoin.cpp index 19757f1edb..beb09ffe68 100644 --- a/src/engine/SpatialJoin.cpp +++ b/src/engine/SpatialJoin.cpp @@ -231,7 +231,19 @@ size_t SpatialJoin::getCostEstimate() { // ____________________________________________________________________________ uint64_t SpatialJoin::getSizeEstimateBeforeLimit() { if (childLeft_ && childRight_) { - return childLeft_->getSizeEstimate() * childRight_->getSizeEstimate(); + // If we limit the number of results to k, even in the largest scenario, the + // result can be at most `|childLeft| * k` + auto maxResults = getMaxResults(); + if (maxResults.has_value()) { + return childLeft_->getSizeEstimate() * maxResults.value(); + } + + // If we don't limit the number of results, we cannot draw conclusions about + // the size, other than the worst case `|childLeft| * |childRight|`. However + // to improve query planning for the average case, we apply a constant + // factor (the asymptotic behavior remains unchanged). + return (childLeft_->getSizeEstimate() * childRight_->getSizeEstimate()) / + SPATIAL_JOIN_MAX_DIST_SIZE_ESTIMATE; } return 1; // dummy return if not both children are added } @@ -263,7 +275,9 @@ float SpatialJoin::getMultiplicity(size_t col) { column -= childLeft_->getResultWidth(); } auto distinctnessChild = getDistinctness(child, column); - return static_cast(getSizeEstimate()) / distinctnessChild; + return static_cast(childLeft_->getSizeEstimate() * + childRight_->getSizeEstimate()) / + distinctnessChild; } else { return 1; } diff --git a/src/global/Constants.h b/src/global/Constants.h index 2eb156aef0..970cde9cd0 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -107,6 +107,12 @@ static const std::string MAX_DIST_IN_METERS = "[0-9]+)>"}; +// The spatial join operation without a limit on the maximum number of results +// can, in the worst case have a square number of results, but usually this is +// not the case. 1 divided by this constant is the damping factor for the +// estimated number of results. +static const size_t SPATIAL_JOIN_MAX_DIST_SIZE_ESTIMATE = 1000; + // This predicate is one of the supported identifiers for the SpatialJoin class. // For a given triple of the form ?a ?b, it will // produce for each value of ?a the nearest neighbors ?b to ?a. The number of diff --git a/test/engine/SpatialJoinTest.cpp b/test/engine/SpatialJoinTest.cpp index 4de1de25e9..e07422b5b4 100644 --- a/test/engine/SpatialJoinTest.cpp +++ b/test/engine/SpatialJoinTest.cpp @@ -21,6 +21,7 @@ #include "engine/Join.h" #include "engine/QueryExecutionTree.h" #include "engine/SpatialJoin.h" +#include "global/Constants.h" #include "parser/data/Variable.h" namespace { // anonymous namespace to avoid linker problems @@ -608,12 +609,7 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst, } }; - auto assert_double_with_bounds = [](double value1, double value2) { - // ASSERT_DOUBLE_EQ did not work properly - ASSERT_TRUE(value1 * 0.99999 < value2); - ASSERT_TRUE(value1 * 1.00001 > value2); - }; - + const double doubleBound = 0.00001; std::string kg = createSmallDatasetWithPoints(); // add multiplicities to test knowledge graph @@ -704,7 +700,7 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst, auto sizeEstimateChild = inputChild->getSizeEstimate(); double distinctnessChild = sizeEstimateChild / multiplicityChild; auto mult = spatialJoin->getMultiplicity(i); - auto sizeEst = spatialJoin->getSizeEstimate(); + auto sizeEst = std::pow(sizeEstimateChild, 2); double distinctness = sizeEst / mult; // multiplicity, distinctness and size are related via the formula // size = distinctness * multiplicity. Therefore if we have two of them, @@ -712,7 +708,7 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst, // holds true. The distinctness must not change after the operation, the // other two variables can change. Therefore we check the correctness // via distinctness. - assert_double_with_bounds(distinctnessChild, distinctness); + ASSERT_NEAR(distinctnessChild, distinctness, doubleBound); } } } else { @@ -723,12 +719,12 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst, ASSERT_EQ(spatialJoin->getSizeEstimate(), 1); auto spJoin2 = spatialJoin->addChild(secondChild, secondVariable); spatialJoin = static_cast(spJoin2.get()); - // the size should be 49, because both input tables have 7 rows and it is - // assumed, that the whole cross product is build + // the size should be at most 49, because both input tables have 7 rows and + // it is assumed, that in the worst case the whole cross product is build auto estimate = spatialJoin->onlyForTestingGetLeftChild()->getSizeEstimate() * spatialJoin->onlyForTestingGetRightChild()->getSizeEstimate(); - ASSERT_EQ(estimate, spatialJoin->getSizeEstimate()); + ASSERT_LE(spatialJoin->getSizeEstimate(), estimate); } { // new block for hard coded testing @@ -790,9 +786,8 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst, auto assertMultiplicity = [&](Variable var, double multiplicity, SpatialJoin* spatialJoin, VariableToColumnMap& varColsMap) { - assert_double_with_bounds( - spatialJoin->getMultiplicity(varColsMap[var].columnIndex_), - multiplicity); + ASSERT_NEAR(spatialJoin->getMultiplicity(varColsMap[var].columnIndex_), + multiplicity, doubleBound); }; multiplicitiesBeforeAllChildrenAdded(spatialJoin); auto spJoin1 = spatialJoin->addChild(firstChild, firstVariable); @@ -815,7 +810,7 @@ void testMultiplicitiesOrSizeEstimate(bool addLeftChildFirst, spatialJoin = static_cast(spJoin1.get()); auto spJoin2 = spatialJoin->addChild(secondChild, secondVariable); spatialJoin = static_cast(spJoin2.get()); - ASSERT_EQ(spatialJoin->getSizeEstimate(), 49); + ASSERT_LE(spatialJoin->getSizeEstimate(), 49); } } }