From 7bffcd5393b941667cec6cc383b92523cfe06fc0 Mon Sep 17 00:00:00 2001 From: Jonathan24680 <93743118+Jonathan24680@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:36:50 +0100 Subject: [PATCH] Spatial join bounding box (#1546) We now have two efficient implementations of spatial joins: One using Google's S2 library, and now a new one using `boost::geometry::rtree`. Currently both of these implementations are limited to point geometries, the new, rtree-based implementation is easier to extend to arbitrary geometries, which will be implemented soon. As boost::geometry purely works on Cartesian coordinates, which are especially unsuited for the spherical form of the earth, this PR manually implements the required expansion of bounding boxes to also show the correct behavior for bounding boxes that cross one of the poles or the 180th degree of longitude. --- src/engine/SpatialJoin.cpp | 18 +- src/engine/SpatialJoin.h | 14 +- src/engine/SpatialJoinAlgorithms.cpp | 305 +++++++++++- src/engine/SpatialJoinAlgorithms.h | 82 +++- test/engine/SpatialJoinAlgorithmsTest.cpp | 573 +++++++++++++++++++++- 5 files changed, 971 insertions(+), 21 deletions(-) diff --git a/src/engine/SpatialJoin.cpp b/src/engine/SpatialJoin.cpp index beb09ffe68..1ad7d8179a 100644 --- a/src/engine/SpatialJoin.cpp +++ b/src/engine/SpatialJoin.cpp @@ -209,7 +209,7 @@ size_t SpatialJoin::getCostEstimate() { if (childLeft_ && childRight_) { size_t inputEstimate = childLeft_->getSizeEstimate() * childRight_->getSizeEstimate(); - if (useBaselineAlgorithm_) { + if (algorithm_ == Algorithm::Baseline) { return inputEstimate * inputEstimate; } else { // Let n be the size of the left table and m the size of the right table. @@ -336,11 +336,21 @@ PreparedSpatialJoinParams SpatialJoin::prepareJoin() const { // ____________________________________________________________________________ Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) { SpatialJoinAlgorithms algorithms{_executionContext, prepareJoin(), - addDistToResult_, config_}; - if (useBaselineAlgorithm_) { + addDistToResult_, config_, this}; + if (algorithm_ == Algorithm::Baseline) { return algorithms.BaselineAlgorithm(); - } else { + } else if (algorithm_ == Algorithm::S2Geometry) { return algorithms.S2geometryAlgorithm(); + } else { + AD_CORRECTNESS_CHECK(algorithm_ == Algorithm::BoundingBox); + // as the BoundingBoxAlgorithms only works for max distance and not for + // nearest neighbors, S2geometry gets called as a backup, if the query is + // asking for the nearest neighbors + if (std::get_if(&config_)) { + return algorithms.BoundingBoxAlgorithm(); + } else { + return algorithms.S2geometryAlgorithm(); + } } } diff --git a/src/engine/SpatialJoin.h b/src/engine/SpatialJoin.h index b62978cf90..b31b1b8866 100644 --- a/src/engine/SpatialJoin.h +++ b/src/engine/SpatialJoin.h @@ -96,14 +96,20 @@ class SpatialJoin : public Operation { // purposes std::optional getMaxResults() const; - void selectAlgorithm(bool useBaselineAlgorithm) { - useBaselineAlgorithm_ = useBaselineAlgorithm; - } + // options which can be used for the algorithm, which calculates the result + enum class Algorithm { Baseline, S2Geometry, BoundingBox }; + + void selectAlgorithm(Algorithm algorithm) { algorithm_ = algorithm; } std::pair onlyForTestingGetConfig() const { return std::pair{getMaxDist().value_or(-1), getMaxResults().value_or(-1)}; } + std::variant + onlyForTestingGetActualConfig() const { + return config_; + } + std::shared_ptr onlyForTestingGetLeftChild() const { return childLeft_; } @@ -135,5 +141,5 @@ class SpatialJoin : public Operation { // between the two objects bool addDistToResult_ = true; const string nameDistanceInternal_ = "?distOfTheTwoObjectsAddedInternally"; - bool useBaselineAlgorithm_ = false; + Algorithm algorithm_ = Algorithm::S2Geometry; }; diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index dd7a9b1c7c..78eec1e2a2 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -6,17 +6,23 @@ #include #include +#include + #include "util/GeoSparqlHelpers.h" +using namespace BoostGeometryNamespace; + // ____________________________________________________________________________ SpatialJoinAlgorithms::SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, bool addDistToResult, - std::variant config) + std::variant config, + std::optional spatialJoin) : qec_{qec}, params_{std::move(params)}, addDistToResult_{addDistToResult}, - config_{std::move(config)} {} + config_{std::move(config)}, + spatialJoin_{spatialJoin} {} // ____________________________________________________________________________ std::optional SpatialJoinAlgorithms::getPoint(const IdTable* restable, @@ -223,3 +229,298 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() { return Result(std::move(result), std::vector{}, Result::getMergedLocalVocab(*resultLeft, *resultRight)); } + +// ____________________________________________________________________________ +std::vector SpatialJoinAlgorithms::computeBoundingBox( + const Point& startPoint) const { + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + AD_CORRECTNESS_CHECK(maxDist.has_value(), + "Max distance must have a value for this operation"); + // haversine function + auto haversine = [](double theta) { return (1 - std::cos(theta)) / 2; }; + + // inverse haversine function + auto archaversine = [](double theta) { return std::acos(1 - 2 * theta); }; + + // safety buffer for numerical inaccuracies + double maxDistInMetersBuffer; + if (maxDist.value() < 10) { + maxDistInMetersBuffer = 10; + } else if (static_cast(maxDist.value()) < + static_cast(std::numeric_limits::max()) / + 1.02) { + maxDistInMetersBuffer = 1.01 * static_cast(maxDist.value()); + } else { + maxDistInMetersBuffer = + static_cast(std::numeric_limits::max()); + } + + // for large distances, where the lower calculation would just result in + // a single bounding box for the whole planet, do an optimized version + if (static_cast(maxDist.value()) > circumferenceMax_ / 4.0 && + static_cast(maxDist.value()) < circumferenceMax_ / 2.01) { + return computeBoundingBoxForLargeDistances(startPoint); + } + + // compute latitude bound + double maxDistInDegrees = maxDistInMetersBuffer * (360 / circumferenceMax_); + double upperLatBound = startPoint.get<1>() + maxDistInDegrees; + double lowerLatBound = startPoint.get<1>() - maxDistInDegrees; + + auto southPoleReached = isAPoleTouched(lowerLatBound).at(1); + auto northPoleReached = isAPoleTouched(upperLatBound).at(0); + + if (southPoleReached || northPoleReached) { + return {Box(Point(-180.0f, lowerLatBound), Point(180.0f, upperLatBound))}; + } + + // compute longitude bound. For an explanation of the calculation and the + // naming convention see my master thesis + double alpha = maxDistInMetersBuffer / radius_; + double gamma = (90 - std::abs(startPoint.get<1>())) * (2 * M_PI / 360); + double beta = std::acos(std::cos(gamma) / std::cos(alpha)); + double delta = 0; + if (maxDistInMetersBuffer > circumferenceMax_ / 20) { + // use law of cosines + delta = std::acos((std::cos(alpha) - std::cos(gamma) * std::cos(beta)) / + (std::sin(gamma) * std::sin(beta))); + } else { + // use law of haversines for numerical stability + delta = archaversine((haversine(alpha - haversine(gamma - beta))) / + (std::sin(gamma) * std::sin(beta))); + } + double lonRange = delta * 360 / (2 * M_PI); + double leftLonBound = startPoint.get<0>() - lonRange; + double rightLonBound = startPoint.get<0>() + lonRange; + // test for "overflows" and create two bounding boxes if necessary + if (leftLonBound < -180) { + auto box1 = + Box(Point(-180, lowerLatBound), Point(rightLonBound, upperLatBound)); + auto box2 = Box(Point(leftLonBound + 360, lowerLatBound), + Point(180, upperLatBound)); + return {box1, box2}; + } else if (rightLonBound > 180) { + auto box1 = + Box(Point(leftLonBound, lowerLatBound), Point(180, upperLatBound)); + auto box2 = Box(Point(-180, lowerLatBound), + Point(rightLonBound - 360, upperLatBound)); + return {box1, box2}; + } + // default case, when no bound has an "overflow" + return {Box(Point(leftLonBound, lowerLatBound), + Point(rightLonBound, upperLatBound))}; +} + +// ____________________________________________________________________________ +std::vector SpatialJoinAlgorithms::computeBoundingBoxForLargeDistances( + const Point& startPoint) const { + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + AD_CORRECTNESS_CHECK(maxDist.has_value(), + "Max distance must have a value for this operation"); + + // point on the opposite side of the globe + Point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1); + if (antiPoint.get<0>() > 180) { + antiPoint.set<0>(antiPoint.get<0>() - 360); + } + // for an explanation of the formula see the master thesis. Divide by two two + // only consider the distance from the point to the antiPoint, subtract + // maxDist and a safety margine from that + double antiDist = + (circumferenceMin_ / 2.0) - static_cast(maxDist.value()) * 1.01; + // use the bigger circumference as an additional safety margin, use 2.01 + // instead of 2.0 because of rounding inaccuracies in floating point + // operations + double distToAntiPoint = (360 / circumferenceMax_) * (antiDist / 2.01); + double upperBound = antiPoint.get<1>() + distToAntiPoint; + double lowerBound = antiPoint.get<1>() - distToAntiPoint; + double leftBound = antiPoint.get<0>() - distToAntiPoint; + double rightBound = antiPoint.get<0>() + distToAntiPoint; + bool northPoleTouched = false; + bool southPoleTouched = false; + bool boxCrosses180Longitude = false; // if the 180 to -180 line is touched + // if a pole is crossed, ignore the part after the crossing + if (upperBound > 90) { + upperBound = 90; + northPoleTouched = true; + } + if (lowerBound < -90) { + lowerBound = -90; + southPoleTouched = true; + } + if (leftBound < -180) { + leftBound += 360; + } + if (rightBound > 180) { + rightBound -= 360; + } + if (rightBound < leftBound) { + boxCrosses180Longitude = true; + } + // compute bounding boxes using the anti bounding box from above + std::vector boxes; + if (!northPoleTouched) { + // add upper bounding box(es) + if (boxCrosses180Longitude) { + boxes.emplace_back(Point(leftBound, upperBound), Point(180, 90)); + boxes.emplace_back(Point(-180, upperBound), Point(rightBound, 90)); + } else { + boxes.emplace_back(Point(leftBound, upperBound), Point(rightBound, 90)); + } + } + if (!southPoleTouched) { + // add lower bounding box(es) + if (boxCrosses180Longitude) { + boxes.emplace_back(Point(leftBound, -90), Point(180, lowerBound)); + boxes.emplace_back(Point(-180, -90), Point(rightBound, lowerBound)); + } else { + boxes.emplace_back(Point(leftBound, -90), Point(rightBound, lowerBound)); + } + } + // add the box(es) inbetween the longitude lines + if (boxCrosses180Longitude) { + // only one box needed to cover the longitudes + boxes.emplace_back(Point(rightBound, -90), Point(leftBound, 90)); + } else { + // two boxes needed, one left and one right of the anti bounding box + boxes.emplace_back(Point(-180, -90), Point(leftBound, 90)); + boxes.emplace_back(Point(rightBound, -90), Point(180, 90)); + } + return boxes; +} + +// ____________________________________________________________________________ +bool SpatialJoinAlgorithms::isContainedInBoundingBoxes( + const std::vector& boundingBox, Point point) const { + convertToNormalCoordinates(point); + + return std::ranges::any_of(boundingBox, [point](const Box& aBox) { + return boost::geometry::covered_by(point, aBox); + }); +} + +// ____________________________________________________________________________ +void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) const { + // correct lon and lat bounds if necessary + while (point.get<0>() < -180) { + point.set<0>(point.get<0>() + 360); + } + while (point.get<0>() > 180) { + point.set<0>(point.get<0>() - 360); + } + if (point.get<1>() < -90) { + point.set<1>(-90); + } else if (point.get<1>() > 90) { + point.set<1>(90); + } +} + +// ____________________________________________________________________________ +std::array SpatialJoinAlgorithms::isAPoleTouched( + const double& latitude) const { + bool northPoleReached = false; + bool southPoleReached = false; + if (latitude >= 90) { + northPoleReached = true; + } + if (latitude <= -90) { + southPoleReached = true; + } + return std::array{northPoleReached, southPoleReached}; +} + +// ____________________________________________________________________________ +Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { + auto printWarning = [alreadyWarned = false, + &spatialJoin = spatialJoin_]() mutable { + if (!alreadyWarned) { + std::string warning = + "The input to a spatial join contained at least one element, " + "that is not a point geometry and is thus skipped. Note that " + "QLever currently only accepts point geometries for the " + "spatial joins"; + AD_LOG_WARN << warning << std::endl; + alreadyWarned = true; + if (spatialJoin.has_value()) { + AD_CORRECTNESS_CHECK(spatialJoin.value() != nullptr); + spatialJoin.value()->addWarning(warning); + } + } + }; + + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + IdTable result{numColumns, qec_->getAllocator()}; + + // create r-tree for smaller result table + auto smallerResult = idTableLeft; + auto otherResult = idTableRight; + bool leftResSmaller = true; + auto smallerResJoinCol = leftJoinCol; + auto otherResJoinCol = rightJoinCol; + if (idTableLeft->numRows() > idTableRight->numRows()) { + std::swap(smallerResult, otherResult); + leftResSmaller = false; + std::swap(smallerResJoinCol, otherResJoinCol); + } + + bgi::rtree, bgi::indexable, + bgi::equal_to, ad_utility::AllocatorWithLimit> + rtree(bgi::quadratic<16>{}, bgi::indexable{}, + bgi::equal_to{}, qec_->getAllocator()); + for (size_t i = 0; i < smallerResult->numRows(); i++) { + // get point of row i + auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); + + if (!geopoint) { + printWarning(); + continue; + } + + Point p(geopoint.value().getLng(), geopoint.value().getLat()); + + // add every point together with the row number into the rtree + rtree.insert(std::make_pair(std::move(p), i)); + } + std::vector> results{ + qec_->getAllocator()}; + for (size_t i = 0; i < otherResult->numRows(); i++) { + auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); + + if (!geopoint1) { + printWarning(); + continue; + } + + Point p(geopoint1.value().getLng(), geopoint1.value().getLat()); + + // query the other rtree for every point using the following bounding box + std::vector bbox = computeBoundingBox(p); + results.clear(); + + std::ranges::for_each(bbox, [&](const Box& bbox) { + rtree.query(bgi::intersects(bbox), std::back_inserter(results)); + }); + + std::ranges::for_each(results, [&](const Value& res) { + size_t rowLeft = res.second; + size_t rowRight = i; + if (!leftResSmaller) { + std::swap(rowLeft, rowRight); + } + auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight, + leftJoinCol, rightJoinCol); + AD_CORRECTNESS_CHECK(distance.getDatatype() == Datatype::Int); + if (distance.getInt() <= maxDist) { + addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, + rowRight, distance); + } + }); + } + auto resTable = + Result(std::move(result), std::vector{}, + Result::getMergedLocalVocab(*resultLeft, *resultRight)); + return resTable; +} diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 00556f1b3d..0719b769b4 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -1,26 +1,53 @@ #pragma once +#include +#include +#include +#include + #include "engine/Result.h" #include "engine/SpatialJoin.h" +namespace BoostGeometryNamespace { +namespace bg = boost::geometry; +namespace bgi = boost::geometry::index; + +using Point = bg::model::point; +using Box = bg::model::box; +using Value = std::pair; +} // namespace BoostGeometryNamespace + class SpatialJoinAlgorithms { public: - // initialize the Algorithm with the needed parameters + // Initialize the Algorithm with the needed parameters SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, bool addDistToResult, - std::variant config); + std::variant config, + std::optional spatialJoin = std::nullopt); Result BaselineAlgorithm(); Result S2geometryAlgorithm(); Result BoundingBoxAlgorithm(); + std::vector + OnlyForTestingWrapperComputeBoundingBox( + const BoostGeometryNamespace::Point& startPoint) const { + return computeBoundingBox(startPoint); + } + + bool OnlyForTestingWrapperContainedInBoundingBoxes( + const std::vector& boundingBox, + const BoostGeometryNamespace::Point& point) const { + return isContainedInBoundingBoxes(boundingBox, point); + } + private: - // helper function which returns a GeoPoint if the element of the given table + // Helper function which returns a GeoPoint if the element of the given table // represents a GeoPoint std::optional getPoint(const IdTable* restable, size_t row, ColumnIndex col) const; - // helper function, which computes the distance of two points, where each + // Helper function, which computes the distance of two points, where each // point comes from a different result table Id computeDist(const IdTable* resLeft, const IdTable* resRight, size_t rowLeft, size_t rowRight, ColumnIndex leftPointCol, @@ -33,8 +60,55 @@ class SpatialJoinAlgorithms { const IdTable* resultRight, size_t rowLeft, size_t rowRight, Id distance) const; + // This function computes the bounding box(es) which represent all points, + // which are in reach of the starting point with a distance of at most + // 'maxDistanceInMeters'. In theory there is always only one bounding box, but + // when mapping the spherical surface on a cartesian plane there are borders. + // So when the "single true" bounding box crosses the left or right (+/-180 + // longitude line) or the poles (+/- 90 latitude, which on the cartesian + // mapping is the top and bottom edge of the rectangular mapping) then the + // single box gets split into multiple boxes (i.e. one on the left and one on + // the right, which when seen on the sphere look like a single box, but on the + // map and in the internal representation it looks like two/more boxes) + std::vector computeBoundingBox( + const BoostGeometryNamespace::Point& startPoint) const; + + // This helper function calculates the bounding boxes based on a box, where + // definitely no match can occur. This means every element in the anti + // bounding box is guaranteed to be more than 'maxDistanceInMeters' away from + // the startPoint. The function is then returning the set of boxes, which + // cover everything on earth, except for the anti bounding box. This function + // gets used, when the usual procedure, would just result in taking a big + // bounding box, which covers the whole planet (so for extremely large max + // distances) + std::vector computeBoundingBoxForLargeDistances( + const BoostGeometryNamespace::Point& startPoint) const; + + // This function returns true, iff the given point is contained in any of the + // bounding boxes + bool isContainedInBoundingBoxes( + const std::vector& boundingBox, + BoostGeometryNamespace::Point point) const; + QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; bool addDistToResult_; std::variant config_; + std::optional spatialJoin_; + + // circumference in meters at the equator (max) and the pole (min) (as the + // earth is not exactly a sphere the circumference is different. Note that + // the values are given in meters) + static constexpr double circumferenceMax_ = 40'075'000; + static constexpr double circumferenceMin_ = 40'007'000; + + // radius of the earth in meters (as the earth is not exactly a sphere the + // radius at the equator has been taken) + static constexpr double radius_ = 6'378'000; + + // convert coordinates to the usual ranges (-180 to 180 and -90 to 90) + void convertToNormalCoordinates(BoostGeometryNamespace::Point& point) const; + + // return whether one of the poles is being touched + std::array isAPoleTouched(const double& latitude) const; }; diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index f732c0d8e4..95fe8fa50f 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -25,14 +25,16 @@ using namespace SpatialJoinTestHelpers; namespace computeResultTest { -class SpatialJoinParamTest : public ::testing::TestWithParam { +class SpatialJoinParamTest + : public ::testing::TestWithParam { public: void createAndTestSpatialJoin( QueryExecutionContext* qec, SparqlTriple spatialJoinTriple, std::shared_ptr leftChild, std::shared_ptr rightChild, bool addLeftChildFirst, std::vector> expectedOutputUnorderedRows, - std::vector columnNames) { + std::vector columnNames, + bool isWrongPointInputTest = false) { // this function is like transposing a matrix. An entry which has been // stored at (i, k) is now stored at (k, i). The reason this is needed is // the following: this function receives the input as a vector of vector of @@ -106,6 +108,23 @@ class SpatialJoinParamTest : public ::testing::TestWithParam { }*/ EXPECT_THAT(vec, ::testing::UnorderedElementsAreArray(expectedOutput)); + + if (isWrongPointInputTest && + GetParam() == SpatialJoin::Algorithm::BoundingBox) { + auto warnings = spatialJoin->collectWarnings(); + bool containsWrongPointWarning = false; + std::string warningMessage = + "The input to a spatial join contained at least one element, " + "that is not a point geometry and is thus skipped. Note that " + "QLever currently only accepts point geometries for the " + "spatial joins"; + for (const auto& warning : warnings) { + if (warning == warningMessage) { + containsWrongPointWarning = true; + } + } + ASSERT_TRUE(containsWrongPointWarning); + } } // build the test using the small dataset. Let the SpatialJoin operation be @@ -192,15 +211,13 @@ class SpatialJoinParamTest : public ::testing::TestWithParam { auto qec = buildTestQEC(); auto numTriples = qec->getIndex().numTriples().normal; ASSERT_EQ(numTriples, 15); - // ========================= build big child - // ================================= + // ========================= build big child ============================= auto bigChild = buildMediumChild( qec, {"?obj1", std::string{""}, "?name1"}, {"?obj1", std::string{""}, "?geo1"}, {"?geo1", std::string{""}, "?point1"}, "?obj1", "?geo1"); - // ========================= build small child - // =============================== + // ========================= build small child =========================== TripleComponent point2{Variable{"?point2"}}; auto smallChild = buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); @@ -218,6 +235,64 @@ class SpatialJoinParamTest : public ::testing::TestWithParam { columnNames); } + void testDiffSizeIdTables( + std::string specialPredicate, bool addLeftChildFirst, + std::vector> expectedOutput, + std::vector columnNames, bool bigChildLeft) { + auto qec = buildTestQEC(); + auto numTriples = qec->getIndex().numTriples().normal; + ASSERT_EQ(numTriples, 15); + // ====================== build small input ============================== + TripleComponent point1{Variable{"?point1"}}; + TripleComponent subject{ + ad_utility::triple_component::Iri::fromIriref("")}; + auto smallChild = ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, + SparqlTriple{subject, std::string{""}, point1}); + // ====================== build big input ================================ + auto bigChild = + buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); + + auto firstChild = bigChildLeft ? bigChild : smallChild; + auto secondChild = bigChildLeft ? smallChild : bigChild; + auto firstVariable = + bigChildLeft ? TripleComponent{Variable{"?point2"}} : point1; + auto secondVariable = + bigChildLeft ? point1 : TripleComponent{Variable{"?point2"}}; + + createAndTestSpatialJoin( + qec, SparqlTriple{firstVariable, specialPredicate, secondVariable}, + firstChild, secondChild, addLeftChildFirst, expectedOutput, + columnNames); + } + + void testWrongPointInInput( + std::string specialPredicate, bool addLeftChildFirst, + std::vector> expectedOutput, + std::vector columnNames) { + auto kg = createSmallDatasetWithPoints(); + // make first point wrong: + auto pos = kg.find("POINT("); + kg = kg.insert(pos + 7, "wrongStuff"); + + ad_utility::MemorySize blocksizePermutations = 128_MB; + auto qec = ad_utility::testing::getQec(kg, true, true, false, + blocksizePermutations, false); + auto numTriples = qec->getIndex().numTriples().normal; + ASSERT_EQ(numTriples, 15); + // ====================== build inputs ================================ + TripleComponent point1{Variable{"?point1"}}; + TripleComponent point2{Variable{"?point2"}}; + auto leftChild = + buildIndexScan(qec, {"?obj1", std::string{""}, "?point1"}); + auto rightChild = + buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); + + createAndTestSpatialJoin( + qec, SparqlTriple{point1, specialPredicate, point2}, leftChild, + rightChild, addLeftChildFirst, expectedOutput, columnNames, true); + } + protected: bool useBaselineAlgorithm_; }; @@ -487,6 +562,63 @@ std::vector> expectedMaxDist10000000_rows_small{ mergeToRow(sEif, sEye, expectedDistEyeEif), mergeToRow(sEif, sLib, expectedDistEifLib)}; +std::vector> expectedMaxDist1_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), +}; + +std::vector> + expectedMaxDist5000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf)}; + +std::vector> + expectedMaxDist500000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif)}; + +std::vector> + expectedMaxDist1000000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sMun, sEye, expectedDistMunEye), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sEye, sMun, expectedDistMunEye), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif)}; + +std::vector> + expectedMaxDist10000000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sMun, sEye, expectedDistMunEye), + mergeToRow(sMun, sLib, expectedDistMunLib), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sEye, sMun, expectedDistMunEye), + mergeToRow(sEye, sLib, expectedDistEyeLib), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sLib, sMun, expectedDistMunLib), + mergeToRow(sLib, sEye, expectedDistEyeLib), + mergeToRow(sLib, sEif, expectedDistEifLib), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif), + mergeToRow(sEif, sLib, expectedDistEifLib)}; + std::vector> expectedMaxDist1_rows_diff{ mergeToRow(TF, sTF, expectedDistSelf), mergeToRow(Mun, sMun, expectedDistSelf), @@ -565,6 +697,31 @@ std::vector> expectedMaxDist10000000_rows_diff{ mergeToRow(Eif, sEye, expectedDistEyeEif), mergeToRow(Eif, sLib, expectedDistEifLib)}; +std::vector> expectedMaxDist1_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf)}; + +std::vector> expectedMaxDist5000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun)}; + +std::vector> expectedMaxDist500000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif)}; + +std::vector> expectedMaxDist1000000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif), + mergeToRow({sTF.at(1)}, sEye, expectedDistUniEye)}; + +std::vector> expectedMaxDist10000000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif), + mergeToRow({sTF.at(1)}, sEye, expectedDistUniEye), + mergeToRow({sTF.at(1)}, sLib, expectedDistUniLib)}; + std::vector> expectedNearestNeighbors1{ mergeToRow(TF, TF, expectedDistSelf), mergeToRow(Mun, Mun, expectedDistSelf), @@ -780,8 +937,410 @@ TEST_P(SpatialJoinParamTest, computeResultSmallDatasetDifferentSizeChildren) { expectedMaxDist10000000_rows_diff, columnNames, false); } -INSTANTIATE_TEST_SUITE_P(SpatialJoin, SpatialJoinParamTest, ::testing::Bool()); +TEST_P(SpatialJoinParamTest, maxSizeMaxDistanceTest) { + auto maxDist = std::numeric_limits::max(); + std::string maxDistStr = + absl::StrCat(""); + + // test small children + std::vector columnNames{"?obj1", "?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetSmallChildren( + maxDistStr, true, expectedMaxDist10000000_rows_small, columnNames); + buildAndTestSmallTestSetSmallChildren( + maxDistStr, false, expectedMaxDist10000000_rows_small, columnNames); + + // test diff size children + columnNames = {"?name1", + "?obj1", + "?geo1", + "?point1", + "?obj2", + "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetDiffSizeChildren( + maxDistStr, true, expectedMaxDist10000000_rows_diff, columnNames, false); + buildAndTestSmallTestSetDiffSizeChildren( + maxDistStr, false, expectedMaxDist10000000_rows_diff, columnNames, false); + + // test large size children + columnNames = {"?name1", "?obj1", "?geo1", + "?point1", "?name2", "?obj2", + "?geo2", "?point2", "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetLargeChildren( + maxDistStr, true, expectedMaxDist10000000_rows, columnNames); + buildAndTestSmallTestSetLargeChildren( + maxDistStr, false, expectedMaxDist10000000_rows, columnNames); +} + +TEST_P(SpatialJoinParamTest, diffSizeIdTables) { + std::vector columnNames{"?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + testDiffSizeIdTables("", true, + expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, + expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, + expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, + expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", true, + expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, + expectedMaxDist5000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, + expectedMaxDist5000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist500000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist500000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist500000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist500000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + false); +} + +TEST_P(SpatialJoinParamTest, wrongPointInInput) { + // expected behavior: point is skipped + std::vector columnNames{"?obj1", "?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + testWrongPointInInput("", true, + expectedMaxDist1_rows_small_wrong_point, columnNames); + testWrongPointInInput("", false, + expectedMaxDist1_rows_small_wrong_point, columnNames); + testWrongPointInInput("", true, + expectedMaxDist5000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist5000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist500000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist500000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist1000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist1000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist10000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist10000000_rows_small_wrong_point, + columnNames); +} + +INSTANTIATE_TEST_SUITE_P( + SpatialJoin, SpatialJoinParamTest, + ::testing::Values(SpatialJoin::Algorithm::Baseline, + SpatialJoin::Algorithm::S2Geometry, + SpatialJoin::Algorithm::BoundingBox)); } // end of Namespace computeResultTest +namespace boundingBox { + +using BoostGeometryNamespace::Box; +using BoostGeometryNamespace::Point; +using BoostGeometryNamespace::Value; + +// usage of this helper function for the bounding box to test: +// Iterate over every edge of the bounding box. Furthermore iterate over each +// edge (i.e. use 100 points, which are on the edge of the bounding box). Then +// call this function for each of those points twice. Once slightly move the +// point inside the bounding box and give 'shouldBeWithin' = true to this +// function, the other time move it slightly outside of the bounding box and +// give 'shouldBeTrue' = false to the function. Do this for all edges. Note +// that this function is not taking a set of boxes, as neighboring boxes would +// not work with this approach (slightly outside of one box can be inside the +// neighboring box. For a set of boxes, check each box separately) +void testBounds(double x, double y, const Box& bbox, bool shouldBeWithin) { + // correct lon bounds if necessary + if (x < -180) { + x += 360; + } else if (x > 180) { + x -= 360; + } + + // testing only possible, if lat bounds are correct and the lon bounds + // don't cover everything (as then left or right of the box is again + // inside the box because of the spherical geometry). If we have a bounding + // box, which goes from -180 to 180 longitude, then left of the bounding box + // is just in the bounding box again. (i.e. -180.00001 is the same as + // +179.99999). As all longitudes are covered, a left or right bound does + // not exist (on the sphere this makes intuitive sense). A test in that case + // is not necessary, because this test is about testing the edges and if + // everything is covered an edge doesn't exist there is no need for testing + // in that case + double minLonBox = bbox.min_corner().get<0>(); + double maxLonBox = bbox.max_corner().get<0>(); + if (y < 90 && y > -90 && !(minLonBox < 179.9999 && maxLonBox > 179.9999)) { + bool within = boost::geometry::covered_by(Point(x, y), bbox); + ASSERT_TRUE(within == shouldBeWithin); + } +}; + +// This function performs multiple tests on the bounding box. First it asserts, +// that a point, which is not contained in any bounding box is more than +// 'maxDistInMeters' away from 'startPoint'. Second it iterates over the edges +// of the bounding box and checks, that points which are slightly inside or +// outside of the bounding box are correctly identified. +void testBoundingBox(const size_t& maxDistInMeters, const Point& startPoint) { + // this helper function checks, that a point, which is not contained in any + // bounding box, is really more than 'maxDistanceInMeters' away from + // 'startPoint' + auto checkOutside = [&](const Point& point1, const Point& startPoint, + const std::vector& bbox, + SpatialJoinAlgorithms* spatialJoinAlg) { + // check if the point is contained in any bounding box + bool within = spatialJoinAlg->OnlyForTestingWrapperContainedInBoundingBoxes( + bbox, point1); + if (!within) { + GeoPoint geo1{point1.get<1>(), point1.get<0>()}; + GeoPoint geo2{startPoint.get<1>(), startPoint.get<0>()}; + double dist = ad_utility::detail::wktDistImpl(geo1, geo2) * 1000; + ASSERT_GT(static_cast(dist), maxDistInMeters); + } + }; + + PreparedSpatialJoinParams params{ + nullptr, nullptr, nullptr, nullptr, 0, + 0, 1, maxDistInMeters, std::nullopt}; + + std::variant config{ + MaxDistanceConfig{maxDistInMeters}}; + + SpatialJoinAlgorithms spatialJoinAlgs{buildTestQEC(), params, true, config}; + + std::vector bbox = + spatialJoinAlgs.OnlyForTestingWrapperComputeBoundingBox(startPoint); + // broad grid test + for (int lon = -180; lon < 180; lon += 20) { + for (int lat = -90; lat < 90; lat += 20) { + checkOutside(Point(lon, lat), startPoint, bbox, &spatialJoinAlgs); + } + } + + // do tests at the border of the box. The exact usage of this is described in + // the function comment of the helper function 'testBounds' + for (size_t k = 0; k < bbox.size(); k++) { + // use a small delta for testing because of floating point inaccuracies + const double delta = 0.00000001; + const Point minPoint = bbox.at(k).min_corner(); + const Point maxPoint = bbox.at(k).max_corner(); + const double lowX = minPoint.get<0>(); + const double lowY = minPoint.get<1>(); + const double highX = maxPoint.get<0>(); + const double highY = maxPoint.get<1>(); + const double xRange = highX - lowX - 2 * delta; + const double yRange = highY - lowY - 2 * delta; + for (size_t i = 0; i <= 100; i++) { + // barely in or out at the left edge + testBounds(lowX + delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + true); + testBounds(lowX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + false); + checkOutside(Point(lowX - delta, lowY + (yRange / 100) * i), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the bottom edge + testBounds(lowX + delta + (xRange / 100) * i, lowY + delta, bbox.at(k), + true); + testBounds(lowX + delta + (xRange / 100) * i, lowY - delta, bbox.at(k), + false); + checkOutside(Point(lowX + (xRange / 100) * i, lowY - delta), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the right edge + testBounds(highX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + true); + testBounds(highX + delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + false); + checkOutside(Point(highX + delta, lowY + (yRange / 100) * i), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the top edge + testBounds(lowX + delta + (xRange / 100) * i, highY - delta, bbox.at(k), + true); + testBounds(lowX + delta + (xRange / 100) * i, highY + delta, bbox.at(k), + false); + checkOutside(Point(lowX + (xRange / 100) * i, highY + delta), startPoint, + bbox, &spatialJoinAlgs); + } + } +} + +TEST(SpatialJoin, computeBoundingBox) { + // ASSERT_EQ("", "uncomment the part below again"); + double circ = 40075 * 1000; // circumference of the earth (at the equator) + // 180.0001 in case 180 is represented internally as 180.0000000001 + for (double lon = -180; lon <= 180.0001; lon += 15) { + // 90.0001 in case 90 is represented internally as 90.000000001 + for (double lat = -90; lat <= 90.0001; lat += 15) { + // circ / 2 means, that all points on earth are within maxDist km of any + // starting point + for (int maxDist = 0; maxDist <= circ / 2.0; maxDist += circ / 36.0) { + testBoundingBox(maxDist, Point(lon, lat)); + } + } + } +} + +TEST(SpatialJoin, isContainedInBoundingBoxes) { + // build dummy join to access the containedInBoundingBox and + // computeBoundingBox functions + auto qec = buildTestQEC(); + auto spatialJoinTriple = SparqlTriple{TripleComponent{Variable{"?point1"}}, + "", + TripleComponent{Variable{"?point2"}}}; + std::shared_ptr spatialJoinOperation = + ad_utility::makeExecutionTree(qec, spatialJoinTriple, + std::nullopt, std::nullopt); + + std::shared_ptr op = spatialJoinOperation->getRootOperation(); + SpatialJoin* spatialJoin = static_cast(op.get()); + + PreparedSpatialJoinParams params{ + nullptr, nullptr, nullptr, nullptr, + 0, 0, 1, spatialJoin->getMaxDist(), + std::nullopt}; + + SpatialJoinAlgorithms spatialJoinAlgs{ + qec, params, true, spatialJoin->onlyForTestingGetActualConfig()}; + + // note that none of the boxes is overlapping, therefore we can check, that + // none of the points which should be contained in one box are contained in + // another box + std::vector boxes = { + Box(Point(20, 40), Point(40, 60)), + Box(Point(-180, -20), Point(-150, 30)), // touching left border + Box(Point(50, -30), Point(180, 10)), // touching right border + Box(Point(-30, 50), Point(10, 90)), // touching north pole + Box(Point(-45, -90), Point(0, -45)) // touching south pole + }; + + // the first entry in this vector is a vector of points, which is contained + // in the first box, the second entry contains points, which are contained in + // the second box and so on + std::vector> containedInBox = { + {Point(20, 40), Point(40, 40), Point(40, 60), Point(20, 60), + Point(30, 50)}, + {Point(-180, -20), Point(-150, -20), Point(-150, 30), Point(-180, 30), + Point(-150, 0)}, + {Point(50, -30), Point(180, -30), Point(180, 10), Point(50, 10), + Point(70, -10)}, + {Point(-30, 50), Point(10, 50), Point(10, 90), Point(-30, 90), + Point(-20, 60)}, + {Point(-45, -90), Point(0, -90), Point(0, -45), Point(-45, -45), + Point(-10, -60)}}; + + // all combinations of box is contained in bounding boxes and is not + // contained. a one encodes, the bounding box is contained in the set of + // bounding boxes, a zero encodes, it isn't. If a box is contained, it's + // checked, that the points which should be contained in the box are + // contained. If the box is not contained, it's checked, that the points which + // are contained in that box are not contained in the box set (because the + // boxes don't overlap) + for (size_t a = 0; a <= 1; a++) { + for (size_t b = 0; a <= 1; a++) { + for (size_t c = 0; a <= 1; a++) { + for (size_t d = 0; a <= 1; a++) { + for (size_t e = 0; a <= 1; a++) { + std::vector toTest; // the set of bounding boxes + std::vector> shouldBeContained; + std::vector> shouldNotBeContained; + if (a == 1) { // box nr. 0 is contained in the set of boxes + toTest.push_back(boxes.at(0)); + shouldBeContained.push_back(containedInBox.at(0)); + } else { // box nr. 0 is not contained in the set of boxes + shouldNotBeContained.push_back(containedInBox.at(0)); + } + if (b == 1) { // box nr. 1 is contained in the set of boxes + toTest.push_back(boxes.at(1)); + shouldBeContained.push_back(containedInBox.at(1)); + } else { // box nr. 1 is not contained in the set of boxes + shouldNotBeContained.push_back(containedInBox.at(1)); + } + if (c == 1) { + toTest.push_back(boxes.at(2)); + shouldBeContained.push_back(containedInBox.at(2)); + } else { + shouldNotBeContained.push_back(containedInBox.at(2)); + } + if (d == 1) { + toTest.push_back(boxes.at(3)); + shouldBeContained.push_back(containedInBox.at(3)); + } else { + shouldNotBeContained.push_back(containedInBox.at(3)); + } + if (e == 1) { + toTest.push_back(boxes.at(4)); + shouldBeContained.push_back(containedInBox.at(4)); + } else { + shouldNotBeContained.push_back(containedInBox.at(4)); + } + if (toTest.size() > 0) { + // test all points, which should be contained in the bounding + // boxes + for (size_t i = 0; i < shouldBeContained.size(); i++) { + for (size_t k = 0; k < shouldBeContained.at(i).size(); k++) { + ASSERT_TRUE( + spatialJoinAlgs + .OnlyForTestingWrapperContainedInBoundingBoxes( + toTest, shouldBeContained.at(i).at(k))); + } + } + // test all points, which shouldn't be contained in the bounding + // boxes + for (size_t i = 0; i < shouldNotBeContained.size(); i++) { + for (size_t k = 0; k < shouldNotBeContained.at(i).size(); k++) { + ASSERT_FALSE( + spatialJoinAlgs + .OnlyForTestingWrapperContainedInBoundingBoxes( + toTest, shouldNotBeContained.at(i).at(k))); + } + } + } + } + } + } + } + } +} + +} // namespace boundingBox + } // namespace