diff --git a/src/engine/SpatialJoin.cpp b/src/engine/SpatialJoin.cpp index beb09ffe68..1ad7d8179a 100644 --- a/src/engine/SpatialJoin.cpp +++ b/src/engine/SpatialJoin.cpp @@ -209,7 +209,7 @@ size_t SpatialJoin::getCostEstimate() { if (childLeft_ && childRight_) { size_t inputEstimate = childLeft_->getSizeEstimate() * childRight_->getSizeEstimate(); - if (useBaselineAlgorithm_) { + if (algorithm_ == Algorithm::Baseline) { return inputEstimate * inputEstimate; } else { // Let n be the size of the left table and m the size of the right table. @@ -336,11 +336,21 @@ PreparedSpatialJoinParams SpatialJoin::prepareJoin() const { // ____________________________________________________________________________ Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) { SpatialJoinAlgorithms algorithms{_executionContext, prepareJoin(), - addDistToResult_, config_}; - if (useBaselineAlgorithm_) { + addDistToResult_, config_, this}; + if (algorithm_ == Algorithm::Baseline) { return algorithms.BaselineAlgorithm(); - } else { + } else if (algorithm_ == Algorithm::S2Geometry) { return algorithms.S2geometryAlgorithm(); + } else { + AD_CORRECTNESS_CHECK(algorithm_ == Algorithm::BoundingBox); + // as the BoundingBoxAlgorithms only works for max distance and not for + // nearest neighbors, S2geometry gets called as a backup, if the query is + // asking for the nearest neighbors + if (std::get_if(&config_)) { + return algorithms.BoundingBoxAlgorithm(); + } else { + return algorithms.S2geometryAlgorithm(); + } } } diff --git a/src/engine/SpatialJoin.h b/src/engine/SpatialJoin.h index b62978cf90..b31b1b8866 100644 --- a/src/engine/SpatialJoin.h +++ b/src/engine/SpatialJoin.h @@ -96,14 +96,20 @@ class SpatialJoin : public Operation { // purposes std::optional getMaxResults() const; - void selectAlgorithm(bool useBaselineAlgorithm) { - useBaselineAlgorithm_ = useBaselineAlgorithm; - } + // options which can be used for the algorithm, which calculates the result + enum class Algorithm { Baseline, S2Geometry, BoundingBox }; + + void selectAlgorithm(Algorithm algorithm) { algorithm_ = algorithm; } std::pair onlyForTestingGetConfig() const { return std::pair{getMaxDist().value_or(-1), getMaxResults().value_or(-1)}; } + std::variant + onlyForTestingGetActualConfig() const { + return config_; + } + std::shared_ptr onlyForTestingGetLeftChild() const { return childLeft_; } @@ -135,5 +141,5 @@ class SpatialJoin : public Operation { // between the two objects bool addDistToResult_ = true; const string nameDistanceInternal_ = "?distOfTheTwoObjectsAddedInternally"; - bool useBaselineAlgorithm_ = false; + Algorithm algorithm_ = Algorithm::S2Geometry; }; diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index dd7a9b1c7c..78eec1e2a2 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -6,17 +6,23 @@ #include #include +#include + #include "util/GeoSparqlHelpers.h" +using namespace BoostGeometryNamespace; + // ____________________________________________________________________________ SpatialJoinAlgorithms::SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, bool addDistToResult, - std::variant config) + std::variant config, + std::optional spatialJoin) : qec_{qec}, params_{std::move(params)}, addDistToResult_{addDistToResult}, - config_{std::move(config)} {} + config_{std::move(config)}, + spatialJoin_{spatialJoin} {} // ____________________________________________________________________________ std::optional SpatialJoinAlgorithms::getPoint(const IdTable* restable, @@ -223,3 +229,298 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() { return Result(std::move(result), std::vector{}, Result::getMergedLocalVocab(*resultLeft, *resultRight)); } + +// ____________________________________________________________________________ +std::vector SpatialJoinAlgorithms::computeBoundingBox( + const Point& startPoint) const { + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + AD_CORRECTNESS_CHECK(maxDist.has_value(), + "Max distance must have a value for this operation"); + // haversine function + auto haversine = [](double theta) { return (1 - std::cos(theta)) / 2; }; + + // inverse haversine function + auto archaversine = [](double theta) { return std::acos(1 - 2 * theta); }; + + // safety buffer for numerical inaccuracies + double maxDistInMetersBuffer; + if (maxDist.value() < 10) { + maxDistInMetersBuffer = 10; + } else if (static_cast(maxDist.value()) < + static_cast(std::numeric_limits::max()) / + 1.02) { + maxDistInMetersBuffer = 1.01 * static_cast(maxDist.value()); + } else { + maxDistInMetersBuffer = + static_cast(std::numeric_limits::max()); + } + + // for large distances, where the lower calculation would just result in + // a single bounding box for the whole planet, do an optimized version + if (static_cast(maxDist.value()) > circumferenceMax_ / 4.0 && + static_cast(maxDist.value()) < circumferenceMax_ / 2.01) { + return computeBoundingBoxForLargeDistances(startPoint); + } + + // compute latitude bound + double maxDistInDegrees = maxDistInMetersBuffer * (360 / circumferenceMax_); + double upperLatBound = startPoint.get<1>() + maxDistInDegrees; + double lowerLatBound = startPoint.get<1>() - maxDistInDegrees; + + auto southPoleReached = isAPoleTouched(lowerLatBound).at(1); + auto northPoleReached = isAPoleTouched(upperLatBound).at(0); + + if (southPoleReached || northPoleReached) { + return {Box(Point(-180.0f, lowerLatBound), Point(180.0f, upperLatBound))}; + } + + // compute longitude bound. For an explanation of the calculation and the + // naming convention see my master thesis + double alpha = maxDistInMetersBuffer / radius_; + double gamma = (90 - std::abs(startPoint.get<1>())) * (2 * M_PI / 360); + double beta = std::acos(std::cos(gamma) / std::cos(alpha)); + double delta = 0; + if (maxDistInMetersBuffer > circumferenceMax_ / 20) { + // use law of cosines + delta = std::acos((std::cos(alpha) - std::cos(gamma) * std::cos(beta)) / + (std::sin(gamma) * std::sin(beta))); + } else { + // use law of haversines for numerical stability + delta = archaversine((haversine(alpha - haversine(gamma - beta))) / + (std::sin(gamma) * std::sin(beta))); + } + double lonRange = delta * 360 / (2 * M_PI); + double leftLonBound = startPoint.get<0>() - lonRange; + double rightLonBound = startPoint.get<0>() + lonRange; + // test for "overflows" and create two bounding boxes if necessary + if (leftLonBound < -180) { + auto box1 = + Box(Point(-180, lowerLatBound), Point(rightLonBound, upperLatBound)); + auto box2 = Box(Point(leftLonBound + 360, lowerLatBound), + Point(180, upperLatBound)); + return {box1, box2}; + } else if (rightLonBound > 180) { + auto box1 = + Box(Point(leftLonBound, lowerLatBound), Point(180, upperLatBound)); + auto box2 = Box(Point(-180, lowerLatBound), + Point(rightLonBound - 360, upperLatBound)); + return {box1, box2}; + } + // default case, when no bound has an "overflow" + return {Box(Point(leftLonBound, lowerLatBound), + Point(rightLonBound, upperLatBound))}; +} + +// ____________________________________________________________________________ +std::vector SpatialJoinAlgorithms::computeBoundingBoxForLargeDistances( + const Point& startPoint) const { + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + AD_CORRECTNESS_CHECK(maxDist.has_value(), + "Max distance must have a value for this operation"); + + // point on the opposite side of the globe + Point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1); + if (antiPoint.get<0>() > 180) { + antiPoint.set<0>(antiPoint.get<0>() - 360); + } + // for an explanation of the formula see the master thesis. Divide by two two + // only consider the distance from the point to the antiPoint, subtract + // maxDist and a safety margine from that + double antiDist = + (circumferenceMin_ / 2.0) - static_cast(maxDist.value()) * 1.01; + // use the bigger circumference as an additional safety margin, use 2.01 + // instead of 2.0 because of rounding inaccuracies in floating point + // operations + double distToAntiPoint = (360 / circumferenceMax_) * (antiDist / 2.01); + double upperBound = antiPoint.get<1>() + distToAntiPoint; + double lowerBound = antiPoint.get<1>() - distToAntiPoint; + double leftBound = antiPoint.get<0>() - distToAntiPoint; + double rightBound = antiPoint.get<0>() + distToAntiPoint; + bool northPoleTouched = false; + bool southPoleTouched = false; + bool boxCrosses180Longitude = false; // if the 180 to -180 line is touched + // if a pole is crossed, ignore the part after the crossing + if (upperBound > 90) { + upperBound = 90; + northPoleTouched = true; + } + if (lowerBound < -90) { + lowerBound = -90; + southPoleTouched = true; + } + if (leftBound < -180) { + leftBound += 360; + } + if (rightBound > 180) { + rightBound -= 360; + } + if (rightBound < leftBound) { + boxCrosses180Longitude = true; + } + // compute bounding boxes using the anti bounding box from above + std::vector boxes; + if (!northPoleTouched) { + // add upper bounding box(es) + if (boxCrosses180Longitude) { + boxes.emplace_back(Point(leftBound, upperBound), Point(180, 90)); + boxes.emplace_back(Point(-180, upperBound), Point(rightBound, 90)); + } else { + boxes.emplace_back(Point(leftBound, upperBound), Point(rightBound, 90)); + } + } + if (!southPoleTouched) { + // add lower bounding box(es) + if (boxCrosses180Longitude) { + boxes.emplace_back(Point(leftBound, -90), Point(180, lowerBound)); + boxes.emplace_back(Point(-180, -90), Point(rightBound, lowerBound)); + } else { + boxes.emplace_back(Point(leftBound, -90), Point(rightBound, lowerBound)); + } + } + // add the box(es) inbetween the longitude lines + if (boxCrosses180Longitude) { + // only one box needed to cover the longitudes + boxes.emplace_back(Point(rightBound, -90), Point(leftBound, 90)); + } else { + // two boxes needed, one left and one right of the anti bounding box + boxes.emplace_back(Point(-180, -90), Point(leftBound, 90)); + boxes.emplace_back(Point(rightBound, -90), Point(180, 90)); + } + return boxes; +} + +// ____________________________________________________________________________ +bool SpatialJoinAlgorithms::isContainedInBoundingBoxes( + const std::vector& boundingBox, Point point) const { + convertToNormalCoordinates(point); + + return std::ranges::any_of(boundingBox, [point](const Box& aBox) { + return boost::geometry::covered_by(point, aBox); + }); +} + +// ____________________________________________________________________________ +void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) const { + // correct lon and lat bounds if necessary + while (point.get<0>() < -180) { + point.set<0>(point.get<0>() + 360); + } + while (point.get<0>() > 180) { + point.set<0>(point.get<0>() - 360); + } + if (point.get<1>() < -90) { + point.set<1>(-90); + } else if (point.get<1>() > 90) { + point.set<1>(90); + } +} + +// ____________________________________________________________________________ +std::array SpatialJoinAlgorithms::isAPoleTouched( + const double& latitude) const { + bool northPoleReached = false; + bool southPoleReached = false; + if (latitude >= 90) { + northPoleReached = true; + } + if (latitude <= -90) { + southPoleReached = true; + } + return std::array{northPoleReached, southPoleReached}; +} + +// ____________________________________________________________________________ +Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { + auto printWarning = [alreadyWarned = false, + &spatialJoin = spatialJoin_]() mutable { + if (!alreadyWarned) { + std::string warning = + "The input to a spatial join contained at least one element, " + "that is not a point geometry and is thus skipped. Note that " + "QLever currently only accepts point geometries for the " + "spatial joins"; + AD_LOG_WARN << warning << std::endl; + alreadyWarned = true; + if (spatialJoin.has_value()) { + AD_CORRECTNESS_CHECK(spatialJoin.value() != nullptr); + spatialJoin.value()->addWarning(warning); + } + } + }; + + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + IdTable result{numColumns, qec_->getAllocator()}; + + // create r-tree for smaller result table + auto smallerResult = idTableLeft; + auto otherResult = idTableRight; + bool leftResSmaller = true; + auto smallerResJoinCol = leftJoinCol; + auto otherResJoinCol = rightJoinCol; + if (idTableLeft->numRows() > idTableRight->numRows()) { + std::swap(smallerResult, otherResult); + leftResSmaller = false; + std::swap(smallerResJoinCol, otherResJoinCol); + } + + bgi::rtree, bgi::indexable, + bgi::equal_to, ad_utility::AllocatorWithLimit> + rtree(bgi::quadratic<16>{}, bgi::indexable{}, + bgi::equal_to{}, qec_->getAllocator()); + for (size_t i = 0; i < smallerResult->numRows(); i++) { + // get point of row i + auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); + + if (!geopoint) { + printWarning(); + continue; + } + + Point p(geopoint.value().getLng(), geopoint.value().getLat()); + + // add every point together with the row number into the rtree + rtree.insert(std::make_pair(std::move(p), i)); + } + std::vector> results{ + qec_->getAllocator()}; + for (size_t i = 0; i < otherResult->numRows(); i++) { + auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); + + if (!geopoint1) { + printWarning(); + continue; + } + + Point p(geopoint1.value().getLng(), geopoint1.value().getLat()); + + // query the other rtree for every point using the following bounding box + std::vector bbox = computeBoundingBox(p); + results.clear(); + + std::ranges::for_each(bbox, [&](const Box& bbox) { + rtree.query(bgi::intersects(bbox), std::back_inserter(results)); + }); + + std::ranges::for_each(results, [&](const Value& res) { + size_t rowLeft = res.second; + size_t rowRight = i; + if (!leftResSmaller) { + std::swap(rowLeft, rowRight); + } + auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight, + leftJoinCol, rightJoinCol); + AD_CORRECTNESS_CHECK(distance.getDatatype() == Datatype::Int); + if (distance.getInt() <= maxDist) { + addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, + rowRight, distance); + } + }); + } + auto resTable = + Result(std::move(result), std::vector{}, + Result::getMergedLocalVocab(*resultLeft, *resultRight)); + return resTable; +} diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 00556f1b3d..0719b769b4 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -1,26 +1,53 @@ #pragma once +#include +#include +#include +#include + #include "engine/Result.h" #include "engine/SpatialJoin.h" +namespace BoostGeometryNamespace { +namespace bg = boost::geometry; +namespace bgi = boost::geometry::index; + +using Point = bg::model::point; +using Box = bg::model::box; +using Value = std::pair; +} // namespace BoostGeometryNamespace + class SpatialJoinAlgorithms { public: - // initialize the Algorithm with the needed parameters + // Initialize the Algorithm with the needed parameters SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, bool addDistToResult, - std::variant config); + std::variant config, + std::optional spatialJoin = std::nullopt); Result BaselineAlgorithm(); Result S2geometryAlgorithm(); Result BoundingBoxAlgorithm(); + std::vector + OnlyForTestingWrapperComputeBoundingBox( + const BoostGeometryNamespace::Point& startPoint) const { + return computeBoundingBox(startPoint); + } + + bool OnlyForTestingWrapperContainedInBoundingBoxes( + const std::vector& boundingBox, + const BoostGeometryNamespace::Point& point) const { + return isContainedInBoundingBoxes(boundingBox, point); + } + private: - // helper function which returns a GeoPoint if the element of the given table + // Helper function which returns a GeoPoint if the element of the given table // represents a GeoPoint std::optional getPoint(const IdTable* restable, size_t row, ColumnIndex col) const; - // helper function, which computes the distance of two points, where each + // Helper function, which computes the distance of two points, where each // point comes from a different result table Id computeDist(const IdTable* resLeft, const IdTable* resRight, size_t rowLeft, size_t rowRight, ColumnIndex leftPointCol, @@ -33,8 +60,55 @@ class SpatialJoinAlgorithms { const IdTable* resultRight, size_t rowLeft, size_t rowRight, Id distance) const; + // This function computes the bounding box(es) which represent all points, + // which are in reach of the starting point with a distance of at most + // 'maxDistanceInMeters'. In theory there is always only one bounding box, but + // when mapping the spherical surface on a cartesian plane there are borders. + // So when the "single true" bounding box crosses the left or right (+/-180 + // longitude line) or the poles (+/- 90 latitude, which on the cartesian + // mapping is the top and bottom edge of the rectangular mapping) then the + // single box gets split into multiple boxes (i.e. one on the left and one on + // the right, which when seen on the sphere look like a single box, but on the + // map and in the internal representation it looks like two/more boxes) + std::vector computeBoundingBox( + const BoostGeometryNamespace::Point& startPoint) const; + + // This helper function calculates the bounding boxes based on a box, where + // definitely no match can occur. This means every element in the anti + // bounding box is guaranteed to be more than 'maxDistanceInMeters' away from + // the startPoint. The function is then returning the set of boxes, which + // cover everything on earth, except for the anti bounding box. This function + // gets used, when the usual procedure, would just result in taking a big + // bounding box, which covers the whole planet (so for extremely large max + // distances) + std::vector computeBoundingBoxForLargeDistances( + const BoostGeometryNamespace::Point& startPoint) const; + + // This function returns true, iff the given point is contained in any of the + // bounding boxes + bool isContainedInBoundingBoxes( + const std::vector& boundingBox, + BoostGeometryNamespace::Point point) const; + QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; bool addDistToResult_; std::variant config_; + std::optional spatialJoin_; + + // circumference in meters at the equator (max) and the pole (min) (as the + // earth is not exactly a sphere the circumference is different. Note that + // the values are given in meters) + static constexpr double circumferenceMax_ = 40'075'000; + static constexpr double circumferenceMin_ = 40'007'000; + + // radius of the earth in meters (as the earth is not exactly a sphere the + // radius at the equator has been taken) + static constexpr double radius_ = 6'378'000; + + // convert coordinates to the usual ranges (-180 to 180 and -90 to 90) + void convertToNormalCoordinates(BoostGeometryNamespace::Point& point) const; + + // return whether one of the poles is being touched + std::array isAPoleTouched(const double& latitude) const; }; diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index f732c0d8e4..95fe8fa50f 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -25,14 +25,16 @@ using namespace SpatialJoinTestHelpers; namespace computeResultTest { -class SpatialJoinParamTest : public ::testing::TestWithParam { +class SpatialJoinParamTest + : public ::testing::TestWithParam { public: void createAndTestSpatialJoin( QueryExecutionContext* qec, SparqlTriple spatialJoinTriple, std::shared_ptr leftChild, std::shared_ptr rightChild, bool addLeftChildFirst, std::vector> expectedOutputUnorderedRows, - std::vector columnNames) { + std::vector columnNames, + bool isWrongPointInputTest = false) { // this function is like transposing a matrix. An entry which has been // stored at (i, k) is now stored at (k, i). The reason this is needed is // the following: this function receives the input as a vector of vector of @@ -106,6 +108,23 @@ class SpatialJoinParamTest : public ::testing::TestWithParam { }*/ EXPECT_THAT(vec, ::testing::UnorderedElementsAreArray(expectedOutput)); + + if (isWrongPointInputTest && + GetParam() == SpatialJoin::Algorithm::BoundingBox) { + auto warnings = spatialJoin->collectWarnings(); + bool containsWrongPointWarning = false; + std::string warningMessage = + "The input to a spatial join contained at least one element, " + "that is not a point geometry and is thus skipped. Note that " + "QLever currently only accepts point geometries for the " + "spatial joins"; + for (const auto& warning : warnings) { + if (warning == warningMessage) { + containsWrongPointWarning = true; + } + } + ASSERT_TRUE(containsWrongPointWarning); + } } // build the test using the small dataset. Let the SpatialJoin operation be @@ -192,15 +211,13 @@ class SpatialJoinParamTest : public ::testing::TestWithParam { auto qec = buildTestQEC(); auto numTriples = qec->getIndex().numTriples().normal; ASSERT_EQ(numTriples, 15); - // ========================= build big child - // ================================= + // ========================= build big child ============================= auto bigChild = buildMediumChild( qec, {"?obj1", std::string{""}, "?name1"}, {"?obj1", std::string{""}, "?geo1"}, {"?geo1", std::string{""}, "?point1"}, "?obj1", "?geo1"); - // ========================= build small child - // =============================== + // ========================= build small child =========================== TripleComponent point2{Variable{"?point2"}}; auto smallChild = buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); @@ -218,6 +235,64 @@ class SpatialJoinParamTest : public ::testing::TestWithParam { columnNames); } + void testDiffSizeIdTables( + std::string specialPredicate, bool addLeftChildFirst, + std::vector> expectedOutput, + std::vector columnNames, bool bigChildLeft) { + auto qec = buildTestQEC(); + auto numTriples = qec->getIndex().numTriples().normal; + ASSERT_EQ(numTriples, 15); + // ====================== build small input ============================== + TripleComponent point1{Variable{"?point1"}}; + TripleComponent subject{ + ad_utility::triple_component::Iri::fromIriref("")}; + auto smallChild = ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, + SparqlTriple{subject, std::string{""}, point1}); + // ====================== build big input ================================ + auto bigChild = + buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); + + auto firstChild = bigChildLeft ? bigChild : smallChild; + auto secondChild = bigChildLeft ? smallChild : bigChild; + auto firstVariable = + bigChildLeft ? TripleComponent{Variable{"?point2"}} : point1; + auto secondVariable = + bigChildLeft ? point1 : TripleComponent{Variable{"?point2"}}; + + createAndTestSpatialJoin( + qec, SparqlTriple{firstVariable, specialPredicate, secondVariable}, + firstChild, secondChild, addLeftChildFirst, expectedOutput, + columnNames); + } + + void testWrongPointInInput( + std::string specialPredicate, bool addLeftChildFirst, + std::vector> expectedOutput, + std::vector columnNames) { + auto kg = createSmallDatasetWithPoints(); + // make first point wrong: + auto pos = kg.find("POINT("); + kg = kg.insert(pos + 7, "wrongStuff"); + + ad_utility::MemorySize blocksizePermutations = 128_MB; + auto qec = ad_utility::testing::getQec(kg, true, true, false, + blocksizePermutations, false); + auto numTriples = qec->getIndex().numTriples().normal; + ASSERT_EQ(numTriples, 15); + // ====================== build inputs ================================ + TripleComponent point1{Variable{"?point1"}}; + TripleComponent point2{Variable{"?point2"}}; + auto leftChild = + buildIndexScan(qec, {"?obj1", std::string{""}, "?point1"}); + auto rightChild = + buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); + + createAndTestSpatialJoin( + qec, SparqlTriple{point1, specialPredicate, point2}, leftChild, + rightChild, addLeftChildFirst, expectedOutput, columnNames, true); + } + protected: bool useBaselineAlgorithm_; }; @@ -487,6 +562,63 @@ std::vector> expectedMaxDist10000000_rows_small{ mergeToRow(sEif, sEye, expectedDistEyeEif), mergeToRow(sEif, sLib, expectedDistEifLib)}; +std::vector> expectedMaxDist1_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), +}; + +std::vector> + expectedMaxDist5000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf)}; + +std::vector> + expectedMaxDist500000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif)}; + +std::vector> + expectedMaxDist1000000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sMun, sEye, expectedDistMunEye), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sEye, sMun, expectedDistMunEye), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif)}; + +std::vector> + expectedMaxDist10000000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sMun, sEye, expectedDistMunEye), + mergeToRow(sMun, sLib, expectedDistMunLib), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sEye, sMun, expectedDistMunEye), + mergeToRow(sEye, sLib, expectedDistEyeLib), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sLib, sMun, expectedDistMunLib), + mergeToRow(sLib, sEye, expectedDistEyeLib), + mergeToRow(sLib, sEif, expectedDistEifLib), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif), + mergeToRow(sEif, sLib, expectedDistEifLib)}; + std::vector> expectedMaxDist1_rows_diff{ mergeToRow(TF, sTF, expectedDistSelf), mergeToRow(Mun, sMun, expectedDistSelf), @@ -565,6 +697,31 @@ std::vector> expectedMaxDist10000000_rows_diff{ mergeToRow(Eif, sEye, expectedDistEyeEif), mergeToRow(Eif, sLib, expectedDistEifLib)}; +std::vector> expectedMaxDist1_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf)}; + +std::vector> expectedMaxDist5000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun)}; + +std::vector> expectedMaxDist500000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif)}; + +std::vector> expectedMaxDist1000000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif), + mergeToRow({sTF.at(1)}, sEye, expectedDistUniEye)}; + +std::vector> expectedMaxDist10000000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif), + mergeToRow({sTF.at(1)}, sEye, expectedDistUniEye), + mergeToRow({sTF.at(1)}, sLib, expectedDistUniLib)}; + std::vector> expectedNearestNeighbors1{ mergeToRow(TF, TF, expectedDistSelf), mergeToRow(Mun, Mun, expectedDistSelf), @@ -780,8 +937,410 @@ TEST_P(SpatialJoinParamTest, computeResultSmallDatasetDifferentSizeChildren) { expectedMaxDist10000000_rows_diff, columnNames, false); } -INSTANTIATE_TEST_SUITE_P(SpatialJoin, SpatialJoinParamTest, ::testing::Bool()); +TEST_P(SpatialJoinParamTest, maxSizeMaxDistanceTest) { + auto maxDist = std::numeric_limits::max(); + std::string maxDistStr = + absl::StrCat(""); + + // test small children + std::vector columnNames{"?obj1", "?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetSmallChildren( + maxDistStr, true, expectedMaxDist10000000_rows_small, columnNames); + buildAndTestSmallTestSetSmallChildren( + maxDistStr, false, expectedMaxDist10000000_rows_small, columnNames); + + // test diff size children + columnNames = {"?name1", + "?obj1", + "?geo1", + "?point1", + "?obj2", + "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetDiffSizeChildren( + maxDistStr, true, expectedMaxDist10000000_rows_diff, columnNames, false); + buildAndTestSmallTestSetDiffSizeChildren( + maxDistStr, false, expectedMaxDist10000000_rows_diff, columnNames, false); + + // test large size children + columnNames = {"?name1", "?obj1", "?geo1", + "?point1", "?name2", "?obj2", + "?geo2", "?point2", "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetLargeChildren( + maxDistStr, true, expectedMaxDist10000000_rows, columnNames); + buildAndTestSmallTestSetLargeChildren( + maxDistStr, false, expectedMaxDist10000000_rows, columnNames); +} + +TEST_P(SpatialJoinParamTest, diffSizeIdTables) { + std::vector columnNames{"?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + testDiffSizeIdTables("", true, + expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, + expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, + expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, + expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", true, + expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, + expectedMaxDist5000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, + expectedMaxDist5000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist500000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist500000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist500000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist500000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + false); +} + +TEST_P(SpatialJoinParamTest, wrongPointInInput) { + // expected behavior: point is skipped + std::vector columnNames{"?obj1", "?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + testWrongPointInInput("", true, + expectedMaxDist1_rows_small_wrong_point, columnNames); + testWrongPointInInput("", false, + expectedMaxDist1_rows_small_wrong_point, columnNames); + testWrongPointInInput("", true, + expectedMaxDist5000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist5000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist500000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist500000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist1000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist1000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist10000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist10000000_rows_small_wrong_point, + columnNames); +} + +INSTANTIATE_TEST_SUITE_P( + SpatialJoin, SpatialJoinParamTest, + ::testing::Values(SpatialJoin::Algorithm::Baseline, + SpatialJoin::Algorithm::S2Geometry, + SpatialJoin::Algorithm::BoundingBox)); } // end of Namespace computeResultTest +namespace boundingBox { + +using BoostGeometryNamespace::Box; +using BoostGeometryNamespace::Point; +using BoostGeometryNamespace::Value; + +// usage of this helper function for the bounding box to test: +// Iterate over every edge of the bounding box. Furthermore iterate over each +// edge (i.e. use 100 points, which are on the edge of the bounding box). Then +// call this function for each of those points twice. Once slightly move the +// point inside the bounding box and give 'shouldBeWithin' = true to this +// function, the other time move it slightly outside of the bounding box and +// give 'shouldBeTrue' = false to the function. Do this for all edges. Note +// that this function is not taking a set of boxes, as neighboring boxes would +// not work with this approach (slightly outside of one box can be inside the +// neighboring box. For a set of boxes, check each box separately) +void testBounds(double x, double y, const Box& bbox, bool shouldBeWithin) { + // correct lon bounds if necessary + if (x < -180) { + x += 360; + } else if (x > 180) { + x -= 360; + } + + // testing only possible, if lat bounds are correct and the lon bounds + // don't cover everything (as then left or right of the box is again + // inside the box because of the spherical geometry). If we have a bounding + // box, which goes from -180 to 180 longitude, then left of the bounding box + // is just in the bounding box again. (i.e. -180.00001 is the same as + // +179.99999). As all longitudes are covered, a left or right bound does + // not exist (on the sphere this makes intuitive sense). A test in that case + // is not necessary, because this test is about testing the edges and if + // everything is covered an edge doesn't exist there is no need for testing + // in that case + double minLonBox = bbox.min_corner().get<0>(); + double maxLonBox = bbox.max_corner().get<0>(); + if (y < 90 && y > -90 && !(minLonBox < 179.9999 && maxLonBox > 179.9999)) { + bool within = boost::geometry::covered_by(Point(x, y), bbox); + ASSERT_TRUE(within == shouldBeWithin); + } +}; + +// This function performs multiple tests on the bounding box. First it asserts, +// that a point, which is not contained in any bounding box is more than +// 'maxDistInMeters' away from 'startPoint'. Second it iterates over the edges +// of the bounding box and checks, that points which are slightly inside or +// outside of the bounding box are correctly identified. +void testBoundingBox(const size_t& maxDistInMeters, const Point& startPoint) { + // this helper function checks, that a point, which is not contained in any + // bounding box, is really more than 'maxDistanceInMeters' away from + // 'startPoint' + auto checkOutside = [&](const Point& point1, const Point& startPoint, + const std::vector& bbox, + SpatialJoinAlgorithms* spatialJoinAlg) { + // check if the point is contained in any bounding box + bool within = spatialJoinAlg->OnlyForTestingWrapperContainedInBoundingBoxes( + bbox, point1); + if (!within) { + GeoPoint geo1{point1.get<1>(), point1.get<0>()}; + GeoPoint geo2{startPoint.get<1>(), startPoint.get<0>()}; + double dist = ad_utility::detail::wktDistImpl(geo1, geo2) * 1000; + ASSERT_GT(static_cast(dist), maxDistInMeters); + } + }; + + PreparedSpatialJoinParams params{ + nullptr, nullptr, nullptr, nullptr, 0, + 0, 1, maxDistInMeters, std::nullopt}; + + std::variant config{ + MaxDistanceConfig{maxDistInMeters}}; + + SpatialJoinAlgorithms spatialJoinAlgs{buildTestQEC(), params, true, config}; + + std::vector bbox = + spatialJoinAlgs.OnlyForTestingWrapperComputeBoundingBox(startPoint); + // broad grid test + for (int lon = -180; lon < 180; lon += 20) { + for (int lat = -90; lat < 90; lat += 20) { + checkOutside(Point(lon, lat), startPoint, bbox, &spatialJoinAlgs); + } + } + + // do tests at the border of the box. The exact usage of this is described in + // the function comment of the helper function 'testBounds' + for (size_t k = 0; k < bbox.size(); k++) { + // use a small delta for testing because of floating point inaccuracies + const double delta = 0.00000001; + const Point minPoint = bbox.at(k).min_corner(); + const Point maxPoint = bbox.at(k).max_corner(); + const double lowX = minPoint.get<0>(); + const double lowY = minPoint.get<1>(); + const double highX = maxPoint.get<0>(); + const double highY = maxPoint.get<1>(); + const double xRange = highX - lowX - 2 * delta; + const double yRange = highY - lowY - 2 * delta; + for (size_t i = 0; i <= 100; i++) { + // barely in or out at the left edge + testBounds(lowX + delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + true); + testBounds(lowX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + false); + checkOutside(Point(lowX - delta, lowY + (yRange / 100) * i), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the bottom edge + testBounds(lowX + delta + (xRange / 100) * i, lowY + delta, bbox.at(k), + true); + testBounds(lowX + delta + (xRange / 100) * i, lowY - delta, bbox.at(k), + false); + checkOutside(Point(lowX + (xRange / 100) * i, lowY - delta), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the right edge + testBounds(highX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + true); + testBounds(highX + delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + false); + checkOutside(Point(highX + delta, lowY + (yRange / 100) * i), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the top edge + testBounds(lowX + delta + (xRange / 100) * i, highY - delta, bbox.at(k), + true); + testBounds(lowX + delta + (xRange / 100) * i, highY + delta, bbox.at(k), + false); + checkOutside(Point(lowX + (xRange / 100) * i, highY + delta), startPoint, + bbox, &spatialJoinAlgs); + } + } +} + +TEST(SpatialJoin, computeBoundingBox) { + // ASSERT_EQ("", "uncomment the part below again"); + double circ = 40075 * 1000; // circumference of the earth (at the equator) + // 180.0001 in case 180 is represented internally as 180.0000000001 + for (double lon = -180; lon <= 180.0001; lon += 15) { + // 90.0001 in case 90 is represented internally as 90.000000001 + for (double lat = -90; lat <= 90.0001; lat += 15) { + // circ / 2 means, that all points on earth are within maxDist km of any + // starting point + for (int maxDist = 0; maxDist <= circ / 2.0; maxDist += circ / 36.0) { + testBoundingBox(maxDist, Point(lon, lat)); + } + } + } +} + +TEST(SpatialJoin, isContainedInBoundingBoxes) { + // build dummy join to access the containedInBoundingBox and + // computeBoundingBox functions + auto qec = buildTestQEC(); + auto spatialJoinTriple = SparqlTriple{TripleComponent{Variable{"?point1"}}, + "", + TripleComponent{Variable{"?point2"}}}; + std::shared_ptr spatialJoinOperation = + ad_utility::makeExecutionTree(qec, spatialJoinTriple, + std::nullopt, std::nullopt); + + std::shared_ptr op = spatialJoinOperation->getRootOperation(); + SpatialJoin* spatialJoin = static_cast(op.get()); + + PreparedSpatialJoinParams params{ + nullptr, nullptr, nullptr, nullptr, + 0, 0, 1, spatialJoin->getMaxDist(), + std::nullopt}; + + SpatialJoinAlgorithms spatialJoinAlgs{ + qec, params, true, spatialJoin->onlyForTestingGetActualConfig()}; + + // note that none of the boxes is overlapping, therefore we can check, that + // none of the points which should be contained in one box are contained in + // another box + std::vector boxes = { + Box(Point(20, 40), Point(40, 60)), + Box(Point(-180, -20), Point(-150, 30)), // touching left border + Box(Point(50, -30), Point(180, 10)), // touching right border + Box(Point(-30, 50), Point(10, 90)), // touching north pole + Box(Point(-45, -90), Point(0, -45)) // touching south pole + }; + + // the first entry in this vector is a vector of points, which is contained + // in the first box, the second entry contains points, which are contained in + // the second box and so on + std::vector> containedInBox = { + {Point(20, 40), Point(40, 40), Point(40, 60), Point(20, 60), + Point(30, 50)}, + {Point(-180, -20), Point(-150, -20), Point(-150, 30), Point(-180, 30), + Point(-150, 0)}, + {Point(50, -30), Point(180, -30), Point(180, 10), Point(50, 10), + Point(70, -10)}, + {Point(-30, 50), Point(10, 50), Point(10, 90), Point(-30, 90), + Point(-20, 60)}, + {Point(-45, -90), Point(0, -90), Point(0, -45), Point(-45, -45), + Point(-10, -60)}}; + + // all combinations of box is contained in bounding boxes and is not + // contained. a one encodes, the bounding box is contained in the set of + // bounding boxes, a zero encodes, it isn't. If a box is contained, it's + // checked, that the points which should be contained in the box are + // contained. If the box is not contained, it's checked, that the points which + // are contained in that box are not contained in the box set (because the + // boxes don't overlap) + for (size_t a = 0; a <= 1; a++) { + for (size_t b = 0; a <= 1; a++) { + for (size_t c = 0; a <= 1; a++) { + for (size_t d = 0; a <= 1; a++) { + for (size_t e = 0; a <= 1; a++) { + std::vector toTest; // the set of bounding boxes + std::vector> shouldBeContained; + std::vector> shouldNotBeContained; + if (a == 1) { // box nr. 0 is contained in the set of boxes + toTest.push_back(boxes.at(0)); + shouldBeContained.push_back(containedInBox.at(0)); + } else { // box nr. 0 is not contained in the set of boxes + shouldNotBeContained.push_back(containedInBox.at(0)); + } + if (b == 1) { // box nr. 1 is contained in the set of boxes + toTest.push_back(boxes.at(1)); + shouldBeContained.push_back(containedInBox.at(1)); + } else { // box nr. 1 is not contained in the set of boxes + shouldNotBeContained.push_back(containedInBox.at(1)); + } + if (c == 1) { + toTest.push_back(boxes.at(2)); + shouldBeContained.push_back(containedInBox.at(2)); + } else { + shouldNotBeContained.push_back(containedInBox.at(2)); + } + if (d == 1) { + toTest.push_back(boxes.at(3)); + shouldBeContained.push_back(containedInBox.at(3)); + } else { + shouldNotBeContained.push_back(containedInBox.at(3)); + } + if (e == 1) { + toTest.push_back(boxes.at(4)); + shouldBeContained.push_back(containedInBox.at(4)); + } else { + shouldNotBeContained.push_back(containedInBox.at(4)); + } + if (toTest.size() > 0) { + // test all points, which should be contained in the bounding + // boxes + for (size_t i = 0; i < shouldBeContained.size(); i++) { + for (size_t k = 0; k < shouldBeContained.at(i).size(); k++) { + ASSERT_TRUE( + spatialJoinAlgs + .OnlyForTestingWrapperContainedInBoundingBoxes( + toTest, shouldBeContained.at(i).at(k))); + } + } + // test all points, which shouldn't be contained in the bounding + // boxes + for (size_t i = 0; i < shouldNotBeContained.size(); i++) { + for (size_t k = 0; k < shouldNotBeContained.at(i).size(); k++) { + ASSERT_FALSE( + spatialJoinAlgs + .OnlyForTestingWrapperContainedInBoundingBoxes( + toTest, shouldNotBeContained.at(i).at(k))); + } + } + } + } + } + } + } + } +} + +} // namespace boundingBox + } // namespace