From e9ed3e4c886a3a77c5ba03c7a356d9cf456b57cc Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Fri, 18 Oct 2024 23:23:54 +0200 Subject: [PATCH 01/18] add BoundingBox Algorithm --- src/engine/SpatialJoin.cpp | 17 +- src/engine/SpatialJoin.h | 14 +- src/engine/SpatialJoinAlgorithms.cpp | 264 +++++++++++++++++++++ src/engine/SpatialJoinAlgorithms.h | 47 ++++ test/engine/SpatialJoinAlgorithmsTest.cpp | 274 +++++++++++++++++++++- 5 files changed, 607 insertions(+), 9 deletions(-) diff --git a/src/engine/SpatialJoin.cpp b/src/engine/SpatialJoin.cpp index beb09ffe68..4bf1eb372d 100644 --- a/src/engine/SpatialJoin.cpp +++ b/src/engine/SpatialJoin.cpp @@ -209,7 +209,7 @@ size_t SpatialJoin::getCostEstimate() { if (childLeft_ && childRight_) { size_t inputEstimate = childLeft_->getSizeEstimate() * childRight_->getSizeEstimate(); - if (useBaselineAlgorithm_) { + if (algorithm_ == Algorithm::Baseline) { return inputEstimate * inputEstimate; } else { // Let n be the size of the left table and m the size of the right table. @@ -337,10 +337,21 @@ PreparedSpatialJoinParams SpatialJoin::prepareJoin() const { Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) { SpatialJoinAlgorithms algorithms{_executionContext, prepareJoin(), addDistToResult_, config_}; - if (useBaselineAlgorithm_) { + if (algorithm_ == Algorithm::Baseline) { return algorithms.BaselineAlgorithm(); - } else { + } else if (algorithm_ == Algorithm::S2Geometry) { return algorithms.S2geometryAlgorithm(); + } else if (algorithm_ == Algorithm::BoundingBox) { + // as the BoundingBoxAlgorithms only works for max distance and not for + // nearest neighbors, S2geometry gets called as a backup, if the query is + // asking for the nearest neighbors + if (std::get_if(&config_)) { + return algorithms.BoundingBoxAlgorithm(); + } else { + return algorithms.S2geometryAlgorithm(); + } + } else { + AD_THROW("choose a valid Algorithm"); } } diff --git a/src/engine/SpatialJoin.h b/src/engine/SpatialJoin.h index b62978cf90..8c67682238 100644 --- a/src/engine/SpatialJoin.h +++ b/src/engine/SpatialJoin.h @@ -96,14 +96,20 @@ class SpatialJoin : public Operation { // purposes std::optional getMaxResults() const; - void selectAlgorithm(bool useBaselineAlgorithm) { - useBaselineAlgorithm_ = useBaselineAlgorithm; - } + // options which can be used for the algorithm, which calculates the result + enum Algorithm { Baseline, S2Geometry, BoundingBox }; + + void selectAlgorithm(Algorithm algorithm) { algorithm_ = algorithm; } std::pair onlyForTestingGetConfig() const { return std::pair{getMaxDist().value_or(-1), getMaxResults().value_or(-1)}; } + std::variant + onlyForTestingGetRealConfig() const { + return config_; + } + std::shared_ptr onlyForTestingGetLeftChild() const { return childLeft_; } @@ -135,5 +141,5 @@ class SpatialJoin : public Operation { // between the two objects bool addDistToResult_ = true; const string nameDistanceInternal_ = "?distOfTheTwoObjectsAddedInternally"; - bool useBaselineAlgorithm_ = false; + Algorithm algorithm_ = BoundingBox; }; diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index dd7a9b1c7c..2935d90003 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -223,3 +223,267 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() { return Result(std::move(result), std::vector{}, Result::getMergedLocalVocab(*resultLeft, *resultRight)); } + +// ____________________________________________________________________________ +std::vector SpatialJoinAlgorithms::computeBoundingBox( + const point& startPoint) { + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + if (!maxDist.has_value()) { + AD_THROW("Max distance must have a value for this operation"); + } + // haversine function + auto haversine = [](double theta) { return (1 - std::cos(theta)) / 2; }; + + // inverse haversine function + auto archaversine = [](double theta) { return std::acos(1 - 2 * theta); }; + + // safety buffer for numerical inaccuracies + double maxDistInMetersBuffer = static_cast(maxDist.value()); + if (maxDist.value() < 10) { + maxDistInMetersBuffer = 10; + } else if (maxDist.value() < std::numeric_limits::max() / 1.02) { + maxDistInMetersBuffer = 1.01 * maxDist.value(); + } else { + maxDistInMetersBuffer = std::numeric_limits::max(); + } + + // for large distances, where the lower calculation would just result in + // a single bounding box for the whole planet, do an optimized version + if (maxDist.value() > circumferenceMax_ / 4.0 && + maxDist.value() < circumferenceMax_ / 2.01) { + return computeAntiBoundingBox(startPoint); + } + + // compute latitude bound + double upperLatBound = + startPoint.get<1>() + maxDistInMetersBuffer * (360 / circumferenceMax_); + double lowerLatBound = + startPoint.get<1>() - maxDistInMetersBuffer * (360 / circumferenceMax_); + bool poleReached = false; + // test for "overflows" + if (lowerLatBound <= -90) { + lowerLatBound = -90; + poleReached = true; // south pole reached + } + if (upperLatBound >= 90) { + upperLatBound = 90; + poleReached = true; // north pole reached + } + if (poleReached) { + return {box(point(-180.0f, lowerLatBound), point(180.0f, upperLatBound))}; + } + + // compute longitude bound. For an explanation of the calculation and the + // naming convention see my master thesis + double alpha = maxDistInMetersBuffer / radius_; + double gamma = + (90 - std::abs(startPoint.get<1>())) * (2 * std::numbers::pi / 360); + double beta = std::acos((std::cos(gamma) / std::cos(alpha))); + double delta = 0; + if (maxDistInMetersBuffer > circumferenceMax_ / 20) { + // use law of cosines + delta = std::acos((std::cos(alpha) - std::cos(gamma) * std::cos(beta)) / + (std::sin(gamma) * std::sin(beta))); + } else { + // use law of haversines for numerical stability + delta = archaversine((haversine(alpha - haversine(gamma - beta))) / + (std::sin(gamma) * std::sin(beta))); + } + double lonRange = delta * 360 / (2 * std::numbers::pi); + double leftLonBound = startPoint.get<0>() - lonRange; + double rightLonBound = startPoint.get<0>() + lonRange; + // test for "overflows" and create two bounding boxes if necessary + if (leftLonBound < -180) { + box box1 = + box(point(-180, lowerLatBound), point(rightLonBound, upperLatBound)); + box box2 = box(point(leftLonBound + 360, lowerLatBound), + point(180, upperLatBound)); + return {box1, box2}; + } else if (rightLonBound > 180) { + box box1 = + box(point(leftLonBound, lowerLatBound), point(180, upperLatBound)); + box box2 = box(point(-180, lowerLatBound), + point(rightLonBound - 360, upperLatBound)); + return {box1, box2}; + } + // default case, when no bound has an "overflow" + return {box(point(leftLonBound, lowerLatBound), + point(rightLonBound, upperLatBound))}; +} + +// ____________________________________________________________________________ +std::vector SpatialJoinAlgorithms::computeAntiBoundingBox( + const point& startPoint) { + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + if (!maxDist.has_value()) { + AD_THROW("Max distance must have a value for this operation"); + } + // point on the opposite side of the globe + point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1); + if (antiPoint.get<0>() > 180) { + antiPoint.set<0>(antiPoint.get<0>() - 360); + } + // for an explanation of the formula see the master thesis. Divide by two two + // only consider the distance from the point to the antiPoint, subtract + // maxDist and a safety margine from that + double antiDist = + (circumferenceMin_ / 2.0) - maxDist.value() * 1.01; // safety margin + // use the bigger circumference as an additional safety margin, use 2.01 + // instead of 2.0 because of rounding inaccuracies in floating point + // operations + double distToAntiPoint = (360 / circumferenceMax_) * (antiDist / 2.01); + double upperBound = antiPoint.get<1>() + distToAntiPoint; + double lowerBound = antiPoint.get<1>() - distToAntiPoint; + double leftBound = antiPoint.get<0>() - distToAntiPoint; + double rightBound = antiPoint.get<0>() + distToAntiPoint; + bool northPoleTouched = false; + bool southPoleTouched = false; + bool boxCrosses180Longitude = false; // if the 180 to -180 line is touched + // if a pole is crossed, ignore the part after the crossing + if (upperBound > 90) { + upperBound = 90; + northPoleTouched = true; + } + if (lowerBound < -90) { + lowerBound = -90; + southPoleTouched = true; + } + if (leftBound < -180) { + leftBound += 360; + } + if (rightBound > 180) { + rightBound -= 360; + } + if (rightBound < leftBound) { + boxCrosses180Longitude = true; + } + // compute bounding boxes using the anti bounding box from above + std::vector boxes; + if (!northPoleTouched) { + // add upper bounding box(es) + if (boxCrosses180Longitude) { + boxes.push_back(box(point(leftBound, upperBound), point(180, 90))); + boxes.push_back(box(point(-180, upperBound), point(rightBound, 90))); + } else { + boxes.push_back(box(point(leftBound, upperBound), point(rightBound, 90))); + } + } + if (!southPoleTouched) { + // add lower bounding box(es) + if (boxCrosses180Longitude) { + boxes.push_back(box(point(leftBound, -90), point(180, lowerBound))); + boxes.push_back(box(point(-180, -90), point(rightBound, lowerBound))); + } else { + boxes.push_back( + box(point(leftBound, -90), point(rightBound, lowerBound))); + } + } + // add the box(es) inbetween the longitude lines + if (boxCrosses180Longitude) { + // only one box needed to cover the longitudes + boxes.push_back(box(point(rightBound, -90), point(leftBound, 90))); + } else { + // two boxes needed, one left and one right of the anti bounding box + boxes.push_back(box(point(-180, -90), point(leftBound, 90))); + boxes.push_back(box(point(rightBound, -90), point(180, 90))); + } + return boxes; +} + +// ____________________________________________________________________________ +bool SpatialJoinAlgorithms::containedInBoundingBoxes( + const std::vector& bbox, point point1) { + // correct lon bounds if necessary + while (point1.get<0>() < -180) { + point1.set<0>(point1.get<0>() + 360); + } + while (point1.get<0>() > 180) { + point1.set<0>(point1.get<0>() - 360); + } + if (point1.get<1>() < -90) { + point1.set<1>(-90); + } else if (point1.get<1>() > 90) { + point1.set<1>(90); + } + + for (size_t i = 0; i < bbox.size(); i++) { + if (boost::geometry::covered_by(point1, bbox.at(i))) { + return true; + } + } + return false; +} + +// ____________________________________________________________________________ +Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, + rightJoinCol, numColumns, maxDist, maxResults] = params_; + IdTable result{numColumns, qec_->getAllocator()}; + + // create r-tree for smaller result table + auto smallerResult = idTableLeft; + auto otherResult = idTableRight; + bool leftResSmaller = true; + /*auto smallerChild = childLeft_; + auto otherChild = childRight_; + auto smallerVariable = leftChildVariable_; + auto otherVariable = rightChildVariable_;*/ + auto smallerResJoinCol = leftJoinCol; + auto otherResJoinCol = rightJoinCol; + if (idTableLeft->numRows() > idTableRight->numRows()) { + smallerResult = idTableRight; + otherResult = idTableLeft; + leftResSmaller = false; + /*smallerChild = childRight_; + otherChild = childLeft_; + smallerVariable = rightChildVariable_; + otherVariable = leftChildVariable_; */ + smallerResJoinCol = rightJoinCol; + otherResJoinCol = leftJoinCol; + } + + // Todo in the benchmark: use different algorithms and compare their + // performance + bgi::rtree> rtree; + for (size_t i = 0; i < smallerResult->numRows(); i++) { + // get point of row i + // ColumnIndex smallerJoinCol = getJoinCol(smallerChild, smallerVariable); + auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); + point p(geopoint.value().getLng(), geopoint.value().getLat()); + + // add every point together with the row number into the rtree + rtree.insert(std::make_pair(p, i)); + } + for (size_t i = 0; i < otherResult->numRows(); i++) { + // ColumnIndex otherJoinCol = getJoinCol(otherChild, otherVariable); + auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); + point p(geopoint1.value().getLng(), geopoint1.value().getLat()); + + // query the other rtree for every point using the following bounding box + std::vector bbox = computeBoundingBox(p); + std::vector results; + for (size_t k = 0; k < bbox.size(); k++) { + rtree.query(bgi::intersects(bbox.at(k)), std::back_inserter(results)); + } + for (size_t k = 0; k < results.size(); k++) { + size_t rowLeft = results.at(k).second; + size_t rowRight = i; + if (!leftResSmaller) { + rowLeft = i; + rowRight = results.at(k).second; + } + auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight, + leftJoinCol, rightJoinCol); + if (distance.getDatatype() == Datatype::Int && + distance.getInt() <= maxDist) { + addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, + rowRight, distance); + } + } + } + Result resTable = + Result(std::move(result), std::vector{}, LocalVocab{}); + return resTable; +} diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 00556f1b3d..316be773f6 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -1,8 +1,20 @@ #pragma once +#include +#include +#include +#include + #include "engine/Result.h" #include "engine/SpatialJoin.h" +namespace bg = boost::geometry; +namespace bgi = boost::geometry::index; + +typedef bg::model::point point; +typedef bg::model::box box; +typedef std::pair value; + class SpatialJoinAlgorithms { public: // initialize the Algorithm with the needed parameters @@ -14,6 +26,16 @@ class SpatialJoinAlgorithms { Result S2geometryAlgorithm(); Result BoundingBoxAlgorithm(); + std::vector OnlyForTestingWrapperComputeBoundingBox( + const point& startPoint) { + return computeBoundingBox(startPoint); + } + + bool OnlyForTestingWrapperContainedInBoundingBoxes( + const std::vector& bbox, point point1) { + return containedInBoundingBoxes(bbox, point1); + } + private: // helper function which returns a GeoPoint if the element of the given table // represents a GeoPoint @@ -33,8 +55,33 @@ class SpatialJoinAlgorithms { const IdTable* resultRight, size_t rowLeft, size_t rowRight, Id distance) const; + // this function computes the bounding box(es), which represent all points, + // which are in reach of the starting point with a distance of at most + // maxDistanceInMeters + std::vector computeBoundingBox(const point& startPoint); + + // this helper function calculates the bounding boxes based on a box, where + // definetly no match can occur. This function gets used, when the usual + // procedure, would just result in taking a big bounding box, which covers + // the whole planet (so for large max distances) + std::vector computeAntiBoundingBox(const point& startPoint); + + // this function returns true, when the given point is contained in any of the + // bounding boxes + bool containedInBoundingBoxes(const std::vector& bbox, point point1); + QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; bool addDistToResult_; std::variant config_; + + // circumference in meters at the equator (max) and the pole (min) (as the + // earth is not exactly a sphere the circumference is different. Note the + // factor of 1000 to convert to meters) + static constexpr double circumferenceMax_ = 40075 * 1000; + static constexpr double circumferenceMin_ = 40007 * 1000; + + // radius of the earth in meters (as the earth is not exactly a sphere the + // radius at the equator has been taken) + static constexpr double radius_ = 6378 * 1000; // * 1000 to convert to meters }; diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index f732c0d8e4..19382430e4 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -25,7 +25,8 @@ using namespace SpatialJoinTestHelpers; namespace computeResultTest { -class SpatialJoinParamTest : public ::testing::TestWithParam { +class SpatialJoinParamTest + : public ::testing::TestWithParam { public: void createAndTestSpatialJoin( QueryExecutionContext* qec, SparqlTriple spatialJoinTriple, @@ -780,8 +781,277 @@ TEST_P(SpatialJoinParamTest, computeResultSmallDatasetDifferentSizeChildren) { expectedMaxDist10000000_rows_diff, columnNames, false); } -INSTANTIATE_TEST_SUITE_P(SpatialJoin, SpatialJoinParamTest, ::testing::Bool()); +INSTANTIATE_TEST_SUITE_P( + SpatialJoin, SpatialJoinParamTest, + ::testing::Values(SpatialJoin::Algorithm::Baseline, + SpatialJoin::Algorithm::S2Geometry, + SpatialJoin::Algorithm::BoundingBox)); } // end of Namespace computeResultTest +namespace boundingBox { + +namespace bg = boost::geometry; +namespace bgi = boost::geometry::index; + +typedef bg::model::point point; +typedef bg::model::box box; +typedef std::pair value; + +inline void testBoundingBox(const long long& maxDistInMeters, + const point& startPoint) { + auto checkOutside = [&](const point& point1, const point& startPoint, + const std::vector& bbox, + SpatialJoinAlgorithms* spatialJoinAlg) { + // check if the point is contained in any bounding box + bool within = spatialJoinAlg->OnlyForTestingWrapperContainedInBoundingBoxes( + bbox, point1); + if (!within) { + GeoPoint geo1{point1.get<1>(), point1.get<0>()}; + GeoPoint geo2{startPoint.get<1>(), startPoint.get<0>()}; + double dist = ad_utility::detail::wktDistImpl(geo1, geo2) * 1000; + ASSERT_GT(static_cast(dist), maxDistInMeters); + } + }; + + auto testBounds = [](double x, double y, const box& bbox, + bool shouldBeWithin) { + // correct lon bounds if necessary + if (x < -180) { + x += 360; + } else if (x > 180) { + x -= 360; + } + + // testing only possible, if lat bounds are correct and the lon bounds + // don't cover everything (as then left or right of the box is again + // inside the box because of the spherical geometry) + double minLonBox = bbox.min_corner().get<0>(); + double maxLonBox = bbox.max_corner().get<0>(); + if (y < 90 && y > -90 && !(minLonBox < 179.9999 && maxLonBox > 179.9999)) { + bool within = boost::geometry::covered_by(point(x, y), bbox); + ASSERT_TRUE(within == shouldBeWithin); + } + }; + + // build dummy join to access the containedInBoundingBox and + // computeBoundingBox functions. Note that maxDistInMeters has to be accurate, + // otherwise the functions of spatialJoin don't work correctly + auto maxDistInMetersStr = + ""; + auto qec = buildTestQEC(); + auto spatialJoinTriple = + SparqlTriple{TripleComponent{Variable{"?point1"}}, maxDistInMetersStr, + TripleComponent{Variable{"?point2"}}}; + std::shared_ptr spatialJoinOperation = + ad_utility::makeExecutionTree(qec, spatialJoinTriple, + std::nullopt, std::nullopt); + + // add children and test, that multiplicity is a dummy return before all + // children are added + std::shared_ptr op = spatialJoinOperation->getRootOperation(); + SpatialJoin* spatialJoin = static_cast(op.get()); + + PreparedSpatialJoinParams params{ + nullptr, nullptr, nullptr, nullptr, + 0, 0, 1, spatialJoin->getMaxDist(), + std::nullopt}; + + SpatialJoinAlgorithms spatialJoinAlgs{ + qec, params, true, spatialJoin->onlyForTestingGetRealConfig()}; + + std::vector bbox = + spatialJoinAlgs.OnlyForTestingWrapperComputeBoundingBox(startPoint); + // broad grid test + for (int lon = -180; lon < 180; lon += 20) { + for (int lat = -90; lat < 90; lat += 20) { + checkOutside(point(lon, lat), startPoint, bbox, &spatialJoinAlgs); + } + } + + // do tests at the border of the box + for (size_t k = 0; k < bbox.size(); k++) { + // use a small delta for testing because of floating point inaccuracies + const double delta = 0.00000001; + const point minPoint = bbox.at(k).min_corner(); + const point maxPoint = bbox.at(k).max_corner(); + const double lowX = minPoint.get<0>(); + const double lowY = minPoint.get<1>(); + const double highX = maxPoint.get<0>(); + const double highY = maxPoint.get<1>(); + const double xRange = highX - lowX - 2 * delta; + const double yRange = highY - lowY - 2 * delta; + for (size_t i = 0; i <= 100; i++) { + // barely in or out at the left edge + testBounds(lowX + delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + true); + testBounds(lowX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + false); + checkOutside(point(lowX - delta, lowY + (yRange / 100) * i), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the bottom edge + testBounds(lowX + delta + (xRange / 100) * i, lowY + delta, bbox.at(k), + true); + testBounds(lowX + delta + (xRange / 100) * i, lowY - delta, bbox.at(k), + false); + checkOutside(point(lowX + (xRange / 100) * i, lowY - delta), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the right edge + testBounds(highX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + true); + testBounds(highX + delta, lowY + delta + (yRange / 100) * i, bbox.at(k), + false); + checkOutside(point(highX + delta, lowY + (yRange / 100) * i), startPoint, + bbox, &spatialJoinAlgs); + // barely in or out at the top edge + testBounds(lowX + delta + (xRange / 100) * i, highY - delta, bbox.at(k), + true); + testBounds(lowX + delta + (xRange / 100) * i, highY + delta, bbox.at(k), + false); + checkOutside(point(lowX + (xRange / 100) * i, highY + delta), startPoint, + bbox, &spatialJoinAlgs); + } + } +} + +TEST(SpatialJoin, computeBoundingBox) { + // ASSERT_EQ("", "uncomment the part below again"); + double circ = 40075 * 1000; // circumference of the earth (at the equator) + // 180.0001 in case 180 is represented internally as 180.0000000001 + for (double lon = -180; lon <= 180.0001; lon += 15) { + // 90.0001 in case 90 is represented internally as 90.000000001 + for (double lat = -90; lat <= 90.0001; lat += 15) { + // circ / 2 means, that all points on earth are within maxDist km of any + // starting point + for (int maxDist = 0; maxDist <= circ / 2.0; maxDist += circ / 36.0) { + testBoundingBox(maxDist, point(lon, lat)); + } + } + } +} + +TEST(SpatialJoin, containedInBoundingBoxes) { + // build dummy join to access the containedInBoundingBox and + // computeBoundingBox functions + auto qec = buildTestQEC(); + auto spatialJoinTriple = SparqlTriple{TripleComponent{Variable{"?point1"}}, + "", + TripleComponent{Variable{"?point2"}}}; + std::shared_ptr spatialJoinOperation = + ad_utility::makeExecutionTree(qec, spatialJoinTriple, + std::nullopt, std::nullopt); + + std::shared_ptr op = spatialJoinOperation->getRootOperation(); + SpatialJoin* spatialJoin = static_cast(op.get()); + + PreparedSpatialJoinParams params{ + nullptr, nullptr, nullptr, nullptr, + 0, 0, 1, spatialJoin->getMaxDist(), + std::nullopt}; + + SpatialJoinAlgorithms spatialJoinAlgs{ + qec, params, true, spatialJoin->onlyForTestingGetRealConfig()}; + + // note that none of the boxes is overlapping, therefore we can check, that + // none of the points which should be contained in one box are contained in + // another box + std::vector boxes = { + box(point(20, 40), point(40, 60)), + box(point(-180, -20), point(-150, 30)), // touching left border + box(point(50, -30), point(180, 10)), // touching right border + box(point(-30, 50), point(10, 90)), // touching north pole + box(point(-45, -90), point(0, -45)) // touching south pole + }; + + // the first entry in this vector is a vector of points, which is contained + // in the first box, the second entry contains points, which are contained in + // the second box and so on + std::vector> containedInBox = { + {point(20, 40), point(40, 40), point(40, 60), point(20, 60), + point(30, 50)}, + {point(-180, -20), point(-150, -20), point(-150, 30), point(-180, 30), + point(-150, 0)}, + {point(50, -30), point(180, -30), point(180, 10), point(50, 10), + point(70, -10)}, + {point(-30, 50), point(10, 50), point(10, 90), point(-30, 90), + point(-20, 60)}, + {point(-45, -90), point(0, -90), point(0, -45), point(-45, -45), + point(-10, -60)}}; + + // all combinations of box is contained in bounding boxes and is not + // contained. a one encodes, the bounding box is contained in the set of + // bounding boxes, a zero encodes, it isn't. If a box is contained, it's + // checked, that the points which should be contained in the box are + // contained. If the box is not contained, it's checked, that the points which + // are contained in that box are not contained in the box set (because the + // boxes don't overlap) + for (size_t a = 0; a <= 1; a++) { + for (size_t b = 0; a <= 1; a++) { + for (size_t c = 0; a <= 1; a++) { + for (size_t d = 0; a <= 1; a++) { + for (size_t e = 0; a <= 1; a++) { + std::vector toTest; // the set of bounding boxes + std::vector> shouldBeContained; + std::vector> shouldNotBeContained; + if (a == 1) { // box nr. 0 is contained in the set of boxes + toTest.push_back(boxes.at(0)); + shouldBeContained.push_back(containedInBox.at(0)); + } else { // box nr. 0 is not contained in the set of boxes + shouldNotBeContained.push_back(containedInBox.at(0)); + } + if (b == 1) { // box nr. 1 is contained in the set of boxes + toTest.push_back(boxes.at(1)); + shouldBeContained.push_back(containedInBox.at(1)); + } else { // box nr. 1 is not contained in the set of boxes + shouldNotBeContained.push_back(containedInBox.at(1)); + } + if (c == 1) { + toTest.push_back(boxes.at(2)); + shouldBeContained.push_back(containedInBox.at(2)); + } else { + shouldNotBeContained.push_back(containedInBox.at(2)); + } + if (d == 1) { + toTest.push_back(boxes.at(3)); + shouldBeContained.push_back(containedInBox.at(3)); + } else { + shouldNotBeContained.push_back(containedInBox.at(3)); + } + if (e == 1) { + toTest.push_back(boxes.at(4)); + shouldBeContained.push_back(containedInBox.at(4)); + } else { + shouldNotBeContained.push_back(containedInBox.at(4)); + } + if (toTest.size() > 0) { + // test all points, which should be contained in the bounding + // boxes + for (size_t i = 0; i < shouldBeContained.size(); i++) { + for (size_t k = 0; k < shouldBeContained.at(i).size(); k++) { + ASSERT_TRUE( + spatialJoinAlgs + .OnlyForTestingWrapperContainedInBoundingBoxes( + toTest, shouldBeContained.at(i).at(k))); + } + } + // test all points, which shouldn't be contained in the bounding + // boxes + for (size_t i = 0; i < shouldNotBeContained.size(); i++) { + for (size_t k = 0; k < shouldNotBeContained.at(i).size(); k++) { + ASSERT_FALSE( + spatialJoinAlgs + .OnlyForTestingWrapperContainedInBoundingBoxes( + toTest, shouldNotBeContained.at(i).at(k))); + } + } + } + } + } + } + } + } +} + +} // namespace boundingBox + } // namespace From ada29787e076769eb7098780263badd6b0b18982 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Sun, 3 Nov 2024 15:00:07 +0100 Subject: [PATCH 02/18] Sonarqube issues --- src/engine/SpatialJoin.h | 4 +- src/engine/SpatialJoinAlgorithms.cpp | 91 ++++++++++++++-------------- src/engine/SpatialJoinAlgorithms.h | 15 ++--- 3 files changed, 54 insertions(+), 56 deletions(-) diff --git a/src/engine/SpatialJoin.h b/src/engine/SpatialJoin.h index 8c67682238..61fa5247aa 100644 --- a/src/engine/SpatialJoin.h +++ b/src/engine/SpatialJoin.h @@ -97,7 +97,7 @@ class SpatialJoin : public Operation { std::optional getMaxResults() const; // options which can be used for the algorithm, which calculates the result - enum Algorithm { Baseline, S2Geometry, BoundingBox }; + enum class Algorithm { Baseline, S2Geometry, BoundingBox }; void selectAlgorithm(Algorithm algorithm) { algorithm_ = algorithm; } @@ -141,5 +141,5 @@ class SpatialJoin : public Operation { // between the two objects bool addDistToResult_ = true; const string nameDistanceInternal_ = "?distOfTheTwoObjectsAddedInternally"; - Algorithm algorithm_ = BoundingBox; + Algorithm algorithm_ = Algorithm::BoundingBox; }; diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 2935d90003..aa71feaf29 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -226,7 +226,7 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() { // ____________________________________________________________________________ std::vector SpatialJoinAlgorithms::computeBoundingBox( - const point& startPoint) { + const point& startPoint) const { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; if (!maxDist.has_value()) { @@ -239,19 +239,22 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( auto archaversine = [](double theta) { return std::acos(1 - 2 * theta); }; // safety buffer for numerical inaccuracies - double maxDistInMetersBuffer = static_cast(maxDist.value()); + double maxDistInMetersBuffer; if (maxDist.value() < 10) { maxDistInMetersBuffer = 10; - } else if (maxDist.value() < std::numeric_limits::max() / 1.02) { - maxDistInMetersBuffer = 1.01 * maxDist.value(); + } else if (static_cast(maxDist.value()) < + static_cast(std::numeric_limits::max()) / + 1.02) { + maxDistInMetersBuffer = 1.01 * static_cast(maxDist.value()); } else { - maxDistInMetersBuffer = std::numeric_limits::max(); + maxDistInMetersBuffer = + static_cast(std::numeric_limits::max()); } // for large distances, where the lower calculation would just result in // a single bounding box for the whole planet, do an optimized version - if (maxDist.value() > circumferenceMax_ / 4.0 && - maxDist.value() < circumferenceMax_ / 2.01) { + if (static_cast(maxDist.value()) > circumferenceMax_ / 4.0 && + static_cast(maxDist.value()) < circumferenceMax_ / 2.01) { return computeAntiBoundingBox(startPoint); } @@ -279,7 +282,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( double alpha = maxDistInMetersBuffer / radius_; double gamma = (90 - std::abs(startPoint.get<1>())) * (2 * std::numbers::pi / 360); - double beta = std::acos((std::cos(gamma) / std::cos(alpha))); + double beta = std::acos(std::cos(gamma) / std::cos(alpha)); double delta = 0; if (maxDistInMetersBuffer > circumferenceMax_ / 20) { // use law of cosines @@ -295,16 +298,16 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( double rightLonBound = startPoint.get<0>() + lonRange; // test for "overflows" and create two bounding boxes if necessary if (leftLonBound < -180) { - box box1 = + auto box1 = box(point(-180, lowerLatBound), point(rightLonBound, upperLatBound)); - box box2 = box(point(leftLonBound + 360, lowerLatBound), - point(180, upperLatBound)); + auto box2 = box(point(leftLonBound + 360, lowerLatBound), + point(180, upperLatBound)); return {box1, box2}; } else if (rightLonBound > 180) { - box box1 = + auto box1 = box(point(leftLonBound, lowerLatBound), point(180, upperLatBound)); - box box2 = box(point(-180, lowerLatBound), - point(rightLonBound - 360, upperLatBound)); + auto box2 = box(point(-180, lowerLatBound), + point(rightLonBound - 360, upperLatBound)); return {box1, box2}; } // default case, when no bound has an "overflow" @@ -314,7 +317,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( // ____________________________________________________________________________ std::vector SpatialJoinAlgorithms::computeAntiBoundingBox( - const point& startPoint) { + const point& startPoint) const { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; if (!maxDist.has_value()) { @@ -329,7 +332,7 @@ std::vector SpatialJoinAlgorithms::computeAntiBoundingBox( // only consider the distance from the point to the antiPoint, subtract // maxDist and a safety margine from that double antiDist = - (circumferenceMin_ / 2.0) - maxDist.value() * 1.01; // safety margin + (circumferenceMin_ / 2.0) - static_cast(maxDist.value()) * 1.01; // use the bigger circumference as an additional safety margin, use 2.01 // instead of 2.0 because of rounding inaccuracies in floating point // operations @@ -364,37 +367,38 @@ std::vector SpatialJoinAlgorithms::computeAntiBoundingBox( if (!northPoleTouched) { // add upper bounding box(es) if (boxCrosses180Longitude) { - boxes.push_back(box(point(leftBound, upperBound), point(180, 90))); - boxes.push_back(box(point(-180, upperBound), point(rightBound, 90))); + boxes.emplace_back(box(point(leftBound, upperBound), point(180, 90))); + boxes.emplace_back(box(point(-180, upperBound), point(rightBound, 90))); } else { - boxes.push_back(box(point(leftBound, upperBound), point(rightBound, 90))); + boxes.emplace_back( + box(point(leftBound, upperBound), point(rightBound, 90))); } } if (!southPoleTouched) { // add lower bounding box(es) if (boxCrosses180Longitude) { - boxes.push_back(box(point(leftBound, -90), point(180, lowerBound))); - boxes.push_back(box(point(-180, -90), point(rightBound, lowerBound))); + boxes.emplace_back(box(point(leftBound, -90), point(180, lowerBound))); + boxes.emplace_back(box(point(-180, -90), point(rightBound, lowerBound))); } else { - boxes.push_back( + boxes.emplace_back( box(point(leftBound, -90), point(rightBound, lowerBound))); } } // add the box(es) inbetween the longitude lines if (boxCrosses180Longitude) { // only one box needed to cover the longitudes - boxes.push_back(box(point(rightBound, -90), point(leftBound, 90))); + boxes.emplace_back(box(point(rightBound, -90), point(leftBound, 90))); } else { // two boxes needed, one left and one right of the anti bounding box - boxes.push_back(box(point(-180, -90), point(leftBound, 90))); - boxes.push_back(box(point(rightBound, -90), point(180, 90))); + boxes.emplace_back(box(point(-180, -90), point(leftBound, 90))); + boxes.emplace_back(box(point(rightBound, -90), point(180, 90))); } return boxes; } // ____________________________________________________________________________ bool SpatialJoinAlgorithms::containedInBoundingBoxes( - const std::vector& bbox, point point1) { + const std::vector& bbox, point point1) const { // correct lon bounds if necessary while (point1.get<0>() < -180) { point1.set<0>(point1.get<0>() + 360); @@ -408,10 +412,10 @@ bool SpatialJoinAlgorithms::containedInBoundingBoxes( point1.set<1>(90); } - for (size_t i = 0; i < bbox.size(); i++) { - if (boost::geometry::covered_by(point1, bbox.at(i))) { - return true; - } + if (std::ranges::any_of(bbox.cbegin(), bbox.cend(), [point1](box box_) { + return boost::geometry::covered_by(point1, box_); + })) { + return true; } return false; } @@ -426,20 +430,12 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto smallerResult = idTableLeft; auto otherResult = idTableRight; bool leftResSmaller = true; - /*auto smallerChild = childLeft_; - auto otherChild = childRight_; - auto smallerVariable = leftChildVariable_; - auto otherVariable = rightChildVariable_;*/ auto smallerResJoinCol = leftJoinCol; auto otherResJoinCol = rightJoinCol; if (idTableLeft->numRows() > idTableRight->numRows()) { smallerResult = idTableRight; otherResult = idTableLeft; leftResSmaller = false; - /*smallerChild = childRight_; - otherChild = childLeft_; - smallerVariable = rightChildVariable_; - otherVariable = leftChildVariable_; */ smallerResJoinCol = rightJoinCol; otherResJoinCol = leftJoinCol; } @@ -457,22 +453,23 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { rtree.insert(std::make_pair(p, i)); } for (size_t i = 0; i < otherResult->numRows(); i++) { - // ColumnIndex otherJoinCol = getJoinCol(otherChild, otherVariable); auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); point p(geopoint1.value().getLng(), geopoint1.value().getLat()); // query the other rtree for every point using the following bounding box std::vector bbox = computeBoundingBox(p); std::vector results; - for (size_t k = 0; k < bbox.size(); k++) { - rtree.query(bgi::intersects(bbox.at(k)), std::back_inserter(results)); - } - for (size_t k = 0; k < results.size(); k++) { - size_t rowLeft = results.at(k).second; + + std::ranges::for_each(bbox, [&](box bbox) { + rtree.query(bgi::intersects(bbox), std::back_inserter(results)); + }); + + std::ranges::for_each(results, [&](value res) { + size_t rowLeft = res.second; size_t rowRight = i; if (!leftResSmaller) { rowLeft = i; - rowRight = results.at(k).second; + rowRight = res.second; } auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight, leftJoinCol, rightJoinCol); @@ -481,9 +478,9 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, rowRight, distance); } - } + }); } - Result resTable = + auto resTable = Result(std::move(result), std::vector{}, LocalVocab{}); return resTable; } diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 316be773f6..3565950cac 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -11,9 +11,9 @@ namespace bg = boost::geometry; namespace bgi = boost::geometry::index; -typedef bg::model::point point; -typedef bg::model::box box; -typedef std::pair value; +using point = bg::model::point; +using box = bg::model::box; +using value = std::pair; class SpatialJoinAlgorithms { public: @@ -58,17 +58,18 @@ class SpatialJoinAlgorithms { // this function computes the bounding box(es), which represent all points, // which are in reach of the starting point with a distance of at most // maxDistanceInMeters - std::vector computeBoundingBox(const point& startPoint); + std::vector computeBoundingBox(const point& startPoint) const; // this helper function calculates the bounding boxes based on a box, where - // definetly no match can occur. This function gets used, when the usual + // definitely no match can occur. This function gets used, when the usual // procedure, would just result in taking a big bounding box, which covers // the whole planet (so for large max distances) - std::vector computeAntiBoundingBox(const point& startPoint); + std::vector computeAntiBoundingBox(const point& startPoint) const; // this function returns true, when the given point is contained in any of the // bounding boxes - bool containedInBoundingBoxes(const std::vector& bbox, point point1); + bool containedInBoundingBoxes(const std::vector& bbox, + point point1) const; QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; From eeadcd7d5b42900dac8dbe283568e7555c019c53 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Mon, 4 Nov 2024 12:44:23 +0100 Subject: [PATCH 03/18] Sonarqube issues --- src/engine/SpatialJoinAlgorithms.cpp | 26 ++++++++++++-------------- src/engine/SpatialJoinAlgorithms.h | 4 ++-- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index aa71feaf29..a86bf94a0c 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -367,31 +367,29 @@ std::vector SpatialJoinAlgorithms::computeAntiBoundingBox( if (!northPoleTouched) { // add upper bounding box(es) if (boxCrosses180Longitude) { - boxes.emplace_back(box(point(leftBound, upperBound), point(180, 90))); - boxes.emplace_back(box(point(-180, upperBound), point(rightBound, 90))); + boxes.emplace_back(point(leftBound, upperBound), point(180, 90)); + boxes.emplace_back(point(-180, upperBound), point(rightBound, 90)); } else { - boxes.emplace_back( - box(point(leftBound, upperBound), point(rightBound, 90))); + boxes.emplace_back(point(leftBound, upperBound), point(rightBound, 90)); } } if (!southPoleTouched) { // add lower bounding box(es) if (boxCrosses180Longitude) { - boxes.emplace_back(box(point(leftBound, -90), point(180, lowerBound))); - boxes.emplace_back(box(point(-180, -90), point(rightBound, lowerBound))); + boxes.emplace_back(point(leftBound, -90), point(180, lowerBound)); + boxes.emplace_back(point(-180, -90), point(rightBound, lowerBound)); } else { - boxes.emplace_back( - box(point(leftBound, -90), point(rightBound, lowerBound))); + boxes.emplace_back(point(leftBound, -90), point(rightBound, lowerBound)); } } // add the box(es) inbetween the longitude lines if (boxCrosses180Longitude) { // only one box needed to cover the longitudes - boxes.emplace_back(box(point(rightBound, -90), point(leftBound, 90))); + boxes.emplace_back(point(rightBound, -90), point(leftBound, 90)); } else { // two boxes needed, one left and one right of the anti bounding box - boxes.emplace_back(box(point(-180, -90), point(leftBound, 90))); - boxes.emplace_back(box(point(rightBound, -90), point(180, 90))); + boxes.emplace_back(point(-180, -90), point(leftBound, 90)); + boxes.emplace_back(point(rightBound, -90), point(180, 90)); } return boxes; } @@ -412,8 +410,8 @@ bool SpatialJoinAlgorithms::containedInBoundingBoxes( point1.set<1>(90); } - if (std::ranges::any_of(bbox.cbegin(), bbox.cend(), [point1](box box_) { - return boost::geometry::covered_by(point1, box_); + if (std::ranges::any_of(bbox, [point1](const box& aBox) { + return boost::geometry::covered_by(point1, aBox); })) { return true; } @@ -460,7 +458,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { std::vector bbox = computeBoundingBox(p); std::vector results; - std::ranges::for_each(bbox, [&](box bbox) { + std::ranges::for_each(bbox, [&](const box& bbox) { rtree.query(bgi::intersects(bbox), std::back_inserter(results)); }); diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 3565950cac..1bee40950a 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -27,12 +27,12 @@ class SpatialJoinAlgorithms { Result BoundingBoxAlgorithm(); std::vector OnlyForTestingWrapperComputeBoundingBox( - const point& startPoint) { + const point& startPoint) const { return computeBoundingBox(startPoint); } bool OnlyForTestingWrapperContainedInBoundingBoxes( - const std::vector& bbox, point point1) { + const std::vector& bbox, point point1) const { return containedInBoundingBoxes(bbox, point1); } From 127ba5314849b36e8de651b8d80cfc2d88c10927 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Mon, 4 Nov 2024 19:21:38 +0100 Subject: [PATCH 04/18] dont use cpp 20 version constant of pi --- src/engine/SpatialJoinAlgorithms.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index a86bf94a0c..3d7f5bb57d 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -6,6 +6,7 @@ #include #include +#include "math.h" #include "util/GeoSparqlHelpers.h" // ____________________________________________________________________________ @@ -280,8 +281,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( // compute longitude bound. For an explanation of the calculation and the // naming convention see my master thesis double alpha = maxDistInMetersBuffer / radius_; - double gamma = - (90 - std::abs(startPoint.get<1>())) * (2 * std::numbers::pi / 360); + double gamma = (90 - std::abs(startPoint.get<1>())) * (2 * M_PI / 360); double beta = std::acos(std::cos(gamma) / std::cos(alpha)); double delta = 0; if (maxDistInMetersBuffer > circumferenceMax_ / 20) { @@ -293,7 +293,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( delta = archaversine((haversine(alpha - haversine(gamma - beta))) / (std::sin(gamma) * std::sin(beta))); } - double lonRange = delta * 360 / (2 * std::numbers::pi); + double lonRange = delta * 360 / (2 * M_PI); double leftLonBound = startPoint.get<0>() - lonRange; double rightLonBound = startPoint.get<0>() + lonRange; // test for "overflows" and create two bounding boxes if necessary @@ -438,8 +438,6 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { otherResJoinCol = leftJoinCol; } - // Todo in the benchmark: use different algorithms and compare their - // performance bgi::rtree> rtree; for (size_t i = 0; i < smallerResult->numRows(); i++) { // get point of row i From be09a86c04259e8640736c5d3f6211c6a1f6ba82 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Fri, 8 Nov 2024 19:26:18 +0100 Subject: [PATCH 05/18] part of first rework --- src/engine/SpatialJoin.h | 2 +- src/engine/SpatialJoinAlgorithms.cpp | 117 ++++++++++++---------- src/engine/SpatialJoinAlgorithms.h | 72 ++++++++----- test/engine/SpatialJoinAlgorithmsTest.cpp | 4 +- 4 files changed, 113 insertions(+), 82 deletions(-) diff --git a/src/engine/SpatialJoin.h b/src/engine/SpatialJoin.h index 61fa5247aa..35d2701379 100644 --- a/src/engine/SpatialJoin.h +++ b/src/engine/SpatialJoin.h @@ -106,7 +106,7 @@ class SpatialJoin : public Operation { } std::variant - onlyForTestingGetRealConfig() const { + onlyForTestingGetActualConfig() const { return config_; } diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 3d7f5bb57d..4885b14881 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -9,6 +9,8 @@ #include "math.h" #include "util/GeoSparqlHelpers.h" +using namespace BoostGeometryNamespace; + // ____________________________________________________________________________ SpatialJoinAlgorithms::SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, @@ -226,8 +228,8 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() { } // ____________________________________________________________________________ -std::vector SpatialJoinAlgorithms::computeBoundingBox( - const point& startPoint) const { +std::vector SpatialJoinAlgorithms::computeBoundingBox( + const Point& startPoint) const { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; if (!maxDist.has_value()) { @@ -256,14 +258,13 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( // a single bounding box for the whole planet, do an optimized version if (static_cast(maxDist.value()) > circumferenceMax_ / 4.0 && static_cast(maxDist.value()) < circumferenceMax_ / 2.01) { - return computeAntiBoundingBox(startPoint); + return computeUsingAntiBoundingBox(startPoint); } // compute latitude bound - double upperLatBound = - startPoint.get<1>() + maxDistInMetersBuffer * (360 / circumferenceMax_); - double lowerLatBound = - startPoint.get<1>() - maxDistInMetersBuffer * (360 / circumferenceMax_); + double maxDistInDegrees = maxDistInMetersBuffer * (360 / circumferenceMax_); + double upperLatBound = startPoint.get<1>() + maxDistInDegrees; + double lowerLatBound = startPoint.get<1>() - maxDistInDegrees; bool poleReached = false; // test for "overflows" if (lowerLatBound <= -90) { @@ -275,7 +276,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( poleReached = true; // north pole reached } if (poleReached) { - return {box(point(-180.0f, lowerLatBound), point(180.0f, upperLatBound))}; + return {Box(Point(-180.0f, lowerLatBound), Point(180.0f, upperLatBound))}; } // compute longitude bound. For an explanation of the calculation and the @@ -299,32 +300,33 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( // test for "overflows" and create two bounding boxes if necessary if (leftLonBound < -180) { auto box1 = - box(point(-180, lowerLatBound), point(rightLonBound, upperLatBound)); - auto box2 = box(point(leftLonBound + 360, lowerLatBound), - point(180, upperLatBound)); + Box(Point(-180, lowerLatBound), Point(rightLonBound, upperLatBound)); + auto box2 = Box(Point(leftLonBound + 360, lowerLatBound), + Point(180, upperLatBound)); return {box1, box2}; } else if (rightLonBound > 180) { auto box1 = - box(point(leftLonBound, lowerLatBound), point(180, upperLatBound)); - auto box2 = box(point(-180, lowerLatBound), - point(rightLonBound - 360, upperLatBound)); + Box(Point(leftLonBound, lowerLatBound), Point(180, upperLatBound)); + auto box2 = Box(Point(-180, lowerLatBound), + Point(rightLonBound - 360, upperLatBound)); return {box1, box2}; } // default case, when no bound has an "overflow" - return {box(point(leftLonBound, lowerLatBound), - point(rightLonBound, upperLatBound))}; + return {Box(Point(leftLonBound, lowerLatBound), + Point(rightLonBound, upperLatBound))}; } // ____________________________________________________________________________ -std::vector SpatialJoinAlgorithms::computeAntiBoundingBox( - const point& startPoint) const { +std::vector SpatialJoinAlgorithms::computeUsingAntiBoundingBox( + const Point& startPoint) const { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; if (!maxDist.has_value()) { AD_THROW("Max distance must have a value for this operation"); } + // point on the opposite side of the globe - point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1); + Point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1); if (antiPoint.get<0>() > 180) { antiPoint.set<0>(antiPoint.get<0>() - 360); } @@ -363,41 +365,41 @@ std::vector SpatialJoinAlgorithms::computeAntiBoundingBox( boxCrosses180Longitude = true; } // compute bounding boxes using the anti bounding box from above - std::vector boxes; + std::vector boxes; if (!northPoleTouched) { // add upper bounding box(es) if (boxCrosses180Longitude) { - boxes.emplace_back(point(leftBound, upperBound), point(180, 90)); - boxes.emplace_back(point(-180, upperBound), point(rightBound, 90)); + boxes.emplace_back(Point(leftBound, upperBound), Point(180, 90)); + boxes.emplace_back(Point(-180, upperBound), Point(rightBound, 90)); } else { - boxes.emplace_back(point(leftBound, upperBound), point(rightBound, 90)); + boxes.emplace_back(Point(leftBound, upperBound), Point(rightBound, 90)); } } if (!southPoleTouched) { // add lower bounding box(es) if (boxCrosses180Longitude) { - boxes.emplace_back(point(leftBound, -90), point(180, lowerBound)); - boxes.emplace_back(point(-180, -90), point(rightBound, lowerBound)); + boxes.emplace_back(Point(leftBound, -90), Point(180, lowerBound)); + boxes.emplace_back(Point(-180, -90), Point(rightBound, lowerBound)); } else { - boxes.emplace_back(point(leftBound, -90), point(rightBound, lowerBound)); + boxes.emplace_back(Point(leftBound, -90), Point(rightBound, lowerBound)); } } // add the box(es) inbetween the longitude lines if (boxCrosses180Longitude) { // only one box needed to cover the longitudes - boxes.emplace_back(point(rightBound, -90), point(leftBound, 90)); + boxes.emplace_back(Point(rightBound, -90), Point(leftBound, 90)); } else { // two boxes needed, one left and one right of the anti bounding box - boxes.emplace_back(point(-180, -90), point(leftBound, 90)); - boxes.emplace_back(point(rightBound, -90), point(180, 90)); + boxes.emplace_back(Point(-180, -90), Point(leftBound, 90)); + boxes.emplace_back(Point(rightBound, -90), Point(180, 90)); } return boxes; } // ____________________________________________________________________________ bool SpatialJoinAlgorithms::containedInBoundingBoxes( - const std::vector& bbox, point point1) const { - // correct lon bounds if necessary + const std::vector& bbox, Point point1) const { + // correct lon and lat bounds if necessary while (point1.get<0>() < -180) { point1.set<0>(point1.get<0>() + 360); } @@ -410,16 +412,18 @@ bool SpatialJoinAlgorithms::containedInBoundingBoxes( point1.set<1>(90); } - if (std::ranges::any_of(bbox, [point1](const box& aBox) { - return boost::geometry::covered_by(point1, aBox); - })) { - return true; - } - return false; + return std::ranges::any_of(bbox, [point1](const Box& aBox) { + return boost::geometry::covered_by(point1, aBox); + }); } // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { + auto printWarning = []() { + LOG(WARN) + << "expected a point here, but no point is given. Skipping this point" + << std::endl; + }; const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; IdTable result{numColumns, qec_->getAllocator()}; @@ -431,41 +435,49 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto smallerResJoinCol = leftJoinCol; auto otherResJoinCol = rightJoinCol; if (idTableLeft->numRows() > idTableRight->numRows()) { - smallerResult = idTableRight; - otherResult = idTableLeft; + std::swap(smallerResult, otherResult); leftResSmaller = false; - smallerResJoinCol = rightJoinCol; - otherResJoinCol = leftJoinCol; + std::swap(smallerResJoinCol, otherResJoinCol); } - bgi::rtree> rtree; + bgi::rtree> rtree; for (size_t i = 0; i < smallerResult->numRows(); i++) { // get point of row i - // ColumnIndex smallerJoinCol = getJoinCol(smallerChild, smallerVariable); auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); - point p(geopoint.value().getLng(), geopoint.value().getLat()); + + if (!geopoint) { + printWarning(); + continue; + } + + Point p(geopoint.value().getLng(), geopoint.value().getLat()); // add every point together with the row number into the rtree rtree.insert(std::make_pair(p, i)); } for (size_t i = 0; i < otherResult->numRows(); i++) { auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); - point p(geopoint1.value().getLng(), geopoint1.value().getLat()); + + if (!geopoint1) { + printWarning(); + continue; + } + + Point p(geopoint1.value().getLng(), geopoint1.value().getLat()); // query the other rtree for every point using the following bounding box - std::vector bbox = computeBoundingBox(p); - std::vector results; + std::vector bbox = computeBoundingBox(p); + std::vector results; - std::ranges::for_each(bbox, [&](const box& bbox) { + std::ranges::for_each(bbox, [&](const Box& bbox) { rtree.query(bgi::intersects(bbox), std::back_inserter(results)); }); - std::ranges::for_each(results, [&](value res) { + std::ranges::for_each(results, [&](const Value& res) { size_t rowLeft = res.second; size_t rowRight = i; if (!leftResSmaller) { - rowLeft = i; - rowRight = res.second; + std::swap(rowLeft, rowRight); } auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight, leftJoinCol, rightJoinCol); @@ -477,6 +489,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { }); } auto resTable = - Result(std::move(result), std::vector{}, LocalVocab{}); + Result(std::move(result), std::vector{}, + Result::getMergedLocalVocab(*resultLeft, *resultRight)); return resTable; } diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 1bee40950a..eabf681620 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -8,16 +8,18 @@ #include "engine/Result.h" #include "engine/SpatialJoin.h" +namespace BoostGeometryNamespace { namespace bg = boost::geometry; namespace bgi = boost::geometry::index; -using point = bg::model::point; -using box = bg::model::box; -using value = std::pair; +using Point = bg::model::point; +using Box = bg::model::box; +using Value = std::pair; +} // namespace BoostGeometryNamespace class SpatialJoinAlgorithms { public: - // initialize the Algorithm with the needed parameters + // Initialize the Algorithm with the needed parameters SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, bool addDistToResult, @@ -26,23 +28,25 @@ class SpatialJoinAlgorithms { Result S2geometryAlgorithm(); Result BoundingBoxAlgorithm(); - std::vector OnlyForTestingWrapperComputeBoundingBox( - const point& startPoint) const { + std::vector + OnlyForTestingWrapperComputeBoundingBox( + const BoostGeometryNamespace::Point& startPoint) const { return computeBoundingBox(startPoint); } bool OnlyForTestingWrapperContainedInBoundingBoxes( - const std::vector& bbox, point point1) const { + const std::vector& bbox, + const BoostGeometryNamespace::Point& point1) const { return containedInBoundingBoxes(bbox, point1); } private: - // helper function which returns a GeoPoint if the element of the given table + // Helper function which returns a GeoPoint if the element of the given table // represents a GeoPoint std::optional getPoint(const IdTable* restable, size_t row, ColumnIndex col) const; - // helper function, which computes the distance of two points, where each + // Helper function, which computes the distance of two points, where each // point comes from a different result table Id computeDist(const IdTable* resLeft, const IdTable* resRight, size_t rowLeft, size_t rowRight, ColumnIndex leftPointCol, @@ -55,21 +59,35 @@ class SpatialJoinAlgorithms { const IdTable* resultRight, size_t rowLeft, size_t rowRight, Id distance) const; - // this function computes the bounding box(es), which represent all points, + // This function computes the bounding box(es) which represent all points, // which are in reach of the starting point with a distance of at most - // maxDistanceInMeters - std::vector computeBoundingBox(const point& startPoint) const; - - // this helper function calculates the bounding boxes based on a box, where - // definitely no match can occur. This function gets used, when the usual - // procedure, would just result in taking a big bounding box, which covers - // the whole planet (so for large max distances) - std::vector computeAntiBoundingBox(const point& startPoint) const; - - // this function returns true, when the given point is contained in any of the + // 'maxDistanceInMeters'. In theory there is always only one bounding box, but + // when mapping the spherical surface on a cartesian plane there are borders. + // So when the "single true" bounding box crosses the left or right (+/-180 + // longitude line) or the poles (+/- 90 latitude, which on the cartesian + // mapping is the top and bottom edge of the rectangular mapping) then the + // single box gets split into multiple boxes (i.e. one on the left and one on + // the right, which when seen on the sphere look like a single box, but on the + // map and in the internal representation it looks like two/more boxes) + std::vector computeBoundingBox( + const BoostGeometryNamespace::Point& startPoint) const; + + // This helper function calculates the bounding boxes based on a box, where + // definitely no match can occur. This means every element in the anti + // bounding box is guaranteed to be more than 'maxDistanceInMeters' away from + // the startPoint. The function is then returning the set of boxes, which + // cover everything on earth, except for the anti bounding box. This function + // gets used, when the usual procedure, would just result in taking a big + // bounding box, which covers the whole planet (so for extremely large max + // distances) + std::vector computeUsingAntiBoundingBox( + const BoostGeometryNamespace::Point& startPoint) const; + + // This function returns true, iff the given point is contained in any of the // bounding boxes - bool containedInBoundingBoxes(const std::vector& bbox, - point point1) const; + bool containedInBoundingBoxes( + const std::vector& bbox, + BoostGeometryNamespace::Point point1) const; QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; @@ -77,12 +95,12 @@ class SpatialJoinAlgorithms { std::variant config_; // circumference in meters at the equator (max) and the pole (min) (as the - // earth is not exactly a sphere the circumference is different. Note the - // factor of 1000 to convert to meters) - static constexpr double circumferenceMax_ = 40075 * 1000; - static constexpr double circumferenceMin_ = 40007 * 1000; + // earth is not exactly a sphere the circumference is different. Note that + // the values are given in meters) + static constexpr double circumferenceMax_ = 40'075'000; + static constexpr double circumferenceMin_ = 40'007'000; // radius of the earth in meters (as the earth is not exactly a sphere the // radius at the equator has been taken) - static constexpr double radius_ = 6378 * 1000; // * 1000 to convert to meters + static constexpr double radius_ = 6'378'000; }; diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index 19382430e4..d00d870348 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -858,7 +858,7 @@ inline void testBoundingBox(const long long& maxDistInMeters, std::nullopt}; SpatialJoinAlgorithms spatialJoinAlgs{ - qec, params, true, spatialJoin->onlyForTestingGetRealConfig()}; + qec, params, true, spatialJoin->onlyForTestingGetActualConfig()}; std::vector bbox = spatialJoinAlgs.OnlyForTestingWrapperComputeBoundingBox(startPoint); @@ -950,7 +950,7 @@ TEST(SpatialJoin, containedInBoundingBoxes) { std::nullopt}; SpatialJoinAlgorithms spatialJoinAlgs{ - qec, params, true, spatialJoin->onlyForTestingGetRealConfig()}; + qec, params, true, spatialJoin->onlyForTestingGetActualConfig()}; // note that none of the boxes is overlapping, therefore we can check, that // none of the points which should be contained in one box are contained in From c4ddec6cbb44524775bb908f4b7b7623b9fcbb3b Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Sat, 9 Nov 2024 15:04:18 +0100 Subject: [PATCH 06/18] rework of some parts of the tests --- test/engine/SpatialJoinAlgorithmsTest.cpp | 168 +++++++++++----------- 1 file changed, 86 insertions(+), 82 deletions(-) diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index d00d870348..95f4d774a6 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -791,17 +791,57 @@ INSTANTIATE_TEST_SUITE_P( namespace boundingBox { -namespace bg = boost::geometry; -namespace bgi = boost::geometry::index; +using BoostGeometryNamespace::Box; +using BoostGeometryNamespace::Point; +using BoostGeometryNamespace::Value; + +// usage of this helper function for the bounding box to test: +// Iterate over every edge of the bounding box. Furthermore iterate over each +// edge (i.e. use 100 points, which are on the edge of the bounding box). Then +// call this function for each of those points twice. Once slightly move the +// point inside the bounding box and give 'shouldBeWithin' = true to this +// function, the other time move it slightly outside of the bounding box and +// give 'shouldBeTrue' = false to the function. Do this for all edges. Note +// that this function is not taking a set of boxes, as neighboring boxes would +// not work with this approach (slighly outside of one box can be inside the +// neighboring box. For a set of boxes, check each box separately) +void testBounds(double x, double y, const Box& bbox, bool shouldBeWithin) { + // correct lon bounds if necessary + if (x < -180) { + x += 360; + } else if (x > 180) { + x -= 360; + } -typedef bg::model::point point; -typedef bg::model::box box; -typedef std::pair value; + // testing only possible, if lat bounds are correct and the lon bounds + // don't cover everything (as then left or right of the box is again + // inside the box because of the spherical geometry). If we have a bounding + // box, which goes from -180 to 180 longitude, then left of the bounding box + // is just in the bounding box again. (i.e. -180.00001 is the same as + // +179.99999). As all longitudes are covered, a left or right bound does + // not exist (on the sphere this makes intuitive sense). A test in that case + // is not necessary, because this test is about testing the edges and if + // everything is covered an edge doesn't exist there is no need for testing + // in that case + double minLonBox = bbox.min_corner().get<0>(); + double maxLonBox = bbox.max_corner().get<0>(); + if (y < 90 && y > -90 && !(minLonBox < 179.9999 && maxLonBox > 179.9999)) { + bool within = boost::geometry::covered_by(Point(x, y), bbox); + ASSERT_TRUE(within == shouldBeWithin); + } +}; -inline void testBoundingBox(const long long& maxDistInMeters, - const point& startPoint) { - auto checkOutside = [&](const point& point1, const point& startPoint, - const std::vector& bbox, +// This function performs multiple tests on the bounding box. First it asserts, +// that a point, which is not contained in any bounding box is more than +// 'maxDistInMeters' away from 'startPoint'. Second it iterates over the edges +// of the bounding box and checks, that points which are slightly inside or +// outside of the bounding box are correctly identified. +void testBoundingBox(const size_t& maxDistInMeters, const Point& startPoint) { + // this helper function checks, that a point, which is not contained in any + // bounding box, is really more than 'maxDistanceInMeters' away from + // 'startPoint' + auto checkOutside = [&](const Point& point1, const Point& startPoint, + const std::vector& bbox, SpatialJoinAlgorithms* spatialJoinAlg) { // check if the point is contained in any bounding box bool within = spatialJoinAlg->OnlyForTestingWrapperContainedInBoundingBoxes( @@ -814,67 +854,31 @@ inline void testBoundingBox(const long long& maxDistInMeters, } }; - auto testBounds = [](double x, double y, const box& bbox, - bool shouldBeWithin) { - // correct lon bounds if necessary - if (x < -180) { - x += 360; - } else if (x > 180) { - x -= 360; - } - - // testing only possible, if lat bounds are correct and the lon bounds - // don't cover everything (as then left or right of the box is again - // inside the box because of the spherical geometry) - double minLonBox = bbox.min_corner().get<0>(); - double maxLonBox = bbox.max_corner().get<0>(); - if (y < 90 && y > -90 && !(minLonBox < 179.9999 && maxLonBox > 179.9999)) { - bool within = boost::geometry::covered_by(point(x, y), bbox); - ASSERT_TRUE(within == shouldBeWithin); - } - }; - - // build dummy join to access the containedInBoundingBox and - // computeBoundingBox functions. Note that maxDistInMeters has to be accurate, - // otherwise the functions of spatialJoin don't work correctly - auto maxDistInMetersStr = - ""; - auto qec = buildTestQEC(); - auto spatialJoinTriple = - SparqlTriple{TripleComponent{Variable{"?point1"}}, maxDistInMetersStr, - TripleComponent{Variable{"?point2"}}}; - std::shared_ptr spatialJoinOperation = - ad_utility::makeExecutionTree(qec, spatialJoinTriple, - std::nullopt, std::nullopt); - - // add children and test, that multiplicity is a dummy return before all - // children are added - std::shared_ptr op = spatialJoinOperation->getRootOperation(); - SpatialJoin* spatialJoin = static_cast(op.get()); - PreparedSpatialJoinParams params{ - nullptr, nullptr, nullptr, nullptr, - 0, 0, 1, spatialJoin->getMaxDist(), - std::nullopt}; + nullptr, nullptr, nullptr, nullptr, 0, + 0, 1, maxDistInMeters, std::nullopt}; - SpatialJoinAlgorithms spatialJoinAlgs{ - qec, params, true, spatialJoin->onlyForTestingGetActualConfig()}; + std::variant config{ + MaxDistanceConfig{maxDistInMeters}}; + + SpatialJoinAlgorithms spatialJoinAlgs{buildTestQEC(), params, true, config}; - std::vector bbox = + std::vector bbox = spatialJoinAlgs.OnlyForTestingWrapperComputeBoundingBox(startPoint); // broad grid test for (int lon = -180; lon < 180; lon += 20) { for (int lat = -90; lat < 90; lat += 20) { - checkOutside(point(lon, lat), startPoint, bbox, &spatialJoinAlgs); + checkOutside(Point(lon, lat), startPoint, bbox, &spatialJoinAlgs); } } - // do tests at the border of the box + // do tests at the border of the box. The exact usage of this is described in + // the function comment of the helper function 'testBounds' for (size_t k = 0; k < bbox.size(); k++) { // use a small delta for testing because of floating point inaccuracies const double delta = 0.00000001; - const point minPoint = bbox.at(k).min_corner(); - const point maxPoint = bbox.at(k).max_corner(); + const Point minPoint = bbox.at(k).min_corner(); + const Point maxPoint = bbox.at(k).max_corner(); const double lowX = minPoint.get<0>(); const double lowY = minPoint.get<1>(); const double highX = maxPoint.get<0>(); @@ -887,28 +891,28 @@ inline void testBoundingBox(const long long& maxDistInMeters, true); testBounds(lowX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), false); - checkOutside(point(lowX - delta, lowY + (yRange / 100) * i), startPoint, + checkOutside(Point(lowX - delta, lowY + (yRange / 100) * i), startPoint, bbox, &spatialJoinAlgs); // barely in or out at the bottom edge testBounds(lowX + delta + (xRange / 100) * i, lowY + delta, bbox.at(k), true); testBounds(lowX + delta + (xRange / 100) * i, lowY - delta, bbox.at(k), false); - checkOutside(point(lowX + (xRange / 100) * i, lowY - delta), startPoint, + checkOutside(Point(lowX + (xRange / 100) * i, lowY - delta), startPoint, bbox, &spatialJoinAlgs); // barely in or out at the right edge testBounds(highX - delta, lowY + delta + (yRange / 100) * i, bbox.at(k), true); testBounds(highX + delta, lowY + delta + (yRange / 100) * i, bbox.at(k), false); - checkOutside(point(highX + delta, lowY + (yRange / 100) * i), startPoint, + checkOutside(Point(highX + delta, lowY + (yRange / 100) * i), startPoint, bbox, &spatialJoinAlgs); // barely in or out at the top edge testBounds(lowX + delta + (xRange / 100) * i, highY - delta, bbox.at(k), true); testBounds(lowX + delta + (xRange / 100) * i, highY + delta, bbox.at(k), false); - checkOutside(point(lowX + (xRange / 100) * i, highY + delta), startPoint, + checkOutside(Point(lowX + (xRange / 100) * i, highY + delta), startPoint, bbox, &spatialJoinAlgs); } } @@ -924,7 +928,7 @@ TEST(SpatialJoin, computeBoundingBox) { // circ / 2 means, that all points on earth are within maxDist km of any // starting point for (int maxDist = 0; maxDist <= circ / 2.0; maxDist += circ / 36.0) { - testBoundingBox(maxDist, point(lon, lat)); + testBoundingBox(maxDist, Point(lon, lat)); } } } @@ -955,28 +959,28 @@ TEST(SpatialJoin, containedInBoundingBoxes) { // note that none of the boxes is overlapping, therefore we can check, that // none of the points which should be contained in one box are contained in // another box - std::vector boxes = { - box(point(20, 40), point(40, 60)), - box(point(-180, -20), point(-150, 30)), // touching left border - box(point(50, -30), point(180, 10)), // touching right border - box(point(-30, 50), point(10, 90)), // touching north pole - box(point(-45, -90), point(0, -45)) // touching south pole + std::vector boxes = { + Box(Point(20, 40), Point(40, 60)), + Box(Point(-180, -20), Point(-150, 30)), // touching left border + Box(Point(50, -30), Point(180, 10)), // touching right border + Box(Point(-30, 50), Point(10, 90)), // touching north pole + Box(Point(-45, -90), Point(0, -45)) // touching south pole }; // the first entry in this vector is a vector of points, which is contained // in the first box, the second entry contains points, which are contained in // the second box and so on - std::vector> containedInBox = { - {point(20, 40), point(40, 40), point(40, 60), point(20, 60), - point(30, 50)}, - {point(-180, -20), point(-150, -20), point(-150, 30), point(-180, 30), - point(-150, 0)}, - {point(50, -30), point(180, -30), point(180, 10), point(50, 10), - point(70, -10)}, - {point(-30, 50), point(10, 50), point(10, 90), point(-30, 90), - point(-20, 60)}, - {point(-45, -90), point(0, -90), point(0, -45), point(-45, -45), - point(-10, -60)}}; + std::vector> containedInBox = { + {Point(20, 40), Point(40, 40), Point(40, 60), Point(20, 60), + Point(30, 50)}, + {Point(-180, -20), Point(-150, -20), Point(-150, 30), Point(-180, 30), + Point(-150, 0)}, + {Point(50, -30), Point(180, -30), Point(180, 10), Point(50, 10), + Point(70, -10)}, + {Point(-30, 50), Point(10, 50), Point(10, 90), Point(-30, 90), + Point(-20, 60)}, + {Point(-45, -90), Point(0, -90), Point(0, -45), Point(-45, -45), + Point(-10, -60)}}; // all combinations of box is contained in bounding boxes and is not // contained. a one encodes, the bounding box is contained in the set of @@ -990,9 +994,9 @@ TEST(SpatialJoin, containedInBoundingBoxes) { for (size_t c = 0; a <= 1; a++) { for (size_t d = 0; a <= 1; a++) { for (size_t e = 0; a <= 1; a++) { - std::vector toTest; // the set of bounding boxes - std::vector> shouldBeContained; - std::vector> shouldNotBeContained; + std::vector toTest; // the set of bounding boxes + std::vector> shouldBeContained; + std::vector> shouldNotBeContained; if (a == 1) { // box nr. 0 is contained in the set of boxes toTest.push_back(boxes.at(0)); shouldBeContained.push_back(containedInBox.at(0)); From 122372bff6a3585e351cd1d34b6781b200499ea4 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Tue, 19 Nov 2024 00:47:35 +0100 Subject: [PATCH 07/18] mr feedback --- src/engine/SpatialJoinAlgorithms.cpp | 46 ++++++++++++++++++++-------- src/engine/SpatialJoinAlgorithms.h | 14 ++++++--- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 4885b14881..c7d1a788ce 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -229,7 +229,7 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() { // ____________________________________________________________________________ std::vector SpatialJoinAlgorithms::computeBoundingBox( - const Point& startPoint) const { + const Point& startPoint) { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; if (!maxDist.has_value()) { @@ -265,17 +265,11 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( double maxDistInDegrees = maxDistInMetersBuffer * (360 / circumferenceMax_); double upperLatBound = startPoint.get<1>() + maxDistInDegrees; double lowerLatBound = startPoint.get<1>() - maxDistInDegrees; - bool poleReached = false; - // test for "overflows" - if (lowerLatBound <= -90) { - lowerLatBound = -90; - poleReached = true; // south pole reached - } - if (upperLatBound >= 90) { - upperLatBound = 90; - poleReached = true; // north pole reached - } - if (poleReached) { + + auto southPoleReached = isAPoleTouched(lowerLatBound).at(1); + auto northPoleReached = isAPoleTouched(upperLatBound).at(0); + + if (southPoleReached || northPoleReached) { return {Box(Point(-180.0f, lowerLatBound), Point(180.0f, upperLatBound))}; } @@ -398,7 +392,7 @@ std::vector SpatialJoinAlgorithms::computeUsingAntiBoundingBox( // ____________________________________________________________________________ bool SpatialJoinAlgorithms::containedInBoundingBoxes( - const std::vector& bbox, Point point1) const { + const std::vector& bbox, Point point1) { // correct lon and lat bounds if necessary while (point1.get<0>() < -180) { point1.set<0>(point1.get<0>() + 360); @@ -417,6 +411,32 @@ bool SpatialJoinAlgorithms::containedInBoundingBoxes( }); } +// ____________________________________________________________________________ +bool SpatialJoinAlgorithms::containedInBoundingBoxes( + const std::vector& bbox, Point point1) { + convertToNormalCoordinates(point1); + + return std::ranges::any_of(bbox, [point1](const Box& aBox) { + return boost::geometry::covered_by(point1, aBox); + }); +} + +// ____________________________________________________________________________ +void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) { + // correct lon and lat bounds if necessary + while (point.get<0>() < -180) { + point.set<0>(point.get<0>() + 360); + } + while (point.get<0>() > 180) { + point.set<0>(point.get<0>() - 360); + } + if (point.get<1>() < -90) { + point.set<1>(-90); + } else if (point.get<1>() > 90) { + point.set<1>(90); + } +} + // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto printWarning = []() { diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index eabf681620..34c7ca95e6 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -30,13 +30,13 @@ class SpatialJoinAlgorithms { std::vector OnlyForTestingWrapperComputeBoundingBox( - const BoostGeometryNamespace::Point& startPoint) const { + const BoostGeometryNamespace::Point& startPoint) { return computeBoundingBox(startPoint); } bool OnlyForTestingWrapperContainedInBoundingBoxes( const std::vector& bbox, - const BoostGeometryNamespace::Point& point1) const { + const BoostGeometryNamespace::Point& point1) { return containedInBoundingBoxes(bbox, point1); } @@ -70,7 +70,7 @@ class SpatialJoinAlgorithms { // the right, which when seen on the sphere look like a single box, but on the // map and in the internal representation it looks like two/more boxes) std::vector computeBoundingBox( - const BoostGeometryNamespace::Point& startPoint) const; + const BoostGeometryNamespace::Point& startPoint); // This helper function calculates the bounding boxes based on a box, where // definitely no match can occur. This means every element in the anti @@ -87,7 +87,7 @@ class SpatialJoinAlgorithms { // bounding boxes bool containedInBoundingBoxes( const std::vector& bbox, - BoostGeometryNamespace::Point point1) const; + BoostGeometryNamespace::Point point1); QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; @@ -103,4 +103,10 @@ class SpatialJoinAlgorithms { // radius of the earth in meters (as the earth is not exactly a sphere the // radius at the equator has been taken) static constexpr double radius_ = 6'378'000; + + // convert coordinates to the usual ranges (-180 to 180 and -90 to 90) + void convertToNormalCoordinates(BoostGeometryNamespace::Point& point); + + // return whether one of the poles is beeing touched + std::array isAPoleTouched(const double& latitude); }; From 213b02e5b43c0d392acd9e75c64ba0efd4876b33 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Tue, 19 Nov 2024 01:12:29 +0100 Subject: [PATCH 08/18] remove second definition of function --- src/engine/SpatialJoinAlgorithms.cpp | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index c7d1a788ce..d3df1b6140 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -265,10 +265,10 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( double maxDistInDegrees = maxDistInMetersBuffer * (360 / circumferenceMax_); double upperLatBound = startPoint.get<1>() + maxDistInDegrees; double lowerLatBound = startPoint.get<1>() - maxDistInDegrees; - + auto southPoleReached = isAPoleTouched(lowerLatBound).at(1); auto northPoleReached = isAPoleTouched(upperLatBound).at(0); - + if (southPoleReached || northPoleReached) { return {Box(Point(-180.0f, lowerLatBound), Point(180.0f, upperLatBound))}; } @@ -390,27 +390,6 @@ std::vector SpatialJoinAlgorithms::computeUsingAntiBoundingBox( return boxes; } -// ____________________________________________________________________________ -bool SpatialJoinAlgorithms::containedInBoundingBoxes( - const std::vector& bbox, Point point1) { - // correct lon and lat bounds if necessary - while (point1.get<0>() < -180) { - point1.set<0>(point1.get<0>() + 360); - } - while (point1.get<0>() > 180) { - point1.set<0>(point1.get<0>() - 360); - } - if (point1.get<1>() < -90) { - point1.set<1>(-90); - } else if (point1.get<1>() > 90) { - point1.set<1>(90); - } - - return std::ranges::any_of(bbox, [point1](const Box& aBox) { - return boost::geometry::covered_by(point1, aBox); - }); -} - // ____________________________________________________________________________ bool SpatialJoinAlgorithms::containedInBoundingBoxes( const std::vector& bbox, Point point1) { From ecda2496fab52836d6140ca52913189b65c7611b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 19 Nov 2024 10:16:55 +0100 Subject: [PATCH 09/18] add new macros AD_LOG vs LOG Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.cpp | 271 ++++++++++++++++++++++------------------ src/util/Log.h | 21 ++++ 2 files changed, 167 insertions(+), 125 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 93b2fc09ea..c49419e197 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -245,10 +245,10 @@ IndexImpl::buildOspWithPatterns( secondSorter->clear(); // Add the `ql:has-pattern` predicate to the sorter such that it will become // part of the PSO and POS permutation. - LOG(INFO) << "Adding " << hasPatternPredicateSortedByPSO->size() - << " triples to the POS and PSO permutation for " - "the internal `ql:has-pattern` ..." - << std::endl; + AD_LOG(AD_INFO) << "Adding " << hasPatternPredicateSortedByPSO->size() + << " triples to the POS and PSO permutation for " + "the internal `ql:has-pattern` ..." + << std::endl; static_assert(NumColumnsIndexBuilding == 4, "When adding additional payload columns, the following code " "has to be changed"); @@ -297,9 +297,10 @@ void IndexImpl::updateInputFileSpecificationsAndLog( // for a single input stream and forbidden for multiple input streams. if (parallelParsingSpecifiedViaJson.has_value()) { if (spec.size() == 1) { - LOG(WARN) << "Parallel parsing set in the `.settings.json` file; this is " - "deprecated, " - << pleaseUseParallelParsingOption << std::endl; + AD_LOG(AD_WARN) + << "Parallel parsing set in the `.settings.json` file; this is " + "deprecated, " + << pleaseUseParallelParsingOption << std::endl; spec.at(0).parseInParallel_ = parallelParsingSpecifiedViaJson.value(); } else { throw std::runtime_error{absl::StrCat( @@ -312,21 +313,22 @@ void IndexImpl::updateInputFileSpecificationsAndLog( // on the command line, we set if implicitly for backward compatibility. if (!parallelParsingSpecifiedViaJson.has_value() && spec.size() == 1 && !spec.at(0).parseInParallelSetExplicitly_) { - LOG(WARN) << "Implicitly using the parallel parser for a single input file " - "for reasons of backward compatibility; this is deprecated, " - << pleaseUseParallelParsingOption << std::endl; + AD_LOG(AD_WARN) + << "Implicitly using the parallel parser for a single input file " + "for reasons of backward compatibility; this is deprecated, " + << pleaseUseParallelParsingOption << std::endl; spec.at(0).parseInParallel_ = true; } // For a single input stream, show the name and whether we parse in parallel. // For multiple input streams, only show the number of streams. if (spec.size() == 1) { - LOG(INFO) << "Parsing triples from single input stream " - << spec.at(0).filename_ << " (parallel = " - << (spec.at(0).parseInParallel_ ? "true" : "false") << ") ..." - << std::endl; + AD_LOG(AD_INFO) << "Parsing triples from single input stream " + << spec.at(0).filename_ << " (parallel = " + << (spec.at(0).parseInParallel_ ? "true" : "false") + << ") ..." << std::endl; } else { - LOG(INFO) << "Processing triples from " << spec.size() - << " input streams ..." << std::endl; + AD_LOG(AD_INFO) << "Processing triples from " << spec.size() + << " input streams ..." << std::endl; } } @@ -412,7 +414,7 @@ void IndexImpl::createFromFiles( addInternalStatisticsToConfiguration(numTriplesInternal, numPredicatesInternal); - LOG(INFO) << "Index build completed" << std::endl; + AD_LOG(AD_INFO) << "Index build completed" << std::endl; } // _____________________________________________________________________________ @@ -439,9 +441,10 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( ad_utility::Synchronized> idTriples( std::make_unique(onDiskBase_ + ".unsorted-triples.dat", 1_GB, allocator_)); - LOG(INFO) << "Parsing input triples and creating partial vocabularies, one " - "per batch ..." - << std::endl; + AD_LOG(AD_INFO) + << "Parsing input triples and creating partial vocabularies, one " + "per batch ..." + << std::endl; bool parserExhausted = false; // already count the numbers of triples that will be used for the language @@ -492,12 +495,12 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( } numTriplesProcessed++; if (progressBar.update()) { - LOG(INFO) << progressBar.getProgressString() << std::flush; + AD_LOG(AD_INFO) << progressBar.getProgressString() << std::flush; } } - LOG(TIMING) << "WaitTimes for Pipeline in msecs\n"; + AD_LOG(AD_TIMING) << "WaitTimes for Pipeline in msecs\n"; for (const auto& t : p.getWaitingTime()) { - LOG(TIMING) + AD_LOG(AD_TIMING) << std::chrono::duration_cast(t).count() << " msecs" << std::endl; } @@ -514,7 +517,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( if (writePartialVocabularyFuture[0].valid()) { writePartialVocabularyFuture[0].get(); } - LOG(TIMING) + AD_LOG(AD_TIMING) << "Time spent waiting for the writing of a previous vocabulary: " << sortFutureTimer.msecs().count() << "ms." << std::endl; auto moveMap = [](std::optional&& el) { @@ -537,20 +540,21 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( // ids actualPartialSizes.push_back(actualCurrentPartialSize); } - LOG(INFO) << progressBar.getFinalProgressString() << std::flush; + AD_LOG(AD_INFO) << progressBar.getFinalProgressString() << std::flush; for (auto& future : writePartialVocabularyFuture) { if (future.valid()) { future.get(); } } - LOG(INFO) << "Number of triples created (including QLever-internal ones): " - << (*idTriples.wlock())->size() << " [may contain duplicates]" - << std::endl; + AD_LOG(AD_INFO) + << "Number of triples created (including QLever-internal ones): " + << (*idTriples.wlock())->size() << " [may contain duplicates]" + << std::endl; size_t sizeInternalVocabulary = 0; std::vector prefixes; - LOG(INFO) << "Merging partial vocabularies ..." << std::endl; + AD_LOG(AD_INFO) << "Merging partial vocabularies ..." << std::endl; const ad_utility::vocabulary_merger::VocabularyMetaData mergeRes = [&]() { auto sortPred = [cmp = &(vocab_.getCaseComparator())](std::string_view a, std::string_view b) { @@ -562,21 +566,22 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( onDiskBase_, numFiles, sortPred, wordCallback, memoryLimitIndexBuilding()); }(); - LOG(DEBUG) << "Finished merging partial vocabularies" << std::endl; + AD_LOG(AD_DEBUG) << "Finished merging partial vocabularies" << std::endl; IndexBuilderDataAsStxxlVector res; res.vocabularyMetaData_ = mergeRes; idOfHasPatternDuringIndexBuilding_ = mergeRes.specialIdMapping().at(HAS_PATTERN_PREDICATE); idOfInternalGraphDuringIndexBuilding_ = mergeRes.specialIdMapping().at(QLEVER_INTERNAL_GRAPH_IRI); - LOG(INFO) << "Number of words in external vocabulary: " - << res.vocabularyMetaData_.numWordsTotal() - sizeInternalVocabulary - << std::endl; + AD_LOG(AD_INFO) << "Number of words in external vocabulary: " + << res.vocabularyMetaData_.numWordsTotal() - + sizeInternalVocabulary + << std::endl; res.idTriples = std::move(*idTriples.wlock()); res.actualPartialSizes = std::move(actualPartialSizes); - LOG(DEBUG) << "Removing temporary files ..." << std::endl; + AD_LOG(AD_DEBUG) << "Removing temporary files ..." << std::endl; for (size_t n = 0; n < numFiles; ++n) { deleteTemporaryFile(absl::StrCat(onDiskBase_, PARTIAL_VOCAB_FILE_NAME, n)); } @@ -589,10 +594,10 @@ auto IndexImpl::convertPartialToGlobalIds( TripleVec& data, const vector& actualLinesPerPartial, size_t linesPerPartial, auto isQLeverInternalTriple) -> FirstPermutationSorterAndInternalTriplesAsPso { - LOG(INFO) << "Converting triples from local IDs to global IDs ..." - << std::endl; - LOG(DEBUG) << "Triples per partial vocabulary: " << linesPerPartial - << std::endl; + AD_LOG(AD_INFO) << "Converting triples from local IDs to global IDs ..." + << std::endl; + AD_LOG(AD_DEBUG) << "Triples per partial vocabulary: " << linesPerPartial + << std::endl; // Iterate over all partial vocabularies. auto resultPtr = @@ -660,7 +665,7 @@ auto IndexImpl::convertPartialToGlobalIds( numTriplesConverted += triples->size(); numTriplesConverted += internalTriples->size(); if (progressBar.update()) { - LOG(INFO) << progressBar.getProgressString() << std::flush; + AD_LOG(AD_INFO) << progressBar.getProgressString() << std::flush; } }; }; @@ -751,7 +756,7 @@ auto IndexImpl::convertPartialToGlobalIds( } lookupQueue.finish(); writeQueue.finish(); - LOG(INFO) << progressBar.getFinalProgressString() << std::flush; + AD_LOG(AD_INFO) << progressBar.getFinalProgressString() << std::flush; return {std::move(resultPtr), std::move(internalTriplesPtr)}; } @@ -811,8 +816,8 @@ std::tuple> configurationJson_; if (configurationJson_.find("git-hash") != configurationJson_.end()) { - LOG(INFO) << "The git hash used to build this index was " - << configurationJson_["git-hash"] << std::endl; + AD_LOG(AD_INFO) << "The git hash used to build this index was " + << configurationJson_["git-hash"] << std::endl; } else { - LOG(INFO) << "The index was built before git commit hashes were stored in " - "the index meta data" - << std::endl; + AD_LOG(AD_INFO) + << "The index was built before git commit hashes were stored in " + "the index meta data" + << std::endl; } if (configurationJson_.find("index-format-version") != @@ -1011,31 +1019,34 @@ void IndexImpl::readConfiguration() { const auto& currentVersion = qlever::indexFormatVersion; if (indexFormatVersion != currentVersion) { if (indexFormatVersion.date_.toBits() > currentVersion.date_.toBits()) { - LOG(ERROR) << "The version of QLever you are using is too old for this " - "index. Please use a version of QLever that is " - "compatible with this index" - " (PR = " - << indexFormatVersion.prNumber_ << ", Date = " - << indexFormatVersion.date_.toStringAndType().first << ")." - << std::endl; + AD_LOG(AD_ERROR) + << "The version of QLever you are using is too old for this " + "index. Please use a version of QLever that is " + "compatible with this index" + " (PR = " + << indexFormatVersion.prNumber_ + << ", Date = " << indexFormatVersion.date_.toStringAndType().first + << ")." << std::endl; } else { - LOG(ERROR) << "The index is too old for this version of QLever. " - "We recommend that you rebuild the index and start the " - "server with the current master. Alternatively start the " - "engine with a version of QLever that is compatible with " - "this index (PR = " - << indexFormatVersion.prNumber_ << ", Date = " - << indexFormatVersion.date_.toStringAndType().first << ")." - << std::endl; + AD_LOG(AD_ERROR) + << "The index is too old for this version of QLever. " + "We recommend that you rebuild the index and start the " + "server with the current master. Alternatively start the " + "engine with a version of QLever that is compatible with " + "this index (PR = " + << indexFormatVersion.prNumber_ + << ", Date = " << indexFormatVersion.date_.toStringAndType().first + << ")." << std::endl; } throw std::runtime_error{ "Incompatible index format, see log message for details"}; } } else { - LOG(ERROR) << "This index was built before versioning was introduced for " - "QLever's index format. Please rebuild your index using the " - "current version of QLever." - << std::endl; + AD_LOG(AD_ERROR) + << "This index was built before versioning was introduced for " + "QLever's index format. Please rebuild your index using the " + "current version of QLever." + << std::endl; throw std::runtime_error{ "Incompatible index format, see log message for details"}; } @@ -1047,7 +1058,7 @@ void IndexImpl::readConfiguration() { } if (configurationJson_.count("ignore-case")) { - LOG(ERROR) << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; + AD_LOG(AD_ERROR) << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; throw std::runtime_error("Deprecated key \"ignore-case\" in index build"); } @@ -1058,9 +1069,10 @@ void IndexImpl::readConfiguration() { vocab_.setLocale(lang, country, ignorePunctuation); textVocab_.setLocale(lang, country, ignorePunctuation); } else { - LOG(ERROR) << "Key \"locale\" is missing in the metadata. This is probably " - "and old index build that is no longer supported by QLever. " - "Please rebuild your index\n"; + AD_LOG(AD_ERROR) + << "Key \"locale\" is missing in the metadata. This is probably " + "and old index build that is no longer supported by QLever. " + "Please rebuild your index\n"; throw std::runtime_error( "Missing required key \"locale\" in index build's metadata"); } @@ -1186,7 +1198,7 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { } if (j.count("ignore-case")) { - LOG(ERROR) << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; + AD_LOG(AD_ERROR) << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; throw std::runtime_error("Deprecated key \"ignore-case\" in settings JSON"); } @@ -1206,22 +1218,25 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { country = std::string{j["locale"]["country"]}; ignorePunctuation = bool{j["locale"]["ignore-punctuation"]}; } else { - LOG(INFO) << "Locale was not specified in settings file, default is " - "en_US" - << std::endl; + AD_LOG(AD_INFO) + << "Locale was not specified in settings file, default is " + "en_US" + << std::endl; } - LOG(INFO) << "You specified \"locale = " << lang << "_" << country << "\" " - << "and \"ignore-punctuation = " << ignorePunctuation << "\"" - << std::endl; + AD_LOG(AD_INFO) << "You specified \"locale = " << lang << "_" << country + << "\" " + << "and \"ignore-punctuation = " << ignorePunctuation + << "\"" << std::endl; if (lang != LOCALE_DEFAULT_LANG || country != LOCALE_DEFAULT_COUNTRY) { - LOG(WARN) << "You are using Locale settings that differ from the default " - "language or country.\n\t" - << "This should work but is untested by the QLever team. If " - "you are running into unexpected problems,\n\t" - << "Please make sure to also report your used locale when " - "filing a bug report. Also note that changing the\n\t" - << "locale requires to completely rebuild the index\n"; + AD_LOG(AD_WARN) + << "You are using Locale settings that differ from the default " + "language or country.\n\t" + << "This should work but is untested by the QLever team. If " + "you are running into unexpected problems,\n\t" + << "Please make sure to also report your used locale when " + "filing a bug report. Also note that changing the\n\t" + << "locale requires to completely rebuild the index\n"; } vocab_.setLocale(lang, country, ignorePunctuation); textVocab_.setLocale(lang, country, ignorePunctuation); @@ -1238,19 +1253,19 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { onlyAsciiTurtlePrefixes_ = static_cast(j["ascii-prefixes-only"]); } if (onlyAsciiTurtlePrefixes_) { - LOG(INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl; + AD_LOG(AD_INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl; } if (j.count("parallel-parsing")) { useParallelParser_ = static_cast(j["parallel-parsing"]); } if (useParallelParser_) { - LOG(INFO) << WARNING_PARALLEL_PARSING << std::endl; + AD_LOG(AD_INFO) << WARNING_PARALLEL_PARSING << std::endl; } if (j.count("num-triples-per-batch")) { numTriplesPerBatch_ = size_t{j["num-triples-per-batch"]}; - LOG(INFO) + AD_LOG(AD_INFO) << "You specified \"num-triples-per-batch = " << numTriplesPerBatch_ << "\", choose a lower value if the index builder runs out of memory" << std::endl; @@ -1258,9 +1273,10 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { if (j.count("parser-batch-size")) { parserBatchSize_ = size_t{j["parser-batch-size"]}; - LOG(INFO) << "Overriding setting parser-batch-size to " << parserBatchSize_ - << " This might influence performance during index build." - << std::endl; + AD_LOG(AD_INFO) << "Overriding setting parser-batch-size to " + << parserBatchSize_ + << " This might influence performance during index build." + << std::endl; } std::string overflowingIntegersThrow = "overflowing-integers-throw"; @@ -1274,34 +1290,38 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { if (j.count(key)) { auto value = static_cast(j[key]); if (value == overflowingIntegersThrow) { - LOG(INFO) << "Integers that cannot be represented by QLever will throw " - "an exception" - << std::endl; + AD_LOG(AD_INFO) + << "Integers that cannot be represented by QLever will throw " + "an exception" + << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::Error; } else if (value == overflowingIntegersBecomeDoubles) { - LOG(INFO) << "Integers that cannot be represented by QLever will be " - "converted to doubles" - << std::endl; + AD_LOG(AD_INFO) + << "Integers that cannot be represented by QLever will be " + "converted to doubles" + << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::OverflowingToDouble; } else if (value == allIntegersBecomeDoubles) { - LOG(INFO) << "All integers will be converted to doubles" << std::endl; + AD_LOG(AD_INFO) << "All integers will be converted to doubles" + << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::OverflowingToDouble; } else { AD_CONTRACT_CHECK(std::ranges::find(allModes, value) == allModes.end()); - LOG(ERROR) << "Invalid value for " << key << std::endl; - LOG(INFO) << "The currently supported values are " - << absl::StrJoin(allModes, ",") << std::endl; + AD_LOG(AD_ERROR) << "Invalid value for " << key << std::endl; + AD_LOG(AD_INFO) << "The currently supported values are " + << absl::StrJoin(allModes, ",") << std::endl; } } else { turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::Error; - LOG(INFO) << "By default, integers that cannot be represented by QLever " - "will throw an " - "exception" - << std::endl; + AD_LOG(AD_INFO) + << "By default, integers that cannot be represented by QLever " + "will throw an " + "exception" + << std::endl; } } @@ -1311,8 +1331,9 @@ std::future IndexImpl::writeNextPartialVocabulary( std::unique_ptr items, auto localIds, ad_utility::Synchronized>* globalWritePtr) { using namespace ad_utility::vocabulary_merger; - LOG(DEBUG) << "Input triples read in this section: " << numLines << std::endl; - LOG(DEBUG) + AD_LOG(AD_DEBUG) << "Input triples read in this section: " << numLines + << std::endl; + AD_LOG(AD_DEBUG) << "Triples processed, also counting internal triples added by QLever: " << actualCurrentPartialSize << std::endl; std::future resultFuture; @@ -1341,7 +1362,7 @@ std::future IndexImpl::writeNextPartialVocabulary( ad_utility::TimeBlockAndLog l{"creating internal mapping"}; return createInternalMapping(&vec); }(); - LOG(TRACE) << "Finished creating of Mapping vocabulary" << std::endl; + AD_LOG(AD_TRACE) << "Finished creating of Mapping vocabulary" << std::endl; // since now adjacent duplicates also have the same Ids, it suffices to // compare those { @@ -1368,7 +1389,7 @@ std::future IndexImpl::writeNextPartialVocabulary( ad_utility::TimeBlockAndLog l{"write partial vocabulary"}; writePartialVocabularyToFile(vec, partialFilename); } - LOG(TRACE) << "Finished writing the partial vocabulary" << std::endl; + AD_LOG(AD_TRACE) << "Finished writing the partial vocabulary" << std::endl; vec.clear(); { ad_utility::TimeBlockAndLog l{"writing to global file"}; diff --git a/src/util/Log.h b/src/util/Log.h index 1ba7277fb0..35a1a93822 100644 --- a/src/util/Log.h +++ b/src/util/Log.h @@ -21,6 +21,11 @@ #endif // Abseil does also define its own LOG macro, so we need to undefine it here. +// NOTE: In case you run into trouble with this conflict, in particular if you +// use the `LOG()` macro and you get compilation errors that mention `abseil`, +// use the (otherwise identical) `AD_LOG` macro below. Same goes for the +// loglevels, use for example `AD_DEBUG` instead of `DEBUG`. +// TODO Consistently replace the `LOG` macro by `AD_LOG`. #ifdef LOG #undef LOG #endif @@ -31,6 +36,12 @@ else \ ad_utility::Log::getLog() // NOLINT +#define AD_LOG(x) \ + if (x > LOGLEVEL) \ + ; \ + else \ + ad_utility::Log::getLog() // NOLINT + enum class LogLevel { FATAL = 0, ERROR = 1, @@ -41,6 +52,16 @@ enum class LogLevel { TRACE = 6 }; +// These should be used in new code to avoid clashes with `abseil` (similar to +// `AD_LOG` vs `LOG`), see above for details. +static constexpr auto AD_FATAL = LogLevel::FATAL; +static constexpr auto AD_ERROR = LogLevel::ERROR; +static constexpr auto AD_WARN = LogLevel::WARN; +static constexpr auto AD_INFO = LogLevel::INFO; +static constexpr auto AD_DEBUG = LogLevel::DEBUG; +static constexpr auto AD_TIMING = LogLevel::TIMING; +static constexpr auto AD_TRACE = LogLevel::TRACE; + using enum LogLevel; namespace ad_utility { From 3831da815ccca3a76b61d499422c667a91b70c96 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Wed, 20 Nov 2024 01:08:31 +0100 Subject: [PATCH 10/18] PR feedback --- src/engine/SpatialJoinAlgorithms.cpp | 19 ++++++++++++++++--- src/engine/SpatialJoinAlgorithms.h | 2 +- test/engine/SpatialJoinAlgorithmsTest.cpp | 2 +- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index d3df1b6140..15a6bb3f48 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -416,10 +416,23 @@ void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) { } } +std::array SpatialJoinAlgorithms::isAPoleTouched( + const double& latitude) { + bool northPoleReached = false; + bool southPoleReached = false; + if (latitude >= 90) { + northPoleReached = true; + } + if (latitude <= -90) { + southPoleReached = true; + } + return std::array{northPoleReached, southPoleReached}; +} + // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto printWarning = []() { - LOG(WARN) + AD_LOG(AD_WARN) << "expected a point here, but no point is given. Skipping this point" << std::endl; }; @@ -452,7 +465,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { Point p(geopoint.value().getLng(), geopoint.value().getLat()); // add every point together with the row number into the rtree - rtree.insert(std::make_pair(p, i)); + rtree.insert(std::make_pair(std::move(p), i)); } for (size_t i = 0; i < otherResult->numRows(); i++) { auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); @@ -466,7 +479,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { // query the other rtree for every point using the following bounding box std::vector bbox = computeBoundingBox(p); - std::vector results; + std::vector> results; std::ranges::for_each(bbox, [&](const Box& bbox) { rtree.query(bgi::intersects(bbox), std::back_inserter(results)); diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 34c7ca95e6..c2a447c8b0 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -107,6 +107,6 @@ class SpatialJoinAlgorithms { // convert coordinates to the usual ranges (-180 to 180 and -90 to 90) void convertToNormalCoordinates(BoostGeometryNamespace::Point& point); - // return whether one of the poles is beeing touched + // return whether one of the poles is being touched std::array isAPoleTouched(const double& latitude); }; diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index 95f4d774a6..4c9ab579e2 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -803,7 +803,7 @@ using BoostGeometryNamespace::Value; // function, the other time move it slightly outside of the bounding box and // give 'shouldBeTrue' = false to the function. Do this for all edges. Note // that this function is not taking a set of boxes, as neighboring boxes would -// not work with this approach (slighly outside of one box can be inside the +// not work with this approach (slightly outside of one box can be inside the // neighboring box. For a set of boxes, check each box separately) void testBounds(double x, double y, const Box& bbox, bool shouldBeWithin) { // correct lon bounds if necessary From 9ffaa0aa7c34190a483b930c3bf56db23c19690c Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 20 Nov 2024 12:15:44 +0100 Subject: [PATCH 11/18] Apply the suggestion by Hannah. Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.cpp | 106 ++++++++++++++++++++-------------------- src/util/Log.h | 14 +++--- 2 files changed, 60 insertions(+), 60 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index c49419e197..2d8c0fef32 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -245,7 +245,7 @@ IndexImpl::buildOspWithPatterns( secondSorter->clear(); // Add the `ql:has-pattern` predicate to the sorter such that it will become // part of the PSO and POS permutation. - AD_LOG(AD_INFO) << "Adding " << hasPatternPredicateSortedByPSO->size() + AD_LOG_INFO << "Adding " << hasPatternPredicateSortedByPSO->size() << " triples to the POS and PSO permutation for " "the internal `ql:has-pattern` ..." << std::endl; @@ -297,7 +297,7 @@ void IndexImpl::updateInputFileSpecificationsAndLog( // for a single input stream and forbidden for multiple input streams. if (parallelParsingSpecifiedViaJson.has_value()) { if (spec.size() == 1) { - AD_LOG(AD_WARN) + AD_LOG_WARN << "Parallel parsing set in the `.settings.json` file; this is " "deprecated, " << pleaseUseParallelParsingOption << std::endl; @@ -313,7 +313,7 @@ void IndexImpl::updateInputFileSpecificationsAndLog( // on the command line, we set if implicitly for backward compatibility. if (!parallelParsingSpecifiedViaJson.has_value() && spec.size() == 1 && !spec.at(0).parseInParallelSetExplicitly_) { - AD_LOG(AD_WARN) + AD_LOG_WARN << "Implicitly using the parallel parser for a single input file " "for reasons of backward compatibility; this is deprecated, " << pleaseUseParallelParsingOption << std::endl; @@ -322,12 +322,12 @@ void IndexImpl::updateInputFileSpecificationsAndLog( // For a single input stream, show the name and whether we parse in parallel. // For multiple input streams, only show the number of streams. if (spec.size() == 1) { - AD_LOG(AD_INFO) << "Parsing triples from single input stream " + AD_LOG_INFO << "Parsing triples from single input stream " << spec.at(0).filename_ << " (parallel = " << (spec.at(0).parseInParallel_ ? "true" : "false") << ") ..." << std::endl; } else { - AD_LOG(AD_INFO) << "Processing triples from " << spec.size() + AD_LOG_INFO << "Processing triples from " << spec.size() << " input streams ..." << std::endl; } } @@ -414,7 +414,7 @@ void IndexImpl::createFromFiles( addInternalStatisticsToConfiguration(numTriplesInternal, numPredicatesInternal); - AD_LOG(AD_INFO) << "Index build completed" << std::endl; + AD_LOG_INFO << "Index build completed" << std::endl; } // _____________________________________________________________________________ @@ -441,7 +441,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( ad_utility::Synchronized> idTriples( std::make_unique(onDiskBase_ + ".unsorted-triples.dat", 1_GB, allocator_)); - AD_LOG(AD_INFO) + AD_LOG_INFO << "Parsing input triples and creating partial vocabularies, one " "per batch ..." << std::endl; @@ -495,12 +495,12 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( } numTriplesProcessed++; if (progressBar.update()) { - AD_LOG(AD_INFO) << progressBar.getProgressString() << std::flush; + AD_LOG_INFO << progressBar.getProgressString() << std::flush; } } - AD_LOG(AD_TIMING) << "WaitTimes for Pipeline in msecs\n"; + AD_LOG_TIMING << "WaitTimes for Pipeline in msecs\n"; for (const auto& t : p.getWaitingTime()) { - AD_LOG(AD_TIMING) + AD_LOG_TIMING << std::chrono::duration_cast(t).count() << " msecs" << std::endl; } @@ -517,7 +517,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( if (writePartialVocabularyFuture[0].valid()) { writePartialVocabularyFuture[0].get(); } - AD_LOG(AD_TIMING) + AD_LOG_TIMING << "Time spent waiting for the writing of a previous vocabulary: " << sortFutureTimer.msecs().count() << "ms." << std::endl; auto moveMap = [](std::optional&& el) { @@ -540,13 +540,13 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( // ids actualPartialSizes.push_back(actualCurrentPartialSize); } - AD_LOG(AD_INFO) << progressBar.getFinalProgressString() << std::flush; + AD_LOG_INFO << progressBar.getFinalProgressString() << std::flush; for (auto& future : writePartialVocabularyFuture) { if (future.valid()) { future.get(); } } - AD_LOG(AD_INFO) + AD_LOG_INFO << "Number of triples created (including QLever-internal ones): " << (*idTriples.wlock())->size() << " [may contain duplicates]" << std::endl; @@ -554,7 +554,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( size_t sizeInternalVocabulary = 0; std::vector prefixes; - AD_LOG(AD_INFO) << "Merging partial vocabularies ..." << std::endl; + AD_LOG_INFO << "Merging partial vocabularies ..." << std::endl; const ad_utility::vocabulary_merger::VocabularyMetaData mergeRes = [&]() { auto sortPred = [cmp = &(vocab_.getCaseComparator())](std::string_view a, std::string_view b) { @@ -566,14 +566,14 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( onDiskBase_, numFiles, sortPred, wordCallback, memoryLimitIndexBuilding()); }(); - AD_LOG(AD_DEBUG) << "Finished merging partial vocabularies" << std::endl; + AD_LOG_DEBUG << "Finished merging partial vocabularies" << std::endl; IndexBuilderDataAsStxxlVector res; res.vocabularyMetaData_ = mergeRes; idOfHasPatternDuringIndexBuilding_ = mergeRes.specialIdMapping().at(HAS_PATTERN_PREDICATE); idOfInternalGraphDuringIndexBuilding_ = mergeRes.specialIdMapping().at(QLEVER_INTERNAL_GRAPH_IRI); - AD_LOG(AD_INFO) << "Number of words in external vocabulary: " + AD_LOG_INFO << "Number of words in external vocabulary: " << res.vocabularyMetaData_.numWordsTotal() - sizeInternalVocabulary << std::endl; @@ -581,7 +581,7 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( res.idTriples = std::move(*idTriples.wlock()); res.actualPartialSizes = std::move(actualPartialSizes); - AD_LOG(AD_DEBUG) << "Removing temporary files ..." << std::endl; + AD_LOG_DEBUG << "Removing temporary files ..." << std::endl; for (size_t n = 0; n < numFiles; ++n) { deleteTemporaryFile(absl::StrCat(onDiskBase_, PARTIAL_VOCAB_FILE_NAME, n)); } @@ -594,9 +594,9 @@ auto IndexImpl::convertPartialToGlobalIds( TripleVec& data, const vector& actualLinesPerPartial, size_t linesPerPartial, auto isQLeverInternalTriple) -> FirstPermutationSorterAndInternalTriplesAsPso { - AD_LOG(AD_INFO) << "Converting triples from local IDs to global IDs ..." + AD_LOG_INFO << "Converting triples from local IDs to global IDs ..." << std::endl; - AD_LOG(AD_DEBUG) << "Triples per partial vocabulary: " << linesPerPartial + AD_LOG_DEBUG << "Triples per partial vocabulary: " << linesPerPartial << std::endl; // Iterate over all partial vocabularies. @@ -665,7 +665,7 @@ auto IndexImpl::convertPartialToGlobalIds( numTriplesConverted += triples->size(); numTriplesConverted += internalTriples->size(); if (progressBar.update()) { - AD_LOG(AD_INFO) << progressBar.getProgressString() << std::flush; + AD_LOG_INFO << progressBar.getProgressString() << std::flush; } }; }; @@ -756,7 +756,7 @@ auto IndexImpl::convertPartialToGlobalIds( } lookupQueue.finish(); writeQueue.finish(); - AD_LOG(AD_INFO) << progressBar.getFinalProgressString() << std::flush; + AD_LOG_INFO << progressBar.getFinalProgressString() << std::flush; return {std::move(resultPtr), std::move(internalTriplesPtr)}; } @@ -816,7 +816,7 @@ std::tuple> configurationJson_; if (configurationJson_.find("git-hash") != configurationJson_.end()) { - AD_LOG(AD_INFO) << "The git hash used to build this index was " + AD_LOG_INFO << "The git hash used to build this index was " << configurationJson_["git-hash"] << std::endl; } else { - AD_LOG(AD_INFO) + AD_LOG_INFO << "The index was built before git commit hashes were stored in " "the index meta data" << std::endl; @@ -1019,7 +1019,7 @@ void IndexImpl::readConfiguration() { const auto& currentVersion = qlever::indexFormatVersion; if (indexFormatVersion != currentVersion) { if (indexFormatVersion.date_.toBits() > currentVersion.date_.toBits()) { - AD_LOG(AD_ERROR) + AD_LOG_ERROR << "The version of QLever you are using is too old for this " "index. Please use a version of QLever that is " "compatible with this index" @@ -1028,7 +1028,7 @@ void IndexImpl::readConfiguration() { << ", Date = " << indexFormatVersion.date_.toStringAndType().first << ")." << std::endl; } else { - AD_LOG(AD_ERROR) + AD_LOG_ERROR << "The index is too old for this version of QLever. " "We recommend that you rebuild the index and start the " "server with the current master. Alternatively start the " @@ -1042,7 +1042,7 @@ void IndexImpl::readConfiguration() { "Incompatible index format, see log message for details"}; } } else { - AD_LOG(AD_ERROR) + AD_LOG_ERROR << "This index was built before versioning was introduced for " "QLever's index format. Please rebuild your index using the " "current version of QLever." @@ -1058,7 +1058,7 @@ void IndexImpl::readConfiguration() { } if (configurationJson_.count("ignore-case")) { - AD_LOG(AD_ERROR) << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; + AD_LOG_ERROR << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; throw std::runtime_error("Deprecated key \"ignore-case\" in index build"); } @@ -1069,7 +1069,7 @@ void IndexImpl::readConfiguration() { vocab_.setLocale(lang, country, ignorePunctuation); textVocab_.setLocale(lang, country, ignorePunctuation); } else { - AD_LOG(AD_ERROR) + AD_LOG_ERROR << "Key \"locale\" is missing in the metadata. This is probably " "and old index build that is no longer supported by QLever. " "Please rebuild your index\n"; @@ -1198,7 +1198,7 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { } if (j.count("ignore-case")) { - AD_LOG(AD_ERROR) << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; + AD_LOG_ERROR << ERROR_IGNORE_CASE_UNSUPPORTED << '\n'; throw std::runtime_error("Deprecated key \"ignore-case\" in settings JSON"); } @@ -1218,18 +1218,18 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { country = std::string{j["locale"]["country"]}; ignorePunctuation = bool{j["locale"]["ignore-punctuation"]}; } else { - AD_LOG(AD_INFO) + AD_LOG_INFO << "Locale was not specified in settings file, default is " "en_US" << std::endl; } - AD_LOG(AD_INFO) << "You specified \"locale = " << lang << "_" << country + AD_LOG_INFO << "You specified \"locale = " << lang << "_" << country << "\" " << "and \"ignore-punctuation = " << ignorePunctuation << "\"" << std::endl; if (lang != LOCALE_DEFAULT_LANG || country != LOCALE_DEFAULT_COUNTRY) { - AD_LOG(AD_WARN) + AD_LOG_WARN << "You are using Locale settings that differ from the default " "language or country.\n\t" << "This should work but is untested by the QLever team. If " @@ -1253,19 +1253,19 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { onlyAsciiTurtlePrefixes_ = static_cast(j["ascii-prefixes-only"]); } if (onlyAsciiTurtlePrefixes_) { - AD_LOG(AD_INFO) << WARNING_ASCII_ONLY_PREFIXES << std::endl; + AD_LOG_INFO << WARNING_ASCII_ONLY_PREFIXES << std::endl; } if (j.count("parallel-parsing")) { useParallelParser_ = static_cast(j["parallel-parsing"]); } if (useParallelParser_) { - AD_LOG(AD_INFO) << WARNING_PARALLEL_PARSING << std::endl; + AD_LOG_INFO << WARNING_PARALLEL_PARSING << std::endl; } if (j.count("num-triples-per-batch")) { numTriplesPerBatch_ = size_t{j["num-triples-per-batch"]}; - AD_LOG(AD_INFO) + AD_LOG_INFO << "You specified \"num-triples-per-batch = " << numTriplesPerBatch_ << "\", choose a lower value if the index builder runs out of memory" << std::endl; @@ -1273,7 +1273,7 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { if (j.count("parser-batch-size")) { parserBatchSize_ = size_t{j["parser-batch-size"]}; - AD_LOG(AD_INFO) << "Overriding setting parser-batch-size to " + AD_LOG_INFO << "Overriding setting parser-batch-size to " << parserBatchSize_ << " This might influence performance during index build." << std::endl; @@ -1290,34 +1290,34 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { if (j.count(key)) { auto value = static_cast(j[key]); if (value == overflowingIntegersThrow) { - AD_LOG(AD_INFO) + AD_LOG_INFO << "Integers that cannot be represented by QLever will throw " "an exception" << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::Error; } else if (value == overflowingIntegersBecomeDoubles) { - AD_LOG(AD_INFO) + AD_LOG_INFO << "Integers that cannot be represented by QLever will be " "converted to doubles" << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::OverflowingToDouble; } else if (value == allIntegersBecomeDoubles) { - AD_LOG(AD_INFO) << "All integers will be converted to doubles" + AD_LOG_INFO << "All integers will be converted to doubles" << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::OverflowingToDouble; } else { AD_CONTRACT_CHECK(std::ranges::find(allModes, value) == allModes.end()); - AD_LOG(AD_ERROR) << "Invalid value for " << key << std::endl; - AD_LOG(AD_INFO) << "The currently supported values are " + AD_LOG_ERROR << "Invalid value for " << key << std::endl; + AD_LOG_INFO << "The currently supported values are " << absl::StrJoin(allModes, ",") << std::endl; } } else { turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::Error; - AD_LOG(AD_INFO) + AD_LOG_INFO << "By default, integers that cannot be represented by QLever " "will throw an " "exception" @@ -1331,9 +1331,9 @@ std::future IndexImpl::writeNextPartialVocabulary( std::unique_ptr items, auto localIds, ad_utility::Synchronized>* globalWritePtr) { using namespace ad_utility::vocabulary_merger; - AD_LOG(AD_DEBUG) << "Input triples read in this section: " << numLines + AD_LOG_DEBUG << "Input triples read in this section: " << numLines << std::endl; - AD_LOG(AD_DEBUG) + AD_LOG_DEBUG << "Triples processed, also counting internal triples added by QLever: " << actualCurrentPartialSize << std::endl; std::future resultFuture; @@ -1362,7 +1362,7 @@ std::future IndexImpl::writeNextPartialVocabulary( ad_utility::TimeBlockAndLog l{"creating internal mapping"}; return createInternalMapping(&vec); }(); - AD_LOG(AD_TRACE) << "Finished creating of Mapping vocabulary" << std::endl; + AD_LOG_TRACE << "Finished creating of Mapping vocabulary" << std::endl; // since now adjacent duplicates also have the same Ids, it suffices to // compare those { @@ -1389,7 +1389,7 @@ std::future IndexImpl::writeNextPartialVocabulary( ad_utility::TimeBlockAndLog l{"write partial vocabulary"}; writePartialVocabularyToFile(vec, partialFilename); } - AD_LOG(AD_TRACE) << "Finished writing the partial vocabulary" << std::endl; + AD_LOG_TRACE << "Finished writing the partial vocabulary" << std::endl; vec.clear(); { ad_utility::TimeBlockAndLog l{"writing to global file"}; diff --git a/src/util/Log.h b/src/util/Log.h index 35a1a93822..687fd7cdb6 100644 --- a/src/util/Log.h +++ b/src/util/Log.h @@ -54,13 +54,13 @@ enum class LogLevel { // These should be used in new code to avoid clashes with `abseil` (similar to // `AD_LOG` vs `LOG`), see above for details. -static constexpr auto AD_FATAL = LogLevel::FATAL; -static constexpr auto AD_ERROR = LogLevel::ERROR; -static constexpr auto AD_WARN = LogLevel::WARN; -static constexpr auto AD_INFO = LogLevel::INFO; -static constexpr auto AD_DEBUG = LogLevel::DEBUG; -static constexpr auto AD_TIMING = LogLevel::TIMING; -static constexpr auto AD_TRACE = LogLevel::TRACE; +#define AD_LOG_FATAL AD_LOG(LogLevel::FATAL) +#define AD_LOG_ERROR AD_LOG(LogLevel::ERROR) +#define AD_LOG_WARN AD_LOG(LogLevel::WARN) +#define AD_LOG_INFO AD_LOG(LogLevel::INFO) +#define AD_LOG_DEBUG AD_LOG(LogLevel::DEBUG) +#define AD_LOG_TIMING AD_LOG(LogLevel::TIMING) +#define AD_LOG_TRACE AD_LOG(LogLevel::TRACE) using enum LogLevel; From 032eadeb2c7ce4695ead029756b64dfb140056c8 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 20 Nov 2024 12:21:22 +0100 Subject: [PATCH 12/18] Clang-format. Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.cpp | 99 +++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 53 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 2d8c0fef32..27ae9491ff 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -246,9 +246,9 @@ IndexImpl::buildOspWithPatterns( // Add the `ql:has-pattern` predicate to the sorter such that it will become // part of the PSO and POS permutation. AD_LOG_INFO << "Adding " << hasPatternPredicateSortedByPSO->size() - << " triples to the POS and PSO permutation for " - "the internal `ql:has-pattern` ..." - << std::endl; + << " triples to the POS and PSO permutation for " + "the internal `ql:has-pattern` ..." + << std::endl; static_assert(NumColumnsIndexBuilding == 4, "When adding additional payload columns, the following code " "has to be changed"); @@ -323,12 +323,12 @@ void IndexImpl::updateInputFileSpecificationsAndLog( // For multiple input streams, only show the number of streams. if (spec.size() == 1) { AD_LOG_INFO << "Parsing triples from single input stream " - << spec.at(0).filename_ << " (parallel = " - << (spec.at(0).parseInParallel_ ? "true" : "false") - << ") ..." << std::endl; + << spec.at(0).filename_ << " (parallel = " + << (spec.at(0).parseInParallel_ ? "true" : "false") << ") ..." + << std::endl; } else { AD_LOG_INFO << "Processing triples from " << spec.size() - << " input streams ..." << std::endl; + << " input streams ..." << std::endl; } } @@ -441,10 +441,9 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( ad_utility::Synchronized> idTriples( std::make_unique(onDiskBase_ + ".unsorted-triples.dat", 1_GB, allocator_)); - AD_LOG_INFO - << "Parsing input triples and creating partial vocabularies, one " - "per batch ..." - << std::endl; + AD_LOG_INFO << "Parsing input triples and creating partial vocabularies, one " + "per batch ..." + << std::endl; bool parserExhausted = false; // already count the numbers of triples that will be used for the language @@ -546,10 +545,9 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( future.get(); } } - AD_LOG_INFO - << "Number of triples created (including QLever-internal ones): " - << (*idTriples.wlock())->size() << " [may contain duplicates]" - << std::endl; + AD_LOG_INFO << "Number of triples created (including QLever-internal ones): " + << (*idTriples.wlock())->size() << " [may contain duplicates]" + << std::endl; size_t sizeInternalVocabulary = 0; std::vector prefixes; @@ -574,9 +572,9 @@ IndexBuilderDataAsStxxlVector IndexImpl::passFileForVocabulary( idOfInternalGraphDuringIndexBuilding_ = mergeRes.specialIdMapping().at(QLEVER_INTERNAL_GRAPH_IRI); AD_LOG_INFO << "Number of words in external vocabulary: " - << res.vocabularyMetaData_.numWordsTotal() - - sizeInternalVocabulary - << std::endl; + << res.vocabularyMetaData_.numWordsTotal() - + sizeInternalVocabulary + << std::endl; res.idTriples = std::move(*idTriples.wlock()); res.actualPartialSizes = std::move(actualPartialSizes); @@ -595,9 +593,9 @@ auto IndexImpl::convertPartialToGlobalIds( size_t linesPerPartial, auto isQLeverInternalTriple) -> FirstPermutationSorterAndInternalTriplesAsPso { AD_LOG_INFO << "Converting triples from local IDs to global IDs ..." - << std::endl; + << std::endl; AD_LOG_DEBUG << "Triples per partial vocabulary: " << linesPerPartial - << std::endl; + << std::endl; // Iterate over all partial vocabularies. auto resultPtr = @@ -817,7 +815,7 @@ IndexImpl::createPermutations(size_t numColumns, auto&& sortedTriples, const Permutation& p1, const Permutation& p2, auto&&... perTripleCallbacks) { AD_LOG_INFO << "Creating permutations " << p1.readableName() << " and " - << p2.readableName() << " ..." << std::endl; + << p2.readableName() << " ..." << std::endl; auto metaData = createPermutationPairImpl( numColumns, onDiskBase_ + ".index" + p1.fileSuffix(), onDiskBase_ + ".index" + p2.fileSuffix(), AD_FWD(sortedTriples), @@ -827,9 +825,9 @@ IndexImpl::createPermutations(size_t numColumns, auto&& sortedTriples, meta1.calculateStatistics(numDistinctCol0); meta2.calculateStatistics(numDistinctCol0); AD_LOG_INFO << "Statistics for " << p1.readableName() << ": " - << meta1.statistics() << std::endl; + << meta1.statistics() << std::endl; AD_LOG_INFO << "Statistics for " << p2.readableName() << ": " - << meta2.statistics() << std::endl; + << meta2.statistics() << std::endl; return metaData; } @@ -854,7 +852,7 @@ size_t IndexImpl::createPermutationPair(size_t numColumns, metaData.appendToFile(&f); }; AD_LOG_DEBUG << "Writing meta data for " << p1.readableName() << " and " - << p2.readableName() << " ..." << std::endl; + << p2.readableName() << " ..." << std::endl; writeMetadata(metaData1, p1); writeMetadata(metaData2, p2); return numDistinctC0; @@ -868,7 +866,7 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { globalSingletonComparator_ = &vocab_.getCaseComparator(); AD_LOG_DEBUG << "Number of words in internal and external vocabulary: " - << vocab_.size() << std::endl; + << vocab_.size() << std::endl; auto range1 = vocab_.prefixRanges(QLEVER_INTERNAL_PREFIX_IRI_WITHOUT_CLOSING_BRACKET); @@ -1004,7 +1002,7 @@ void IndexImpl::readConfiguration() { f >> configurationJson_; if (configurationJson_.find("git-hash") != configurationJson_.end()) { AD_LOG_INFO << "The git hash used to build this index was " - << configurationJson_["git-hash"] << std::endl; + << configurationJson_["git-hash"] << std::endl; } else { AD_LOG_INFO << "The index was built before git commit hashes were stored in " @@ -1218,15 +1216,14 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { country = std::string{j["locale"]["country"]}; ignorePunctuation = bool{j["locale"]["ignore-punctuation"]}; } else { - AD_LOG_INFO - << "Locale was not specified in settings file, default is " - "en_US" - << std::endl; + AD_LOG_INFO << "Locale was not specified in settings file, default is " + "en_US" + << std::endl; } AD_LOG_INFO << "You specified \"locale = " << lang << "_" << country - << "\" " - << "and \"ignore-punctuation = " << ignorePunctuation - << "\"" << std::endl; + << "\" " + << "and \"ignore-punctuation = " << ignorePunctuation << "\"" + << std::endl; if (lang != LOCALE_DEFAULT_LANG || country != LOCALE_DEFAULT_COUNTRY) { AD_LOG_WARN @@ -1274,9 +1271,9 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { if (j.count("parser-batch-size")) { parserBatchSize_ = size_t{j["parser-batch-size"]}; AD_LOG_INFO << "Overriding setting parser-batch-size to " - << parserBatchSize_ - << " This might influence performance during index build." - << std::endl; + << parserBatchSize_ + << " This might influence performance during index build." + << std::endl; } std::string overflowingIntegersThrow = "overflowing-integers-throw"; @@ -1290,38 +1287,34 @@ void IndexImpl::readIndexBuilderSettingsFromFile() { if (j.count(key)) { auto value = static_cast(j[key]); if (value == overflowingIntegersThrow) { - AD_LOG_INFO - << "Integers that cannot be represented by QLever will throw " - "an exception" - << std::endl; + AD_LOG_INFO << "Integers that cannot be represented by QLever will throw " + "an exception" + << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::Error; } else if (value == overflowingIntegersBecomeDoubles) { - AD_LOG_INFO - << "Integers that cannot be represented by QLever will be " - "converted to doubles" - << std::endl; + AD_LOG_INFO << "Integers that cannot be represented by QLever will be " + "converted to doubles" + << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::OverflowingToDouble; } else if (value == allIntegersBecomeDoubles) { - AD_LOG_INFO << "All integers will be converted to doubles" - << std::endl; + AD_LOG_INFO << "All integers will be converted to doubles" << std::endl; turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::OverflowingToDouble; } else { AD_CONTRACT_CHECK(std::ranges::find(allModes, value) == allModes.end()); AD_LOG_ERROR << "Invalid value for " << key << std::endl; AD_LOG_INFO << "The currently supported values are " - << absl::StrJoin(allModes, ",") << std::endl; + << absl::StrJoin(allModes, ",") << std::endl; } } else { turtleParserIntegerOverflowBehavior_ = TurtleParserIntegerOverflowBehavior::Error; - AD_LOG_INFO - << "By default, integers that cannot be represented by QLever " - "will throw an " - "exception" - << std::endl; + AD_LOG_INFO << "By default, integers that cannot be represented by QLever " + "will throw an " + "exception" + << std::endl; } } @@ -1332,7 +1325,7 @@ std::future IndexImpl::writeNextPartialVocabulary( ad_utility::Synchronized>* globalWritePtr) { using namespace ad_utility::vocabulary_merger; AD_LOG_DEBUG << "Input triples read in this section: " << numLines - << std::endl; + << std::endl; AD_LOG_DEBUG << "Triples processed, also counting internal triples added by QLever: " << actualCurrentPartialSize << std::endl; From 8285355a149665499f5c9c6220f9f9bc409bcefb Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Wed, 20 Nov 2024 13:14:14 +0100 Subject: [PATCH 13/18] merge log change and fix allocator --- src/engine/SpatialJoinAlgorithms.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 15a6bb3f48..43c29844e4 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -432,7 +432,7 @@ std::array SpatialJoinAlgorithms::isAPoleTouched( // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto printWarning = []() { - AD_LOG(AD_WARN) + AD_LOG_WARN << "expected a point here, but no point is given. Skipping this point" << std::endl; }; @@ -479,7 +479,8 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { // query the other rtree for every point using the following bounding box std::vector bbox = computeBoundingBox(p); - std::vector> results; + std::vector> results{ + qec_->getAllocator()}; std::ranges::for_each(bbox, [&](const Box& bbox) { rtree.query(bgi::intersects(bbox), std::back_inserter(results)); From 56e16183f84fb33f145eb57c05ad92985a2f5184 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Wed, 20 Nov 2024 23:45:15 +0100 Subject: [PATCH 14/18] PR feedback --- src/engine/SpatialJoin.h | 2 +- src/engine/SpatialJoinAlgorithms.cpp | 47 +++++++++++++---------- src/engine/SpatialJoinAlgorithms.h | 18 ++++----- test/engine/SpatialJoinAlgorithmsTest.cpp | 2 +- 4 files changed, 38 insertions(+), 31 deletions(-) diff --git a/src/engine/SpatialJoin.h b/src/engine/SpatialJoin.h index 35d2701379..b31b1b8866 100644 --- a/src/engine/SpatialJoin.h +++ b/src/engine/SpatialJoin.h @@ -141,5 +141,5 @@ class SpatialJoin : public Operation { // between the two objects bool addDistToResult_ = true; const string nameDistanceInternal_ = "?distOfTheTwoObjectsAddedInternally"; - Algorithm algorithm_ = Algorithm::BoundingBox; + Algorithm algorithm_ = Algorithm::S2Geometry; }; diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 43c29844e4..4345bd7805 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -6,7 +6,7 @@ #include #include -#include "math.h" +#include #include "util/GeoSparqlHelpers.h" using namespace BoostGeometryNamespace; @@ -232,9 +232,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( const Point& startPoint) { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; - if (!maxDist.has_value()) { - AD_THROW("Max distance must have a value for this operation"); - } + AD_CORRECTNESS_CHECK(maxDist.has_value(), "Max distance must have a value for this operation"); // haversine function auto haversine = [](double theta) { return (1 - std::cos(theta)) / 2; }; @@ -258,7 +256,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( // a single bounding box for the whole planet, do an optimized version if (static_cast(maxDist.value()) > circumferenceMax_ / 4.0 && static_cast(maxDist.value()) < circumferenceMax_ / 2.01) { - return computeUsingAntiBoundingBox(startPoint); + return computeBoundingBoxForLargeDistances(startPoint); } // compute latitude bound @@ -311,13 +309,11 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( } // ____________________________________________________________________________ -std::vector SpatialJoinAlgorithms::computeUsingAntiBoundingBox( +std::vector SpatialJoinAlgorithms::computeBoundingBoxForLargeDistances( const Point& startPoint) const { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; - if (!maxDist.has_value()) { - AD_THROW("Max distance must have a value for this operation"); - } + AD_CORRECTNESS_CHECK(maxDist.has_value(), "Max distance must have a value for this operation"); // point on the opposite side of the globe Point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1); @@ -391,17 +387,17 @@ std::vector SpatialJoinAlgorithms::computeUsingAntiBoundingBox( } // ____________________________________________________________________________ -bool SpatialJoinAlgorithms::containedInBoundingBoxes( - const std::vector& bbox, Point point1) { - convertToNormalCoordinates(point1); +bool SpatialJoinAlgorithms::isContainedInBoundingBoxes( + const std::vector& boundingBox, Point point) { + convertToNormalCoordinates(point); - return std::ranges::any_of(bbox, [point1](const Box& aBox) { - return boost::geometry::covered_by(point1, aBox); + return std::ranges::any_of(boundingBox, [point](const Box& aBox) { + return boost::geometry::covered_by(point, aBox); }); } // ____________________________________________________________________________ -void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) { +void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) const { // correct lon and lat bounds if necessary while (point.get<0>() < -180) { point.set<0>(point.get<0>() + 360); @@ -416,8 +412,9 @@ void SpatialJoinAlgorithms::convertToNormalCoordinates(Point& point) { } } +// ____________________________________________________________________________ std::array SpatialJoinAlgorithms::isAPoleTouched( - const double& latitude) { + const double& latitude) const { bool northPoleReached = false; bool southPoleReached = false; if (latitude >= 90) { @@ -431,11 +428,17 @@ std::array SpatialJoinAlgorithms::isAPoleTouched( // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { - auto printWarning = []() { - AD_LOG_WARN - << "expected a point here, but no point is given. Skipping this point" - << std::endl; + bool alreadyWarned = false; + + auto printWarning = [&]() { + if (!alreadyWarned) { + AD_LOG_WARN + << "expected a point here, but no point is given. Skipping this point and future 'non points' of this query" + << std::endl; + alreadyWarned = true; + } }; + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; IdTable result{numColumns, qec_->getAllocator()}; @@ -453,6 +456,10 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { } bgi::rtree> rtree; + // bgi::rtree, index::indexable, index::equal_to, ad_utility::AllocatorWithLimit> rtree(parameters_type(), indexable_getter(), value_equal(), qec_->getAllocator()); + // bgi::rtree, ad_utility::AllocatorWithLimit> rtree{qec_->getAllocator()}; + // bgi::rtree> dummyRtree; + // bgi::rtree, ad_utility::AllocatorWithLimit> rtree(dummyRtree, qec_->getAllocator()); for (size_t i = 0; i < smallerResult->numRows(); i++) { // get point of row i auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index c2a447c8b0..886b1bc583 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -35,9 +35,9 @@ class SpatialJoinAlgorithms { } bool OnlyForTestingWrapperContainedInBoundingBoxes( - const std::vector& bbox, - const BoostGeometryNamespace::Point& point1) { - return containedInBoundingBoxes(bbox, point1); + const std::vector& boundingBox, + const BoostGeometryNamespace::Point& point) { + return isContainedInBoundingBoxes(boundingBox, point); } private: @@ -80,14 +80,14 @@ class SpatialJoinAlgorithms { // gets used, when the usual procedure, would just result in taking a big // bounding box, which covers the whole planet (so for extremely large max // distances) - std::vector computeUsingAntiBoundingBox( + std::vector computeBoundingBoxForLargeDistances( const BoostGeometryNamespace::Point& startPoint) const; // This function returns true, iff the given point is contained in any of the // bounding boxes - bool containedInBoundingBoxes( - const std::vector& bbox, - BoostGeometryNamespace::Point point1); + bool isContainedInBoundingBoxes( + const std::vector& boundingBox, + BoostGeometryNamespace::Point point); QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; @@ -105,8 +105,8 @@ class SpatialJoinAlgorithms { static constexpr double radius_ = 6'378'000; // convert coordinates to the usual ranges (-180 to 180 and -90 to 90) - void convertToNormalCoordinates(BoostGeometryNamespace::Point& point); + void convertToNormalCoordinates(BoostGeometryNamespace::Point& point) const; // return whether one of the poles is being touched - std::array isAPoleTouched(const double& latitude); + std::array isAPoleTouched(const double& latitude) const; }; diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index 4c9ab579e2..4ddcc67b13 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -934,7 +934,7 @@ TEST(SpatialJoin, computeBoundingBox) { } } -TEST(SpatialJoin, containedInBoundingBoxes) { +TEST(SpatialJoin, isContainedInBoundingBoxes) { // build dummy join to access the containedInBoundingBox and // computeBoundingBox functions auto qec = buildTestQEC(); From 7d8b98b5e9827e69339cdbdd44ab7b053e3cf324 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Thu, 21 Nov 2024 19:10:29 +0100 Subject: [PATCH 15/18] pr feedback --- src/engine/SpatialJoinAlgorithms.cpp | 14 ++-- test/engine/SpatialJoinAlgorithmsTest.cpp | 81 +++++++++++++++++++++-- 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 4345bd7805..d01ac1642c 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -455,11 +455,12 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { std::swap(smallerResJoinCol, otherResJoinCol); } - bgi::rtree> rtree; - // bgi::rtree, index::indexable, index::equal_to, ad_utility::AllocatorWithLimit> rtree(parameters_type(), indexable_getter(), value_equal(), qec_->getAllocator()); + // bgi::rtree> rtree; + bgi::rtree, bgi::indexable, bgi::equal_to, ad_utility::AllocatorWithLimit> + rtree(bgi::quadratic<16>{}, bgi::indexable{}, bgi::equal_to{}, qec_->getAllocator()); // bgi::rtree, ad_utility::AllocatorWithLimit> rtree{qec_->getAllocator()}; - // bgi::rtree> dummyRtree; - // bgi::rtree, ad_utility::AllocatorWithLimit> rtree(dummyRtree, qec_->getAllocator()); + //bgi::rtree> dummyRtree; + //bgi::rtree, ad_utility::AllocatorWithLimit> rtree(dummyRtree, qec_->getAllocator()); for (size_t i = 0; i < smallerResult->numRows(); i++) { // get point of row i auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); @@ -474,6 +475,8 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { // add every point together with the row number into the rtree rtree.insert(std::make_pair(std::move(p), i)); } + std::vector> results{ + qec_->getAllocator()}; for (size_t i = 0; i < otherResult->numRows(); i++) { auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); @@ -486,8 +489,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { // query the other rtree for every point using the following bounding box std::vector bbox = computeBoundingBox(p); - std::vector> results{ - qec_->getAllocator()}; + results.clear(); std::ranges::for_each(bbox, [&](const Box& bbox) { rtree.query(bgi::intersects(bbox), std::back_inserter(results)); diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index 4ddcc67b13..a203b2b849 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -193,15 +193,13 @@ class SpatialJoinParamTest auto qec = buildTestQEC(); auto numTriples = qec->getIndex().numTriples().normal; ASSERT_EQ(numTriples, 15); - // ========================= build big child - // ================================= + // ========================= build big child ============================= auto bigChild = buildMediumChild( qec, {"?obj1", std::string{""}, "?name1"}, {"?obj1", std::string{""}, "?geo1"}, {"?geo1", std::string{""}, "?point1"}, "?obj1", "?geo1"); - // ========================= build small child - // =============================== + // ========================= build small child =========================== TripleComponent point2{Variable{"?point2"}}; auto smallChild = buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); @@ -219,6 +217,33 @@ class SpatialJoinParamTest columnNames); } + void testDiffSizeIdTables(std::string specialPredicate, bool addLeftChildFirst, + std::vector> expectedOutput, + std::vector columnNames, bool bigChildLeft) { + auto qec = buildTestQEC(); + auto numTriples = qec->getIndex().numTriples().normal; + ASSERT_EQ(numTriples, 15); + // ====================== build small input ================================= + TripleComponent point1{Variable{"?point1"}}; + TripleComponent subject{ad_utility::triple_component::Iri::fromIriref("")}; + auto smallChild = ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, SparqlTriple{subject, std::string{""}, point1}); + // ====================== build big input ================================== + auto bigChild = buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); + + auto firstChild = bigChildLeft ? bigChild : smallChild; + auto secondChild = bigChildLeft ? smallChild : bigChild; + auto firstVariable = + bigChildLeft ? TripleComponent{Variable{"?point2"}} : point1; + auto secondVariable = + bigChildLeft ? point1 : TripleComponent{Variable{"?point2"}}; + + createAndTestSpatialJoin( + qec, SparqlTriple{firstVariable, specialPredicate, secondVariable}, + firstChild, secondChild, addLeftChildFirst, expectedOutput, + columnNames); +} + protected: bool useBaselineAlgorithm_; }; @@ -566,6 +591,27 @@ std::vector> expectedMaxDist10000000_rows_diff{ mergeToRow(Eif, sEye, expectedDistEyeEif), mergeToRow(Eif, sLib, expectedDistEifLib)}; +std::vector> expectedMaxDist1_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) +}; + +std::vector> expectedMaxDist5000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun) +}; + +std::vector> expectedMaxDist500000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) +}; + +std::vector> expectedMaxDist1000000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) +}; + +std::vector> expectedMaxDist10000000_rows_diffIDTable{ + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) +}; + std::vector> expectedNearestNeighbors1{ mergeToRow(TF, TF, expectedDistSelf), mergeToRow(Mun, Mun, expectedDistSelf), @@ -781,6 +827,33 @@ TEST_P(SpatialJoinParamTest, computeResultSmallDatasetDifferentSizeChildren) { expectedMaxDist10000000_rows_diff, columnNames, false); } +TEST_P(SpatialJoinParamTest, diffSizeIdTables) { + std::vector columnNames{"?point1", + "?obj2", + "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + /*testDiffSizeIdTables("", true, expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", true, expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, expectedMaxDist5000_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, expectedMaxDist5000_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", true, expectedMaxDist500000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, expectedMaxDist500000_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, expectedMaxDist500000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, expectedMaxDist500000_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", true, expectedMaxDist1000000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, expectedMaxDist1000000_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, expectedMaxDist1000000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, expectedMaxDist1000000_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", true, expectedMaxDist10000000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, expectedMaxDist10000000_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, expectedMaxDist10000000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, expectedMaxDist10000000_rows_diffIDTable, columnNames, false);*/ +} + INSTANTIATE_TEST_SUITE_P( SpatialJoin, SpatialJoinParamTest, ::testing::Values(SpatialJoin::Algorithm::Baseline, From 89491a1fd5aef96b0d07638269750af6be10138a Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Thu, 21 Nov 2024 21:52:34 +0100 Subject: [PATCH 16/18] PR feedback --- src/engine/SpatialJoinAlgorithms.cpp | 29 ++--- test/engine/SpatialJoinAlgorithmsTest.cpp | 145 ++++++++++++++-------- 2 files changed, 108 insertions(+), 66 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index d01ac1642c..8f455156f3 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -7,6 +7,7 @@ #include #include + #include "util/GeoSparqlHelpers.h" using namespace BoostGeometryNamespace; @@ -232,7 +233,8 @@ std::vector SpatialJoinAlgorithms::computeBoundingBox( const Point& startPoint) { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; - AD_CORRECTNESS_CHECK(maxDist.has_value(), "Max distance must have a value for this operation"); + AD_CORRECTNESS_CHECK(maxDist.has_value(), + "Max distance must have a value for this operation"); // haversine function auto haversine = [](double theta) { return (1 - std::cos(theta)) / 2; }; @@ -313,7 +315,8 @@ std::vector SpatialJoinAlgorithms::computeBoundingBoxForLargeDistances( const Point& startPoint) const { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; - AD_CORRECTNESS_CHECK(maxDist.has_value(), "Max distance must have a value for this operation"); + AD_CORRECTNESS_CHECK(maxDist.has_value(), + "Max distance must have a value for this operation"); // point on the opposite side of the globe Point antiPoint(startPoint.get<0>() + 180, startPoint.get<1>() * -1); @@ -429,16 +432,16 @@ std::array SpatialJoinAlgorithms::isAPoleTouched( // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { bool alreadyWarned = false; - + auto printWarning = [&]() { if (!alreadyWarned) { - AD_LOG_WARN - << "expected a point here, but no point is given. Skipping this point and future 'non points' of this query" - << std::endl; + AD_LOG_WARN << "expected a point here, but no point is given. Skipping " + "this point and future 'non points' of this query" + << std::endl; alreadyWarned = true; } }; - + const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; IdTable result{numColumns, qec_->getAllocator()}; @@ -455,12 +458,10 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { std::swap(smallerResJoinCol, otherResJoinCol); } - // bgi::rtree> rtree; - bgi::rtree, bgi::indexable, bgi::equal_to, ad_utility::AllocatorWithLimit> - rtree(bgi::quadratic<16>{}, bgi::indexable{}, bgi::equal_to{}, qec_->getAllocator()); - // bgi::rtree, ad_utility::AllocatorWithLimit> rtree{qec_->getAllocator()}; - //bgi::rtree> dummyRtree; - //bgi::rtree, ad_utility::AllocatorWithLimit> rtree(dummyRtree, qec_->getAllocator()); + bgi::rtree, bgi::indexable, + bgi::equal_to, ad_utility::AllocatorWithLimit> + rtree(bgi::quadratic<16>{}, bgi::indexable{}, + bgi::equal_to{}, qec_->getAllocator()); for (size_t i = 0; i < smallerResult->numRows(); i++) { // get point of row i auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); @@ -476,7 +477,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { rtree.insert(std::make_pair(std::move(p), i)); } std::vector> results{ - qec_->getAllocator()}; + qec_->getAllocator()}; for (size_t i = 0; i < otherResult->numRows(); i++) { auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index a203b2b849..02c88b70c0 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -217,32 +217,37 @@ class SpatialJoinParamTest columnNames); } - void testDiffSizeIdTables(std::string specialPredicate, bool addLeftChildFirst, + void testDiffSizeIdTables( + std::string specialPredicate, bool addLeftChildFirst, std::vector> expectedOutput, std::vector columnNames, bool bigChildLeft) { - auto qec = buildTestQEC(); - auto numTriples = qec->getIndex().numTriples().normal; - ASSERT_EQ(numTriples, 15); - // ====================== build small input ================================= - TripleComponent point1{Variable{"?point1"}}; - TripleComponent subject{ad_utility::triple_component::Iri::fromIriref("")}; - auto smallChild = ad_utility::makeExecutionTree( - qec, Permutation::Enum::PSO, SparqlTriple{subject, std::string{""}, point1}); - // ====================== build big input ================================== - auto bigChild = buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); - - auto firstChild = bigChildLeft ? bigChild : smallChild; - auto secondChild = bigChildLeft ? smallChild : bigChild; - auto firstVariable = + auto qec = buildTestQEC(); + auto numTriples = qec->getIndex().numTriples().normal; + ASSERT_EQ(numTriples, 15); + // ====================== build small input + // ================================= + TripleComponent point1{Variable{"?point1"}}; + TripleComponent subject{ + ad_utility::triple_component::Iri::fromIriref("")}; + auto smallChild = ad_utility::makeExecutionTree( + qec, Permutation::Enum::PSO, + SparqlTriple{subject, std::string{""}, point1}); + // ====================== build big input ================================== + auto bigChild = + buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); + + auto firstChild = bigChildLeft ? bigChild : smallChild; + auto secondChild = bigChildLeft ? smallChild : bigChild; + auto firstVariable = bigChildLeft ? TripleComponent{Variable{"?point2"}} : point1; auto secondVariable = bigChildLeft ? point1 : TripleComponent{Variable{"?point2"}}; - - createAndTestSpatialJoin( + + createAndTestSpatialJoin( qec, SparqlTriple{firstVariable, specialPredicate, secondVariable}, firstChild, secondChild, addLeftChildFirst, expectedOutput, columnNames); -} + } protected: bool useBaselineAlgorithm_; @@ -592,25 +597,29 @@ std::vector> expectedMaxDist10000000_rows_diff{ mergeToRow(Eif, sLib, expectedDistEifLib)}; std::vector> expectedMaxDist1_rows_diffIDTable{ - mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) -}; + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf)}; std::vector> expectedMaxDist5000_rows_diffIDTable{ - mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), - mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun) -}; + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun)}; std::vector> expectedMaxDist500000_rows_diffIDTable{ - mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) -}; + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif)}; std::vector> expectedMaxDist1000000_rows_diffIDTable{ - mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) -}; + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif), + mergeToRow({sTF.at(1)}, sEye, expectedDistUniEye)}; std::vector> expectedMaxDist10000000_rows_diffIDTable{ - mergeToRow({sTF.at(1)}, sTF, expectedDistSelf) -}; + mergeToRow({sTF.at(1)}, sTF, expectedDistSelf), + mergeToRow({sTF.at(1)}, sMun, expectedDistUniMun), + mergeToRow({sTF.at(1)}, sEif, expectedDistUniEif), + mergeToRow({sTF.at(1)}, sEye, expectedDistUniEye), + mergeToRow({sTF.at(1)}, sLib, expectedDistUniLib)}; std::vector> expectedNearestNeighbors1{ mergeToRow(TF, TF, expectedDistSelf), @@ -828,30 +837,62 @@ TEST_P(SpatialJoinParamTest, computeResultSmallDatasetDifferentSizeChildren) { } TEST_P(SpatialJoinParamTest, diffSizeIdTables) { - std::vector columnNames{"?point1", - "?obj2", - "?point2", + std::vector columnNames{"?point1", "?obj2", "?point2", "?distOfTheTwoObjectsAddedInternally"}; - /*testDiffSizeIdTables("", true, expectedMaxDist1_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", true, expectedMaxDist1_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", false, expectedMaxDist1_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", false, expectedMaxDist1_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", true, expectedMaxDist5000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", true, expectedMaxDist5000_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", false, expectedMaxDist5000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", false, expectedMaxDist5000_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", true, expectedMaxDist500000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", true, expectedMaxDist500000_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", false, expectedMaxDist500000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", false, expectedMaxDist500000_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", true, expectedMaxDist1000000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", true, expectedMaxDist1000000_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", false, expectedMaxDist1000000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", false, expectedMaxDist1000000_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", true, expectedMaxDist10000000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", true, expectedMaxDist10000000_rows_diffIDTable, columnNames, false); - testDiffSizeIdTables("", false, expectedMaxDist10000000_rows_diffIDTable, columnNames, true); - testDiffSizeIdTables("", false, expectedMaxDist10000000_rows_diffIDTable, columnNames, false);*/ + testDiffSizeIdTables("", true, + expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, + expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", false, + expectedMaxDist1_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, + expectedMaxDist1_rows_diffIDTable, columnNames, false); + testDiffSizeIdTables("", true, + expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", true, + expectedMaxDist5000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist5000_rows_diffIDTable, columnNames, true); + testDiffSizeIdTables("", false, + expectedMaxDist5000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist500000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist500000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist500000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist500000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist1000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", true, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", true, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + false); + testDiffSizeIdTables("", false, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + true); + testDiffSizeIdTables("", false, + expectedMaxDist10000000_rows_diffIDTable, columnNames, + false); } INSTANTIATE_TEST_SUITE_P( From 23f2ed2ff9a11913a712ea5919ce69361be6dd64 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Fri, 22 Nov 2024 17:43:05 +0100 Subject: [PATCH 17/18] PR feedback --- src/engine/SpatialJoin.cpp | 7 +- src/engine/SpatialJoinAlgorithms.cpp | 43 ++++--- src/engine/SpatialJoinAlgorithms.h | 12 +- test/engine/SpatialJoinAlgorithmsTest.cpp | 143 +++++++++++++++++++++- 4 files changed, 174 insertions(+), 31 deletions(-) diff --git a/src/engine/SpatialJoin.cpp b/src/engine/SpatialJoin.cpp index 4bf1eb372d..1ad7d8179a 100644 --- a/src/engine/SpatialJoin.cpp +++ b/src/engine/SpatialJoin.cpp @@ -336,12 +336,13 @@ PreparedSpatialJoinParams SpatialJoin::prepareJoin() const { // ____________________________________________________________________________ Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) { SpatialJoinAlgorithms algorithms{_executionContext, prepareJoin(), - addDistToResult_, config_}; + addDistToResult_, config_, this}; if (algorithm_ == Algorithm::Baseline) { return algorithms.BaselineAlgorithm(); } else if (algorithm_ == Algorithm::S2Geometry) { return algorithms.S2geometryAlgorithm(); - } else if (algorithm_ == Algorithm::BoundingBox) { + } else { + AD_CORRECTNESS_CHECK(algorithm_ == Algorithm::BoundingBox); // as the BoundingBoxAlgorithms only works for max distance and not for // nearest neighbors, S2geometry gets called as a backup, if the query is // asking for the nearest neighbors @@ -350,8 +351,6 @@ Result SpatialJoin::computeResult([[maybe_unused]] bool requestLaziness) { } else { return algorithms.S2geometryAlgorithm(); } - } else { - AD_THROW("choose a valid Algorithm"); } } diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index 8f455156f3..f036be5e07 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -16,11 +16,13 @@ using namespace BoostGeometryNamespace; SpatialJoinAlgorithms::SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, bool addDistToResult, - std::variant config) + std::variant config, + std::optional spatialJoin) : qec_{qec}, params_{std::move(params)}, addDistToResult_{addDistToResult}, - config_{std::move(config)} {} + config_{std::move(config)}, + spatialJoin_{spatialJoin} {} // ____________________________________________________________________________ std::optional SpatialJoinAlgorithms::getPoint(const IdTable* restable, @@ -230,7 +232,7 @@ Result SpatialJoinAlgorithms::S2geometryAlgorithm() { // ____________________________________________________________________________ std::vector SpatialJoinAlgorithms::computeBoundingBox( - const Point& startPoint) { + const Point& startPoint) const { const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; AD_CORRECTNESS_CHECK(maxDist.has_value(), @@ -391,7 +393,7 @@ std::vector SpatialJoinAlgorithms::computeBoundingBoxForLargeDistances( // ____________________________________________________________________________ bool SpatialJoinAlgorithms::isContainedInBoundingBoxes( - const std::vector& boundingBox, Point point) { + const std::vector& boundingBox, Point point) const { convertToNormalCoordinates(point); return std::ranges::any_of(boundingBox, [point](const Box& aBox) { @@ -431,16 +433,21 @@ std::array SpatialJoinAlgorithms::isAPoleTouched( // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { - bool alreadyWarned = false; - - auto printWarning = [&]() { - if (!alreadyWarned) { - AD_LOG_WARN << "expected a point here, but no point is given. Skipping " - "this point and future 'non points' of this query" - << std::endl; - alreadyWarned = true; - } - }; + auto printWarning = + [alreadyWarned = false](std::optional spatialJoin) mutable { + if (!alreadyWarned) { + std::string warning = + "The input to a spatial join contained at least one element, " + "that is not a point geometry and is thus skipped. Note that " + "QLever currently only accepts point geometries for the " + "spatial joins"; + AD_LOG_WARN << warning << std::endl; + alreadyWarned = true; + if (spatialJoin) { + spatialJoin.value()->addWarning(warning); + } + } + }; const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; @@ -467,7 +474,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); if (!geopoint) { - printWarning(); + printWarning(spatialJoin_); continue; } @@ -482,7 +489,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); if (!geopoint1) { - printWarning(); + printWarning(spatialJoin_); continue; } @@ -504,8 +511,8 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { } auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight, leftJoinCol, rightJoinCol); - if (distance.getDatatype() == Datatype::Int && - distance.getInt() <= maxDist) { + if (distance.getInt() <= maxDist) { + AD_CORRECTNESS_CHECK(distance.getDatatype() == Datatype::Int); addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, rowRight, distance); } diff --git a/src/engine/SpatialJoinAlgorithms.h b/src/engine/SpatialJoinAlgorithms.h index 886b1bc583..0719b769b4 100644 --- a/src/engine/SpatialJoinAlgorithms.h +++ b/src/engine/SpatialJoinAlgorithms.h @@ -23,20 +23,21 @@ class SpatialJoinAlgorithms { SpatialJoinAlgorithms( QueryExecutionContext* qec, PreparedSpatialJoinParams params, bool addDistToResult, - std::variant config); + std::variant config, + std::optional spatialJoin = std::nullopt); Result BaselineAlgorithm(); Result S2geometryAlgorithm(); Result BoundingBoxAlgorithm(); std::vector OnlyForTestingWrapperComputeBoundingBox( - const BoostGeometryNamespace::Point& startPoint) { + const BoostGeometryNamespace::Point& startPoint) const { return computeBoundingBox(startPoint); } bool OnlyForTestingWrapperContainedInBoundingBoxes( const std::vector& boundingBox, - const BoostGeometryNamespace::Point& point) { + const BoostGeometryNamespace::Point& point) const { return isContainedInBoundingBoxes(boundingBox, point); } @@ -70,7 +71,7 @@ class SpatialJoinAlgorithms { // the right, which when seen on the sphere look like a single box, but on the // map and in the internal representation it looks like two/more boxes) std::vector computeBoundingBox( - const BoostGeometryNamespace::Point& startPoint); + const BoostGeometryNamespace::Point& startPoint) const; // This helper function calculates the bounding boxes based on a box, where // definitely no match can occur. This means every element in the anti @@ -87,12 +88,13 @@ class SpatialJoinAlgorithms { // bounding boxes bool isContainedInBoundingBoxes( const std::vector& boundingBox, - BoostGeometryNamespace::Point point); + BoostGeometryNamespace::Point point) const; QueryExecutionContext* qec_; PreparedSpatialJoinParams params_; bool addDistToResult_; std::variant config_; + std::optional spatialJoin_; // circumference in meters at the equator (max) and the pole (min) (as the // earth is not exactly a sphere the circumference is different. Note that diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index 02c88b70c0..d8fdf2d219 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -33,7 +33,8 @@ class SpatialJoinParamTest std::shared_ptr leftChild, std::shared_ptr rightChild, bool addLeftChildFirst, std::vector> expectedOutputUnorderedRows, - std::vector columnNames) { + std::vector columnNames, + bool isWrongPointInputTest = false) { // this function is like transposing a matrix. An entry which has been // stored at (i, k) is now stored at (k, i). The reason this is needed is // the following: this function receives the input as a vector of vector of @@ -107,6 +108,23 @@ class SpatialJoinParamTest }*/ EXPECT_THAT(vec, ::testing::UnorderedElementsAreArray(expectedOutput)); + + if (isWrongPointInputTest && + GetParam() == SpatialJoin::Algorithm::BoundingBox) { + auto warnings = spatialJoin->collectWarnings(); + bool containsWrongPointWarning = false; + std::string warningMessage = + "The input to a spatial join contained at least one element, " + "that is not a point geometry and is thus skipped. Note that " + "QLever currently only accepts point geometries for the " + "spatial joins"; + for (const auto& warning : warnings) { + if (warning == warningMessage) { + containsWrongPointWarning = true; + } + } + ASSERT_TRUE(containsWrongPointWarning); + } } // build the test using the small dataset. Let the SpatialJoin operation be @@ -224,15 +242,14 @@ class SpatialJoinParamTest auto qec = buildTestQEC(); auto numTriples = qec->getIndex().numTriples().normal; ASSERT_EQ(numTriples, 15); - // ====================== build small input - // ================================= + // ====================== build small input ============================== TripleComponent point1{Variable{"?point1"}}; TripleComponent subject{ ad_utility::triple_component::Iri::fromIriref("")}; auto smallChild = ad_utility::makeExecutionTree( qec, Permutation::Enum::PSO, SparqlTriple{subject, std::string{""}, point1}); - // ====================== build big input ================================== + // ====================== build big input ================================ auto bigChild = buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); @@ -249,6 +266,33 @@ class SpatialJoinParamTest columnNames); } + void testWrongPointInInput( + std::string specialPredicate, bool addLeftChildFirst, + std::vector> expectedOutput, + std::vector columnNames) { + auto kg = createSmallDatasetWithPoints(); + // make first point wrong: + auto pos = kg.find("POINT("); + kg = kg.insert(pos + 7, "wrongStuff"); + + ad_utility::MemorySize blocksizePermutations = 128_MB; + auto qec = ad_utility::testing::getQec(kg, true, true, false, + blocksizePermutations, false); + auto numTriples = qec->getIndex().numTriples().normal; + ASSERT_EQ(numTriples, 15); + // ====================== build inputs ================================ + TripleComponent point1{Variable{"?point1"}}; + TripleComponent point2{Variable{"?point2"}}; + auto leftChild = + buildIndexScan(qec, {"?obj1", std::string{""}, "?point1"}); + auto rightChild = + buildIndexScan(qec, {"?obj2", std::string{""}, "?point2"}); + + createAndTestSpatialJoin( + qec, SparqlTriple{point1, specialPredicate, point2}, leftChild, + rightChild, addLeftChildFirst, expectedOutput, columnNames, true); + } + protected: bool useBaselineAlgorithm_; }; @@ -518,6 +562,63 @@ std::vector> expectedMaxDist10000000_rows_small{ mergeToRow(sEif, sEye, expectedDistEyeEif), mergeToRow(sEif, sLib, expectedDistEifLib)}; +std::vector> expectedMaxDist1_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), +}; + +std::vector> + expectedMaxDist5000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf)}; + +std::vector> + expectedMaxDist500000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif)}; + +std::vector> + expectedMaxDist1000000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sMun, sEye, expectedDistMunEye), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sEye, sMun, expectedDistMunEye), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif)}; + +std::vector> + expectedMaxDist10000000_rows_small_wrong_point{ + mergeToRow(sMun, sMun, expectedDistSelf), + mergeToRow(sMun, sEif, expectedDistMunEif), + mergeToRow(sMun, sEye, expectedDistMunEye), + mergeToRow(sMun, sLib, expectedDistMunLib), + mergeToRow(sEye, sEye, expectedDistSelf), + mergeToRow(sEye, sEif, expectedDistEyeEif), + mergeToRow(sEye, sMun, expectedDistMunEye), + mergeToRow(sEye, sLib, expectedDistEyeLib), + mergeToRow(sLib, sLib, expectedDistSelf), + mergeToRow(sLib, sMun, expectedDistMunLib), + mergeToRow(sLib, sEye, expectedDistEyeLib), + mergeToRow(sLib, sEif, expectedDistEifLib), + mergeToRow(sEif, sEif, expectedDistSelf), + mergeToRow(sEif, sMun, expectedDistMunEif), + mergeToRow(sEif, sEye, expectedDistEyeEif), + mergeToRow(sEif, sLib, expectedDistEifLib)}; + std::vector> expectedMaxDist1_rows_diff{ mergeToRow(TF, sTF, expectedDistSelf), mergeToRow(Mun, sMun, expectedDistSelf), @@ -895,6 +996,40 @@ TEST_P(SpatialJoinParamTest, diffSizeIdTables) { false); } +TEST_P(SpatialJoinParamTest, wrongPointInInput) { + // expected behavior: point is skipped + std::vector columnNames{"?obj1", "?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + testWrongPointInInput("", true, + expectedMaxDist1_rows_small_wrong_point, columnNames); + testWrongPointInInput("", false, + expectedMaxDist1_rows_small_wrong_point, columnNames); + testWrongPointInInput("", true, + expectedMaxDist5000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist5000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist500000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist500000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist1000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist1000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", true, + expectedMaxDist10000000_rows_small_wrong_point, + columnNames); + testWrongPointInInput("", false, + expectedMaxDist10000000_rows_small_wrong_point, + columnNames); +} + INSTANTIATE_TEST_SUITE_P( SpatialJoin, SpatialJoinParamTest, ::testing::Values(SpatialJoin::Algorithm::Baseline, From ce5db4535915155db99975a69b27b615a2f77842 Mon Sep 17 00:00:00 2001 From: Jonathan Zeller Date: Sun, 24 Nov 2024 11:42:45 +0100 Subject: [PATCH 18/18] MR feedback --- src/engine/SpatialJoinAlgorithms.cpp | 37 ++++++++++++----------- test/engine/SpatialJoinAlgorithmsTest.cpp | 36 ++++++++++++++++++++++ 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/src/engine/SpatialJoinAlgorithms.cpp b/src/engine/SpatialJoinAlgorithms.cpp index f036be5e07..78eec1e2a2 100644 --- a/src/engine/SpatialJoinAlgorithms.cpp +++ b/src/engine/SpatialJoinAlgorithms.cpp @@ -433,21 +433,22 @@ std::array SpatialJoinAlgorithms::isAPoleTouched( // ____________________________________________________________________________ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { - auto printWarning = - [alreadyWarned = false](std::optional spatialJoin) mutable { - if (!alreadyWarned) { - std::string warning = - "The input to a spatial join contained at least one element, " - "that is not a point geometry and is thus skipped. Note that " - "QLever currently only accepts point geometries for the " - "spatial joins"; - AD_LOG_WARN << warning << std::endl; - alreadyWarned = true; - if (spatialJoin) { - spatialJoin.value()->addWarning(warning); - } - } - }; + auto printWarning = [alreadyWarned = false, + &spatialJoin = spatialJoin_]() mutable { + if (!alreadyWarned) { + std::string warning = + "The input to a spatial join contained at least one element, " + "that is not a point geometry and is thus skipped. Note that " + "QLever currently only accepts point geometries for the " + "spatial joins"; + AD_LOG_WARN << warning << std::endl; + alreadyWarned = true; + if (spatialJoin.has_value()) { + AD_CORRECTNESS_CHECK(spatialJoin.value() != nullptr); + spatialJoin.value()->addWarning(warning); + } + } + }; const auto [idTableLeft, resultLeft, idTableRight, resultRight, leftJoinCol, rightJoinCol, numColumns, maxDist, maxResults] = params_; @@ -474,7 +475,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto geopoint = getPoint(smallerResult, i, smallerResJoinCol); if (!geopoint) { - printWarning(spatialJoin_); + printWarning(); continue; } @@ -489,7 +490,7 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { auto geopoint1 = getPoint(otherResult, i, otherResJoinCol); if (!geopoint1) { - printWarning(spatialJoin_); + printWarning(); continue; } @@ -511,8 +512,8 @@ Result SpatialJoinAlgorithms::BoundingBoxAlgorithm() { } auto distance = computeDist(idTableLeft, idTableRight, rowLeft, rowRight, leftJoinCol, rightJoinCol); + AD_CORRECTNESS_CHECK(distance.getDatatype() == Datatype::Int); if (distance.getInt() <= maxDist) { - AD_CORRECTNESS_CHECK(distance.getDatatype() == Datatype::Int); addResultTableEntry(&result, idTableLeft, idTableRight, rowLeft, rowRight, distance); } diff --git a/test/engine/SpatialJoinAlgorithmsTest.cpp b/test/engine/SpatialJoinAlgorithmsTest.cpp index d8fdf2d219..95fe8fa50f 100644 --- a/test/engine/SpatialJoinAlgorithmsTest.cpp +++ b/test/engine/SpatialJoinAlgorithmsTest.cpp @@ -937,6 +937,42 @@ TEST_P(SpatialJoinParamTest, computeResultSmallDatasetDifferentSizeChildren) { expectedMaxDist10000000_rows_diff, columnNames, false); } +TEST_P(SpatialJoinParamTest, maxSizeMaxDistanceTest) { + auto maxDist = std::numeric_limits::max(); + std::string maxDistStr = + absl::StrCat(""); + + // test small children + std::vector columnNames{"?obj1", "?point1", "?obj2", "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetSmallChildren( + maxDistStr, true, expectedMaxDist10000000_rows_small, columnNames); + buildAndTestSmallTestSetSmallChildren( + maxDistStr, false, expectedMaxDist10000000_rows_small, columnNames); + + // test diff size children + columnNames = {"?name1", + "?obj1", + "?geo1", + "?point1", + "?obj2", + "?point2", + "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetDiffSizeChildren( + maxDistStr, true, expectedMaxDist10000000_rows_diff, columnNames, false); + buildAndTestSmallTestSetDiffSizeChildren( + maxDistStr, false, expectedMaxDist10000000_rows_diff, columnNames, false); + + // test large size children + columnNames = {"?name1", "?obj1", "?geo1", + "?point1", "?name2", "?obj2", + "?geo2", "?point2", "?distOfTheTwoObjectsAddedInternally"}; + buildAndTestSmallTestSetLargeChildren( + maxDistStr, true, expectedMaxDist10000000_rows, columnNames); + buildAndTestSmallTestSetLargeChildren( + maxDistStr, false, expectedMaxDist10000000_rows, columnNames); +} + TEST_P(SpatialJoinParamTest, diffSizeIdTables) { std::vector columnNames{"?point1", "?obj2", "?point2", "?distOfTheTwoObjectsAddedInternally"};