diff --git a/src/engine/Service.cpp b/src/engine/Service.cpp index f261be660f..291fa6ae7f 100644 --- a/src/engine/Service.cpp +++ b/src/engine/Service.cpp @@ -143,19 +143,17 @@ ProtoResult Service::computeResultImpl([[maybe_unused]] bool requestLaziness) { serviceQuery, "application/sparql-query", "application/sparql-results+json"); - std::basic_string, - ad_utility::AllocatorWithLimit> - jsonStr(_executionContext->getAllocator()); - for (std::span bytes : response.body_) { - jsonStr.append(reinterpret_cast(bytes.data()), bytes.size()); - checkCancellation(); - } - - auto throwErrorWithContext = [&jsonStr, &serviceUrl](std::string_view sv) { - throw std::runtime_error(absl::StrCat( - "Error while executing a SERVICE request to <", serviceUrl.asString(), - ">: ", sv, ". First 100 bytes of the response: ", - std::string_view{jsonStr.data()}.substr(0, 100))); + auto throwErrorWithContext = [this, &response](std::string_view sv) { + std::string ctx; + ctx.reserve(100); + for (const auto& bytes : std::move(response.body_)) { + ctx += std::string(reinterpret_cast(bytes.data()), + bytes.size()); + if (ctx.size() >= 100) { + break; + } + } + this->throwErrorWithContext(sv, std::string_view(ctx).substr(0, 100)); }; // Verify status and content-type of the response. @@ -173,42 +171,24 @@ ProtoResult Service::computeResultImpl([[maybe_unused]] bool requestLaziness) { response.contentType_, "'")); } - // Parse the received result. - std::vector resVariables; - std::vector resBindings; - try { - auto jsonResult = nlohmann::json::parse(jsonStr); - - if (jsonResult.empty()) { - throwErrorWithContext("Response from SPARQL endpoint is empty"); - } - - resVariables = jsonResult["head"]["vars"].get>(); - resBindings = - jsonResult["results"]["bindings"].get>(); - } catch (const nlohmann::json::parse_error&) { - throwErrorWithContext("Failed to parse the SERVICE result as JSON"); - } catch (const nlohmann::json::type_error&) { - throwErrorWithContext("JSON result does not have the expected structure"); - } - - // Check if result header row is expected. - std::string headerRow = absl::StrCat("?", absl::StrJoin(resVariables, " ?")); - std::string expectedHeaderRow = absl::StrJoin( - parsedServiceClause_.visibleVariables_, " ", Variable::AbslFormatter); - if (headerRow != expectedHeaderRow) { - throwErrorWithContext(absl::StrCat( - "Header row of JSON result for SERVICE query is \"", headerRow, - "\", but expected \"", expectedHeaderRow, "\"")); - } + // Prepare the expected Variables as keys for the JSON-bindings. We can't wait + // for the variables sent in the response as they're maybe not read before + // the bindings. + std::vector expVariableKeys; + std::ranges::transform(parsedServiceClause_.visibleVariables_, + std::back_inserter(expVariableKeys), + [](const Variable& v) { return v.name().substr(1); }); // Set basic properties of the result table. IdTable idTable{getResultWidth(), getExecutionContext()->getAllocator()}; LocalVocab localVocab{}; + + auto body = ad_utility::LazyJsonParser::parse(std::move(response.body_), + {"results", "bindings"}); + // Fill the result table using the `writeJsonResult` method below. - size_t resWidth = getResultWidth(); - CALL_FIXED_SIZE(resWidth, &Service::writeJsonResult, this, resVariables, - resBindings, &idTable, &localVocab); + CALL_FIXED_SIZE(getResultWidth(), &Service::writeJsonResult, this, + expVariableKeys, body, &idTable, &localVocab); return {std::move(idTable), resultSortedOn(), std::move(localVocab)}; } @@ -216,28 +196,70 @@ ProtoResult Service::computeResultImpl([[maybe_unused]] bool requestLaziness) { // ____________________________________________________________________________ template void Service::writeJsonResult(const std::vector& vars, - const std::vector& bindings, + ad_utility::LazyJsonParser::Generator& body, IdTable* idTablePtr, LocalVocab* localVocab) { IdTableStatic idTable = std::move(*idTablePtr).toStatic(); checkCancellation(); std::vector numLocalVocabPerColumn(idTable.numColumns()); + auto writeBindings = [&](const nlohmann::json& bindings, size_t& rowIdx) { + for (const auto& binding : bindings) { + idTable.emplace_back(); + for (size_t colIdx = 0; colIdx < vars.size(); ++colIdx) { + TripleComponent tc = + binding.contains(vars[colIdx]) + ? bindingToTripleComponent(binding[vars[colIdx]]) + : TripleComponent::UNDEF(); + + Id id = std::move(tc).toValueId(getIndex().getVocab(), *localVocab); + idTable(rowIdx, colIdx) = id; + if (id.getDatatype() == Datatype::LocalVocabIndex) { + ++numLocalVocabPerColumn[colIdx]; + } + } + rowIdx++; + checkCancellation(); + } + }; + size_t rowIdx = 0; - for (const auto& binding : bindings) { - idTable.emplace_back(); - for (size_t colIdx = 0; colIdx < vars.size(); ++colIdx) { - TripleComponent tc = binding.contains(vars[colIdx]) - ? bindingToTripleComponent(binding[vars[colIdx]]) - : TripleComponent::UNDEF(); - - Id id = std::move(tc).toValueId(getIndex().getVocab(), *localVocab); - idTable(rowIdx, colIdx) = id; - if (id.getDatatype() == Datatype::LocalVocabIndex) { - ++numLocalVocabPerColumn[colIdx]; + bool resultExists{false}; + bool varsChecked{false}; + try { + for (const nlohmann::json& part : body) { + if (part.contains("head")) { + AD_CORRECTNESS_CHECK(!varsChecked); + verifyVariables(part["head"], body.details()); + varsChecked = true; } + // The LazyJsonParser only yields parts containing the "bindings" array, + // therefore we can assume its existence here. + AD_CORRECTNESS_CHECK(part.contains("results") && + part["results"].contains("bindings") && + part["results"]["bindings"].is_array()); + writeBindings(part["results"]["bindings"], rowIdx); + resultExists = true; } - rowIdx++; - checkCancellation(); + } catch (const ad_utility::LazyJsonParser::Error& e) { + throwErrorWithContext( + absl::StrCat("Parser failed with error: '", e.what(), "'"), + body.details().first100_, body.details().last100_); + } + + // As the LazyJsonParser only passes parts of the result that match the + // expected structure, no result implies an unexpected structure. + if (!resultExists) { + throwErrorWithContext( + "JSON result does not have the expected structure (results section " + "missing)", + body.details().first100_, body.details().last100_); + } + + if (!varsChecked) { + throwErrorWithContext( + "JSON result does not have the expected structure (head section " + "missing)", + body.details().first100_, body.details().last100_); } AD_CORRECTNESS_CHECK(rowIdx == idTable.size()); @@ -309,24 +331,27 @@ std::optional Service::getSiblingValuesClause() const { } // ____________________________________________________________________________ -TripleComponent Service::bindingToTripleComponent(const nlohmann::json& cell) { - if (!cell.contains("type") || !cell.contains("value")) { - throw std::runtime_error("Missing type or value field in binding."); +TripleComponent Service::bindingToTripleComponent( + const nlohmann::json& binding) { + if (!binding.contains("type") || !binding.contains("value")) { + throw std::runtime_error(absl::StrCat( + "Missing type or value field in binding. The binding is: '", + binding.dump(), "'")); } - const auto type = cell["type"].get(); - const auto value = cell["value"].get(); + const auto type = binding["type"].get(); + const auto value = binding["value"].get(); TripleComponent tc; if (type == "literal") { - if (cell.contains("datatype")) { + if (binding.contains("datatype")) { tc = TurtleParser::literalAndDatatypeToTripleComponent( value, TripleComponent::Iri::fromIrirefWithoutBrackets( - cell["datatype"].get())); - } else if (cell.contains("xml:lang")) { + binding["datatype"].get())); + } else if (binding.contains("xml:lang")) { tc = TripleComponent::Literal::literalWithNormalizedContent( asNormalizedStringViewUnsafe(value), - cell["xml:lang"].get()); + binding["xml:lang"].get()); } else { tc = TripleComponent::Literal::literalWithNormalizedContent( asNormalizedStringViewUnsafe(value)); @@ -339,7 +364,9 @@ TripleComponent Service::bindingToTripleComponent(const nlohmann::json& cell) { "For now, consider filtering them out using the ISBLANK function or " "converting them via the STR function."); } else { - throw std::runtime_error(absl::StrCat("Type ", type, " is undefined.")); + throw std::runtime_error(absl::StrCat("Type ", type, + " is undefined. The binding is: '", + binding.dump(), "'")); } return tc; } @@ -353,3 +380,53 @@ ProtoResult Service::makeNeutralElementResultForSilentFail() const { } return {std::move(idTable), resultSortedOn(), LocalVocab{}}; } + +// ____________________________________________________________________________ +void Service::verifyVariables( + const nlohmann::json& head, + const ad_utility::LazyJsonParser::Details& details) const { + std::vector vars; + try { + vars = head.at("vars").get>(); + } catch (...) { + throw std::runtime_error( + absl::StrCat("JSON result does not have the expected structure, as its " + "\"head\" section is not according to the SPARQL " + "standard. The \"head\" section is: '", + head.dump(), "'.")); + } + + ad_utility::HashSet responseVars; + for (const auto& v : vars) { + responseVars.emplace("?" + v); + } + ad_utility::HashSet expectedVars( + parsedServiceClause_.visibleVariables_.begin(), + parsedServiceClause_.visibleVariables_.end()); + + if (responseVars != expectedVars) { + throwErrorWithContext( + absl::StrCat("Header row of JSON result for SERVICE query is \"", + absl::StrCat("?", absl::StrJoin(vars, " ?")), + "\", but expected \"", + absl::StrJoin(parsedServiceClause_.visibleVariables_, " ", + Variable::AbslFormatter), + "\". Probable cause: The remote endpoint sent a JSON " + "response that is not according to the SPARQL Standard"), + details.first100_, details.last100_); + } +} + +// ____________________________________________________________________________ +void Service::throwErrorWithContext(std::string_view msg, + std::string_view first100, + std::string_view last100) const { + const ad_utility::httpUtils::Url serviceUrl{ + asStringViewUnsafe(parsedServiceClause_.serviceIri_.getContent())}; + + throw std::runtime_error(absl::StrCat( + "Error while executing a SERVICE request to <", serviceUrl.asString(), + ">: ", msg, ". First 100 bytes of the response: '", first100, + (last100.empty() ? "'" + : absl::StrCat(", last 100 bytes: '", last100, "'")))); +} diff --git a/src/engine/Service.h b/src/engine/Service.h index 3edf004057..865d20dd22 100644 --- a/src/engine/Service.h +++ b/src/engine/Service.h @@ -9,6 +9,7 @@ #include "engine/Operation.h" #include "engine/Values.h" #include "parser/ParsedQuery.h" +#include "util/LazyJsonParser.h" #include "util/http/HttpClient.h" // The SERVICE operation. Sends a query to the remote endpoint specified by the @@ -98,7 +99,8 @@ class Service : public Operation { vector getChildren() override { return {}; } // Convert the given binding to TripleComponent. - static TripleComponent bindingToTripleComponent(const nlohmann::json& cell); + static TripleComponent bindingToTripleComponent( + const nlohmann::json& binding); private: // The string returned by this function is used as cache key. @@ -116,6 +118,17 @@ class Service : public Operation { // Create result for silent fail. ProtoResult makeNeutralElementResultForSilentFail() const; + // Check that all visible variables of the SERVICE clause exist in the json + // object, otherwise throw an error. + void verifyVariables(const nlohmann::json& head, + const ad_utility::LazyJsonParser::Details& gen) const; + + // Throws an error message, providing the first 100 bytes of the result as + // context. + [[noreturn]] void throwErrorWithContext( + std::string_view msg, std::string_view first100, + std::string_view last100 = ""sv) const; + // Write the given JSON result to the given result object. The `I` is the // width of the result table. // @@ -123,6 +136,6 @@ class Service : public Operation { // parse JSON here and not a VALUES clause. template void writeJsonResult(const std::vector& vars, - const std::vector& bindings, + ad_utility::LazyJsonParser::Generator& response, IdTable* idTable, LocalVocab* localVocab); }; diff --git a/src/util/LazyJsonParser.cpp b/src/util/LazyJsonParser.cpp index 9c06c2f199..c5e7f0b4cd 100644 --- a/src/util/LazyJsonParser.cpp +++ b/src/util/LazyJsonParser.cpp @@ -15,11 +15,18 @@ namespace ad_utility { // ____________________________________________________________________________ -cppcoro::generator LazyJsonParser::parse( - cppcoro::generator partialJson, +LazyJsonParser::Generator LazyJsonParser::parse( + cppcoro::generator partialJson, std::vector arrayPath) { LazyJsonParser p(std::move(arrayPath)); + Details& details = co_await cppcoro::getDetails; for (const auto& chunk : partialJson) { + if (details.first100_.size() < 100) { + details.first100_ += chunk.substr(0, 100); + } + details.last100_ = + chunk.substr(std::max(0, static_cast(chunk.size() - 100)), 100); + if (auto res = p.parseChunk(chunk); res.has_value()) { co_yield res; if (p.endReached_) { @@ -29,6 +36,21 @@ cppcoro::generator LazyJsonParser::parse( } } +// ____________________________________________________________________________ +LazyJsonParser::Generator LazyJsonParser::parse( + cppcoro::generator> partialJson, + std::vector arrayPath) { + return parse( + [](cppcoro::generator> partialJson) + -> cppcoro::generator { + for (const auto& bytes : partialJson) { + co_yield std::string_view(reinterpret_cast(bytes.data()), + bytes.size()); + } + }(std::move(partialJson)), + std::move(arrayPath)); +} + // ____________________________________________________________________________ LazyJsonParser::LazyJsonParser(std::vector arrayPath) : arrayPath_(std::move(arrayPath)), @@ -230,7 +252,9 @@ std::optional LazyJsonParser::constructResultFromParsedChunk( size_t nextChunkStart = materializeEnd == 0 ? 0 : std::min(materializeEnd + 1, input_.size()); if (input_.size() - nextChunkStart >= 1'000'000) { - throw std::runtime_error("Ill formed Json."); + throw Error( + "QLever currently doesn't support SERVICE results where a single " + "result row is larger than 1MB"); } if (nextChunkStart == 0) { return std::nullopt; @@ -255,7 +279,11 @@ std::optional LazyJsonParser::constructResultFromParsedChunk( absl::StrAppend(&resStr, suffixInArray_); } - return nlohmann::json::parse(resStr); + try { + return nlohmann::json::parse(resStr); + } catch (const nlohmann::json::parse_error& e) { + throw Error(e.what()); + } } // ____________________________________________________________________________ diff --git a/src/util/LazyJsonParser.h b/src/util/LazyJsonParser.h index 24dae9000e..23e99fa2d8 100644 --- a/src/util/LazyJsonParser.h +++ b/src/util/LazyJsonParser.h @@ -20,10 +20,26 @@ namespace ad_utility { */ class LazyJsonParser { public: + // Generator detail, the first/last 100 input characters for better error + // context. + struct Details { + std::string first100_; + std::string last100_; + }; + using Generator = cppcoro::generator; + + class Error : public std::runtime_error { + public: + explicit Error(const std::string& msg) : std::runtime_error(msg) {} + }; + // Parse chunks of json-strings yielding them reconstructed. - static cppcoro::generator parse( - cppcoro::generator partialJson, - std::vector arrayPath); + static Generator parse(cppcoro::generator partialJson, + std::vector arrayPath); + + // Convenient alternative for the function above using bytes. + static Generator parse(cppcoro::generator> partialJson, + std::vector arrayPath); private: explicit LazyJsonParser(std::vector arrayPath); diff --git a/test/LazyJsonParserTest.cpp b/test/LazyJsonParserTest.cpp index 8a8ce2f5c0..4c676ad1cc 100644 --- a/test/LazyJsonParserTest.cpp +++ b/test/LazyJsonParserTest.cpp @@ -14,9 +14,9 @@ TEST(parseTest, parse) { const std::vector arrayPath = {"results", "bindings"}; auto yieldChars = - [](const std::string& s) -> cppcoro::generator { - for (auto& c : s) { - co_yield std::string(1, c); + [](const std::string_view s) -> cppcoro::generator { + for (size_t i = 0; i < s.size(); ++i) { + co_yield std::string_view(s.data() + i, 1); } }; // Check if the parser yields the expected results when parsing each char diff --git a/test/ServiceTest.cpp b/test/ServiceTest.cpp index 45ee1636c0..0476d8ab0a 100644 --- a/test/ServiceTest.cpp +++ b/test/ServiceTest.cpp @@ -208,34 +208,57 @@ TEST_F(ServiceTest, computeResult) { EXPECT_NO_THROW(runComputeResult(result, status, contentType, true)); }; - // CHECK 1: An exception shall be thrown, when - // status-code isn't ok, contentType doesn't match + // CHECK 1: An exception shall be thrown (and maybe silenced), when + // status-code isn't ok expectThrowOrSilence( - "", "SERVICE responded with HTTP status code: 400, Bad Request.", + genJsonResult({"x", "y"}, {{"bla", "bli"}, {"blu"}, {"bli", "blu"}}), + "SERVICE responded with HTTP status code: 400, Bad Request.", boost::beast::http::status::bad_request, "application/sparql-results+json"); - - expectThrowOrSilence("", - "QLever requires the endpoint of a SERVICE to send " - "the result as 'application/sparql-results+json' but " - "the endpoint sent 'wrong/type'.", - boost::beast::http::status::ok, "wrong/type"); - - // or Result is no JSON, empty or has invalid structure + // contentType doesn't match expectThrowOrSilence( - "", - "Failed to parse the SERVICE result as JSON."); - - expectThrowOrSilence("{}", "Response from SPARQL endpoint is empty."); + genJsonResult({"x", "y"}, {{"bla", "bli"}, {"blu"}, {"bli", "blu"}}), + "QLever requires the endpoint of a SERVICE to send " + "the result as 'application/sparql-results+json' but " + "the endpoint sent 'wrong/type'.", + boost::beast::http::status::ok, "wrong/type"); + + // or Result has invalid structure + // `results` missing + expectThrowOrSilence("{\"head\": {\"vars\": [\"x\", \"y\"]}}", + "results section missing"); + expectThrowOrSilence("", "results section missing"); + // `bindings` missing + expectThrowOrSilence( + "{\"head\": {\"vars\": [\"x\", \"y\"]}," + "\"results\": {}}", + "results section missing"); + // wrong `bindings` type (array expected) + expectThrowOrSilence( + "{\"head\": {\"vars\": [\"x\", \"y\"]}," + "\"results\": {\"bindings\": {}}}", + "results section missing"); - expectThrowOrSilence("{\"invalid\": \"structure\"}", - "JSON result does not have the expected structure."); + // `head`/`vars` missing + expectThrowOrSilence( + "{\"results\": {\"bindings\": [{\"x\": {\"type\": \"uri\", \"value\": " + "\"a\"}, \"y\": {\"type\": \"uri\", \"value\": \"b\"}}]}}", + "head section missing"); + expectThrowOrSilence( + "{\"head\": {}," + "\"results\": {\"bindings\": []}}", + "\"head\" section is not according to the SPARQL standard."); + // wrong variables type (array of strings expected) + expectThrowOrSilence( + "{\"head\": {\"vars\": [\"x\", \"y\", 3]}," + "\"results\": {\"bindings\": []}}", + "\"head\" section is not according to the SPARQL standard."); + // Internal parser errors. expectThrowOrSilence( - "{\"head\": {\"vars\": [1, 2, 3]}," - "\"results\": {\"bindings\": {}}}", - "JSON result does not have the expected structure."); + std::string(1'000'000, '0'), + "QLever currently doesn't support SERVICE results where a single " + "result row is larger than 1MB"); // CHECK 1b: Even if the SILENT-keyword is set, throw local errors. Service serviceSilent{ @@ -272,13 +295,11 @@ TEST_F(ServiceTest, computeResult) { boost::beast::http::status::ok, "APPLICATION/SPARQL-RESULTS+JSON;charset=utf-8")); - // CHECK 2: Header row of returned JSON is wrong (variables in wrong - // order) -> an exception should be thrown. - expectThrowOrSilence( - genJsonResult({"y", "x"}, - {{"bla", "bli"}, {"blu", "bla"}, {"bli", "blu"}}), - "Header row of JSON result for SERVICE query is " - "\"?y ?x\", but expected \"?x ?y\"."); + // CHECK 2: Header row of returned JSON is wrong (missing expected variables) + // -> an exception should be thrown. + expectThrowOrSilence(genJsonResult({"x"}, {{"bla"}, {"blu"}, {"bli"}}), + "Header row of JSON result for SERVICE query is " + "\"?x\", but expected \"?x ?y\"."); // CHECK 3: A result row of the returned JSON is missing a variable's // value -> undefined value @@ -358,7 +379,7 @@ TEST_F(ServiceTest, computeResult) { {"blu", "bla", "y"}, {"bli", "blu", "y"}})), siblingTree}; - EXPECT_NO_THROW(serviceOperation5.getResult()); + EXPECT_NO_THROW(serviceOperation5.computeResultOnlyForTesting()); // Check 6: SiblingTree's rows exceed maxValue const auto maxValueRowsDefault = @@ -377,7 +398,7 @@ TEST_F(ServiceTest, computeResult) { {"blue", "bla", "y"}, {"bli", "blu", "y"}})), siblingTree}; - EXPECT_NO_THROW(serviceOperation6.getResult()); + EXPECT_NO_THROW(serviceOperation6.computeResultOnlyForTesting()); RuntimeParameters().set<"service-max-value-rows">(maxValueRowsDefault); } @@ -454,8 +475,12 @@ TEST_F(ServiceTest, bindingToTripleComponent) { nlohmann::json binding; // Missing type or value. - EXPECT_ANY_THROW(Service::bindingToTripleComponent({{"type", "literal"}})); - EXPECT_ANY_THROW(Service::bindingToTripleComponent({{"value", "v"}})); + AD_EXPECT_THROW_WITH_MESSAGE( + Service::bindingToTripleComponent({{"type", "literal"}}), + ::testing::HasSubstr("Missing type or value")); + AD_EXPECT_THROW_WITH_MESSAGE( + Service::bindingToTripleComponent({{"value", "v"}}), + ::testing::HasSubstr("Missing type or value")); EXPECT_EQ( Service::bindingToTripleComponent( @@ -491,6 +516,8 @@ TEST_F(ServiceTest, bindingToTripleComponent) { EXPECT_ANY_THROW( Service::bindingToTripleComponent({{"type", "bnode"}, {"value", "b"}})); - EXPECT_ANY_THROW(Service::bindingToTripleComponent( - {{"type", "INVALID_TYPE"}, {"value", "v"}})); + AD_EXPECT_THROW_WITH_MESSAGE( + Service::bindingToTripleComponent( + {{"type", "INVALID_TYPE"}, {"value", "v"}}), + ::testing::HasSubstr("Type INVALID_TYPE is undefined")); }