Skip to content

Commit

Permalink
A lazy parser for the application/sparql-results+json format (#1412)
Browse files Browse the repository at this point in the history
This parser can be used to lazily parse JSON input where most of the data resides in a single JSON array, the path to which is known in advance with the additional assumption that the single entries of that array are small, but there might be many of them. This assumption holds for the JSON format of SPARQL query results, which consist of some (small) metadata like the contained variables and a large array that contains the result rows, each of which are typically small (at most one entry per variable).
In the future this parser will be used to implement a lazy `SERVICE` operation.
  • Loading branch information
UNEXENU authored Aug 29, 2024
1 parent 34d59a5 commit 7dcfea2
Show file tree
Hide file tree
Showing 5 changed files with 470 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
add_subdirectory(ConfigManager)
add_subdirectory(MemorySize)
add_subdirectory(http)
add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp DateYearDuration.cpp Duration.cpp antlr/GenerateAntlrExceptionMetadata.cpp CancellationHandle.cpp StringUtils.cpp)
add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp DateYearDuration.cpp Duration.cpp antlr/GenerateAntlrExceptionMetadata.cpp CancellationHandle.cpp StringUtils.cpp LazyJsonParser.cpp)
qlever_target_link_libraries(util re2::re2)
270 changes: 270 additions & 0 deletions src/util/LazyJsonParser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Moritz Dom ([email protected])

#include "util/LazyJsonParser.h"

#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>

#include <variant>

#include "util/Exception.h"

namespace ad_utility {

// ____________________________________________________________________________
cppcoro::generator<nlohmann::json> LazyJsonParser::parse(
cppcoro::generator<std::string> partialJson,
std::vector<std::string> arrayPath) {
LazyJsonParser p(std::move(arrayPath));
for (const auto& chunk : partialJson) {
if (auto res = p.parseChunk(chunk); res.has_value()) {
co_yield res;
if (p.endReached_) {
co_return;
}
}
}
}

// ____________________________________________________________________________
LazyJsonParser::LazyJsonParser(std::vector<std::string> arrayPath)
: arrayPath_(std::move(arrayPath)),
prefixInArray_(absl::StrCat(
absl::StrJoin(arrayPath_.begin(), arrayPath_.end(), "",
[](std::string* out, const std::string& s) {
absl::StrAppend(out, "{\"", s, "\": ");
}),
"[")),
suffixInArray_(absl::StrCat("]", std::string(arrayPath_.size(), '}'))) {}
// ____________________________________________________________________________
std::optional<nlohmann::json> LazyJsonParser::parseChunk(
std::string_view inStr) {
size_t idx = input_.size();
absl::StrAppend(&input_, inStr);
// End-index (exclusive) of the current `input_` to construct a result.
size_t materializeEnd = 0;
// If the previous chunk ended within a Literal, finish parsing it.
if (inLiteral_) {
parseLiteral(idx);
++idx;
}
// Resume parsing the current section.
if (std::holds_alternative<BeforeArrayPath>(state_)) {
parseBeforeArrayPath(idx);
}
if (std::holds_alternative<InArrayPath>(state_)) {
materializeEnd = parseInArrayPath(idx);
}
if (std::holds_alternative<AfterArrayPath>(state_)) {
std::optional<size_t> optEnd = parseAfterArrayPath(idx);
if (optEnd) {
materializeEnd = optEnd.value();
}
}
return constructResultFromParsedChunk(materializeEnd);
}
// ____________________________________________________________________________
void LazyJsonParser::parseLiteral(size_t& idx) {
AD_CORRECTNESS_CHECK(inLiteral_ || input_[idx] == '"');
if (input_[idx] == '"' && !inLiteral_) {
++idx;
if (std::holds_alternative<BeforeArrayPath>(state_)) {
std::get<BeforeArrayPath>(state_).optLiteral_ =
BeforeArrayPath::LiteralView{.start_ = idx, .length_ = 0};
}
inLiteral_ = true;
}
for (; idx < input_.size(); ++idx) {
if (isEscaped_) {
isEscaped_ = false;
continue;
}
switch (input_[idx]) {
case '"':
// End of literal.
if (std::holds_alternative<BeforeArrayPath>(state_)) {
std::get<BeforeArrayPath>(state_).optLiteral_.value().length_ =
idx -
std::get<BeforeArrayPath>(state_).optLiteral_.value().start_;
}
inLiteral_ = false;
return;
case '\\':
isEscaped_ = true;
break;
default:
break;
}
}
}
// ____________________________________________________________________________
void LazyJsonParser::parseBeforeArrayPath(size_t& idx) {
AD_CORRECTNESS_CHECK(std::holds_alternative<BeforeArrayPath>(state_));
auto& state = std::get<BeforeArrayPath>(state_);
for (; idx < input_.size(); ++idx) {
switch (input_[idx]) {
case '{':
state.tryAddKeyToPath(input_);
break;
case '[':
if (state.openBrackets_ == 0) {
state.tryAddKeyToPath(input_);
}
++state.openBrackets_;
if (state.curPath_ == arrayPath_) {
// Reached arrayPath.
state_ = InArrayPath();
++idx;
return;
}
break;
case ']':
--state.openBrackets_;
if (state.openBrackets_ == 0 && !state.curPath_.empty()) {
state.curPath_.pop_back();
}
break;
case '}':
if (!state.curPath_.empty()) {
state.curPath_.pop_back();
}
break;
case '"':
parseLiteral(idx);
break;
default:
break;
}
}
}
// ____________________________________________________________________________
size_t LazyJsonParser::parseInArrayPath(size_t& idx) {
AD_CORRECTNESS_CHECK(std::holds_alternative<InArrayPath>(state_));
auto& state = std::get<InArrayPath>(state_);
size_t materializeEnd = 0;
auto exitArrayPath = [&]() {
state_ = AfterArrayPath{.remainingBraces_ = arrayPath_.size()};
++idx;
if (arrayPath_.empty()) {
materializeEnd = idx;
}
return materializeEnd;
};
for (; idx < input_.size(); ++idx) {
switch (input_[idx]) {
case '{':
case '[':
++state.openBracketsAndBraces_;
break;
case '}':
--state.openBracketsAndBraces_;
break;
case ']':
if (state.openBracketsAndBraces_ == 0) {
// End of ArrayPath reached.
return exitArrayPath();
}
--state.openBracketsAndBraces_;
break;
case ',':
if (state.openBracketsAndBraces_ == 0) {
materializeEnd = idx;
}
break;
case '"':
parseLiteral(idx);
break;
default:
break;
}
}
return materializeEnd;
}
// ____________________________________________________________________________
std::optional<size_t> LazyJsonParser::parseAfterArrayPath(size_t& idx) {
AD_CORRECTNESS_CHECK(std::holds_alternative<AfterArrayPath>(state_));
auto& state = std::get<AfterArrayPath>(state_);
for (; idx < input_.size(); ++idx) {
switch (input_[idx]) {
case '{':
state.remainingBraces_ += 1;
break;
case '}':
state.remainingBraces_ -= 1;
if (state.remainingBraces_ == 0) {
// End reached.
endReached_ = true;
return idx + 1;
}
break;
case '"':
parseLiteral(idx);
break;
default:
break;
}
}
return std::nullopt;
}
// ____________________________________________________________________________
std::optional<nlohmann::json> LazyJsonParser::constructResultFromParsedChunk(
size_t materializeEnd) {
size_t nextChunkStart =
materializeEnd == 0 ? 0 : std::min(materializeEnd + 1, input_.size());
if (input_.size() - nextChunkStart >= 1'000'000) {
throw std::runtime_error("Ill formed Json.");
}
if (nextChunkStart == 0) {
return std::nullopt;
}

std::string resStr = yieldCount_ > 0 ? prefixInArray_ : "";
++yieldCount_;

// materializeEnd either holds the index to a `,` between two elements in the
// arrayPath or the (non-existent) first-character after the input.
AD_CORRECTNESS_CHECK(
(std::holds_alternative<InArrayPath>(state_) &&
input_[materializeEnd] == ',') ||
(std::holds_alternative<AfterArrayPath>(state_) &&
std::get<AfterArrayPath>(state_).remainingBraces_ == 0 &&
input_.size() == materializeEnd));

absl::StrAppend(&resStr, input_.substr(0, materializeEnd));
input_ = input_.substr(nextChunkStart);

if (std::holds_alternative<InArrayPath>(state_)) {
absl::StrAppend(&resStr, suffixInArray_);
}

return nlohmann::json::parse(resStr);
}

// ____________________________________________________________________________
void LazyJsonParser::BeforeArrayPath::tryAddKeyToPath(std::string_view input) {
if (optLiteral_) {
curPath_.emplace_back(
input.substr(optLiteral_.value().start_, optLiteral_.value().length_));
optLiteral_ = std::nullopt;
}
}

} // namespace ad_utility
99 changes: 99 additions & 0 deletions src/util/LazyJsonParser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Moritz Dom ([email protected])

#pragma once

#include <optional>
#include <variant>

#include "util/Generator.h"
#include "util/json.h"

namespace ad_utility {
/*
* A simple parser for split up JSON-data with known structure.
*
* Given the path to an array containing the majority of the given JSON-object,
* the Parser will yield chunks of the object separated after completed elements
* in the arrayPath or after reading the entire object itself.
*/
class LazyJsonParser {
public:
// Parse chunks of json-strings yielding them reconstructed.
static cppcoro::generator<nlohmann::json> parse(
cppcoro::generator<std::string> partialJson,
std::vector<std::string> arrayPath);

private:
explicit LazyJsonParser(std::vector<std::string> arrayPath);

// Parses a chunk of JSON data and returns it with reconstructed structure.
std::optional<nlohmann::json> parseChunk(std::string_view inStr);

// The following 3 methods parse the different sections before/in/after the
// arrayPath starting at the given index `idx` on the `input_` string.
void parseBeforeArrayPath(size_t& idx);

// Returns the index of the last `,` between array elements or 0.
size_t parseInArrayPath(size_t& idx);

// Returns the index after the input, when reading the input is complete.
std::optional<size_t> parseAfterArrayPath(size_t& idx);

// Parses literals in the input.
void parseLiteral(size_t& idx);

// Constructs the result to be returned after parsing a chunk.
std::optional<nlohmann::json> constructResultFromParsedChunk(
size_t materializeEnd);

// Context for the 3 parsing sections.
struct BeforeArrayPath {
// Indices of the latest parsed literal, used to add keys to the curPath_.
struct LiteralView {
size_t start_{0};
size_t length_{0};
};
std::optional<LiteralView> optLiteral_;
std::vector<std::string> curPath_;
// Open Brackets counter to track nested arrays.
int openBrackets_{0};

// Attempts to add a key to the current Path, based on strStart/strEnd.
void tryAddKeyToPath(std::string_view input);
};
struct InArrayPath {
// Track brackets/braces to find the end of the array.
int openBracketsAndBraces_{0};
};
struct AfterArrayPath {
// Remaining braces until the end of the input-object.
size_t remainingBraces_;
};
std::variant<BeforeArrayPath, InArrayPath, AfterArrayPath> state_{
BeforeArrayPath()};

// Current (not yet materialized) input-string.
std::string input_;

// If the next character is escaped or not.
bool isEscaped_{false};

// If the parser is currently positioned within a literal.
bool inLiteral_{false};

// Indicates whether the end of the object has been reached.
bool endReached_{false};

// Counter for the so far returned results.
unsigned int yieldCount_{0};

// Key-path to the array containing many elements.
const std::vector<std::string> arrayPath_;

// Precomputed prefix/suffix used to construct results.
const std::string prefixInArray_;
const std::string suffixInArray_;
};
} // namespace ad_utility
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,8 @@ addLinkAndDiscoverTest(FsstCompressorTest fsst)

addLinkAndDiscoverTest(CopyableSynchronizationTest)

addLinkAndDiscoverTest(LazyJsonParserTest)

addLinkAndDiscoverTest(CacheableGeneratorTest)

addLinkAndDiscoverTest(FilterTest engine)
Expand Down
Loading

0 comments on commit 7dcfea2

Please sign in to comment.