Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A lazy parser for the application/sparql-results+json format #1412

Merged
merged 16 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
add_subdirectory(ConfigManager)
add_subdirectory(MemorySize)
add_subdirectory(http)
add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp DateYearDuration.cpp Duration.cpp antlr/GenerateAntlrExceptionMetadata.cpp CancellationHandle.cpp StringUtils.cpp)
add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp DateYearDuration.cpp Duration.cpp antlr/GenerateAntlrExceptionMetadata.cpp CancellationHandle.cpp StringUtils.cpp LazyJsonParser.cpp)
qlever_target_link_libraries(util re2::re2)
270 changes: 270 additions & 0 deletions src/util/LazyJsonParser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Moritz Dom ([email protected])

#include "util/LazyJsonParser.h"

#include <absl/strings/ascii.h>
#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>

#include <variant>

#include "util/Exception.h"

namespace ad_utility {

// ____________________________________________________________________________
cppcoro::generator<nlohmann::json> LazyJsonParser::parse(
cppcoro::generator<std::string> partialJson,
std::vector<std::string> arrayPath) {
LazyJsonParser p(std::move(arrayPath));
for (const auto& chunk : partialJson) {
if (auto res = p.parseChunk(chunk); res.has_value()) {
co_yield res;
if (p.endReached_) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this is a (albeit static) member function of LazyJsonParser, endReached_ can (as all the other member variabls) be private, which then also gets rid of the Sonarcloud message.

co_return;
}
}
}
}

// ____________________________________________________________________________
LazyJsonParser::LazyJsonParser(std::vector<std::string> arrayPath)
: arrayPath_(std::move(arrayPath)),
prefixInArray_(absl::StrCat(
absl::StrJoin(arrayPath_.begin(), arrayPath_.end(), "",
[](std::string* out, const std::string& s) {
absl::StrAppend(out, "{\"", s, "\": ");
}),
"[")),
suffixInArray_(absl::StrCat("]", std::string(arrayPath_.size(), '}'))) {}

// ____________________________________________________________________________
std::optional<nlohmann::json> LazyJsonParser::parseChunk(
std::string_view inStr) {
size_t idx = input_.size();
absl::StrAppend(&input_, inStr);

// End-index (exclusive) of the current `input_` to construct a result.
size_t materializeEnd = 0;

// If the previous chunk ended within a Literal, finish parsing it.
if (inLiteral_) {
parseLiteral(idx);
++idx;
}

// Resume parsing the current section.
if (std::holds_alternative<BeforeArrayPath>(state_)) {
parseBeforeArrayPath(idx);
}
if (std::holds_alternative<InArrayPath>(state_)) {
materializeEnd = parseInArrayPath(idx);
}
if (std::holds_alternative<AfterArrayPath>(state_)) {
std::optional<size_t> optEnd = parseAfterArrayPath(idx);
if (optEnd) {
materializeEnd = optEnd.value();
}
}

return constructResultFromParsedChunk(materializeEnd);
}

// ____________________________________________________________________________
void LazyJsonParser::parseLiteral(size_t& idx) {
AD_CORRECTNESS_CHECK(inLiteral_ || input_[idx] == '"');
if (input_[idx] == '"' && !inLiteral_) {
++idx;
joka921 marked this conversation as resolved.
Show resolved Hide resolved
if (std::holds_alternative<BeforeArrayPath>(state_)) {
std::get<BeforeArrayPath>(state_).optLiteral_ =
BeforeArrayPath::LiteralView{.start_ = idx, .length_ = 0};
}
inLiteral_ = true;
}

for (; idx < input_.size(); ++idx) {
if (isEscaped_) {
isEscaped_ = false;
continue;
}
switch (input_[idx]) {
case '"':
// End of literal.
if (std::holds_alternative<BeforeArrayPath>(state_)) {
std::get<BeforeArrayPath>(state_).optLiteral_.value().length_ =
idx -
std::get<BeforeArrayPath>(state_).optLiteral_.value().start_;
}
inLiteral_ = false;
return;
case '\\':
isEscaped_ = true;
break;
default:
break;
}
}
}

// ____________________________________________________________________________
void LazyJsonParser::parseBeforeArrayPath(size_t& idx) {
AD_CORRECTNESS_CHECK(std::holds_alternative<BeforeArrayPath>(state_));
auto& state = std::get<BeforeArrayPath>(state_);

for (; idx < input_.size(); ++idx) {
switch (input_[idx]) {
case '{':
state.tryAddKeyToPath(input_);
break;
case '[':
if (state.openBrackets_ == 0) {
state.tryAddKeyToPath(input_);
}
++state.openBrackets_;
if (state.curPath_ == arrayPath_) {
// Reached arrayPath.
state_ = InArrayPath();
++idx;
return;
}
break;
case ']':
--state.openBrackets_;
if (state.openBrackets_ == 0 && !state.curPath_.empty()) {
state.curPath_.pop_back();
}
break;
case '}':
if (!state.curPath_.empty()) {
state.curPath_.pop_back();
}
break;
case '"':
parseLiteral(idx);
break;
default:
break;
}
}
joka921 marked this conversation as resolved.
Show resolved Hide resolved
}

// ____________________________________________________________________________
size_t LazyJsonParser::parseInArrayPath(size_t& idx) {
AD_CORRECTNESS_CHECK(std::holds_alternative<InArrayPath>(state_));
auto& state = std::get<InArrayPath>(state_);
size_t materializeEnd = 0;

auto exitArrayPath = [&]() {
state_ = AfterArrayPath{.remainingBraces_ = arrayPath_.size()};
++idx;
if (arrayPath_.empty()) {
materializeEnd = idx;
}
return materializeEnd;
};

for (; idx < input_.size(); ++idx) {
switch (input_[idx]) {
case '{':
case '[':
++state.openBracketsAndBraces_;
break;
case '}':
--state.openBracketsAndBraces_;
break;
case ']':
if (state.openBracketsAndBraces_ == 0) {
// End of ArrayPath reached.
return exitArrayPath();
}
--state.openBracketsAndBraces_;
break;
case ',':
if (state.openBracketsAndBraces_ == 0) {
materializeEnd = idx;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this materializeEnd is not read, but only written, it should be a return value of this function
(together with a comment, what this function returns).

}
break;
case '"':
parseLiteral(idx);
break;
default:
break;
}
}
return materializeEnd;
}

// ____________________________________________________________________________
std::optional<size_t> LazyJsonParser::parseAfterArrayPath(size_t& idx) {
AD_CORRECTNESS_CHECK(std::holds_alternative<AfterArrayPath>(state_));
auto& state = std::get<AfterArrayPath>(state_);

for (; idx < input_.size(); ++idx) {
joka921 marked this conversation as resolved.
Show resolved Hide resolved
switch (input_[idx]) {
case '{':
state.remainingBraces_ += 1;
break;
case '}':
state.remainingBraces_ -= 1;
if (state.remainingBraces_ == 0) {
// End reached.
endReached_ = true;
return idx + 1;
}
break;
case '"':
parseLiteral(idx);
break;
default:
break;
}
}
return std::nullopt;
}

// ____________________________________________________________________________
std::optional<nlohmann::json> LazyJsonParser::constructResultFromParsedChunk(
size_t materializeEnd) {
size_t nextChunkStart =
materializeEnd == 0 ? 0 : std::min(materializeEnd + 1, input_.size());
if (input_.size() - nextChunkStart >= 1'000'000) {
throw std::runtime_error("Ill formed Json.");
}
if (nextChunkStart == 0) {
return std::nullopt;
}

std::string resStr = yieldCount_ > 0 ? prefixInArray_ : "";
++yieldCount_;

// materializeEnd either holds the index to a `,` between two elements in the
// arrayPath or the (non-existent) first-character after the input.
AD_CORRECTNESS_CHECK(
(std::holds_alternative<InArrayPath>(state_) &&
input_[materializeEnd] == ',') ||
(std::holds_alternative<AfterArrayPath>(state_) &&
std::get<AfterArrayPath>(state_).remainingBraces_ == 0 &&
input_.size() == materializeEnd));

absl::StrAppend(&resStr, input_.substr(0, materializeEnd));
input_ = input_.substr(nextChunkStart);

if (std::holds_alternative<InArrayPath>(state_)) {
absl::StrAppend(&resStr, suffixInArray_);
}

return nlohmann::json::parse(resStr);
}

// ____________________________________________________________________________
void LazyJsonParser::BeforeArrayPath::tryAddKeyToPath(std::string_view input) {
if (optLiteral_) {
curPath_.emplace_back(
input.substr(optLiteral_.value().start_, optLiteral_.value().length_));
optLiteral_ = std::nullopt;
}
}

} // namespace ad_utility
99 changes: 99 additions & 0 deletions src/util/LazyJsonParser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
joka921 marked this conversation as resolved.
Show resolved Hide resolved
// Author: Moritz Dom ([email protected])

#pragma once

#include <optional>
#include <variant>

#include "util/Generator.h"
#include "util/json.h"

namespace ad_utility {
/*
* A simple parser for split up JSON-data with known structure.
*
* Given the path to an array containing the majority of the given JSON-object,
* the Parser will yield chunks of the object separated after completed elements
* in the arrayPath or after reading the entire object itself.
*/
class LazyJsonParser {
public:
// Parse chunks of json-strings yielding them reconstructed.
static cppcoro::generator<nlohmann::json> parse(
cppcoro::generator<std::string> partialJson,
std::vector<std::string> arrayPath);

private:
explicit LazyJsonParser(std::vector<std::string> arrayPath);

// Parses a chunk of JSON data and returns it with reconstructed structure.
std::optional<nlohmann::json> parseChunk(std::string_view inStr);

// The following 3 methods parse the different sections before/in/after the
// arrayPath starting at the given index `idx` on the `input_` string.
void parseBeforeArrayPath(size_t& idx);

// Returns the index of the last `,` between array elements or 0.
size_t parseInArrayPath(size_t& idx);

// Returns the index after the input, when reading the input is complete.
std::optional<size_t> parseAfterArrayPath(size_t& idx);

// Parses literals in the input.
void parseLiteral(size_t& idx);

// Constructs the result to be returned after parsing a chunk.
std::optional<nlohmann::json> constructResultFromParsedChunk(
size_t materializeEnd);

// Context for the 3 parsing sections.
struct BeforeArrayPath {
// Indices of the latest parsed literal, used to add keys to the curPath_.
struct LiteralView {
size_t start_{0};
size_t length_{0};
};
std::optional<LiteralView> optLiteral_;
std::vector<std::string> curPath_;
// Open Brackets counter to track nested arrays.
int openBrackets_{0};

// Attempts to add a key to the current Path, based on strStart/strEnd.
void tryAddKeyToPath(std::string_view input);
};
struct InArrayPath {
// Track brackets/braces to find the end of the array.
int openBracketsAndBraces_{0};
};
struct AfterArrayPath {
// Remaining braces until the end of the input-object.
size_t remainingBraces_;
};
std::variant<BeforeArrayPath, InArrayPath, AfterArrayPath> state_{
BeforeArrayPath()};

// Current (not yet materialized) input-string.
std::string input_;

// If the next character is escaped or not.
bool isEscaped_{false};

// If the parser is currently positioned within a literal.
bool inLiteral_{false};

// Indicates whether the end of the object has been reached.
bool endReached_{false};

// Counter for the so far returned results.
unsigned int yieldCount_{0};

// Key-path to the array containing many elements.
const std::vector<std::string> arrayPath_;

// Precomputed prefix/suffix used to construct results.
const std::string prefixInArray_;
const std::string suffixInArray_;
};
} // namespace ad_utility
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,8 @@ addLinkAndDiscoverTest(FsstCompressorTest fsst)

addLinkAndDiscoverTest(CopyableSynchronizationTest)

addLinkAndDiscoverTest(LazyJsonParserTest)

addLinkAndDiscoverTest(CacheableGeneratorTest)

addLinkAndDiscoverTest(FilterTest engine)
Expand Down
Loading
Loading