A lazy parser for the application/sparql-results+json format (#1412)

This parser can be used to lazily parse JSON input where most of the data resides in a single JSON array, the path to which is known in advance with the additional assumption that the single entries of that array are small, but there might be many of them. This assumption holds for the JSON format of SPARQL query results, which consist of some (small) metadata like the contained variables and a large array that contains the result rows, each of which are typically small (at most one entry per variable). In the future this parser will be used to implement a lazy `SERVICE` operation.
ad-freiburg · Aug 29, 2024 · 7dcfea2 · 7dcfea2
1 parent 34d59a5
commit 7dcfea2
Show file tree

Hide file tree

Showing 5 changed files with 470 additions and 1 deletion.
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(ConfigManager)
 add_subdirectory(MemorySize)
 add_subdirectory(http)
-add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp DateYearDuration.cpp Duration.cpp antlr/GenerateAntlrExceptionMetadata.cpp CancellationHandle.cpp StringUtils.cpp)
+add_library(util GeoSparqlHelpers.cpp antlr/ANTLRErrorHandling.cpp ParseException.cpp Conversions.cpp Date.cpp DateYearDuration.cpp Duration.cpp antlr/GenerateAntlrExceptionMetadata.cpp CancellationHandle.cpp StringUtils.cpp LazyJsonParser.cpp)
 qlever_target_link_libraries(util re2::re2)
diff --git a/src/util/LazyJsonParser.cpp b/src/util/LazyJsonParser.cpp
@@ -0,0 +1,270 @@
+// Copyright 2024, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Moritz Dom ([email protected])
+
+#include "util/LazyJsonParser.h"
+
+#include <absl/strings/ascii.h>
+#include <absl/strings/str_cat.h>
+#include <absl/strings/str_join.h>
+
+#include <variant>
+
+#include "util/Exception.h"
+
+namespace ad_utility {
+
+// ____________________________________________________________________________
+cppcoro::generator<nlohmann::json> LazyJsonParser::parse(
+    cppcoro::generator<std::string> partialJson,
+    std::vector<std::string> arrayPath) {
+  LazyJsonParser p(std::move(arrayPath));
+  for (const auto& chunk : partialJson) {
+    if (auto res = p.parseChunk(chunk); res.has_value()) {
+      co_yield res;
+      if (p.endReached_) {
+        co_return;
+      }
+    }
+  }
+}
+
+// ____________________________________________________________________________
+LazyJsonParser::LazyJsonParser(std::vector<std::string> arrayPath)
+    : arrayPath_(std::move(arrayPath)),
+      prefixInArray_(absl::StrCat(
+          absl::StrJoin(arrayPath_.begin(), arrayPath_.end(), "",
+                        [](std::string* out, const std::string& s) {
+                          absl::StrAppend(out, "{\"", s, "\": ");
+                        }),
+          "[")),
+      suffixInArray_(absl::StrCat("]", std::string(arrayPath_.size(), '}'))) {}
+
+// ____________________________________________________________________________
+std::optional<nlohmann::json> LazyJsonParser::parseChunk(
+    std::string_view inStr) {
+  size_t idx = input_.size();
+  absl::StrAppend(&input_, inStr);
+
+  // End-index (exclusive) of the current `input_` to construct a result.
+  size_t materializeEnd = 0;
+
+  // If the previous chunk ended within a Literal, finish parsing it.
+  if (inLiteral_) {
+    parseLiteral(idx);
+    ++idx;
+  }
+
+  // Resume parsing the current section.
+  if (std::holds_alternative<BeforeArrayPath>(state_)) {
+    parseBeforeArrayPath(idx);
+  }
+  if (std::holds_alternative<InArrayPath>(state_)) {
+    materializeEnd = parseInArrayPath(idx);
+  }
+  if (std::holds_alternative<AfterArrayPath>(state_)) {
+    std::optional<size_t> optEnd = parseAfterArrayPath(idx);
+    if (optEnd) {
+      materializeEnd = optEnd.value();
+    }
+  }
+
+  return constructResultFromParsedChunk(materializeEnd);
+}
+
+// ____________________________________________________________________________
+void LazyJsonParser::parseLiteral(size_t& idx) {
+  AD_CORRECTNESS_CHECK(inLiteral_ || input_[idx] == '"');
+  if (input_[idx] == '"' && !inLiteral_) {
+    ++idx;
+    if (std::holds_alternative<BeforeArrayPath>(state_)) {
+      std::get<BeforeArrayPath>(state_).optLiteral_ =
+          BeforeArrayPath::LiteralView{.start_ = idx, .length_ = 0};
+    }
+    inLiteral_ = true;
+  }
+
+  for (; idx < input_.size(); ++idx) {
+    if (isEscaped_) {
+      isEscaped_ = false;
+      continue;
+    }
+    switch (input_[idx]) {
+      case '"':
+        // End of literal.
+        if (std::holds_alternative<BeforeArrayPath>(state_)) {
+          std::get<BeforeArrayPath>(state_).optLiteral_.value().length_ =
+              idx -
+              std::get<BeforeArrayPath>(state_).optLiteral_.value().start_;
+        }
+        inLiteral_ = false;
+        return;
+      case '\\':
+        isEscaped_ = true;
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+// ____________________________________________________________________________
+void LazyJsonParser::parseBeforeArrayPath(size_t& idx) {
+  AD_CORRECTNESS_CHECK(std::holds_alternative<BeforeArrayPath>(state_));
+  auto& state = std::get<BeforeArrayPath>(state_);
+
+  for (; idx < input_.size(); ++idx) {
+    switch (input_[idx]) {
+      case '{':
+        state.tryAddKeyToPath(input_);
+        break;
+      case '[':
+        if (state.openBrackets_ == 0) {
+          state.tryAddKeyToPath(input_);
+        }
+        ++state.openBrackets_;
+        if (state.curPath_ == arrayPath_) {
+          // Reached arrayPath.
+          state_ = InArrayPath();
+          ++idx;
+          return;
+        }
+        break;
+      case ']':
+        --state.openBrackets_;
+        if (state.openBrackets_ == 0 && !state.curPath_.empty()) {
+          state.curPath_.pop_back();
+        }
+        break;
+      case '}':
+        if (!state.curPath_.empty()) {
+          state.curPath_.pop_back();
+        }
+        break;
+      case '"':
+        parseLiteral(idx);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+// ____________________________________________________________________________
+size_t LazyJsonParser::parseInArrayPath(size_t& idx) {
+  AD_CORRECTNESS_CHECK(std::holds_alternative<InArrayPath>(state_));
+  auto& state = std::get<InArrayPath>(state_);
+  size_t materializeEnd = 0;
+
+  auto exitArrayPath = [&]() {
+    state_ = AfterArrayPath{.remainingBraces_ = arrayPath_.size()};
+    ++idx;
+    if (arrayPath_.empty()) {
+      materializeEnd = idx;
+    }
+    return materializeEnd;
+  };
+
+  for (; idx < input_.size(); ++idx) {
+    switch (input_[idx]) {
+      case '{':
+      case '[':
+        ++state.openBracketsAndBraces_;
+        break;
+      case '}':
+        --state.openBracketsAndBraces_;
+        break;
+      case ']':
+        if (state.openBracketsAndBraces_ == 0) {
+          // End of ArrayPath reached.
+          return exitArrayPath();
+        }
+        --state.openBracketsAndBraces_;
+        break;
+      case ',':
+        if (state.openBracketsAndBraces_ == 0) {
+          materializeEnd = idx;
+        }
+        break;
+      case '"':
+        parseLiteral(idx);
+        break;
+      default:
+        break;
+    }
+  }
+  return materializeEnd;
+}
+
+// ____________________________________________________________________________
+std::optional<size_t> LazyJsonParser::parseAfterArrayPath(size_t& idx) {
+  AD_CORRECTNESS_CHECK(std::holds_alternative<AfterArrayPath>(state_));
+  auto& state = std::get<AfterArrayPath>(state_);
+
+  for (; idx < input_.size(); ++idx) {
+    switch (input_[idx]) {
+      case '{':
+        state.remainingBraces_ += 1;
+        break;
+      case '}':
+        state.remainingBraces_ -= 1;
+        if (state.remainingBraces_ == 0) {
+          // End reached.
+          endReached_ = true;
+          return idx + 1;
+        }
+        break;
+      case '"':
+        parseLiteral(idx);
+        break;
+      default:
+        break;
+    }
+  }
+  return std::nullopt;
+}
+
+// ____________________________________________________________________________
+std::optional<nlohmann::json> LazyJsonParser::constructResultFromParsedChunk(
+    size_t materializeEnd) {
+  size_t nextChunkStart =
+      materializeEnd == 0 ? 0 : std::min(materializeEnd + 1, input_.size());
+  if (input_.size() - nextChunkStart >= 1'000'000) {
+    throw std::runtime_error("Ill formed Json.");
+  }
+  if (nextChunkStart == 0) {
+    return std::nullopt;
+  }
+
+  std::string resStr = yieldCount_ > 0 ? prefixInArray_ : "";
+  ++yieldCount_;
+
+  // materializeEnd either holds the index to a `,` between two elements in the
+  // arrayPath or the (non-existent) first-character after the input.
+  AD_CORRECTNESS_CHECK(
+      (std::holds_alternative<InArrayPath>(state_) &&
+       input_[materializeEnd] == ',') ||
+      (std::holds_alternative<AfterArrayPath>(state_) &&
+       std::get<AfterArrayPath>(state_).remainingBraces_ == 0 &&
+       input_.size() == materializeEnd));
+
+  absl::StrAppend(&resStr, input_.substr(0, materializeEnd));
+  input_ = input_.substr(nextChunkStart);
+
+  if (std::holds_alternative<InArrayPath>(state_)) {
+    absl::StrAppend(&resStr, suffixInArray_);
+  }
+
+  return nlohmann::json::parse(resStr);
+}
+
+// ____________________________________________________________________________
+void LazyJsonParser::BeforeArrayPath::tryAddKeyToPath(std::string_view input) {
+  if (optLiteral_) {
+    curPath_.emplace_back(
+        input.substr(optLiteral_.value().start_, optLiteral_.value().length_));
+    optLiteral_ = std::nullopt;
+  }
+}
+
+}  // namespace ad_utility
diff --git a/src/util/LazyJsonParser.h b/src/util/LazyJsonParser.h
@@ -0,0 +1,99 @@
+// Copyright 2024, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Moritz Dom ([email protected])
+
+#pragma once
+
+#include <optional>
+#include <variant>
+
+#include "util/Generator.h"
+#include "util/json.h"
+
+namespace ad_utility {
+/*
+ * A simple parser for split up JSON-data with known structure.
+ *
+ * Given the path to an array containing the majority of the given JSON-object,
+ * the Parser will yield chunks of the object separated after completed elements
+ * in the arrayPath or after reading the entire object itself.
+ */
+class LazyJsonParser {
+ public:
+  // Parse chunks of json-strings yielding them reconstructed.
+  static cppcoro::generator<nlohmann::json> parse(
+      cppcoro::generator<std::string> partialJson,
+      std::vector<std::string> arrayPath);
+
+ private:
+  explicit LazyJsonParser(std::vector<std::string> arrayPath);
+
+  // Parses a chunk of JSON data and returns it with reconstructed structure.
+  std::optional<nlohmann::json> parseChunk(std::string_view inStr);
+
+  // The following 3 methods parse the different sections before/in/after the
+  // arrayPath starting at the given index `idx` on the `input_` string.
+  void parseBeforeArrayPath(size_t& idx);
+
+  // Returns the index of the last `,` between array elements or 0.
+  size_t parseInArrayPath(size_t& idx);
+
+  // Returns the index after the input, when reading the input is complete.
+  std::optional<size_t> parseAfterArrayPath(size_t& idx);
+
+  // Parses literals in the input.
+  void parseLiteral(size_t& idx);
+
+  // Constructs the result to be returned after parsing a chunk.
+  std::optional<nlohmann::json> constructResultFromParsedChunk(
+      size_t materializeEnd);
+
+  // Context for the 3 parsing sections.
+  struct BeforeArrayPath {
+    // Indices of the latest parsed literal, used to add keys to the curPath_.
+    struct LiteralView {
+      size_t start_{0};
+      size_t length_{0};
+    };
+    std::optional<LiteralView> optLiteral_;
+    std::vector<std::string> curPath_;
+    // Open Brackets counter to track nested arrays.
+    int openBrackets_{0};
+
+    // Attempts to add a key to the current Path, based on strStart/strEnd.
+    void tryAddKeyToPath(std::string_view input);
+  };
+  struct InArrayPath {
+    // Track brackets/braces to find the end of the array.
+    int openBracketsAndBraces_{0};
+  };
+  struct AfterArrayPath {
+    // Remaining braces until the end of the input-object.
+    size_t remainingBraces_;
+  };
+  std::variant<BeforeArrayPath, InArrayPath, AfterArrayPath> state_{
+      BeforeArrayPath()};
+
+  // Current (not yet materialized) input-string.
+  std::string input_;
+
+  // If the next character is escaped or not.
+  bool isEscaped_{false};
+
+  // If the parser is currently positioned within a literal.
+  bool inLiteral_{false};
+
+  // Indicates whether the end of the object has been reached.
+  bool endReached_{false};
+
+  // Counter for the so far returned results.
+  unsigned int yieldCount_{0};
+
+  // Key-path to the array containing many elements.
+  const std::vector<std::string> arrayPath_;
+
+  // Precomputed prefix/suffix used to construct results.
+  const std::string prefixInArray_;
+  const std::string suffixInArray_;
+};
+}  // namespace ad_utility
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -395,6 +395,8 @@ addLinkAndDiscoverTest(FsstCompressorTest fsst)
 
 addLinkAndDiscoverTest(CopyableSynchronizationTest)
 
+addLinkAndDiscoverTest(LazyJsonParserTest)
+
 addLinkAndDiscoverTest(CacheableGeneratorTest)
 
 addLinkAndDiscoverTest(FilterTest engine)