Skip to content

Commit

Permalink
Port conversion from jsonPath to json pointer path
Browse files Browse the repository at this point in the history
  • Loading branch information
PHILO-HE committed Mar 11, 2024
1 parent c28902d commit 8eaa089
Showing 1 changed file with 61 additions and 20 deletions.
81 changes: 61 additions & 20 deletions velox/functions/sparksql/SIMDJsonFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ struct SIMDGetJsonObjectFunction {
const arg_type<Varchar>* /*json*/,
const arg_type<Varchar>* jsonPath) {
if (jsonPath != nullptr) {
formattedJsonPath_ = getFormattedJsonPath(*jsonPath);
if (jsonPath->size() > 1 && jsonPath->data()[0] == '$') {
formattedJsonPath_ = getJsonPointerPath(
std::string_view(jsonPath->data() + 1, jsonPath->size() - 1));
}
}
}

Expand All @@ -41,14 +44,17 @@ struct SIMDGetJsonObjectFunction {
const arg_type<Varchar>& json,
const arg_type<Varchar>& jsonPath) {
// Spark requires the first char in jsonPath is '$'.
if (jsonPath.data()[0] != '$') {
if (jsonPath.size() < 2 || jsonPath.data()[0] != '$') {
return false;
}
ParserContext ctx(json.data(), json.size());
ctx.parseDocument();
auto rawResult = formattedJsonPath_.has_value()
? ctx.jsonDoc.at_pointer(formattedJsonPath_.value().data())
: ctx.jsonDoc.at_pointer(getFormattedJsonPath(jsonPath).data());
: ctx.jsonDoc.at_pointer(
getJsonPointerPath(
std::string_view(jsonPath.data() + 1, jsonPath.size() - 1))
.data());
if (rawResult.error()) {
return false;
}
Expand All @@ -63,28 +69,63 @@ struct SIMDGetJsonObjectFunction {
}

private:
// Makes a conversion from Spark's json path to json pointer, e.g., converts
// "$.a.b" to "/a/b".
// Makes a conversion from Spark's json path to json pointer path, e.g.,
// converts "$.a.b" to "/a/b".
// See simdjson link:
// https://github.com/simdjson/simdjson/blob/master/doc/dom.md#json-pointer
FOLLY_ALWAYS_INLINE std::string getFormattedJsonPath(
const arg_type<Varchar>& jsonPath) {
// Ignore '$'.
char formattedJsonPath[jsonPath.size()];
int j = 0;
for (int i = 1; i < jsonPath.size(); i++) {
if (jsonPath.data()[i] == ']' || jsonPath.data()[i] == '\'') {
continue;
} else if (jsonPath.data()[i] == '[' || jsonPath.data()[i] == '.') {
formattedJsonPath[j] = '/';
j++;
// Copied from:
// https://github.com/simdjson/simdjson/blob/master/include/simdjson/generic/ondemand/json_path_to_pointer_conversion-inl.h
FOLLY_ALWAYS_INLINE std::string getJsonPointerPath(
const std::string_view jsonPath) {
if (jsonPath.empty() ||
(jsonPath.front() != '.' && jsonPath.front() != '[')) {
return "-1"; // This is just a sentinel value, the caller should check for
// this and return an error.
}

std::string result;
// Reserve space to reduce allocations, adjusting for potential increases
// due to escaping.
result.reserve(jsonPath.size() * 2);

size_t i = 0;

while (i < jsonPath.length()) {
if (jsonPath[i] == '.') {
result += '/';
} else if (jsonPath[i] == '[') {
result += '/';
++i; // Move past the '['
while (i < jsonPath.length() && jsonPath[i] != ']') {
if (jsonPath[i] == '~') {
result += "~0";
} else if (jsonPath[i] == '/') {
result += "~1";
} else if (jsonPath[i] == '\'') {
++i;
continue;
} else {
result += jsonPath[i];
}
++i;
}
if (i == jsonPath.length() || jsonPath[i] != ']') {
return "-1"; // Using sentinel value that will be handled as an error
// by the caller.
}
} else {
formattedJsonPath[j] = jsonPath.data()[i];
j++;
if (jsonPath[i] == '~') {
result += "~0";
} else if (jsonPath[i] == '/') {
result += "~1";
} else {
result += jsonPath[i];
}
}
++i;
}
formattedJsonPath[j] = '\0';
return std::string(formattedJsonPath, j + 1);

return result;
}

FOLLY_ALWAYS_INLINE simdjson::error_code extractStringResult(
Expand Down

0 comments on commit 8eaa089

Please sign in to comment.