Skip to content

Commit

Permalink
Do not throw exception in SIMDJsonExtractor parse and extract (facebo…
Browse files Browse the repository at this point in the history
…okincubator#6776)

Summary:
Pull Request resolved: facebookincubator#6776

`SIMDJsonExtractor::parse` and `SIMDJsonExtractor::extract` is throwing exception when the input is malformed, which slows down the query a lot.  Optimize the exception away by returning the `simdjson_result` directly and inspect the error code from it.

Reviewed By: kevinwilfong

Differential Revision: D49670929

fbshipit-source-id: 6a1c2a21887be091dd25dffe60ca4926b8a304df
  • Loading branch information
Yuhta authored and facebook-github-bot committed Sep 28, 2023
1 parent 2a10b67 commit ab72f0d
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 80 deletions.
5 changes: 5 additions & 0 deletions velox/common/base/Macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@
#define VELOX_UNSUPPRESS_RETURN_LOCAL_ADDR_WARNING \
_Pragma("GCC diagnostic pop");
#endif

#define VELOX_CONCAT(x, y) x##y
// Need this extra layer to expand __COUNTER__.
#define VELOX_VARNAME_IMPL(x, y) VELOX_CONCAT(x, y)
#define VELOX_VARNAME(x) VELOX_VARNAME_IMPL(x, __COUNTER__)
6 changes: 2 additions & 4 deletions velox/common/testutil/TestValue.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <unordered_map>

#include "velox/common/base/Exceptions.h"
#include "velox/common/base/Macros.h"

namespace facebook::velox::common::testutil {

Expand Down Expand Up @@ -109,11 +110,8 @@ inline void TestValue::adjust(
void* testData) {}
#endif

#define VELOX_CONCAT(x, y) __##x##y
#define VELOX_VARNAME(x) VELOX_CONCAT(x, Obj)

#define SCOPED_TESTVALUE_SET(point, ...) \
::facebook::velox::common::testutil::ScopedTestValue VELOX_VARNAME( \
__LINE__)(point, ##__VA_ARGS__)
_scopedTestValue)(point, ##__VA_ARGS__)

} // namespace facebook::velox::common::testutil
60 changes: 40 additions & 20 deletions velox/functions/prestosql/SIMDJsonFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,26 +182,32 @@ struct SIMDJsonExtractScalarFunction {
// We should just get a single value, if we see multiple, it's an error
// and we should return null.
resultStr = std::nullopt;
return;
return true;
}

resultPopulated = true;

switch (v.type()) {
case simdjson::ondemand::json_type::boolean:
resultStr = v.get_bool().value() ? "true" : "false";
SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type());
switch (vtype) {
case simdjson::ondemand::json_type::boolean: {
SIMDJSON_ASSIGN_OR_RAISE(bool vbool, v.get_bool());
resultStr = vbool ? "true" : "false";
break;
case simdjson::ondemand::json_type::string:
resultStr = v.get_string().value();
}
case simdjson::ondemand::json_type::string: {
SIMDJSON_ASSIGN_OR_RAISE(resultStr, v.get_string());
break;
}
case simdjson::ondemand::json_type::object:
case simdjson::ondemand::json_type::array:
case simdjson::ondemand::json_type::null:
// Do nothing.
break;
default:
resultStr = simdjson::to_json_string(v).value();
default: {
SIMDJSON_ASSIGN_OR_RAISE(resultStr, simdjson::to_json_string(v));
}
}
return true;
};

if (!simdJsonExtract(json, jsonPath, consumer)) {
Expand Down Expand Up @@ -237,22 +243,32 @@ struct SIMDJsonExtractFunction {
// We could just convert v to a string using to_json_string directly, but
// in that case the JSON wouldn't be parsed (it would just return the
// contents directly) and we might miss invalid JSON.
switch (v.type()) {
case simdjson::ondemand::json_type::object:
results += simdjson::to_json_string(v.get_object()).value();
SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type());
switch (vtype) {
case simdjson::ondemand::json_type::object: {
SIMDJSON_ASSIGN_OR_RAISE(
auto jsonStr, simdjson::to_json_string(v.get_object()));
results += jsonStr;
break;
case simdjson::ondemand::json_type::array:
results += simdjson::to_json_string(v.get_array()).value();
}
case simdjson::ondemand::json_type::array: {
SIMDJSON_ASSIGN_OR_RAISE(
auto jsonStr, simdjson::to_json_string(v.get_array()));
results += jsonStr;
break;
}
case simdjson::ondemand::json_type::string:
case simdjson::ondemand::json_type::number:
case simdjson::ondemand::json_type::boolean:
results += simdjson::to_json_string(v).value();
case simdjson::ondemand::json_type::boolean: {
SIMDJSON_ASSIGN_OR_RAISE(auto jsonStr, simdjson::to_json_string(v));
results += jsonStr;
break;
}
case simdjson::ondemand::json_type::null:
results += kNullString;
break;
}
return true;
};

if (!simdJsonExtract(json, jsonPath, consumer)) {
Expand Down Expand Up @@ -294,13 +310,16 @@ struct SIMDJsonSizeFunction {
// We only need the size of the actual object if there's only one
// returned, if multiple are returned we use the number of objects
// returned instead.
switch (v.type()) {
case simdjson::ondemand::json_type::object:
singleResultSize = v.count_fields().value();
SIMDJSON_ASSIGN_OR_RAISE(auto vtype, v.type());
switch (vtype) {
case simdjson::ondemand::json_type::object: {
SIMDJSON_ASSIGN_OR_RAISE(singleResultSize, v.count_fields());
break;
case simdjson::ondemand::json_type::array:
singleResultSize = v.count_elements().value();
}
case simdjson::ondemand::json_type::array: {
SIMDJSON_ASSIGN_OR_RAISE(singleResultSize, v.count_elements());
break;
}
case simdjson::ondemand::json_type::string:
case simdjson::ondemand::json_type::number:
case simdjson::ondemand::json_type::boolean:
Expand All @@ -309,6 +328,7 @@ struct SIMDJsonSizeFunction {
break;
}
}
return true;
};

if (!simdJsonExtract(json, jsonPath, consumer)) {
Expand Down
22 changes: 13 additions & 9 deletions velox/functions/prestosql/json/SIMDJsonExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ namespace facebook::velox::functions::detail {
return *it.first->second;
}

simdjson::ondemand::document SIMDJsonExtractor::parse(
const simdjson::padded_string& json) {
simdjson::simdjson_result<simdjson::ondemand::document>
SIMDJsonExtractor::parse(const simdjson::padded_string& json) {
thread_local static simdjson::ondemand::parser parser;
return parser.iterate(json);
}
Expand Down Expand Up @@ -71,29 +71,33 @@ bool SIMDJsonExtractor::tokenize(const std::string& path) {
return true;
}

void extractObject(
simdjson::ondemand::value& jsonObj,
bool extractObject(
simdjson::ondemand::value& jsonValue,
const std::string& key,
std::optional<simdjson::ondemand::value>& ret) {
for (auto field : jsonObj.get_object()) {
if (field.unescaped_key().value() == key) {
SIMDJSON_ASSIGN_OR_RAISE(auto jsonObj, jsonValue.get_object());
for (auto field : jsonObj) {
SIMDJSON_ASSIGN_OR_RAISE(auto currentKey, field.unescaped_key());
if (currentKey == key) {
ret.emplace(field.value());
return;
return true;
}
}
return true;
}

void extractArray(
bool extractArray(
simdjson::ondemand::value& jsonValue,
const std::string& index,
std::optional<simdjson::ondemand::value>& ret) {
auto jsonArray = jsonValue.get_array();
SIMDJSON_ASSIGN_OR_RAISE(auto jsonArray, jsonValue.get_array());
auto rv = folly::tryTo<int32_t>(index);
if (rv.hasValue()) {
auto val = jsonArray.at(rv.value());
if (!val.error()) {
ret.emplace(std::move(val));
}
}
return true;
}
} // namespace facebook::velox::functions::detail
80 changes: 45 additions & 35 deletions velox/functions/prestosql/json/SIMDJsonExtractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,21 @@
#include "folly/Range.h"
#include "folly/dynamic.h"
#include "velox/common/base/Exceptions.h"
#include "velox/common/base/Macros.h"
#include "velox/functions/prestosql/json/JsonPathTokenizer.h"
#include "velox/functions/prestosql/json/SIMDJsonWrapper.h"
#include "velox/type/StringView.h"

#define SIMDJSON_ASSIGN_OR_RAISE_IMPL(_resultName, _lhs, _rexpr) \
auto&& _resultName = (_rexpr); \
if (_resultName.error() != ::simdjson::SUCCESS) { \
return false; \
} \
_lhs = std::move(_resultName).value_unsafe();

#define SIMDJSON_ASSIGN_OR_RAISE(_lhs, _rexpr) \
SIMDJSON_ASSIGN_OR_RAISE_IMPL(VELOX_VARNAME(_simdjsonResult), _lhs, _rexpr)

namespace facebook::velox::functions {

template <typename TConsumer>
Expand All @@ -40,7 +51,7 @@ using JsonVector = std::vector<simdjson::ondemand::value>;
class SIMDJsonExtractor {
public:
template <typename TConsumer>
void extract(
bool extract(
simdjson::ondemand::value& json,
TConsumer& consumer,
size_t tokenStartIndex = 0);
Expand All @@ -50,7 +61,8 @@ class SIMDJsonExtractor {
return tokens_.empty();
}

simdjson::ondemand::document parse(const simdjson::padded_string& json);
simdjson::simdjson_result<simdjson::ondemand::document> parse(
const simdjson::padded_string& json);

private:
// Use this method to get an instance of SIMDJsonExtractor given a JSON path.
Expand Down Expand Up @@ -79,18 +91,18 @@ class SIMDJsonExtractor {
TConsumer&& consumer);
};

void extractObject(
bool extractObject(
simdjson::ondemand::value& jsonObj,
const std::string& key,
std::optional<simdjson::ondemand::value>& ret);

void extractArray(
bool extractArray(
simdjson::ondemand::value& jsonValue,
const std::string& index,
std::optional<simdjson::ondemand::value>& ret);

template <typename TConsumer>
void SIMDJsonExtractor::extract(
bool SIMDJsonExtractor::extract(
simdjson::ondemand::value& json,
TConsumer& consumer,
size_t tokenStartIndex) {
Expand All @@ -102,35 +114,43 @@ void SIMDJsonExtractor::extract(
tokenIndex++) {
auto& token = tokens_[tokenIndex];
if (input.type() == simdjson::ondemand::json_type::object) {
extractObject(input, token, result);
if (!extractObject(input, token, result)) {
return false;
}
} else if (input.type() == simdjson::ondemand::json_type::array) {
if (token == "*") {
for (auto child : input.get_array()) {
if (tokenIndex == tokens_.size() - 1) {
// If this is the last token in the path, consume each element in
// the array.
consumer(child.value());
if (!consumer(child.value())) {
return false;
}
} else {
// If not, then recursively call the extract function on each
// element in the array.
extract(child.value(), consumer, tokenIndex + 1);
if (!extract(child.value(), consumer, tokenIndex + 1)) {
return false;
}
}
}

return;
return true;
} else {
extractArray(input, token, result);
if (!extractArray(input, token, result)) {
return false;
}
}
}
if (!result) {
return;
return true;
}

input = result.value();
result.reset();
}

consumer(input);
return consumer(input);
}
} // namespace detail

Expand Down Expand Up @@ -158,30 +178,20 @@ bool simdJsonExtract(
const velox::StringView& json,
const velox::StringView& path,
TConsumer&& consumer) {
try {
// If extractor fails to parse the path, this will throw a VeloxUserError,
// and we want to let this exception bubble up to the client. We only catch
// JSON parsing failures (in which cases we return folly::none instead of
// throw).
auto& extractor = detail::SIMDJsonExtractor::getInstance(path);
simdjson::padded_string paddedJson(json.data(), json.size());
simdjson::ondemand::document jsonDoc = extractor.parse(paddedJson);

if (extractor.isRootOnlyPath()) {
// If the path is just to return the original object, call consumer on the
// document. Note, we cannot convert this to a value as this is not
// supported if the object is a scalar.
consumer(jsonDoc);
} else {
auto value = jsonDoc.get_value().value();
extractor.extract(value, std::forward<TConsumer>(consumer));
}
} catch (const simdjson::simdjson_error&) {
// simdjson might throw a conversion error while parsing the input JSON.
return false;
// If extractor fails to parse the path, this will throw a VeloxUserError, and
// we want to let this exception bubble up to the client.
auto& extractor = detail::SIMDJsonExtractor::getInstance(path);
simdjson::padded_string paddedJson(json.data(), json.size());
SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, extractor.parse(paddedJson));

if (extractor.isRootOnlyPath()) {
// If the path is just to return the original object, call consumer on the
// document. Note, we cannot convert this to a value as this is not
// supported if the object is a scalar.
return consumer(jsonDoc);
}

return true;
SIMDJSON_ASSIGN_OR_RAISE(auto value, jsonDoc.get_value());
return extractor.extract(value, std::forward<TConsumer>(consumer));
}

template <typename TConsumer>
Expand Down
Loading

0 comments on commit ab72f0d

Please sign in to comment.