Skip to content

Commit cc5c25b

Browse files
Krishna Paifacebook-github-bot
authored andcommitted
Fix (json) : Json Parse validation to be more comprehensive in certain json functions (facebookincubator#14558)
Summary: Pull Request resolved: facebookincubator#14558 Certain json functions (json_extract_scalar, is_json_scalar, json_array_contains etc) relied on simdjson's parseDocument to validate that the json is valid. simdjson's parse is not fool proof and there are several cases it will return as if the document is valid when in fact it is not. This change does a more comprehensive parse that ensures the document is valid. NOTE: Previously invalid json's like [01], or [9.6E400] would give results, with this change though these jsons which are invalid will result in a null value. Reviewed By: kevinwilfong Differential Revision: D80502803
1 parent 37d26c0 commit cc5c25b

File tree

4 files changed

+135
-13
lines changed

4 files changed

+135
-13
lines changed

velox/functions/prestosql/JsonFunctions.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,8 +862,52 @@ class JsonInternalCastFunction : public exec::VectorFunction {
862862
mutable JsonCastOperator jsonCastOperator_;
863863
};
864864

865+
template <typename T>
866+
static simdjson::error_code validate(T value) {
867+
SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type());
868+
switch (type) {
869+
case simdjson::ondemand::json_type::array: {
870+
SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array());
871+
for (auto elementOrError : array) {
872+
SIMDJSON_ASSIGN_OR_RAISE(auto element, elementOrError);
873+
SIMDJSON_TRY(validate(element));
874+
}
875+
return simdjson::SUCCESS;
876+
}
877+
case simdjson::ondemand::json_type::object: {
878+
SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object());
879+
for (auto fieldOrError : object) {
880+
SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldOrError);
881+
SIMDJSON_TRY(validate(field.value()));
882+
}
883+
return simdjson::SUCCESS;
884+
}
885+
case simdjson::ondemand::json_type::number:
886+
return value.get_double().error();
887+
case simdjson::ondemand::json_type::string:
888+
return value.get_string().error();
889+
case simdjson::ondemand::json_type::boolean:
890+
return value.get_bool().error();
891+
case simdjson::ondemand::json_type::null: {
892+
SIMDJSON_ASSIGN_OR_RAISE(auto isNull, value.is_null());
893+
return isNull ? simdjson::SUCCESS : simdjson::N_ATOM_ERROR;
894+
}
895+
}
896+
VELOX_UNREACHABLE();
897+
}
898+
865899
} // namespace
866900

901+
simdjson::error_code jsonParsingError(const simdjson::padded_string& json) {
902+
SIMDJSON_ASSIGN_OR_RAISE(auto doc, simdjsonParse(json));
903+
SIMDJSON_TRY(validate<simdjson::ondemand::document&>(doc));
904+
if (!doc.at_end()) {
905+
return simdjson::TRAILING_CONTENT;
906+
}
907+
908+
return simdjson::SUCCESS;
909+
}
910+
867911
VELOX_DECLARE_VECTOR_FUNCTION(
868912
udf_json_format,
869913
JsonFormatFunction::signatures(),

velox/functions/prestosql/JsonFunctions.h

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@
2525

2626
namespace facebook::velox::functions {
2727

28+
/// Utility function that takes a padded string and determines whether
29+
/// the json is valid or not.
30+
/// @returns simdjson::SUCCESS if passed json is valid else returns a
31+
/// failure code.
32+
simdjson::error_code jsonParsingError(const simdjson::padded_string& json);
33+
2834
template <typename T>
2935
struct IsJsonScalarFunction {
3036
VELOX_DEFINE_FUNCTION_TYPES(T);
@@ -33,14 +39,17 @@ struct IsJsonScalarFunction {
3339
simdjson::ondemand::document jsonDoc;
3440

3541
simdjson::padded_string paddedJson(json.data(), json.size());
36-
if (auto errorCode = simdjsonParse(paddedJson).get(jsonDoc)) {
42+
if (auto errorCode = jsonParsingError(paddedJson)) {
3743
if (threadSkipErrorDetails()) {
3844
return Status::UserError();
3945
}
4046
return Status::UserError(
4147
"Failed to parse JSON: {}", simdjson::error_message(errorCode));
4248
}
4349

50+
if (simdjsonParse(paddedJson).get(jsonDoc)) {
51+
return Status::UserError("Failed to parse JSON");
52+
}
4453
result =
4554
(jsonDoc.type() == simdjson::ondemand::json_type::number ||
4655
jsonDoc.type() == simdjson::ondemand::json_type::string ||
@@ -68,9 +77,13 @@ struct JsonArrayContainsFunction {
6877
simdjson::ondemand::document jsonDoc;
6978

7079
simdjson::padded_string paddedJson(json.data(), json.size());
80+
if (jsonParsingError(paddedJson)) {
81+
return false;
82+
}
7183
if (simdjsonParse(paddedJson).get(jsonDoc)) {
7284
return false;
7385
}
86+
7487
if (jsonDoc.type().error()) {
7588
return false;
7689
}
@@ -133,6 +146,9 @@ struct JsonArrayGetFunction {
133146
simdjson::ondemand::document jsonDoc;
134147

135148
simdjson::padded_string paddedJson(jsonArray.data(), jsonArray.size());
149+
if (jsonParsingError(paddedJson)) {
150+
return false;
151+
}
136152
if (simdjsonParse(paddedJson).get(jsonDoc)) {
137153
return false;
138154
}
@@ -229,12 +245,8 @@ struct JsonExtractScalarFunction {
229245

230246
// Check for valid json
231247
simdjson::padded_string paddedJson(json.data(), json.size());
232-
{
233-
SIMDJSON_ASSIGN_OR_RAISE(auto jsonDoc, simdjsonParse(paddedJson));
234-
simdjson::ondemand::document parsedDoc;
235-
if (simdjsonParse(paddedJson).get(parsedDoc)) {
236-
return simdjson::TAPE_ERROR;
237-
}
248+
if (auto val = jsonParsingError(paddedJson)) {
249+
return val;
238250
}
239251

240252
bool isDefinitePath = true;
@@ -328,6 +340,10 @@ struct JsonArrayLengthFunction {
328340
simdjson::ondemand::document jsonDoc;
329341

330342
simdjson::padded_string paddedJson(json.data(), json.size());
343+
// Check for valid json
344+
if (jsonParsingError(paddedJson)) {
345+
return false;
346+
}
331347
if (simdjsonParse(paddedJson).get(jsonDoc)) {
332348
return false;
333349
}

velox/functions/prestosql/tests/JsonExtractScalarTest.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,18 @@ TEST_F(JsonExtractScalarTest, wildcardSelect) {
239239
std::nullopt);
240240
}
241241

242+
TEST_F(JsonExtractScalarTest, nanJson) {
243+
std::vector<std::string> badReplacements = {"garbage", "NaN", "}", "0/0"};
244+
245+
for (const auto& badReplacement : badReplacements) {
246+
std::string js = fmt::format(
247+
fmt::runtime(
248+
"{{\"hands_v1\": {}, \"over_occlusion_rate\": 0.0358322490205352}}"),
249+
badReplacement);
250+
EXPECT_EQ(jsonExtractScalar(js, "$.over_occlusion_rate"), std::nullopt);
251+
}
252+
}
253+
242254
} // namespace
243255

244256
} // namespace facebook::velox::functions::prestosql

velox/functions/prestosql/tests/JsonFunctionsTest.cpp

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,12 @@ class JsonFunctionsTest : public functions::test::FunctionBaseTest {
175175
return evaluateJsonVectorFunction("json_array_get", json, index, wrapInTry);
176176
}
177177

178+
const std::vector<std::string> badReplacements_ = {
179+
"garbage",
180+
"NaN",
181+
"}",
182+
"0/0"};
183+
178184
private:
179185
// Utility function to evaluate a function both with and without constant
180186
// inputs. Ensures that the results are same in both cases. 'wrapInTry' is
@@ -684,6 +690,16 @@ TEST_F(JsonFunctionsTest, isJsonScalar) {
684690
EXPECT_EQ(isJsonScalar(R"({"k1":""})"), false);
685691
}
686692

693+
TEST_F(JsonFunctionsTest, isJsonScalarBadJsons) {
694+
for (const auto& badReplacement : badReplacements_) {
695+
std::string js = fmt::format(
696+
fmt::runtime(
697+
"{{\"hands_v1\": {}, \"over_occlusion_rate\": 0.0358322490205352}}"),
698+
badReplacement);
699+
EXPECT_THROW(isJsonScalar(js), VeloxUserError);
700+
}
701+
}
702+
687703
TEST_F(JsonFunctionsTest, jsonArrayLength) {
688704
EXPECT_EQ(jsonArrayLength(R"([])"), 0);
689705
EXPECT_EQ(jsonArrayLength(R"([1])"), 1);
@@ -703,6 +719,14 @@ TEST_F(JsonFunctionsTest, jsonArrayLength) {
703719

704720
// Malformed Json.
705721
EXPECT_EQ(jsonArrayLength(R"((})"), std::nullopt);
722+
723+
for (const auto& badReplacement : badReplacements_) {
724+
std::string js = fmt::format(
725+
fmt::runtime(
726+
"{{\"hands_v1\": {}, \"over_occlusion_rate\": 0.0358322490205352}}"),
727+
badReplacement);
728+
EXPECT_EQ(jsonArrayLength(js), std::nullopt);
729+
}
706730
}
707731

708732
TEST_F(JsonFunctionsTest, jsonArrayGet) {
@@ -753,6 +777,15 @@ TEST_F(JsonFunctionsTest, jsonArrayGet) {
753777
EXPECT_FALSE(arrayGet("[1, 2, ...", 1, type).has_value());
754778
EXPECT_FALSE(arrayGet("not json", 1, type).has_value());
755779
}
780+
781+
// Test it fails on bad jsons
782+
for (const auto& badReplacement : badReplacements_) {
783+
std::string js = fmt::format(
784+
fmt::runtime(
785+
"{{\"hands_v1\": {}, \"over_occlusion_rate\": 0.0358322490205352}}"),
786+
badReplacement);
787+
EXPECT_EQ(arrayGet(js.data(), 1, JSON()), std::nullopt);
788+
}
756789
}
757790

758791
TEST_F(JsonFunctionsTest, jsonArrayContainsBool) {
@@ -792,8 +825,9 @@ true, true, true, true, true, true, true, true, true, true, true])",
792825

793826
// Test errors of getting the specified type of json value.
794827
// Error code is "INCORRECT_TYPE".
795-
EXPECT_EQ(jsonArrayContains<bool>(R"([truet])", false), false);
796-
EXPECT_EQ(jsonArrayContains<bool>(R"([truet, false])", false), true);
828+
// Bad jsons will return null.
829+
EXPECT_EQ(jsonArrayContains<bool>(R"([truet])", false), std::nullopt);
830+
EXPECT_EQ(jsonArrayContains<bool>(R"([truet, false])", false), std::nullopt);
797831
}
798832

799833
TEST_F(JsonFunctionsTest, jsonArrayContainsBigint) {
@@ -841,8 +875,8 @@ TEST_F(JsonFunctionsTest, jsonArrayContainsBigint) {
841875
EXPECT_EQ(
842876
jsonArrayContains<int64_t>(R"([-9223372036854775809,-9])", -9), true);
843877
// Error code is "NUMBER_ERROR".
844-
EXPECT_EQ(jsonArrayContains<int64_t>(R"([01])", 4), false);
845-
EXPECT_EQ(jsonArrayContains<int64_t>(R"([01, 4])", 4), true);
878+
EXPECT_EQ(jsonArrayContains<int64_t>(R"([01])", 4), std::nullopt);
879+
EXPECT_EQ(jsonArrayContains<int64_t>(R"([01, 4])", 4), std::nullopt);
846880
}
847881

848882
TEST_F(JsonFunctionsTest, jsonArrayContainsDouble) {
@@ -890,8 +924,8 @@ TEST_F(JsonFunctionsTest, jsonArrayContainsDouble) {
890924

891925
// Test errors of getting the specified type of json value.
892926
// Error code is "NUMBER_ERROR".
893-
EXPECT_EQ(jsonArrayContains<double>(R"([9.6E400])", 4.2), false);
894-
EXPECT_EQ(jsonArrayContains<double>(R"([9.6E400,4.2])", 4.2), true);
927+
EXPECT_EQ(jsonArrayContains<double>(R"([9.6E400])", 4.2), std::nullopt);
928+
EXPECT_EQ(jsonArrayContains<double>(R"([9.6E400,4.2])", 4.2), std::nullopt);
895929
}
896930

897931
TEST_F(JsonFunctionsTest, jsonArrayContainsString) {
@@ -959,6 +993,14 @@ TEST_F(JsonFunctionsTest, jsonArrayContainsMalformed) {
959993
evaluateOnce<bool>(
960994
"json_array_contains(c0, 'a')", makeRowVector({jsonVector})),
961995
std::nullopt);
996+
997+
for (const auto& badReplacement : badReplacements_) {
998+
std::string js = fmt::format(
999+
fmt::runtime(
1000+
"{{\"hands_v1\": {}, \"over_occlusion_rate\": 0.0358322490205352}}"),
1001+
badReplacement);
1002+
EXPECT_EQ(jsonArrayContains<std::string>(js, {""}), std::nullopt);
1003+
}
9621004
}
9631005

9641006
TEST_F(JsonFunctionsTest, jsonSize) {
@@ -975,6 +1017,14 @@ TEST_F(JsonFunctionsTest, jsonSize) {
9751017
jsonSize(
9761018
R"({"k1":{"k2": 999, "k3": [{"k4": [1, 2, 3]}]}})", "$.k1.k3[0].k4"),
9771019
3);
1020+
1021+
for (const auto& badReplacement : badReplacements_) {
1022+
std::string js = fmt::format(
1023+
fmt::runtime(
1024+
"{{\"hands_v1\": {}, \"over_occlusion_rate\": 0.0358322490205352}}"),
1025+
badReplacement);
1026+
EXPECT_EQ(jsonSize(js, "$.k1"), std::nullopt);
1027+
}
9781028
}
9791029

9801030
TEST_F(JsonFunctionsTest, invalidPath) {

0 commit comments

Comments
 (0)