diff --git a/docs/mkdocs/docs/api/basic_json/dump.md b/docs/mkdocs/docs/api/basic_json/dump.md index 41adb154d6..7e1ca0f819 100644 --- a/docs/mkdocs/docs/api/basic_json/dump.md +++ b/docs/mkdocs/docs/api/basic_json/dump.md @@ -25,10 +25,11 @@ and `ensure_ascii` parameters. result consists of ASCII characters only. `error_handler` (in) -: how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md): - `strict` (throws and exception in case a decoding error occurs; default), `replace` (replace invalid UTF-8 sequences - with U+FFFD), and `ignore` (ignore invalid UTF-8 sequences during serialization; all bytes are copied to the output - unchanged)). +: how to react on decoding errors; there are four possible values (see [`error_handler_t`](error_handler_t.md)): +: - `strict`: throws a [`type_error`](../../home/exceptions.md#type-errors) exception in case a decoding error occurs (this is the default), + - `replace`: replace invalid UTF-8 sequences with U+FFFD (� REPLACEMENT CHARACTER), + - `ignore`: ignore invalid UTF-8 sequences during serialization (i.e., these bytes are skipped and not copied to the output), and + - `keep`: keep invalid UTF-8 sequences during serialization (i.e., all bytes are copied to the output unchanged) ## Return value @@ -77,3 +78,4 @@ Binary values are serialized as object containing two keys: - Indentation character `indent_char`, option `ensure_ascii` and exceptions added in version 3.0.0. - Error handlers added in version 3.4.0. - Serialization of binary values added in version 3.8.0. +- Added support for error handler value `keep` in version ???. diff --git a/docs/mkdocs/docs/api/basic_json/error_handler_t.md b/docs/mkdocs/docs/api/basic_json/error_handler_t.md index dc32ced9b9..437abdc618 100644 --- a/docs/mkdocs/docs/api/basic_json/error_handler_t.md +++ b/docs/mkdocs/docs/api/basic_json/error_handler_t.md @@ -12,13 +12,16 @@ This enumeration is used in the [`dump`](dump.md) function to choose how to trea `basic_json` value. Three values are differentiated: strict -: throw a `type_error` exception in case of invalid UTF-8 +: throw a [`type_error`](../../home/exceptions.md#type-errors) exception in case of invalid UTF-8 replace : replace invalid UTF-8 sequences with U+FFFD (� REPLACEMENT CHARACTER) ignore -: ignore invalid UTF-8 sequences; all bytes are copied to the output unchanged +: ignore invalid UTF-8 sequences; these bytes are skipped and not copied to the output + +keep +: keep invalid UTF-8 sequences; all bytes are copied to the output unchanged ## Examples @@ -40,3 +43,4 @@ ignore ## Version history - Added in version 3.4.0. +- Added value `keep` in version ???. diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index b42a310448..5a5aac65c1 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -44,7 +44,8 @@ enum class error_handler_t { strict, ///< throw a type_error exception in case of invalid UTF-8 replace, ///< replace invalid UTF-8 sequences with U+FFFD - ignore ///< ignore invalid UTF-8 sequences + ignore, ///< ignore invalid UTF-8 sequences + keep ///< keep invalid UTF-8 sequences }; template @@ -398,6 +399,13 @@ class serializer std::size_t bytes_after_last_accept = 0; std::size_t undumped_chars = 0; + // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII + if (error_handler == error_handler_t::keep && !ensure_ascii) + { + o->write_characters(s.data(), s.size()); + return; + } + for (std::size_t i = 0; i < s.size(); ++i) { const auto byte = static_cast(s[i]); @@ -567,7 +575,23 @@ class serializer break; } - default: // LCOV_EXCL_LINE + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (std::size_t j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + + // add erroneous byte to string buffer + string_buffer[bytes++] = s[i]; + + // continue processing the string + state = UTF8_ACCEPT; + break; + } + + default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } break; @@ -605,6 +629,20 @@ class serializer JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast(s.back() | 0))), nullptr)); } + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (std::size_t j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + undumped_chars = 0; + + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes); + break; + } + case error_handler_t::ignore: { // write all accepted bytes @@ -628,8 +666,8 @@ class serializer break; } - default: // LCOV_EXCL_LINE - JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } } } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index a13dd22fd5..2fe3325184 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -18606,7 +18606,8 @@ enum class error_handler_t { strict, ///< throw a type_error exception in case of invalid UTF-8 replace, ///< replace invalid UTF-8 sequences with U+FFFD - ignore ///< ignore invalid UTF-8 sequences + ignore, ///< ignore invalid UTF-8 sequences + keep ///< keep invalid UTF-8 sequences }; template @@ -18960,6 +18961,13 @@ class serializer std::size_t bytes_after_last_accept = 0; std::size_t undumped_chars = 0; + // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII + if (error_handler == error_handler_t::keep && !ensure_ascii) + { + o->write_characters(s.data(), s.size()); + return; + } + for (std::size_t i = 0; i < s.size(); ++i) { const auto byte = static_cast(s[i]); @@ -19129,7 +19137,23 @@ class serializer break; } - default: // LCOV_EXCL_LINE + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (std::size_t j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + + // add erroneous byte to string buffer + string_buffer[bytes++] = s[i]; + + // continue processing the string + state = UTF8_ACCEPT; + break; + } + + default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } break; @@ -19167,6 +19191,20 @@ class serializer JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast(s.back() | 0))), nullptr)); } + case error_handler_t::keep: + { + // copy undumped chars to string buffer + for (std::size_t j = 0; j < undumped_chars; ++j) + { + string_buffer[bytes++] = s[bytes_after_last_accept + j]; + } + undumped_chars = 0; + + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes); + break; + } + case error_handler_t::ignore: { // write all accepted bytes @@ -19190,8 +19228,8 @@ class serializer break; } - default: // LCOV_EXCL_LINE - JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } } } diff --git a/tests/src/unit-regression2.cpp b/tests/src/unit-regression2.cpp index 88a64a7b48..6cde58a51f 100644 --- a/tests/src/unit-regression2.cpp +++ b/tests/src/unit-regression2.cpp @@ -995,6 +995,15 @@ TEST_CASE("regression tests 2") CHECK(p.x == 1); CHECK(p.y == 2); } + + SECTION("issue #4552 - UTF-8 invalid characters are not always ignored when dumping with error_handler_t::ignore") + { + nlohmann::json node; + node["test"] = "test\334\005"; + CHECK(node.dump(-1, ' ', false, nlohmann::json::error_handler_t::ignore) == "{\"test\":\"test\\u0005\"}"); + CHECK(node.dump(-1, ' ', false, nlohmann::json::error_handler_t::keep) == "{\"test\":\"test\334\005\"}"); + CHECK(node.dump(-1, ' ', true, nlohmann::json::error_handler_t::keep) == "{\"test\":\"test\334\005\"}"); + } } DOCTEST_CLANG_SUPPRESS_WARNING_POP diff --git a/tests/src/unit-serialization.cpp b/tests/src/unit-serialization.cpp index 201e5724cc..ee8c7b43e9 100644 --- a/tests/src/unit-serialization.cpp +++ b/tests/src/unit-serialization.cpp @@ -86,8 +86,11 @@ TEST_CASE("serialization") CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&); CHECK_THROWS_WITH_AS(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&); CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"\\u00e4\\u00fc\""); CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"ä\xA9ü\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"\\u00e4\xA9\\u00fc\""); } SECTION("ending with incomplete character") @@ -97,8 +100,11 @@ TEST_CASE("serialization") CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2", json::type_error&); CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123\""); CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xC2\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xC2\""); } SECTION("unexpected character") @@ -107,9 +113,14 @@ TEST_CASE("serialization") CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34", json::type_error&); CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); + + // see pending discussion at #4452 CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123456\""); CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\""); } SECTION("U+FFFD Substitution of Maximal Subparts") diff --git a/tests/src/unit-unicode2.cpp b/tests/src/unit-unicode2.cpp index 606de12e24..eb0a5a3fda 100644 --- a/tests/src/unit-unicode2.cpp +++ b/tests/src/unit-unicode2.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); diff --git a/tests/src/unit-unicode3.cpp b/tests/src/unit-unicode3.cpp index b060f090a0..db64a252da 100644 --- a/tests/src/unit-unicode3.cpp +++ b/tests/src/unit-unicode3.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); diff --git a/tests/src/unit-unicode4.cpp b/tests/src/unit-unicode4.cpp index a6a67a0297..c5d0b70342 100644 --- a/tests/src/unit-unicode4.cpp +++ b/tests/src/unit-unicode4.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); diff --git a/tests/src/unit-unicode5.cpp b/tests/src/unit-unicode5.cpp index 7cf2fc8c00..d2a70ea0ca 100644 --- a/tests/src/unit-unicode5.cpp +++ b/tests/src/unit-unicode5.cpp @@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 static std::string s_replaced2; static std::string s_replaced_ascii; static std::string s_replaced2_ascii; + static std::string s_kept; // dumping with ignore/replace must not throw in any case s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); @@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep); if (success_expected) { @@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } + // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used + CHECK(json_string == s_kept.substr(1, json_string.size())); + // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");