From b4da6e1f3a18e3d72c27497895af56aaa2b233a0 Mon Sep 17 00:00:00 2001 From: Dmitry Arkhipov Date: Sun, 12 Jun 2022 10:00:44 +0300 Subject: [PATCH 1/5] document dependency on Boost.Endian --- README.md | 4 ++++ doc/qbk/overview.qbk | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/README.md b/README.md index fb7ac3453..ffed0b258 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,10 @@ building the library or including the function definitions: #define BOOST_JSON_STACK_BUFFER_SIZE 1024 #include ``` +### Endianness + +Boost.JSON uses [Boost.Endian](https://www.boost.org/doc/libs/release/libs/endian/doc/html/endian.html) +in order to support both little endian and big endian platforms. ### Supported Compilers diff --git a/doc/qbk/overview.qbk b/doc/qbk/overview.qbk index 9de3cd724..d4f564b8d 100644 --- a/doc/qbk/overview.qbk +++ b/doc/qbk/overview.qbk @@ -120,6 +120,12 @@ building the library or including the function definitions: #include ``` +[heading Endianness] + +Boost.JSON uses +[@https://www.boost.org/doc/libs/release/libs/endian/doc/html/endian.html +Boost.Endian] in order to support both little endian and big endian platforms. + [heading Supported Compilers] Boost.JSON has been tested with the following compilers: From a5ee70b5ed555a2d1a2e4700219956342f97eca7 Mon Sep 17 00:00:00 2001 From: Dmitry Arkhipov Date: Sun, 9 Jul 2023 14:45:40 +0300 Subject: [PATCH 2/5] utf-8 masks depend on endianness Rather than reversing 32 bit numbers, when checking if the analysed code units represent a valid utf-8 encoding, we change the constants base on system's endianness. --- include/boost/json/detail/utf8.hpp | 63 +++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/include/boost/json/detail/utf8.hpp b/include/boost/json/detail/utf8.hpp index 74434773e..042e242fe 100644 --- a/include/boost/json/detail/utf8.hpp +++ b/include/boost/json/detail/utf8.hpp @@ -21,6 +21,30 @@ namespace boost { namespace json { namespace detail { +template +constexpr +std::uint32_t +make_u32_impl(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) +{ + return (b4 << 24) | (b3 << 16) | (b2 << 8) | b1; +} + +template<> +constexpr +std::uint32_t +make_u32_impl( + std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) +{ + return (b1 << 24) | (b2 << 16) | (b3 << 8) | b4; +} + +constexpr +std::uint32_t +make_u32(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) +{ + return make_u32_impl(b4, b3, b2, b1); +} + template std::uint32_t load_little_endian(void const* p) @@ -70,7 +94,7 @@ inline bool is_valid_utf8(const char* p, uint16_t first) { - uint32_t v; + std::uint32_t v; switch(first >> 8) { default: @@ -78,38 +102,47 @@ is_valid_utf8(const char* p, uint16_t first) // 2 bytes, second byte [80, BF] case 1: - v = load_little_endian<2>(p); - return (v & 0xC000) == 0x8000; + std::memcpy(&v, p, 2); + return ( v & make_u32(0x00,0x00,0xC0,0x00) ) + == make_u32(0x00,0x00,0x80,0x00); // 3 bytes, second byte [A0, BF] case 2: - v = load_little_endian<3>(p); - return (v & 0xC0E000) == 0x80A000; + std::memcpy(&v, p, 3); + return ( v & make_u32(0x00,0xC0,0xE0,0x00) ) + == make_u32(0x00,0x80,0xA0,0x00); // 3 bytes, second byte [80, BF] case 3: - v = load_little_endian<3>(p); - return (v & 0xC0C000) == 0x808000; + std::memcpy(&v, p, 3); + return ( v & make_u32(0x00,0xC0,0xC0,0x00) ) + == make_u32(0x00,0x80,0x80,0x00); // 3 bytes, second byte [80, 9F] case 4: - v = load_little_endian<3>(p); - return (v & 0xC0E000) == 0x808000; + std::memcpy(&v, p, 3); + return ( v & make_u32(0x00,0xC0,0xE0,0x00) ) + == make_u32(0x00,0x80,0x80,0x00); // 4 bytes, second byte [90, BF] case 5: - v = load_little_endian<4>(p); - return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00; + std::memcpy(&v, p, 4); + return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) + + make_u32(0x7F,0x7F,0x70,0xFF) ) + | make_u32(0x00,0x00,0x30,0xFF) ) + == make_u32(0x00,0x00,0x30,0xFF); // 4 bytes, second byte [80, BF] case 6: - v = load_little_endian<4>(p); - return (v & 0xC0C0C000) == 0x80808000; + std::memcpy(&v, p, 4); + return ( v & make_u32(0xC0,0xC0,0xC0,0x00) ) + == make_u32(0x80,0x80,0x80,0x00); // 4 bytes, second byte [80, 8F] case 7: - v = load_little_endian<4>(p); - return (v & 0xC0C0F000) == 0x80808000; + std::memcpy(&v, p, 4); + return ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) + == make_u32(0x80,0x80,0x80,0x00); } } From 3a3508d87e1308ab9cf8dd2ef3d39274c48a3eaf Mon Sep 17 00:00:00 2001 From: Dmitry Arkhipov Date: Wed, 21 Feb 2024 13:20:52 +0300 Subject: [PATCH 3/5] temp --- include/boost/json/detail/utf8.hpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/include/boost/json/detail/utf8.hpp b/include/boost/json/detail/utf8.hpp index 042e242fe..4b921861c 100644 --- a/include/boost/json/detail/utf8.hpp +++ b/include/boost/json/detail/utf8.hpp @@ -45,6 +45,22 @@ make_u32(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) return make_u32_impl(b4, b3, b2, b1); } +template +constexpr +std::uint32_t +utf8_case5_special_number() +{ + return make_u32(0x00,0x00,0x30,0xFF); +} + +template<> +constexpr +std::uint32_t +utf8_case5_special_number() +{ + return make_u32(0xFF,0xFF,0x30,0x00); +} + template std::uint32_t load_little_endian(void const* p) @@ -129,8 +145,8 @@ is_valid_utf8(const char* p, uint16_t first) std::memcpy(&v, p, 4); return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) + make_u32(0x7F,0x7F,0x70,0xFF) ) - | make_u32(0x00,0x00,0x30,0xFF) ) - == make_u32(0x00,0x00,0x30,0xFF); + | make_u32(0x00,0x00,0x30,0x00) ) + == utf8_case5_special_number(); // 4 bytes, second byte [80, BF] case 6: From a284d0d9ba4724eb7dc7349c8794ff92e358da19 Mon Sep 17 00:00:00 2001 From: Dmitry Arkhipov Date: Fri, 23 Feb 2024 20:10:13 +0300 Subject: [PATCH 4/5] temp --- include/boost/json/detail/utf8.hpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/boost/json/detail/utf8.hpp b/include/boost/json/detail/utf8.hpp index 4b921861c..c452a7879 100644 --- a/include/boost/json/detail/utf8.hpp +++ b/include/boost/json/detail/utf8.hpp @@ -47,18 +47,22 @@ make_u32(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) template constexpr -std::uint32_t -utf8_case5_special_number() +bool +utf8_case5(std::uint32_t v) { - return make_u32(0x00,0x00,0x30,0xFF); + return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) + + make_u32(0x7F,0x7F,0x70,0x00) ) | make_u32(0x00,0x00,0x30,0x00) ) + == make_u32(0x00,0x00,0x30,0x00); } template<> constexpr -std::uint32_t -utf8_case5_special_number() +bool +utf8_case5(std::uint32_t v) { - return make_u32(0xFF,0xFF,0x30,0x00); + return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) + + make_u32(0x00,0x00,0x70,0xFF) ) | make_u32(0x00,0x00,0x30,0x00) ) + == make_u32(0x80,0x80,0x30,0x00); } template @@ -143,10 +147,7 @@ is_valid_utf8(const char* p, uint16_t first) // 4 bytes, second byte [90, BF] case 5: std::memcpy(&v, p, 4); - return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) - + make_u32(0x7F,0x7F,0x70,0xFF) ) - | make_u32(0x00,0x00,0x30,0x00) ) - == utf8_case5_special_number(); + return utf8_case5(v); // 4 bytes, second byte [80, BF] case 6: From d79a5867adbac7fac79ef6a14a60779570e01d27 Mon Sep 17 00:00:00 2001 From: Dmitry Arkhipov Date: Wed, 6 Mar 2024 20:45:18 +0300 Subject: [PATCH 5/5] noinline version of is_valid_utf8 --- include/boost/json/detail/sse2.hpp | 4 +- include/boost/json/detail/utf8.hpp | 128 ++++++++++------------------- 2 files changed, 44 insertions(+), 88 deletions(-) diff --git a/include/boost/json/detail/sse2.hpp b/include/boost/json/detail/sse2.hpp index 06657bc18..3b979c467 100644 --- a/include/boost/json/detail/sse2.hpp +++ b/include/boost/json/detail/sse2.hpp @@ -136,7 +136,7 @@ count_valid( uint8_t len = first & 0xFF; if(BOOST_JSON_UNLIKELY(end - p < len)) break; - if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first))) + if(BOOST_JSON_UNLIKELY(! is_valid_utf8_no_inline(p, first))) break; p += len; } @@ -185,7 +185,7 @@ count_valid( uint8_t len = first & 0xFF; if(BOOST_JSON_UNLIKELY(end - p < len)) break; - if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first))) + if(BOOST_JSON_UNLIKELY(! is_valid_utf8_no_inline(p, first))) break; p += len; } diff --git a/include/boost/json/detail/utf8.hpp b/include/boost/json/detail/utf8.hpp index c452a7879..0a0fea2b2 100644 --- a/include/boost/json/detail/utf8.hpp +++ b/include/boost/json/detail/utf8.hpp @@ -21,58 +21,12 @@ namespace boost { namespace json { namespace detail { -template -constexpr -std::uint32_t -make_u32_impl(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) -{ - return (b4 << 24) | (b3 << 16) | (b2 << 8) | b1; -} - -template<> -constexpr -std::uint32_t -make_u32_impl( - std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) -{ - return (b1 << 24) | (b2 << 16) | (b3 << 8) | b4; -} - -constexpr -std::uint32_t -make_u32(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1) -{ - return make_u32_impl(b4, b3, b2, b1); -} - -template -constexpr -bool -utf8_case5(std::uint32_t v) -{ - return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) - + make_u32(0x7F,0x7F,0x70,0x00) ) | make_u32(0x00,0x00,0x30,0x00) ) - == make_u32(0x00,0x00,0x30,0x00); -} - -template<> -constexpr -bool -utf8_case5(std::uint32_t v) -{ - return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) - + make_u32(0x00,0x00,0x70,0xFF) ) | make_u32(0x00,0x00,0x30,0x00) ) - == make_u32(0x80,0x80,0x30,0x00); -} - template std::uint32_t load_little_endian(void const* p) { - std::uint32_t v = 0; - std::memcpy(&v, p, N); - endian::little_to_native_inplace(v); - return v; + auto const up = reinterpret_cast(p); + return endian::endian_load(up); } inline @@ -122,47 +76,49 @@ is_valid_utf8(const char* p, uint16_t first) // 2 bytes, second byte [80, BF] case 1: - std::memcpy(&v, p, 2); - return ( v & make_u32(0x00,0x00,0xC0,0x00) ) - == make_u32(0x00,0x00,0x80,0x00); - - // 3 bytes, second byte [A0, BF] - case 2: - std::memcpy(&v, p, 3); - return ( v & make_u32(0x00,0xC0,0xE0,0x00) ) - == make_u32(0x00,0x80,0xA0,0x00); - - // 3 bytes, second byte [80, BF] - case 3: - std::memcpy(&v, p, 3); - return ( v & make_u32(0x00,0xC0,0xC0,0x00) ) - == make_u32(0x00,0x80,0x80,0x00); - - // 3 bytes, second byte [80, 9F] - case 4: - std::memcpy(&v, p, 3); - return ( v & make_u32(0x00,0xC0,0xE0,0x00) ) - == make_u32(0x00,0x80,0x80,0x00); - - // 4 bytes, second byte [90, BF] - case 5: - std::memcpy(&v, p, 4); - return utf8_case5(v); - - // 4 bytes, second byte [80, BF] - case 6: - std::memcpy(&v, p, 4); - return ( v & make_u32(0xC0,0xC0,0xC0,0x00) ) - == make_u32(0x80,0x80,0x80,0x00); - - // 4 bytes, second byte [80, 8F] - case 7: - std::memcpy(&v, p, 4); - return ( v & make_u32(0xC0,0xC0,0xF0,0x00) ) - == make_u32(0x80,0x80,0x80,0x00); + v = load_little_endian<2>(p); + return (v & 0xC000) == 0x8000; + + // 3 bytes, second byte [A0, BF] + case 2: + v = load_little_endian<3>(p); + return (v & 0xC0E000) == 0x80A000; + + // 3 bytes, second byte [80, BF] + case 3: + v = load_little_endian<3>(p); + return (v & 0xC0C000) == 0x808000; + + // 3 bytes, second byte [80, 9F] + case 4: + v = load_little_endian<3>(p); + return (v & 0xC0E000) == 0x808000; + + // 4 bytes, second byte [90, BF] + case 5: + v = load_little_endian<4>(p); + return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00; + + // 4 bytes, second byte [80, BF] + case 6: + v = load_little_endian<4>(p); + return (v & 0xC0C0C000) == 0x80808000; + + // 4 bytes, second byte [80, 8F] + case 7: + v = load_little_endian<4>(p); + return (v & 0xC0C0F000) == 0x80808000; } } +BOOST_NOINLINE +inline +bool +is_valid_utf8_no_inline(const char* p, uint16_t first) +{ + return is_valid_utf8(p, first); +} + class utf8_sequence { char seq_[4];