Skip to content

Commit afef793

Browse files
committed
utf-8 masks depend on endianness
Rather than reversing 32 bit numbers, when checking if the analysed code units represent a valid utf-8 encoding, we change the constants base on system's endianness.
1 parent d718d86 commit afef793

File tree

1 file changed

+30
-15
lines changed

1 file changed

+30
-15
lines changed

include/boost/json/detail/utf8.hpp

+30-15
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@
1616
#include <cstring>
1717
#include <cstdint>
1818

19+
#ifdef BOOST_JSON_BIG_ENDIAN
20+
# define BOOST_JSON_MK_NUM4(b4, b3, b2, b1) 0x ## b1 ## b2 ## b3 ## b4
21+
#else
22+
# define BOOST_JSON_MK_NUM4(b4, b3, b2, b1) 0x ## b4 ## b3 ## b2 ## b1
23+
#endif
24+
1925
namespace boost {
2026
namespace json {
2127
namespace detail {
@@ -74,46 +80,55 @@ inline
7480
bool
7581
is_valid_utf8(const char* p, uint16_t first)
7682
{
77-
uint32_t v;
83+
std::uint32_t v = 0;
7884
switch(first >> 8)
7985
{
8086
default:
8187
return false;
8288

8389
// 2 bytes, second byte [80, BF]
8490
case 1:
85-
v = load_little_endian<2>(p);
86-
return (v & 0xC000) == 0x8000;
91+
std::memcpy(&v, p, 2);
92+
return ( v & BOOST_JSON_MK_NUM4(00,00,C0,00) )
93+
== BOOST_JSON_MK_NUM4(00,00,80,00);
8794

8895
// 3 bytes, second byte [A0, BF]
8996
case 2:
90-
v = load_little_endian<3>(p);
91-
return (v & 0xC0E000) == 0x80A000;
97+
std::memcpy(&v, p, 3);
98+
return ( v & BOOST_JSON_MK_NUM4(00,C0,E0,00) )
99+
== BOOST_JSON_MK_NUM4(00,80,A0,00);
92100

93101
// 3 bytes, second byte [80, BF]
94102
case 3:
95-
v = load_little_endian<3>(p);
96-
return (v & 0xC0C000) == 0x808000;
103+
std::memcpy(&v, p, 3);
104+
return ( v & BOOST_JSON_MK_NUM4(00,C0,C0,00) )
105+
== BOOST_JSON_MK_NUM4(00,80,80,00);
97106

98107
// 3 bytes, second byte [80, 9F]
99108
case 4:
100-
v = load_little_endian<3>(p);
101-
return (v & 0xC0E000) == 0x808000;
109+
std::memcpy(&v, p, 3);
110+
return ( v & BOOST_JSON_MK_NUM4(00,C0,E0,00) )
111+
== BOOST_JSON_MK_NUM4(00,80,80,00);
102112

103113
// 4 bytes, second byte [90, BF]
104114
case 5:
105-
v = load_little_endian<4>(p);
106-
return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
115+
std::memcpy(&v, p, 4);
116+
return (
117+
( v & BOOST_JSON_MK_NUM4(C0, C0, C0, 00) )
118+
== BOOST_JSON_MK_NUM4(80, 80, 80, 00) )
119+
&& ( v & BOOST_JSON_MK_NUM4(00, 00, 30, 00) );
107120

108121
// 4 bytes, second byte [80, BF]
109122
case 6:
110-
v = load_little_endian<4>(p);
111-
return (v & 0xC0C0C000) == 0x80808000;
123+
std::memcpy(&v, p, 4);
124+
return ( v & BOOST_JSON_MK_NUM4(C0,C0,C0,00) )
125+
== BOOST_JSON_MK_NUM4(80,80,80,00);
112126

113127
// 4 bytes, second byte [80, 8F]
114128
case 7:
115-
v = load_little_endian<4>(p);
116-
return (v & 0xC0C0F000) == 0x80808000;
129+
std::memcpy(&v, p, 4);
130+
return ( v & BOOST_JSON_MK_NUM4(C0,C0,F0,00) )
131+
== BOOST_JSON_MK_NUM4(80,80,80,00);
117132
}
118133
}
119134

0 commit comments

Comments
 (0)