diff --git a/include/glaze/json/write.hpp b/include/glaze/json/write.hpp index 27d885b607..793b5d15fc 100644 --- a/include/glaze/json/write.hpp +++ b/include/glaze/json/write.hpp @@ -495,8 +495,39 @@ namespace glz data += 2; ++c; }*/ +#define ENABLE_GENERIC_VECTOR 1 + +#ifdef ENABLE_GENERIC_VECTOR + if (n > 63) { + for (const auto end_m7 = e - 63; c < end_m7;) { + std::memcpy(data, c, 64); + uint64x8_t swar; + std::memcpy(&swar, c, 64); + + static constexpr uint64x8_t lo7_mask = repeat_byte64(0b01111111); + const uint64x8_t lo7 = swar & lo7_mask; + const uint64x8_t quote = (lo7 ^ repeat_byte64('"')) + lo7_mask; + const uint64x8_t backslash = (lo7 ^ repeat_byte64('\\')) + lo7_mask; + const uint64x8_t less_32 = (swar & repeat_byte64(0b01100000)) + lo7_mask; + uint64x8_t next = ~((quote & backslash & less_32) | swar); + + uint64_t mask = movemask_64(next); + if (mask == 0) { + data += 64; + c += 64; + continue; + } + + uint32_t length = countr_zero(mask); + c += length; + data += length; -#if defined(__APPLE__) + std::memcpy(data, &char_escape_table[uint8_t(*c)], 2); + data += 2; + ++c; + } + } +#elif defined(__APPLE__) // This approach is faster when strings don't contain many escapes // But, this is not faster in the general case /*if (n > 15) { @@ -2081,3 +2112,4 @@ namespace glz return {buffer_to_file(buffer, file_name)}; } } + diff --git a/include/glaze/util/parse.hpp b/include/glaze/util/parse.hpp index 86c1361913..a4742a628a 100644 --- a/include/glaze/util/parse.hpp +++ b/include/glaze/util/parse.hpp @@ -3,6 +3,8 @@ #pragma once +#include + #include #include #include @@ -116,10 +118,37 @@ namespace glz::detail return t; }(); - consteval uint32_t repeat_byte4(const auto repeat) { return uint32_t(0x01010101u) * uint8_t(repeat); } + [[nodiscard, gnu::always_inline, gnu::const]] + consteval uint32_t repeat_byte4(const auto repeat) + { + return uint32_t(0x01010101u) * uint8_t(repeat); + } + + [[nodiscard, gnu::always_inline, gnu::const]] + consteval uint64_t repeat_byte8(const uint8_t repeat) + { + return 0x0101010101010101ull * repeat; + } + using uint64x4_t = uint64_t __attribute__((__vector_size__(32))); + using uint64x8_t = uint64_t __attribute__((__vector_size__(64))); - consteval uint64_t repeat_byte8(const uint8_t repeat) { return 0x0101010101010101ull * repeat; } + [[nodiscard, gnu::always_inline, gnu::const]] + consteval uint64x4_t repeat_byte32(const uint8_t repeat) + { + return uint64x4_t{repeat_byte8(repeat), repeat_byte8(repeat), repeat_byte8(repeat), repeat_byte8(repeat)}; + } + [[nodiscard, gnu::always_inline, gnu::const]] + consteval uint64x8_t repeat_byte64(const uint8_t repeat) + { + return uint64x8_t{repeat_byte8(repeat), repeat_byte8(repeat), repeat_byte8(repeat), repeat_byte8(repeat), + repeat_byte8(repeat), repeat_byte8(repeat), repeat_byte8(repeat), repeat_byte8(repeat)}; + } + [[nodiscard, gnu::always_inline, gnu::const]] + auto movemask_64(const uint64x8_t v) noexcept -> uint64_t + { + return _mm512_movepi8_mask(v); + } #if defined(__SIZEOF_INT128__) consteval __uint128_t repeat_byte16(const uint8_t repeat) { @@ -479,8 +508,7 @@ namespace glz::detail ctx.error = error_code::unexpected_end; } else if (*it == '/') { - while (++it != end && *it != '\n') - ; + while (++it != end && *it != '\n'); } else if (*it == '*') { while (++it != end) {