From 6b30cefc3b1f3355e7c9472956ef726e027dabab Mon Sep 17 00:00:00 2001 From: Timofey Date: Wed, 30 Oct 2024 00:09:01 +0300 Subject: [PATCH 01/22] scan_for_text_ascii: Added simple avx2 and avx512 detector. --- src/libunicode/CMakeLists.txt | 22 +++- src/libunicode/scan.cpp | 10 ++ src/libunicode/scan.h | 2 + src/libunicode/scan256.cpp | 67 ++++++++++++ src/libunicode/scan512.cpp | 67 ++++++++++++ src/libunicode/simd_detector.cpp | 170 +++++++++++++++++++++++++++++++ src/libunicode/simd_detector.h | 7 ++ 7 files changed, 344 insertions(+), 1 deletion(-) create mode 100644 src/libunicode/scan256.cpp create mode 100644 src/libunicode/scan512.cpp create mode 100644 src/libunicode/simd_detector.cpp create mode 100644 src/libunicode/simd_detector.h diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index bb1a824..2eaad88 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -106,6 +106,9 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} script_segmenter.cpp utf8.cpp width.cpp + simd_detector.cpp + scan256.cpp + scan512.cpp # auto-generated by unicode_tablegen codepoint_properties_data.h @@ -113,6 +116,23 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} codepoint_properties_names.cpp ) +set_source_files_properties( + scan256.cpp + PROPERTIES + COMPILE_FLAGS + -mavx2 + ) +set_source_files_properties( + scan512.cpp + PROPERTIES + COMPILE_FLAGS + -mavx512f + COMPILE_FLAGS + -mavx512bw + COMPILE_FLAGS + -mavx512bitalg + ) + if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std") target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD) elseif(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") @@ -234,7 +254,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp index 8efbbad..3d3a1a6 100644 --- a/src/libunicode/scan.cpp +++ b/src/libunicode/scan.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,15 @@ namespace size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noexcept { + static auto simd_size = max_simd_size(); + if (simd_size == 512) + { + return scan_for_text_ascii_512(text, maxColumnCount); + } + else if (simd_size == 256) + { + return scan_for_text_ascii_256(text, maxColumnCount); + } auto input = text.data(); auto const end = text.data() + min(text.size(), maxColumnCount); #if defined(USE_STD_SIMD) diff --git a/src/libunicode/scan.h b/src/libunicode/scan.h index 902cc1c..3abe674 100644 --- a/src/libunicode/scan.h +++ b/src/libunicode/scan.h @@ -79,6 +79,8 @@ class null_receiver final: public grapheme_cluster_receiver namespace detail { size_t scan_for_text_ascii(std::string_view text, size_t maxColumnCount) noexcept; + size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept; + size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept; scan_result scan_for_text_nonascii(scan_state& state, std::string_view text, size_t maxColumnCount, diff --git a/src/libunicode/scan256.cpp b/src/libunicode/scan256.cpp new file mode 100644 index 0000000..340f88d --- /dev/null +++ b/src/libunicode/scan256.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +#include "scan.h" +#include + +namespace stdx = std::experimental; +using std::distance; +using std::get; +using std::holds_alternative; +using std::max; +using std::min; +using std::string_view; + +constexpr bool is_control(char ch) noexcept +{ + return static_cast(ch) < 0x20; +} + +// Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E. +constexpr bool is_complex(char ch) noexcept +{ + return static_cast(ch) & 0x80; +} + +// Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. +constexpr bool is_ascii(char ch) noexcept +{ + return !is_control(ch) && !is_complex(ch); +} + +namespace unicode +{ +size_t detail::scan_for_text_ascii_256(string_view text, size_t maxColumnCount) noexcept +{ + auto input = text.data(); + auto const end = text.data() + min(text.size(), maxColumnCount); + constexpr int numberOfElements = 256 / 8; + stdx::fixed_size_simd simd_text {}; + while (input < end - numberOfElements) + { + simd_text.copy_from(input, stdx::element_aligned); + + // check for control + // TODO check for complex + auto const simd_mask_text = (simd_text < 0x20); + if (stdx::popcount(simd_mask_text) > 0) + { + input += stdx::find_first_set(simd_mask_text); + break; + } + input += numberOfElements; + } + while (input != end && is_ascii(*input)) + ++input; + + // if (static_cast(distance(text.data(), input))) + // std::print( + // "countAsciiTextChars: {} bytes: \"{}\"\n", + // static_cast(distance(text.data(), input)), + // (string_view(text.data(), static_cast(distance(text.data(), input))))); + + return static_cast(distance(text.data(), input)); +} +} // namespace unicode diff --git a/src/libunicode/scan512.cpp b/src/libunicode/scan512.cpp new file mode 100644 index 0000000..7c5a618 --- /dev/null +++ b/src/libunicode/scan512.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +#include "scan.h" +#include + +namespace stdx = std::experimental; +using std::distance; +using std::get; +using std::holds_alternative; +using std::max; +using std::min; +using std::string_view; + +constexpr bool is_control(char ch) noexcept +{ + return static_cast(ch) < 0x20; +} + +// Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E. +constexpr bool is_complex(char ch) noexcept +{ + return static_cast(ch) & 0x80; +} + +// Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. +constexpr bool is_ascii(char ch) noexcept +{ + return !is_control(ch) && !is_complex(ch); +} + +namespace unicode +{ +size_t detail::scan_for_text_ascii_512(string_view text, size_t maxColumnCount) noexcept +{ + auto input = text.data(); + auto const end = text.data() + min(text.size(), maxColumnCount); + constexpr int numberOfElements = 512 / 8; + stdx::fixed_size_simd simd_text {}; + while (input < end - numberOfElements) + { + simd_text.copy_from(input, stdx::element_aligned); + + // check for control + // TODO check for complex + auto const simd_mask_text = (simd_text < 0x20); + if (stdx::popcount(simd_mask_text) > 0) + { + input += stdx::find_first_set(simd_mask_text); + break; + } + input += numberOfElements; + } + while (input != end && is_ascii(*input)) + ++input; + + // if (static_cast(distance(text.data(), input))) + // std::print( + // "countAsciiTextChars: {} bytes: \"{}\"\n", + // static_cast(distance(text.data(), input)), + // (string_view(text.data(), static_cast(distance(text.data(), input))))); + + return static_cast(distance(text.data(), input)); +} +} // namespace unicode diff --git a/src/libunicode/simd_detector.cpp b/src/libunicode/simd_detector.cpp new file mode 100644 index 0000000..ee60b4a --- /dev/null +++ b/src/libunicode/simd_detector.cpp @@ -0,0 +1,170 @@ +#include "simd_detector.h" + +#include + +// AVX512 required: +// AVX512_BITALG : popcnt +// AVX512_BW : compare greater (less is needed) +// AVX512_F : and +// +// auto max_simd_size() -> size_t; + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) + #if _WIN32 + // clang-format off + #include + #include + void cpuid(int32_t out[4], int32_t eax, int32_t ecx) + { + __cpuidex(out, eax, ecx); + } + __int64 xgetbv(unsigned int x) + { + return _xgetbv(x); + } + // clang-format on + #elif defined(__GNUC__) || defined(__clang__) + // clang-format off + #include + void cpuid(int32_t out[4], int32_t eax, int32_t ecx) + { + __cpuid_count(eax, ecx, out[0], out[1], out[2], out[3]); + } + uint64_t xgetbv(unsigned int index) + { + uint32_t eax, edx; + __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); + return ((uint64_t) edx << 32) | eax; + } + #define XCR_XFEATURE_ENABLED_MASK 0 + // clang-format on + #else + #error "No cpuid intrinsic defined for compiler." + #endif + +auto detect_os_avx() -> bool +{ + // Copied from: http://stackoverflow.com/a/22521619/922184 + bool avxSupported = false; + int32_t cpuInfo[4]; + cpuid(cpuInfo, 1, 0); + + bool osUsesXSAVE_XRSTORE = (cpuInfo[2] & (1 << 27)) != 0; + bool cpuAVXSuport = (cpuInfo[2] & (1 << 28)) != 0; + + if (osUsesXSAVE_XRSTORE && cpuAVXSuport) + { + uint64_t xcrFeatureMask = xgetbv(XCR_XFEATURE_ENABLED_MASK); + avxSupported = (xcrFeatureMask & 0x6) == 0x6; + } + + return avxSupported; +} + +auto detect_os_avx512() -> bool +{ + if (!detect_os_avx()) + return false; + uint64_t xcrFeatureMask = xgetbv(XCR_XFEATURE_ENABLED_MASK); + return (xcrFeatureMask & 0xe6) == 0xe6; +} + +auto unicode::detail::max_simd_size() -> size_t +{ + if (!detect_os_avx()) + return 128; + + int32_t info[4]; + cpuid(info, 0, 0); + int nIds = info[0]; + + // cpuid(info, 0x80000000, 0); + // uint32_t nExIds = info[0]; + + // Detect Features + // if (nIds >= 0x00000001) + // { + // cpuid(info, 0x00000001, 0); + // bool HW_MMX = (info[3] & ((int) 1 << 23)) != 0; + // bool HW_SSE = (info[3] & ((int) 1 << 25)) != 0; + // bool HW_SSE2 = (info[3] & ((int) 1 << 26)) != 0; + // bool HW_SSE3 = (info[2] & ((int) 1 << 0)) != 0; + // + // bool HW_SSSE3 = (info[2] & ((int) 1 << 9)) != 0; + // bool HW_SSE41 = (info[2] & ((int) 1 << 19)) != 0; + // bool HW_SSE42 = (info[2] & ((int) 1 << 20)) != 0; + // bool HW_AES = (info[2] & ((int) 1 << 25)) != 0; + // + // bool HW_AVX = (info[2] & ((int) 1 << 28)) != 0; + // bool HW_FMA3 = (info[2] & ((int) 1 << 12)) != 0; + // + // bool HW_RDRAND = (info[2] & ((int) 1 << 30)) != 0; + // } + if (nIds >= 0x00000007) + { + cpuid(info, 0x00000007, 0); + bool HW_AVX2 = (info[1] & ((int) 1 << 5)) != 0; + if (!HW_AVX2) + return 128; + + // bool HW_BMI1 = (info[1] & ((int) 1 << 3)) != 0; + // bool HW_BMI2 = (info[1] & ((int) 1 << 8)) != 0; + // bool HW_ADX = (info[1] & ((int) 1 << 19)) != 0; + // bool HW_MPX = (info[1] & ((int) 1 << 14)) != 0; + // bool HW_SHA = (info[1] & ((int) 1 << 29)) != 0; + // bool HW_RDSEED = (info[1] & ((int) 1 << 18)) != 0; + // bool HW_PREFETCHWT1 = (info[2] & ((int) 1 << 0)) != 0; + // bool HW_RDPID = (info[2] & ((int) 1 << 22)) != 0; + + bool HW_AVX512_F = (info[1] & ((int) 1 << 16)) != 0; + // bool HW_AVX512_CD = (info[1] & ((int) 1 << 28)) != 0; + // bool HW_AVX512_PF = (info[1] & ((int) 1 << 26)) != 0; + // bool HW_AVX512_ER = (info[1] & ((int) 1 << 27)) != 0; + + // bool HW_AVX512_VL = (info[1] & ((int) 1 << 31)) != 0; + bool HW_AVX512_BW = (info[1] & ((int) 1 << 30)) != 0; + // bool HW_AVX512_DQ = (info[1] & ((int) 1 << 17)) != 0; + + // bool HW_AVX512_IFMA = (info[1] & ((int) 1 << 21)) != 0; + // bool HW_AVX512_VBMI = (info[2] & ((int) 1 << 1)) != 0; + + // bool HW_AVX512_VPOPCNTDQ = (info[2] & ((int) 1 << 14)) != 0; + // bool HW_AVX512_4VNNIW = (info[3] & ((int) 1 << 2)) != 0; + // bool HW_AVX512_4FMAPS = (info[3] & ((int) 1 << 3)) != 0; + + // bool HW_AVX512_VNNI = (info[2] & ((int) 1 << 11)) != 0; + + // bool HW_AVX512_VBMI2 = (info[2] & ((int) 1 << 6)) != 0; + // bool HW_GFNI = (info[2] & ((int) 1 << 8)) != 0; + // bool HW_VAES = (info[2] & ((int) 1 << 9)) != 0; + // bool HW_AVX512_VPCLMUL = (info[2] & ((int) 1 << 10)) != 0; + bool HW_AVX512_BITALG = (info[2] & ((int) 1 << 12)) != 0; + + bool use512 = detect_os_avx512() && HW_AVX512_F && HW_AVX512_BW && HW_AVX512_BITALG; + if (!use512) + return 256; + else + return 512; + + // cpuid(info, 0x00000007, 1); + // bool HW_AVX512_BF16 = (info[0] & ((int) 1 << 5)) != 0; + } + return 128; + // if (nExIds >= 0x80000001) + // { + // cpuid(info, 0x80000001, 0); + // bool HW_x64 = (info[3] & ((int) 1 << 29)) != 0; + // bool HW_ABM = (info[2] & ((int) 1 << 5)) != 0; + // bool HW_SSE4a = (info[2] & ((int) 1 << 6)) != 0; + // bool HW_FMA4 = (info[2] & ((int) 1 << 16)) != 0; + // bool HW_XOP = (info[2] & ((int) 1 << 11)) != 0; + // bool HW_PREFETCHW = (info[2] & ((int) 1 << 8)) != 0; + // } +} + +#else +auto unicode::detail::max_simd_size() -> size_t +{ + return 128; +} +#endif diff --git a/src/libunicode/simd_detector.h b/src/libunicode/simd_detector.h new file mode 100644 index 0000000..33de202 --- /dev/null +++ b/src/libunicode/simd_detector.h @@ -0,0 +1,7 @@ +#pragma once + +#include +namespace unicode::detail +{ +auto max_simd_size() -> size_t; +} From bdf4c5184990ab360485720f924d641ff994c36d Mon Sep 17 00:00:00 2001 From: Timofey Date: Sat, 2 Nov 2024 22:20:04 +0300 Subject: [PATCH 02/22] WIP(simd scan_for_text_ascii): added intrinsics and cleanup. Added x86-64 intrinsics implementation. Moved implemenation into a separate header. Added macro checks for x86. non x86 targets use 128 bit simd max. Changed intrinsic macro definition to LIBUNICODE_USE_INTRINSICS. --- src/libunicode/CMakeLists.txt | 2 +- src/libunicode/intrinsics.h | 139 ++++++++++++++++++++++++++++++ src/libunicode/scan.cpp | 65 +------------- src/libunicode/scan.h | 3 + src/libunicode/scan256.cpp | 68 ++------------- src/libunicode/scan512.cpp | 68 ++------------- src/libunicode/scan_simd_impl.hpp | 108 +++++++++++++++++++++++ 7 files changed, 264 insertions(+), 189 deletions(-) create mode 100644 src/libunicode/scan_simd_impl.hpp diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 2eaad88..0e082ce 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -136,7 +136,7 @@ set_source_files_properties( if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std") target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD) elseif(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") - target_compile_definitions(unicode PRIVATE USE_INTRINSICS) + target_compile_definitions(unicode PRIVATE LIBUNICDE_USE_INTRINSICS) endif() set(public_headers diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index 6e4f9c5..5027068 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -13,6 +13,7 @@ */ #pragma once +#include #if defined(__x86_64__) || defined(_M_AMD64) #include // AVX, AVX2, FMP #include // SSE2 @@ -72,6 +73,144 @@ struct platform_intrinsics<__m128i> using intrinsics = platform_intrinsics<__m128i>; +template +struct intrin +{ + using vec_t = void*; + + using mask_t = int; + + static inline vec_t setzero() noexcept; + + static inline vec_t set1_epi8(signed char w) noexcept; + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept; + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept; + + static inline vec_t or_vec(vec_t a, vec_t b) noexcept; + + static inline vec_t load(const char* p) noexcept; + + static inline bool equal(vec_t a, vec_t b) noexcept; + + static inline mask_t less(vec_t a, vec_t b) noexcept; + + static inline mask_t greater(vec_t a, vec_t b) noexcept; + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept; + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept; + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept; + + static inline auto to_underlying(mask_t a) noexcept; +}; + +template +struct intrin<128, T> +{ + using vec_t = __m128i; + + using mask_t = int; + + static inline vec_t setzero() noexcept { return _mm_setzero_si128(); } + + static inline vec_t set1_epi8(signed char w) { return _mm_set1_epi8(w); } + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm_xor_si128(a, b); } + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm_and_si128(a, b); } + + static inline vec_t or_vec(vec_t a, vec_t b) { return _mm_or_si128(a, b); } + + static inline vec_t load(const char* p) noexcept { return _mm_loadu_si128(reinterpret_cast(p)); } + + static inline bool equal(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xFFFF; } + + static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmplt_epi8(a, b)); } + + static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmpgt_epi8(a, b)); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } + + static inline uint32_t to_underlying(mask_t a) noexcept { return static_cast(a); } +}; + +template +struct intrin<256, T> +{ + using vec_t = __m256i; + + using mask_t = int; + + static inline vec_t setzero() noexcept { return _mm256_setzero_si256(); } + + static inline vec_t set1_epi8(signed char w) { return _mm256_set1_epi8(w); } + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm256_xor_si256(a, b); } + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm256_and_si256(a, b); } + + static inline vec_t or_vec(vec_t a, vec_t b) { return _mm256_or_si256(a, b); } + + static inline vec_t load(const char* p) noexcept { return _mm256_loadu_si256(reinterpret_cast(p)); } + + static inline bool equal(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi32(a, b)) == 0xFFFF; } + + static inline auto less(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(b, a)); } + + static inline auto greater(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(a, b)); } + + static inline auto movemask_epi8(vec_t a) noexcept { return _mm256_movemask_epi8(a); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } + + static inline uint32_t to_underlying(mask_t a) noexcept { return static_cast(a); } +}; + +template +struct intrin<512, T> +{ + using vec_t = __m512i; + + using mask_t = __mmask64; + + static inline vec_t setzero() noexcept { return _mm512_setzero_si512(); } + + static inline vec_t set1_epi8(signed char w) { return _mm512_set1_epi8(w); } + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm512_xor_si512(a, b); } + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm512_and_si512(a, b); } + + static inline vec_t or_vec(vec_t a, vec_t b) { return _mm512_or_si512(a, b); } + + static inline vec_t load(const char* p) noexcept { return _mm512_loadu_si512(reinterpret_cast(p)); } + + static inline bool equal(vec_t a, vec_t b) noexcept { return _mm512_cmpeq_epi8_mask(a, b) == 0xFFFFFFFF; } + + static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm512_cmpgt_epi8_mask(a, b); } + + static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm512_cmplt_epi8_mask(a, b); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return _kand_mask64(a, b); } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return _kor_mask64(a, b); } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return _kxor_mask64(a, b); } + + static inline uint64_t to_underlying(mask_t a) noexcept { return static_cast(a); } +}; + #endif // }}} diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp index 3d3a1a6..bd601bb 100644 --- a/src/libunicode/scan.cpp +++ b/src/libunicode/scan.cpp @@ -23,19 +23,7 @@ #include #include -// clang-format off -#if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) && !defined(__APPLE__) && !defined(__FreeBSD__) - #define USE_STD_SIMD - #include - namespace stdx = std::experimental; -#elif __has_include() && defined(LIBUNICODE_USE_STD_SIMD) - #define USE_STD_SIMD - #include - namespace stdx = std; -#elif defined(__SSE2__) - #include -#endif -// clang-format on +#include "scan_simd_impl.hpp" using std::distance; using std::get; @@ -88,6 +76,7 @@ namespace size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noexcept { +#if defined(__x86_64__) || defined(_M_AMD64) static auto simd_size = max_simd_size(); if (simd_size == 512) { @@ -97,56 +86,8 @@ size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noex { return scan_for_text_ascii_256(text, maxColumnCount); } - auto input = text.data(); - auto const end = text.data() + min(text.size(), maxColumnCount); -#if defined(USE_STD_SIMD) - constexpr int numberOfElements = stdx::simd_abi::max_fixed_size; - stdx::fixed_size_simd simd_text {}; - while (input < end - numberOfElements) - { - simd_text.copy_from(input, stdx::element_aligned); - - // check for control - // TODO check for complex - auto const simd_mask_text = (simd_text < 0x20); - if (stdx::popcount(simd_mask_text) > 0) - { - input += stdx::find_first_set(simd_mask_text); - break; - } - input += numberOfElements; - } -#elif defined(USE_INTRINSICS) - intrinsics::m128i const ControlCodeMax = intrinsics::set1_epi8(0x20); // 0..0x1F - intrinsics::m128i const Complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000) - - while (input < end - sizeof(intrinsics::m128i)) - { - intrinsics::m128i batch = intrinsics::load_unaligned((intrinsics::m128i*) input); - intrinsics::m128i isControl = intrinsics::compare_less(batch, ControlCodeMax); - intrinsics::m128i isComplex = intrinsics::and128(batch, Complex); - // intrinsics::m128i isComplex = _mm_cmplt_epi8(batch, Complex); - intrinsics::m128i testPack = intrinsics::or128(isControl, isComplex); - if (int const check = intrinsics::movemask_epi8(testPack); check != 0) - { - int advance = countTrailingZeroBits(static_cast(check)); - input += advance; - break; - } - input += sizeof(intrinsics::m128i); - } #endif - - while (input != end && is_ascii(*input)) - ++input; - - // if (static_cast(distance(text.data(), input))) - // std::print( - // "countAsciiTextChars: {} bytes: \"{}\"\n", - // static_cast(distance(text.data(), input)), - // (string_view(text.data(), static_cast(distance(text.data(), input))))); - - return static_cast(distance(text.data(), input)); + return scan_for_text_ascii_simd<128>(text, maxColumnCount); } scan_result detail::scan_for_text_nonascii(scan_state& state, diff --git a/src/libunicode/scan.h b/src/libunicode/scan.h index 3abe674..7048cb5 100644 --- a/src/libunicode/scan.h +++ b/src/libunicode/scan.h @@ -79,6 +79,9 @@ class null_receiver final: public grapheme_cluster_receiver namespace detail { size_t scan_for_text_ascii(std::string_view text, size_t maxColumnCount) noexcept; + + template + size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept; size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept; size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept; scan_result scan_for_text_nonascii(scan_state& state, diff --git a/src/libunicode/scan256.cpp b/src/libunicode/scan256.cpp index 340f88d..1dcc4ac 100644 --- a/src/libunicode/scan256.cpp +++ b/src/libunicode/scan256.cpp @@ -1,67 +1,9 @@ -#include -#include -#include -#include +#include "scan_simd_impl.hpp" -#include "scan.h" -#include - -namespace stdx = std::experimental; -using std::distance; -using std::get; -using std::holds_alternative; -using std::max; -using std::min; -using std::string_view; - -constexpr bool is_control(char ch) noexcept -{ - return static_cast(ch) < 0x20; -} - -// Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E. -constexpr bool is_complex(char ch) noexcept -{ - return static_cast(ch) & 0x80; -} - -// Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. -constexpr bool is_ascii(char ch) noexcept -{ - return !is_control(ch) && !is_complex(ch); -} - -namespace unicode +namespace unicode::detail { -size_t detail::scan_for_text_ascii_256(string_view text, size_t maxColumnCount) noexcept +size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept { - auto input = text.data(); - auto const end = text.data() + min(text.size(), maxColumnCount); - constexpr int numberOfElements = 256 / 8; - stdx::fixed_size_simd simd_text {}; - while (input < end - numberOfElements) - { - simd_text.copy_from(input, stdx::element_aligned); - - // check for control - // TODO check for complex - auto const simd_mask_text = (simd_text < 0x20); - if (stdx::popcount(simd_mask_text) > 0) - { - input += stdx::find_first_set(simd_mask_text); - break; - } - input += numberOfElements; - } - while (input != end && is_ascii(*input)) - ++input; - - // if (static_cast(distance(text.data(), input))) - // std::print( - // "countAsciiTextChars: {} bytes: \"{}\"\n", - // static_cast(distance(text.data(), input)), - // (string_view(text.data(), static_cast(distance(text.data(), input))))); - - return static_cast(distance(text.data(), input)); + return scan_for_text_ascii_simd<256>(text, maxColumnCount); } -} // namespace unicode +} // namespace unicode::detail diff --git a/src/libunicode/scan512.cpp b/src/libunicode/scan512.cpp index 7c5a618..fcd8026 100644 --- a/src/libunicode/scan512.cpp +++ b/src/libunicode/scan512.cpp @@ -1,67 +1,9 @@ -#include -#include -#include -#include +#include "scan_simd_impl.hpp" -#include "scan.h" -#include - -namespace stdx = std::experimental; -using std::distance; -using std::get; -using std::holds_alternative; -using std::max; -using std::min; -using std::string_view; - -constexpr bool is_control(char ch) noexcept -{ - return static_cast(ch) < 0x20; -} - -// Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E. -constexpr bool is_complex(char ch) noexcept -{ - return static_cast(ch) & 0x80; -} - -// Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. -constexpr bool is_ascii(char ch) noexcept -{ - return !is_control(ch) && !is_complex(ch); -} - -namespace unicode +namespace unicode::detail { -size_t detail::scan_for_text_ascii_512(string_view text, size_t maxColumnCount) noexcept +size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept { - auto input = text.data(); - auto const end = text.data() + min(text.size(), maxColumnCount); - constexpr int numberOfElements = 512 / 8; - stdx::fixed_size_simd simd_text {}; - while (input < end - numberOfElements) - { - simd_text.copy_from(input, stdx::element_aligned); - - // check for control - // TODO check for complex - auto const simd_mask_text = (simd_text < 0x20); - if (stdx::popcount(simd_mask_text) > 0) - { - input += stdx::find_first_set(simd_mask_text); - break; - } - input += numberOfElements; - } - while (input != end && is_ascii(*input)) - ++input; - - // if (static_cast(distance(text.data(), input))) - // std::print( - // "countAsciiTextChars: {} bytes: \"{}\"\n", - // static_cast(distance(text.data(), input)), - // (string_view(text.data(), static_cast(distance(text.data(), input))))); - - return static_cast(distance(text.data(), input)); + return scan_for_text_ascii_simd<512>(text, maxColumnCount); } -} // namespace unicode +} // namespace unicode::detail diff --git a/src/libunicode/scan_simd_impl.hpp b/src/libunicode/scan_simd_impl.hpp new file mode 100644 index 0000000..3cafb79 --- /dev/null +++ b/src/libunicode/scan_simd_impl.hpp @@ -0,0 +1,108 @@ +#pragma once +#include +#include +#include +#include +#include + +#include "intrinsics.h" + +// clang-format off +#if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) && !defined(__APPLE__) && !defined(__FreeBSD__) + #define USE_STD_SIMD + #include + namespace stdx = std::experimental; +#elif __has_include() && defined(LIBUNICODE_USE_STD_SIMD) + #define USE_STD_SIMD + #include + namespace stdx = std; + #include "intrinsics.h" +#endif +// clang-format on +namespace unicode::detail +{ +template +size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept +{ + [[maybe_unused]] constexpr int simd_size = SimdBitWidth / 8; + auto input = text.data(); + auto const end = text.data() + std::min(text.size(), maxColumnCount); + +#if defined(USE_STD_SIMD) + auto simd_text = stdx::fixed_size_simd {}; + while (input < end - simd_size) + { + simd_text.copy_from(input, stdx::element_aligned); + auto const is_control_mask = simd_text < 0x20; + auto const is_complex_mask = (simd_text & 0x80) == 0x80; + auto const ctrl_or_complex_mask = is_control_mask || is_complex_mask; + if (stdx::any_of(ctrl_or_complex_mask)) + { + input += stdx::find_first_set(ctrl_or_complex_mask); + break; + } + input += simd_size; + } +#elif defined(LIBUNICDE_USE_INTRINSICS) + + [[maybe_unused]] constexpr auto countTrailingZeroBits = [](T value) noexcept { + // clang-format off + if constexpr (std::same_as) + { + #if defined(_WIN32) + // return _tzcnt_u32(value); + // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. + unsigned long r = 0; + _BitScanForward(&r, value); + return r; + #else + return __builtin_ctz(value); + #endif + } + else if constexpr (std::same_as) + { + #if defined(_WIN32) + unsigned long r = 0; + _BitScanForward64(&r, value); + return r; + #else + return __builtin_ctzl(value); + #endif + } + else + { + static_assert(false); + } + // clang-format on + }; + using intrin = intrin; + auto const vec_control = intrin::set1_epi8(0x20); // 0..0x1F + auto const vec_complex = intrin::set1_epi8(-128); // equals to 0x80 (0b1000'0000) + + while (input < end - simd_size) + { + auto batch = intrin::load(input); + auto is_control_mask = intrin::less(batch, vec_control); + auto is_complex_mask = intrin::equal(intrin::and_vec(batch, vec_complex), vec_complex); + auto ctrl_or_complex_mask = intrin::or_mask(is_control_mask, is_complex_mask); + if (ctrl_or_complex_mask) + { + int advance = countTrailingZeroBits(intrin::to_underlying(ctrl_or_complex_mask)); + input += advance; + break; + } + input += sizeof(simd_size); + } +#endif + + constexpr auto is_ascii = [](char ch) noexcept { + auto is_control = static_cast(ch) > 0x20; + auto is_complex = static_cast(ch) & 0x80; + return !is_control && !is_complex; + }; + while (input != end && is_ascii(*input)) + ++input; + + return static_cast(std::distance(text.data(), input)); +} +} // namespace unicode::detail From b145efe6ab070439656b84636859b3783a6f5054 Mon Sep 17 00:00:00 2001 From: Timofey Date: Sat, 2 Nov 2024 22:46:36 +0300 Subject: [PATCH 03/22] WIP(simd scan_for_text_ascii): cleanup --- src/libunicode/scan.cpp | 19 ------------------- src/libunicode/scan_simd_impl.hpp | 15 ++++++--------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp index bd601bb..69d1976 100644 --- a/src/libunicode/scan.cpp +++ b/src/libunicode/scan.cpp @@ -37,19 +37,6 @@ namespace unicode namespace { - [[maybe_unused]] int countTrailingZeroBits(unsigned int value) noexcept - { -#if defined(_WIN32) - // return _tzcnt_u32(value); - // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. - unsigned long r = 0; - _BitScanForward(&r, value); - return r; -#else - return __builtin_ctz(value); -#endif - } - template constexpr bool ascending(T low, T val, T high) noexcept { @@ -66,12 +53,6 @@ namespace { return static_cast(ch) & 0x80; } - - // Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. - constexpr bool is_ascii(char ch) noexcept - { - return !is_control(ch) && !is_complex(ch); - } } // namespace size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noexcept diff --git a/src/libunicode/scan_simd_impl.hpp b/src/libunicode/scan_simd_impl.hpp index 3cafb79..d78d1a7 100644 --- a/src/libunicode/scan_simd_impl.hpp +++ b/src/libunicode/scan_simd_impl.hpp @@ -1,12 +1,9 @@ #pragma once #include #include -#include #include #include -#include "intrinsics.h" - // clang-format off #if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) && !defined(__APPLE__) && !defined(__FreeBSD__) #define USE_STD_SIMD @@ -16,6 +13,7 @@ #define USE_STD_SIMD #include namespace stdx = std; +#elif defined(LIBUNICDE_USE_INTRINSICS) #include "intrinsics.h" #endif // clang-format on @@ -33,9 +31,9 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no while (input < end - simd_size) { simd_text.copy_from(input, stdx::element_aligned); - auto const is_control_mask = simd_text < 0x20; - auto const is_complex_mask = (simd_text & 0x80) == 0x80; - auto const ctrl_or_complex_mask = is_control_mask || is_complex_mask; + auto is_control_mask = simd_text < 0x20; + auto is_complex_mask = (simd_text & 0x80) == 0x80; + auto ctrl_or_complex_mask = is_control_mask || is_complex_mask; if (stdx::any_of(ctrl_or_complex_mask)) { input += stdx::find_first_set(ctrl_or_complex_mask); @@ -44,8 +42,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no input += simd_size; } #elif defined(LIBUNICDE_USE_INTRINSICS) - - [[maybe_unused]] constexpr auto countTrailingZeroBits = [](T value) noexcept { + constexpr auto trailing_zero_count = [](T value) noexcept { // clang-format off if constexpr (std::same_as) { @@ -87,7 +84,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no auto ctrl_or_complex_mask = intrin::or_mask(is_control_mask, is_complex_mask); if (ctrl_or_complex_mask) { - int advance = countTrailingZeroBits(intrin::to_underlying(ctrl_or_complex_mask)); + int advance = trailing_zero_count(intrin::to_underlying(ctrl_or_complex_mask)); input += advance; break; } From b0f27e824dc2255601e5fb3a742a86f1f29636f7 Mon Sep 17 00:00:00 2001 From: Timofey Date: Sun, 3 Nov 2024 15:42:36 +0300 Subject: [PATCH 04/22] Implement comments from @Yaraslaut. --- src/libunicode/CMakeLists.txt | 2 +- src/libunicode/intrinsics.h | 36 ++----------------- src/libunicode/scan.cpp | 4 +-- src/libunicode/scan256.cpp | 3 +- src/libunicode/scan512.cpp | 3 +- .../{scan_simd_impl.hpp => scan_simd_impl.h} | 2 +- 6 files changed, 9 insertions(+), 41 deletions(-) rename src/libunicode/{scan_simd_impl.hpp => scan_simd_impl.h} (99%) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 0e082ce..84ded8f 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -254,7 +254,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index 5027068..c6cb924 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -15,8 +15,7 @@ #include #if defined(__x86_64__) || defined(_M_AMD64) - #include // AVX, AVX2, FMP - #include // SSE2 + #include #endif #if defined(__aarch64__) || defined(_M_ARM64) @@ -74,38 +73,7 @@ struct platform_intrinsics<__m128i> using intrinsics = platform_intrinsics<__m128i>; template -struct intrin -{ - using vec_t = void*; - - using mask_t = int; - - static inline vec_t setzero() noexcept; - - static inline vec_t set1_epi8(signed char w) noexcept; - - static inline vec_t xor_vec(vec_t a, vec_t b) noexcept; - - static inline vec_t and_vec(vec_t a, vec_t b) noexcept; - - static inline vec_t or_vec(vec_t a, vec_t b) noexcept; - - static inline vec_t load(const char* p) noexcept; - - static inline bool equal(vec_t a, vec_t b) noexcept; - - static inline mask_t less(vec_t a, vec_t b) noexcept; - - static inline mask_t greater(vec_t a, vec_t b) noexcept; - - static inline mask_t and_mask(mask_t a, mask_t b) noexcept; - - static inline mask_t or_mask(mask_t a, mask_t b) noexcept; - - static inline mask_t xor_mask(mask_t a, mask_t b) noexcept; - - static inline auto to_underlying(mask_t a) noexcept; -}; +struct intrin; template struct intrin<128, T> diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp index 69d1976..8048c0e 100644 --- a/src/libunicode/scan.cpp +++ b/src/libunicode/scan.cpp @@ -12,8 +12,8 @@ * limitations under the License. */ #include -#include #include +#include #include #include #include @@ -23,8 +23,6 @@ #include #include -#include "scan_simd_impl.hpp" - using std::distance; using std::get; using std::holds_alternative; diff --git a/src/libunicode/scan256.cpp b/src/libunicode/scan256.cpp index 1dcc4ac..015b8f7 100644 --- a/src/libunicode/scan256.cpp +++ b/src/libunicode/scan256.cpp @@ -1,4 +1,5 @@ -#include "scan_simd_impl.hpp" +#include +#include namespace unicode::detail { diff --git a/src/libunicode/scan512.cpp b/src/libunicode/scan512.cpp index fcd8026..fa81034 100644 --- a/src/libunicode/scan512.cpp +++ b/src/libunicode/scan512.cpp @@ -1,4 +1,5 @@ -#include "scan_simd_impl.hpp" +#include +#include namespace unicode::detail { diff --git a/src/libunicode/scan_simd_impl.hpp b/src/libunicode/scan_simd_impl.h similarity index 99% rename from src/libunicode/scan_simd_impl.hpp rename to src/libunicode/scan_simd_impl.h index d78d1a7..33135f7 100644 --- a/src/libunicode/scan_simd_impl.hpp +++ b/src/libunicode/scan_simd_impl.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include #include From 209a4c29dc60866590f4f305602b693470c35780 Mon Sep 17 00:00:00 2001 From: Timofey Date: Sun, 3 Nov 2024 16:28:34 +0300 Subject: [PATCH 05/22] WIP(simd scan_for_text_ascii) Arm intrinsics. --- src/libunicode/intrinsics.h | 77 ++++++++++++++++++++++++++++++--- src/libunicode/scan_simd_impl.h | 2 +- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index c6cb924..71b20cb 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -28,6 +28,9 @@ namespace unicode template struct platform_intrinsics; +template +struct intrin; + #if defined(__GNUC__) && defined(__x86_64__) // For some reason, GCC associates attributes with __m128i that are not obvious (alignment), // and then complains about it when used below. @@ -72,9 +75,6 @@ struct platform_intrinsics<__m128i> using intrinsics = platform_intrinsics<__m128i>; -template -struct intrin; - template struct intrin<128, T> { @@ -106,7 +106,7 @@ struct intrin<128, T> static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } - static inline uint32_t to_underlying(mask_t a) noexcept { return static_cast(a); } + static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } }; template @@ -142,7 +142,7 @@ struct intrin<256, T> static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } - static inline uint32_t to_underlying(mask_t a) noexcept { return static_cast(a); } + static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } }; template @@ -176,7 +176,7 @@ struct intrin<512, T> static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return _kxor_mask64(a, b); } - static inline uint64_t to_underlying(mask_t a) noexcept { return static_cast(a); } + static inline uint64_t to_unsigned(mask_t a) noexcept { return static_cast(a); } }; #endif @@ -327,6 +327,71 @@ struct platform_intrinsics }; using intrinsics = platform_intrinsics; + +template +struct intrin<128, T> +{ + // The following inline functions (in its initial version) were borrowed from: + // https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h + + using vec_t = int64x2_t; + + using mask_t = int; + + static inline vec_t setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); } + + static inline vec_t set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); } + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept + { + return vreinterpretq_s64_s32(veorq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); + } + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept + { + return vreinterpretq_s64_s32(vandq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); + } + + static inline vec_t or_vec(vec_t a, vec_t b) + { + return vreinterpretq_s64_s32(vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); + } + + static inline vec_t load(const char* p) noexcept + { + return vreinterpretq_s64_s32(vld1q_s32(reinterpret_cast(p))); + } + + static inline bool equal(vec_t a, vec_t b) noexcept + { + return movemask_epi8(vreinterpretq_s64_u32(vceqq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)))) == 0xFFFF; + } + + static inline vec_t less(vec_t a, vec_t b) noexcept + { + return vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b))); + } + + static inline vec_t greater(vec_t a, vec_t b) noexcept { return less(b, a); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } + + static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } + + static inline mask_t movemask_epi8(vec_t a) + { + uint8x16_t input = vreinterpretq_u8_s64(a); + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); + } +}; #endif // }}} diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index 33135f7..2591721 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -84,7 +84,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no auto ctrl_or_complex_mask = intrin::or_mask(is_control_mask, is_complex_mask); if (ctrl_or_complex_mask) { - int advance = trailing_zero_count(intrin::to_underlying(ctrl_or_complex_mask)); + int advance = trailing_zero_count(intrin::to_unsigned(ctrl_or_complex_mask)); input += advance; break; } From 8a192dc0d9c487605ba747d028846be721c0b47a Mon Sep 17 00:00:00 2001 From: Timofey Date: Sun, 3 Nov 2024 17:24:20 +0300 Subject: [PATCH 06/22] Fix(scan_for_text_asii): Fixed error and expanded tests with wider strings. --- src/libunicode/scan.cpp | 2 +- src/libunicode/scan_simd_impl.h | 2 +- src/libunicode/scan_test.cpp | 17 ++++++++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp index 8048c0e..cbfb05b 100644 --- a/src/libunicode/scan.cpp +++ b/src/libunicode/scan.cpp @@ -55,7 +55,7 @@ namespace size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noexcept { -#if defined(__x86_64__) || defined(_M_AMD64) +#if (defined(LIBUNICODE_USE_STD_SIMD) || defined(LIBUNICODE_USE_INTRINSICS)) && defined(__x86_64__) || defined(_M_AMD64) static auto simd_size = max_simd_size(); if (simd_size == 512) { diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index 2591721..a01ddc5 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -93,7 +93,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no #endif constexpr auto is_ascii = [](char ch) noexcept { - auto is_control = static_cast(ch) > 0x20; + auto is_control = static_cast(ch) < 0x20; auto is_complex = static_cast(ch) & 0x80; return !is_control && !is_complex; }; diff --git a/src/libunicode/scan_test.cpp b/src/libunicode/scan_test.cpp index 240e754..cfdbb7c 100644 --- a/src/libunicode/scan_test.cpp +++ b/src/libunicode/scan_test.cpp @@ -109,7 +109,12 @@ TEST_CASE("scan.ascii.empty") TEST_CASE("scan.ascii.32") { - auto const text = "0123456789ABCDEF0123456789ABCDEF"sv; + auto const text = "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 128) == 128); + CHECK(scan_for_text_ascii(text, 64) == 64); CHECK(scan_for_text_ascii(text, 32) == 32); CHECK(scan_for_text_ascii(text, 16) == 16); CHECK(scan_for_text_ascii(text, 8) == 8); @@ -123,12 +128,22 @@ TEST_CASE("scan.ascii.mixed_with_controls") CHECK(scan_for_text_ascii("12345678\033", 80) == 8); CHECK(scan_for_text_ascii("0123456789ABCDEF\033", 80) == 16); CHECK(scan_for_text_ascii("0123456789ABCDEF1\033", 80) == 17); + constexpr auto text = "0123456789ABCDEF0\033123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 80) == 17); } TEST_CASE("scan.ascii.until_complex") { CHECK(scan_for_text_ascii("1234\x80", 80) == 4); CHECK(scan_for_text_ascii("0123456789{\xE2\x94\x80}ABCDEF", 80) == 11); + constexpr auto text = "0123456789{\xE2\x94\x80}ABCDEF0323456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 80) == 11); } TEST_CASE("scan.complex.grapheme_cluster.1") From 37439effbe5089ff327e1db5aac365c3aaa45b91 Mon Sep 17 00:00:00 2001 From: Yaraslau Tamashevich Date: Mon, 4 Nov 2024 09:53:37 +0200 Subject: [PATCH 07/22] Use same structure across all platforms --- src/libunicode/intrinsics.h | 176 ++++++-------------------------- src/libunicode/scan_simd_impl.h | 16 +-- 2 files changed, 39 insertions(+), 153 deletions(-) diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index 71b20cb..b753b47 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -25,11 +25,8 @@ namespace unicode { -template -struct platform_intrinsics; - template -struct intrin; +struct intrinsics; #if defined(__GNUC__) && defined(__x86_64__) // For some reason, GCC associates attributes with __m128i that are not obvious (alignment), @@ -39,44 +36,8 @@ struct intrin; #if defined(__x86_64__) || defined(_M_AMD64) // {{{ -template <> -struct platform_intrinsics<__m128i> -{ - using m128i = __m128i; - - static inline m128i setzero() noexcept { return _mm_setzero_si128(); } - - static inline m128i set1_epi8(signed char w) { return _mm_set1_epi8(w); } - - static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept - { - return _mm_set_epi32(static_cast(a), static_cast(b), static_cast(c), static_cast(d)); - } - - static inline m128i xor128(m128i a, m128i b) noexcept { return _mm_xor_si128(a, b); } - - static inline m128i and128(m128i a, m128i b) noexcept { return _mm_and_si128(a, b); } - - // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. - static inline m128i or128(m128i a, m128i b) { return _mm_or_si128(a, b); } - - static inline m128i load_unaligned(m128i const* p) noexcept { return _mm_loadu_si128(static_cast(p)); } - - static inline int32_t to_i32(m128i a) { return _mm_cvtsi128_si32(a); } - - static inline bool compare(m128i a, m128i b) noexcept { return _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xFFFF; } - - static inline m128i compare_less(m128i a, m128i b) noexcept { return _mm_cmplt_epi8(a, b); } - - static inline int movemask_epi8(m128i a) { return _mm_movemask_epi8(a); } - - static inline m128i cvtsi64_si128(int64_t a) { return _mm_cvtsi64_si128(a); } -}; - -using intrinsics = platform_intrinsics<__m128i>; - template -struct intrin<128, T> +struct intrinsics<128, T> { using vec_t = __m128i; @@ -110,7 +71,7 @@ struct intrin<128, T> }; template -struct intrin<256, T> +struct intrinsics<256, T> { using vec_t = __m256i; @@ -146,7 +107,7 @@ struct intrin<256, T> }; template -struct intrin<512, T> +struct intrinsics<512, T> { using vec_t = __m512i; @@ -183,72 +144,64 @@ struct intrin<512, T> // }}} #if defined(__aarch64__) || defined(_M_ARM64) // {{{ -template <> -struct platform_intrinsics + + template +struct intrinsics<128, T> { // The following inline functions (in its initial version) were borrowed from: // https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h - using m128i = int64x2_t; + using vec_t = int64x2_t; - static inline m128i setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); } + using mask_t = int; - static inline m128i set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); } + static inline vec_t setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); } - static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept - { - alignas(16) int32_t data[4] = { - static_cast(a), - static_cast(b), - static_cast(c), - static_cast(d), - }; - return vreinterpretq_s64_s32(vld1q_s32(data)); - } + static inline vec_t set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); } - static inline m128i xor128(m128i a, m128i b) noexcept + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx return vreinterpretq_s64_s32(veorq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); } - static inline m128i and128(m128i a, m128i b) noexcept + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return vreinterpretq_s64_s32(vandq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); } - // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. - static inline m128i or128(m128i a, m128i b) + static inline vec_t or_vec(vec_t a, vec_t b) { return vreinterpretq_s64_s32(vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); } - // Loads 128-bit value. : - // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx - static inline m128i load_unaligned(m128i const* p) noexcept { return vreinterpretq_s64_s32(vld1q_s32((int32_t const*) p)); } - - // Copy the lower 32-bit integer in a to dst. - // - // dst[31:0] := a[31:0] - // - // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 - static inline int32_t to_i32(m128i a) { return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0); } + static inline vec_t load(const char* p) noexcept + { + return vreinterpretq_s64_s32(vld1q_s32(reinterpret_cast(p))); + } - static inline bool compare(m128i a, m128i b) noexcept + static inline bool equal(vec_t a, vec_t b) noexcept { return movemask_epi8(vreinterpretq_s64_u32(vceqq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)))) == 0xFFFF; } - static inline m128i compare_less(m128i a, m128i b) noexcept + static inline vec_t less(vec_t a, vec_t b) noexcept { - // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers - // in b for lesser than. - // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx return vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b))); } - static inline int movemask_epi8(m128i a) + static inline vec_t greater(vec_t a, vec_t b) noexcept { return less(b, a); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } + + static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } + + static inline mask_t movemask_epi8(vec_t a) { // Use increasingly wide shifts+adds to collect the sign bits // together. @@ -325,73 +278,6 @@ struct platform_intrinsics return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); } }; - -using intrinsics = platform_intrinsics; - -template -struct intrin<128, T> -{ - // The following inline functions (in its initial version) were borrowed from: - // https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h - - using vec_t = int64x2_t; - - using mask_t = int; - - static inline vec_t setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); } - - static inline vec_t set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); } - - static inline vec_t xor_vec(vec_t a, vec_t b) noexcept - { - return vreinterpretq_s64_s32(veorq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); - } - - static inline vec_t and_vec(vec_t a, vec_t b) noexcept - { - return vreinterpretq_s64_s32(vandq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); - } - - static inline vec_t or_vec(vec_t a, vec_t b) - { - return vreinterpretq_s64_s32(vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); - } - - static inline vec_t load(const char* p) noexcept - { - return vreinterpretq_s64_s32(vld1q_s32(reinterpret_cast(p))); - } - - static inline bool equal(vec_t a, vec_t b) noexcept - { - return movemask_epi8(vreinterpretq_s64_u32(vceqq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)))) == 0xFFFF; - } - - static inline vec_t less(vec_t a, vec_t b) noexcept - { - return vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b))); - } - - static inline vec_t greater(vec_t a, vec_t b) noexcept { return less(b, a); } - - static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } - - static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } - - static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } - - static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } - - static inline mask_t movemask_epi8(vec_t a) - { - uint8x16_t input = vreinterpretq_u8_s64(a); - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); - uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); - uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); - } -}; #endif // }}} diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index a01ddc5..c8981a7 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -72,19 +72,19 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no } // clang-format on }; - using intrin = intrin; - auto const vec_control = intrin::set1_epi8(0x20); // 0..0x1F - auto const vec_complex = intrin::set1_epi8(-128); // equals to 0x80 (0b1000'0000) + using intrinsics = intrinsics; + auto const vec_control = intrinsics::set1_epi8(0x20); // 0..0x1F + auto const vec_complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000) while (input < end - simd_size) { - auto batch = intrin::load(input); - auto is_control_mask = intrin::less(batch, vec_control); - auto is_complex_mask = intrin::equal(intrin::and_vec(batch, vec_complex), vec_complex); - auto ctrl_or_complex_mask = intrin::or_mask(is_control_mask, is_complex_mask); + auto batch = intrinsics::load(input); + auto is_control_mask = intrinsics::less(batch, vec_control); + auto is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); + auto ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); if (ctrl_or_complex_mask) { - int advance = trailing_zero_count(intrin::to_unsigned(ctrl_or_complex_mask)); + int advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); input += advance; break; } From ea5164080707c1b884547a0275ff6ac5a7021bb5 Mon Sep 17 00:00:00 2001 From: Yaraslau Tamashevich Date: Mon, 4 Nov 2024 10:57:24 +0200 Subject: [PATCH 08/22] fix macos --- src/libunicode/intrinsics.h | 4 ++-- src/libunicode/scan_simd_impl.h | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index b753b47..c770a67 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -145,8 +145,8 @@ struct intrinsics<512, T> #if defined(__aarch64__) || defined(_M_ARM64) // {{{ - template -struct intrinsics<128, T> +template +struct intrinsics { // The following inline functions (in its initial version) were borrowed from: // https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index c8981a7..8bc72c3 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -68,7 +68,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no } else { - static_assert(false); + return __builtin_ctzl(value); } // clang-format on }; @@ -80,11 +80,11 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no { auto batch = intrinsics::load(input); auto is_control_mask = intrinsics::less(batch, vec_control); - auto is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); - auto ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); - if (ctrl_or_complex_mask) + auto is_complex_mask = intrinsics::and_vec(batch, vec_complex); + auto ctrl_or_complex_mask = intrinsics::or_vec(is_control_mask, is_complex_mask); + if (int const check = intrinsics::movemask_epi8(ctrl_or_complex_mask)) { - int advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); + int advance = trailing_zero_count(static_cast(check)); input += advance; break; } From 572972fcac7ced644ee40c87f7a41dc8e16d3b7c Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 18:37:06 +0300 Subject: [PATCH 09/22] fix(scan_for_text_ascii): revert logic change and fix non-x86 builds. --- src/libunicode/CMakeLists.txt | 48 +++++++++++++++++++-------------- src/libunicode/intrinsics.h | 4 +-- src/libunicode/scan_simd_impl.h | 10 +++---- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 84ded8f..4988e3e 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -97,6 +97,32 @@ target_link_libraries(unicode_loader PUBLIC unicode::ucd) # ========================================================================================================= + +if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std" OR LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(LIBUNICODE_SIMD_SOURCES + scan256.cpp + scan512.cpp + ) + set_source_files_properties( + scan256.cpp + PROPERTIES + COMPILE_FLAGS + -mavx2 + ) + set_source_files_properties( + scan512.cpp + PROPERTIES + COMPILE_FLAGS + -mavx512f + COMPILE_FLAGS + -mavx512bw + COMPILE_FLAGS + -mavx512bitalg + ) + endif() +endif() + add_library(unicode ${LIBUNICODE_LIB_MODE} capi.cpp codepoint_properties.cpp @@ -107,8 +133,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} utf8.cpp width.cpp simd_detector.cpp - scan256.cpp - scan512.cpp + ${LIBUNICODE_SIMD_SOURCES} # auto-generated by unicode_tablegen codepoint_properties_data.h @@ -116,23 +141,6 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} codepoint_properties_names.cpp ) -set_source_files_properties( - scan256.cpp - PROPERTIES - COMPILE_FLAGS - -mavx2 - ) -set_source_files_properties( - scan512.cpp - PROPERTIES - COMPILE_FLAGS - -mavx512f - COMPILE_FLAGS - -mavx512bw - COMPILE_FLAGS - -mavx512bitalg - ) - if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std") target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD) elseif(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") @@ -254,7 +262,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index c770a67..01c2686 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -145,8 +145,8 @@ struct intrinsics<512, T> #if defined(__aarch64__) || defined(_M_ARM64) // {{{ -template -struct intrinsics +template +struct intrinsics<128, T> { // The following inline functions (in its initial version) were borrowed from: // https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index 8bc72c3..c8981a7 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -68,7 +68,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no } else { - return __builtin_ctzl(value); + static_assert(false); } // clang-format on }; @@ -80,11 +80,11 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no { auto batch = intrinsics::load(input); auto is_control_mask = intrinsics::less(batch, vec_control); - auto is_complex_mask = intrinsics::and_vec(batch, vec_complex); - auto ctrl_or_complex_mask = intrinsics::or_vec(is_control_mask, is_complex_mask); - if (int const check = intrinsics::movemask_epi8(ctrl_or_complex_mask)) + auto is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); + auto ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); + if (ctrl_or_complex_mask) { - int advance = trailing_zero_count(static_cast(check)); + int advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); input += advance; break; } From dc7ed7b86b37ba151cc85df5fd24bbd26570ad20 Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 18:42:04 +0300 Subject: [PATCH 10/22] fix cmake autoformat. --- src/libunicode/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 4988e3e..0bef004 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -262,7 +262,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) From 5c0c0d6eb7a9f5334814912bdb47536ac38716d5 Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 18:46:33 +0300 Subject: [PATCH 11/22] Moved `simd_detector.cpp` to x86 sources only. --- src/libunicode/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 0bef004..a853332 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -101,6 +101,7 @@ target_link_libraries(unicode_loader PUBLIC unicode::ucd) if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std" OR LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") set(LIBUNICODE_SIMD_SOURCES + simd_detector.cpp scan256.cpp scan512.cpp ) @@ -132,7 +133,6 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} script_segmenter.cpp utf8.cpp width.cpp - simd_detector.cpp ${LIBUNICODE_SIMD_SOURCES} # auto-generated by unicode_tablegen @@ -262,7 +262,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) From 1e81e3d37dc574515b251c8928ed80d73d0e72d5 Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 18:48:30 +0300 Subject: [PATCH 12/22] Fixed cmake autoformat again. --- src/libunicode/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index a853332..3b49983 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -262,7 +262,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) From 10cb062579f7af0263cb6fb9e010d818107a9606 Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 19:02:04 +0300 Subject: [PATCH 13/22] Fix arm comparison intrinsics. --- src/libunicode/intrinsics.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index 01c2686..99a6252 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -91,9 +91,9 @@ struct intrinsics<256, T> static inline bool equal(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi32(a, b)) == 0xFFFF; } - static inline auto less(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(b, a)); } + static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(b, a)); } - static inline auto greater(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(a, b)); } + static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(a, b)); } static inline auto movemask_epi8(vec_t a) noexcept { return _mm256_movemask_epi8(a); } @@ -186,12 +186,12 @@ struct intrinsics<128, T> return movemask_epi8(vreinterpretq_s64_u32(vceqq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)))) == 0xFFFF; } - static inline vec_t less(vec_t a, vec_t b) noexcept + static inline mask_t less(vec_t a, vec_t b) noexcept { - return vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b))); + return movemask_epi8(vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b)))); } - static inline vec_t greater(vec_t a, vec_t b) noexcept { return less(b, a); } + static inline mask_t greater(vec_t a, vec_t b) noexcept { return less(b, a); } static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } From 8c04467d66895ffbde92b36f484abf6880aea11d Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 19:49:51 +0300 Subject: [PATCH 14/22] Fixed avx512 comparison and simd detection. --- src/libunicode/CMakeLists.txt | 4 ++-- src/libunicode/intrinsics.h | 4 ++-- src/libunicode/scan.cpp | 2 +- src/libunicode/scan_simd_impl.h | 42 ++++++++++++++++----------------- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 3b49983..200aa97 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -144,7 +144,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std") target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD) elseif(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") - target_compile_definitions(unicode PRIVATE LIBUNICDE_USE_INTRINSICS) + target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_INTRINSICS) endif() set(public_headers @@ -262,7 +262,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index 99a6252..03bbd37 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -127,9 +127,9 @@ struct intrinsics<512, T> static inline bool equal(vec_t a, vec_t b) noexcept { return _mm512_cmpeq_epi8_mask(a, b) == 0xFFFFFFFF; } - static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm512_cmpgt_epi8_mask(a, b); } + static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm512_cmplt_epi8_mask(a, b); } - static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm512_cmplt_epi8_mask(a, b); } + static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm512_cmpgt_epi8_mask(a, b); } static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return _kand_mask64(a, b); } diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp index cbfb05b..1fce7a3 100644 --- a/src/libunicode/scan.cpp +++ b/src/libunicode/scan.cpp @@ -55,7 +55,7 @@ namespace size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noexcept { -#if (defined(LIBUNICODE_USE_STD_SIMD) || defined(LIBUNICODE_USE_INTRINSICS)) && defined(__x86_64__) || defined(_M_AMD64) +#if (defined(LIBUNICODE_USE_STD_SIMD) || defined(LIBUNICODE_USE_INTRINSICS)) && (defined(__x86_64__) || defined(_M_AMD64)) static auto simd_size = max_simd_size(); if (simd_size == 512) { diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index c8981a7..fa56815 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -13,7 +13,7 @@ #define USE_STD_SIMD #include namespace stdx = std; -#elif defined(LIBUNICDE_USE_INTRINSICS) +#elif defined(LIBUNICODE_USE_INTRINSICS) #include "intrinsics.h" #endif // clang-format on @@ -41,30 +41,30 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no } input += simd_size; } -#elif defined(LIBUNICDE_USE_INTRINSICS) +#elif defined(LIBUNICODE_USE_INTRINSICS) constexpr auto trailing_zero_count = [](T value) noexcept { // clang-format off - if constexpr (std::same_as) + if constexpr (std::same_as, uint32_t>) { - #if defined(_WIN32) - // return _tzcnt_u32(value); - // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. - unsigned long r = 0; - _BitScanForward(&r, value); - return r; - #else - return __builtin_ctz(value); - #endif + #if defined(_WIN32) + // return _tzcnt_u32(value); + // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. + unsigned long r = 0; + _BitScanForward(&r, value); + return r; + #else + return __builtin_ctz(value); + #endif } - else if constexpr (std::same_as) + else if constexpr (std::same_as, uint64_t>) { - #if defined(_WIN32) - unsigned long r = 0; - _BitScanForward64(&r, value); - return r; - #else - return __builtin_ctzl(value); - #endif + #if defined(_WIN32) + unsigned long r = 0; + _BitScanForward64(&r, value); + return r; + #else + return __builtin_ctzl(value); + #endif } else { @@ -82,7 +82,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no auto is_control_mask = intrinsics::less(batch, vec_control); auto is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); auto ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); - if (ctrl_or_complex_mask) + if (is_control_mask) { int advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); input += advance; From d358053846738d3a678229bc9cb12365838c454f Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 20:06:59 +0300 Subject: [PATCH 15/22] Fix gcc13 compilation. --- src/libunicode/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 200aa97..982493e 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -118,8 +118,6 @@ if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std" OR LIBUNICODE_SIMD_IMPLEMENTATI -mavx512f COMPILE_FLAGS -mavx512bw - COMPILE_FLAGS - -mavx512bitalg ) endif() endif() @@ -262,7 +260,7 @@ if(LIBUNICODE_TESTING) if(NOT Catch2_FOUND) # supress conversion warnings for Catch2 # https://github.com/catchorg/Catch2/issues/2583 - # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt #L18-L22 + # https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22 set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF) set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF) get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES) From e9bf7b69ca4167e2614b943e95021ce4cceee656 Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 20:12:48 +0300 Subject: [PATCH 16/22] Fix macOS compilation. --- src/libunicode/scan_simd_impl.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index fa56815..2b52e3a 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -56,7 +56,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no return __builtin_ctz(value); #endif } - else if constexpr (std::same_as, uint64_t>) + else { #if defined(_WIN32) unsigned long r = 0; @@ -66,10 +66,6 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no return __builtin_ctzl(value); #endif } - else - { - static_assert(false); - } // clang-format on }; using intrinsics = intrinsics; From 889b95bf67dcf80761d9efd5e14e0f0c631cafc4 Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 20:40:54 +0300 Subject: [PATCH 17/22] Added widows version of system_processor. --- src/libunicode/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index 982493e..1899422 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -99,7 +99,7 @@ target_link_libraries(unicode_loader PUBLIC unicode::ucd) if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std" OR LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") - if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") set(LIBUNICODE_SIMD_SOURCES simd_detector.cpp scan256.cpp From 2370befec3116f71182418f14bf2801658ec3826 Mon Sep 17 00:00:00 2001 From: Timofey Date: Mon, 4 Nov 2024 21:31:06 +0300 Subject: [PATCH 18/22] Fix windows simd detector. --- src/libunicode/simd_detector.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libunicode/simd_detector.cpp b/src/libunicode/simd_detector.cpp index ee60b4a..2b39323 100644 --- a/src/libunicode/simd_detector.cpp +++ b/src/libunicode/simd_detector.cpp @@ -36,7 +36,7 @@ __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); return ((uint64_t) edx << 32) | eax; } - #define XCR_XFEATURE_ENABLED_MASK 0 + #define _XCR_XFEATURE_ENABLED_MASK 0 // clang-format on #else #error "No cpuid intrinsic defined for compiler." @@ -54,7 +54,7 @@ auto detect_os_avx() -> bool if (osUsesXSAVE_XRSTORE && cpuAVXSuport) { - uint64_t xcrFeatureMask = xgetbv(XCR_XFEATURE_ENABLED_MASK); + uint64_t xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); avxSupported = (xcrFeatureMask & 0x6) == 0x6; } @@ -65,7 +65,7 @@ auto detect_os_avx512() -> bool { if (!detect_os_avx()) return false; - uint64_t xcrFeatureMask = xgetbv(XCR_XFEATURE_ENABLED_MASK); + uint64_t xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); return (xcrFeatureMask & 0xe6) == 0xe6; } From 875660691ad31020e3eded96b60afd334749d8dc Mon Sep 17 00:00:00 2001 From: Timofey Date: Tue, 5 Nov 2024 18:39:40 +0300 Subject: [PATCH 19/22] Fix missing declataion warnings. --- src/libunicode/simd_detector.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/libunicode/simd_detector.cpp b/src/libunicode/simd_detector.cpp index 2b39323..e83c3f1 100644 --- a/src/libunicode/simd_detector.cpp +++ b/src/libunicode/simd_detector.cpp @@ -9,6 +9,11 @@ // // auto max_simd_size() -> size_t; +void cpuid(int32_t out[4], int32_t eax, int32_t ecx); +uint64_t xgetbv(unsigned int index); +auto detect_os_avx() -> bool; +auto detect_os_avx512() -> bool; + #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) #if _WIN32 // clang-format off From aadecbb3435d15bd54fd6e9db9cfefd82642a85c Mon Sep 17 00:00:00 2001 From: Yaraslau Tamashevich Date: Sun, 10 Nov 2024 16:43:18 +0200 Subject: [PATCH 20/22] small changes --- src/libunicode/scan256.cpp | 1 + src/libunicode/scan512.cpp | 1 + src/libunicode/scan_simd_impl.h | 21 +++++++++++---------- src/libunicode/simd_detector.cpp | 26 ++++++++++++++++---------- src/libunicode/simd_detector.h | 1 + 5 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/libunicode/scan256.cpp b/src/libunicode/scan256.cpp index 015b8f7..3cbead3 100644 --- a/src/libunicode/scan256.cpp +++ b/src/libunicode/scan256.cpp @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: Apache-2.0 #include #include diff --git a/src/libunicode/scan512.cpp b/src/libunicode/scan512.cpp index fa81034..c96374f 100644 --- a/src/libunicode/scan512.cpp +++ b/src/libunicode/scan512.cpp @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: Apache-2.0 #include #include diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index 2b52e3a..ebd1e55 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: Apache-2.0 #pragma once #include #include @@ -31,9 +32,9 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no while (input < end - simd_size) { simd_text.copy_from(input, stdx::element_aligned); - auto is_control_mask = simd_text < 0x20; - auto is_complex_mask = (simd_text & 0x80) == 0x80; - auto ctrl_or_complex_mask = is_control_mask || is_complex_mask; + auto const is_control_mask = simd_text < 0x20; + auto const is_complex_mask = (simd_text & 0x80) == 0x80; + auto const ctrl_or_complex_mask = is_control_mask || is_complex_mask; if (stdx::any_of(ctrl_or_complex_mask)) { input += stdx::find_first_set(ctrl_or_complex_mask); @@ -74,13 +75,13 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no while (input < end - simd_size) { - auto batch = intrinsics::load(input); - auto is_control_mask = intrinsics::less(batch, vec_control); - auto is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); - auto ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); + auto const batch = intrinsics::load(input); + auto const is_control_mask = intrinsics::less(batch, vec_control); + auto const is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); + auto const ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); if (is_control_mask) { - int advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); + int const advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); input += advance; break; } @@ -89,8 +90,8 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no #endif constexpr auto is_ascii = [](char ch) noexcept { - auto is_control = static_cast(ch) < 0x20; - auto is_complex = static_cast(ch) & 0x80; + auto const is_control = static_cast(ch) < 0x20; + auto const is_complex = static_cast(ch) & 0x80; return !is_control && !is_complex; }; while (input != end && is_ascii(*input)) diff --git a/src/libunicode/simd_detector.cpp b/src/libunicode/simd_detector.cpp index e83c3f1..c3153f1 100644 --- a/src/libunicode/simd_detector.cpp +++ b/src/libunicode/simd_detector.cpp @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: Apache-2.0 #include "simd_detector.h" #include @@ -11,6 +12,11 @@ void cpuid(int32_t out[4], int32_t eax, int32_t ecx); uint64_t xgetbv(unsigned int index); +enum class Simd_Size +{ + sse2, + avx, +}; auto detect_os_avx() -> bool; auto detect_os_avx512() -> bool; @@ -54,12 +60,12 @@ auto detect_os_avx() -> bool int32_t cpuInfo[4]; cpuid(cpuInfo, 1, 0); - bool osUsesXSAVE_XRSTORE = (cpuInfo[2] & (1 << 27)) != 0; - bool cpuAVXSuport = (cpuInfo[2] & (1 << 28)) != 0; + bool const osUsesXSAVE_XRSTORE = (cpuInfo[2] & (1 << 27)); + bool const cpuAVXSuport = (cpuInfo[2] & (1 << 28)) != 0; if (osUsesXSAVE_XRSTORE && cpuAVXSuport) { - uint64_t xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); + uint64_t const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); avxSupported = (xcrFeatureMask & 0x6) == 0x6; } @@ -70,7 +76,7 @@ auto detect_os_avx512() -> bool { if (!detect_os_avx()) return false; - uint64_t xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); + uint64_t const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); return (xcrFeatureMask & 0xe6) == 0xe6; } @@ -81,7 +87,7 @@ auto unicode::detail::max_simd_size() -> size_t int32_t info[4]; cpuid(info, 0, 0); - int nIds = info[0]; + int const nIds = info[0]; // cpuid(info, 0x80000000, 0); // uint32_t nExIds = info[0]; @@ -108,7 +114,7 @@ auto unicode::detail::max_simd_size() -> size_t if (nIds >= 0x00000007) { cpuid(info, 0x00000007, 0); - bool HW_AVX2 = (info[1] & ((int) 1 << 5)) != 0; + bool const HW_AVX2 = (info[1] & ((int) 1 << 5)) != 0; if (!HW_AVX2) return 128; @@ -121,13 +127,13 @@ auto unicode::detail::max_simd_size() -> size_t // bool HW_PREFETCHWT1 = (info[2] & ((int) 1 << 0)) != 0; // bool HW_RDPID = (info[2] & ((int) 1 << 22)) != 0; - bool HW_AVX512_F = (info[1] & ((int) 1 << 16)) != 0; + bool const HW_AVX512_F = (info[1] & ((int) 1 << 16)); // bool HW_AVX512_CD = (info[1] & ((int) 1 << 28)) != 0; // bool HW_AVX512_PF = (info[1] & ((int) 1 << 26)) != 0; // bool HW_AVX512_ER = (info[1] & ((int) 1 << 27)) != 0; // bool HW_AVX512_VL = (info[1] & ((int) 1 << 31)) != 0; - bool HW_AVX512_BW = (info[1] & ((int) 1 << 30)) != 0; + bool const HW_AVX512_BW = (info[1] & ((int) 1 << 30)); // bool HW_AVX512_DQ = (info[1] & ((int) 1 << 17)) != 0; // bool HW_AVX512_IFMA = (info[1] & ((int) 1 << 21)) != 0; @@ -143,9 +149,9 @@ auto unicode::detail::max_simd_size() -> size_t // bool HW_GFNI = (info[2] & ((int) 1 << 8)) != 0; // bool HW_VAES = (info[2] & ((int) 1 << 9)) != 0; // bool HW_AVX512_VPCLMUL = (info[2] & ((int) 1 << 10)) != 0; - bool HW_AVX512_BITALG = (info[2] & ((int) 1 << 12)) != 0; + bool const HW_AVX512_BITALG = (info[2] & ((int) 1 << 12)); - bool use512 = detect_os_avx512() && HW_AVX512_F && HW_AVX512_BW && HW_AVX512_BITALG; + bool const use512 = detect_os_avx512() && HW_AVX512_F && HW_AVX512_BW && HW_AVX512_BITALG; if (!use512) return 256; else diff --git a/src/libunicode/simd_detector.h b/src/libunicode/simd_detector.h index 33de202..5fbb546 100644 --- a/src/libunicode/simd_detector.h +++ b/src/libunicode/simd_detector.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: Apache-2.0 #pragma once #include From bae48fc281d315ad39bccfcde464f1fc91a3824d Mon Sep 17 00:00:00 2001 From: Yaraslau Tamashevich Date: Sun, 10 Nov 2024 16:52:04 +0200 Subject: [PATCH 21/22] Add tests and small fix --- src/libunicode/scan_simd_impl.h | 2 +- src/libunicode/scan_test.cpp | 33 +++++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h index ebd1e55..2e0b88e 100644 --- a/src/libunicode/scan_simd_impl.h +++ b/src/libunicode/scan_simd_impl.h @@ -79,7 +79,7 @@ size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) no auto const is_control_mask = intrinsics::less(batch, vec_control); auto const is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); auto const ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); - if (is_control_mask) + if (ctrl_or_complex_mask) { int const advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); input += advance; diff --git a/src/libunicode/scan_test.cpp b/src/libunicode/scan_test.cpp index cfdbb7c..555c3e9 100644 --- a/src/libunicode/scan_test.cpp +++ b/src/libunicode/scan_test.cpp @@ -108,6 +108,26 @@ TEST_CASE("scan.ascii.empty") } TEST_CASE("scan.ascii.32") +{ + auto const text = "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 32) == 32); + CHECK(scan_for_text_ascii(text, 16) == 16); + CHECK(scan_for_text_ascii(text, 8) == 8); + CHECK(scan_for_text_ascii(text, 1) == 1); +} + +TEST_CASE("scan.ascii.64") +{ + auto const text = "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 64) == 64); + CHECK(scan_for_text_ascii(text, 32) == 32); + CHECK(scan_for_text_ascii(text, 16) == 16); + CHECK(scan_for_text_ascii(text, 8) == 8); + CHECK(scan_for_text_ascii(text, 1) == 1); +} + +TEST_CASE("scan.ascii.128") { auto const text = "0123456789ABCDEF0123456789ABCDEF" "0123456789ABCDEF0123456789ABCDEF" @@ -128,11 +148,16 @@ TEST_CASE("scan.ascii.mixed_with_controls") CHECK(scan_for_text_ascii("12345678\033", 80) == 8); CHECK(scan_for_text_ascii("0123456789ABCDEF\033", 80) == 16); CHECK(scan_for_text_ascii("0123456789ABCDEF1\033", 80) == 17); - constexpr auto text = "0123456789ABCDEF0\033123456789ABCDEF" - "0123456789ABCDEF0123456789ABCDEF" - "0123456789ABCDEF0123456789ABCDEF" - "0123456789ABCDEF0123456789ABCDEF"sv; + auto text = "0123456789ABCDEF0\033123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; CHECK(scan_for_text_ascii(text, 80) == 17); + text = "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF\0330123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 128) == 112); } TEST_CASE("scan.ascii.until_complex") From 16ff3f39e6f34c4ac04bd55351514baaebd403d0 Mon Sep 17 00:00:00 2001 From: Yaraslau Tamashevich Date: Sun, 10 Nov 2024 16:57:59 +0200 Subject: [PATCH 22/22] Fix --- src/libunicode/simd_detector.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/libunicode/simd_detector.cpp b/src/libunicode/simd_detector.cpp index c3153f1..2293b9b 100644 --- a/src/libunicode/simd_detector.cpp +++ b/src/libunicode/simd_detector.cpp @@ -11,12 +11,13 @@ // auto max_simd_size() -> size_t; void cpuid(int32_t out[4], int32_t eax, int32_t ecx); +#if _WIN32 +__int64 xgetbv(unsigned int x); +#elif defined(__GNUC__) || defined(__clang__) uint64_t xgetbv(unsigned int index); -enum class Simd_Size -{ - sse2, - avx, -}; +#else +#endif + auto detect_os_avx() -> bool; auto detect_os_avx512() -> bool; @@ -65,7 +66,7 @@ auto detect_os_avx() -> bool if (osUsesXSAVE_XRSTORE && cpuAVXSuport) { - uint64_t const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); + auto const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); avxSupported = (xcrFeatureMask & 0x6) == 0x6; } @@ -114,7 +115,7 @@ auto unicode::detail::max_simd_size() -> size_t if (nIds >= 0x00000007) { cpuid(info, 0x00000007, 0); - bool const HW_AVX2 = (info[1] & ((int) 1 << 5)) != 0; + bool const HW_AVX2 = (info[1] & ((int) 1 << 5)); if (!HW_AVX2) return 128;