diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt index bb1a824..1899422 100644 --- a/src/libunicode/CMakeLists.txt +++ b/src/libunicode/CMakeLists.txt @@ -97,6 +97,31 @@ target_link_libraries(unicode_loader PUBLIC unicode::ucd) # ========================================================================================================= + +if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std" OR LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") + set(LIBUNICODE_SIMD_SOURCES + simd_detector.cpp + scan256.cpp + scan512.cpp + ) + set_source_files_properties( + scan256.cpp + PROPERTIES + COMPILE_FLAGS + -mavx2 + ) + set_source_files_properties( + scan512.cpp + PROPERTIES + COMPILE_FLAGS + -mavx512f + COMPILE_FLAGS + -mavx512bw + ) + endif() +endif() + add_library(unicode ${LIBUNICODE_LIB_MODE} capi.cpp codepoint_properties.cpp @@ -106,6 +131,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} script_segmenter.cpp utf8.cpp width.cpp + ${LIBUNICODE_SIMD_SOURCES} # auto-generated by unicode_tablegen codepoint_properties_data.h @@ -116,7 +142,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE} if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std") target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD) elseif(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics") - target_compile_definitions(unicode PRIVATE USE_INTRINSICS) + target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_INTRINSICS) endif() set(public_headers diff --git a/src/libunicode/intrinsics.h b/src/libunicode/intrinsics.h index 6e4f9c5..03bbd37 100644 --- a/src/libunicode/intrinsics.h +++ b/src/libunicode/intrinsics.h @@ -13,9 +13,9 @@ */ #pragma once +#include #if defined(__x86_64__) || defined(_M_AMD64) - #include // AVX, AVX2, FMP - #include // SSE2 + #include #endif #if defined(__aarch64__) || defined(_M_ARM64) @@ -25,8 +25,8 @@ namespace unicode { -template -struct platform_intrinsics; +template +struct intrinsics; #if defined(__GNUC__) && defined(__x86_64__) // For some reason, GCC associates attributes with __m128i that are not obvious (alignment), @@ -36,112 +36,172 @@ struct platform_intrinsics; #if defined(__x86_64__) || defined(_M_AMD64) // {{{ -template <> -struct platform_intrinsics<__m128i> +template +struct intrinsics<128, T> { - using m128i = __m128i; + using vec_t = __m128i; - static inline m128i setzero() noexcept { return _mm_setzero_si128(); } + using mask_t = int; - static inline m128i set1_epi8(signed char w) { return _mm_set1_epi8(w); } + static inline vec_t setzero() noexcept { return _mm_setzero_si128(); } - static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept - { - return _mm_set_epi32(static_cast(a), static_cast(b), static_cast(c), static_cast(d)); - } + static inline vec_t set1_epi8(signed char w) { return _mm_set1_epi8(w); } + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm_xor_si128(a, b); } + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm_and_si128(a, b); } - static inline m128i xor128(m128i a, m128i b) noexcept { return _mm_xor_si128(a, b); } + static inline vec_t or_vec(vec_t a, vec_t b) { return _mm_or_si128(a, b); } - static inline m128i and128(m128i a, m128i b) noexcept { return _mm_and_si128(a, b); } + static inline vec_t load(const char* p) noexcept { return _mm_loadu_si128(reinterpret_cast(p)); } - // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. - static inline m128i or128(m128i a, m128i b) { return _mm_or_si128(a, b); } + static inline bool equal(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xFFFF; } - static inline m128i load_unaligned(m128i const* p) noexcept { return _mm_loadu_si128(static_cast(p)); } + static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmplt_epi8(a, b)); } - static inline int32_t to_i32(m128i a) { return _mm_cvtsi128_si32(a); } + static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmpgt_epi8(a, b)); } - static inline bool compare(m128i a, m128i b) noexcept { return _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xFFFF; } + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } - static inline m128i compare_less(m128i a, m128i b) noexcept { return _mm_cmplt_epi8(a, b); } + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } - static inline int movemask_epi8(m128i a) { return _mm_movemask_epi8(a); } + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } - static inline m128i cvtsi64_si128(int64_t a) { return _mm_cvtsi64_si128(a); } + static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } }; -using intrinsics = platform_intrinsics<__m128i>; +template +struct intrinsics<256, T> +{ + using vec_t = __m256i; + + using mask_t = int; + + static inline vec_t setzero() noexcept { return _mm256_setzero_si256(); } + + static inline vec_t set1_epi8(signed char w) { return _mm256_set1_epi8(w); } + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm256_xor_si256(a, b); } + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm256_and_si256(a, b); } + + static inline vec_t or_vec(vec_t a, vec_t b) { return _mm256_or_si256(a, b); } + + static inline vec_t load(const char* p) noexcept { return _mm256_loadu_si256(reinterpret_cast(p)); } + + static inline bool equal(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi32(a, b)) == 0xFFFF; } + + static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(b, a)); } + + static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(a, b)); } + + static inline auto movemask_epi8(vec_t a) noexcept { return _mm256_movemask_epi8(a); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } + + static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } +}; + +template +struct intrinsics<512, T> +{ + using vec_t = __m512i; + + using mask_t = __mmask64; + + static inline vec_t setzero() noexcept { return _mm512_setzero_si512(); } + + static inline vec_t set1_epi8(signed char w) { return _mm512_set1_epi8(w); } + + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm512_xor_si512(a, b); } + + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm512_and_si512(a, b); } + + static inline vec_t or_vec(vec_t a, vec_t b) { return _mm512_or_si512(a, b); } + + static inline vec_t load(const char* p) noexcept { return _mm512_loadu_si512(reinterpret_cast(p)); } + + static inline bool equal(vec_t a, vec_t b) noexcept { return _mm512_cmpeq_epi8_mask(a, b) == 0xFFFFFFFF; } + + static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm512_cmplt_epi8_mask(a, b); } + + static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm512_cmpgt_epi8_mask(a, b); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return _kand_mask64(a, b); } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return _kor_mask64(a, b); } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return _kxor_mask64(a, b); } + + static inline uint64_t to_unsigned(mask_t a) noexcept { return static_cast(a); } +}; #endif // }}} #if defined(__aarch64__) || defined(_M_ARM64) // {{{ -template <> -struct platform_intrinsics + +template +struct intrinsics<128, T> { // The following inline functions (in its initial version) were borrowed from: // https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h - using m128i = int64x2_t; + using vec_t = int64x2_t; - static inline m128i setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); } + using mask_t = int; - static inline m128i set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); } + static inline vec_t setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); } - static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept - { - alignas(16) int32_t data[4] = { - static_cast(a), - static_cast(b), - static_cast(c), - static_cast(d), - }; - return vreinterpretq_s64_s32(vld1q_s32(data)); - } + static inline vec_t set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); } - static inline m128i xor128(m128i a, m128i b) noexcept + static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx return vreinterpretq_s64_s32(veorq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); } - static inline m128i and128(m128i a, m128i b) noexcept + static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return vreinterpretq_s64_s32(vandq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); } - // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. - static inline m128i or128(m128i a, m128i b) + static inline vec_t or_vec(vec_t a, vec_t b) { return vreinterpretq_s64_s32(vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); } - // Loads 128-bit value. : - // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx - static inline m128i load_unaligned(m128i const* p) noexcept { return vreinterpretq_s64_s32(vld1q_s32((int32_t const*) p)); } - - // Copy the lower 32-bit integer in a to dst. - // - // dst[31:0] := a[31:0] - // - // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 - static inline int32_t to_i32(m128i a) { return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0); } + static inline vec_t load(const char* p) noexcept + { + return vreinterpretq_s64_s32(vld1q_s32(reinterpret_cast(p))); + } - static inline bool compare(m128i a, m128i b) noexcept + static inline bool equal(vec_t a, vec_t b) noexcept { return movemask_epi8(vreinterpretq_s64_u32(vceqq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)))) == 0xFFFF; } - static inline m128i compare_less(m128i a, m128i b) noexcept + static inline mask_t less(vec_t a, vec_t b) noexcept { - // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers - // in b for lesser than. - // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx - return vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b))); + return movemask_epi8(vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b)))); } - static inline int movemask_epi8(m128i a) + static inline mask_t greater(vec_t a, vec_t b) noexcept { return less(b, a); } + + static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; } + + static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; } + + static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; } + + static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast(a); } + + static inline mask_t movemask_epi8(vec_t a) { // Use increasingly wide shifts+adds to collect the sign bits // together. @@ -218,8 +278,6 @@ struct platform_intrinsics return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); } }; - -using intrinsics = platform_intrinsics; #endif // }}} diff --git a/src/libunicode/scan.cpp b/src/libunicode/scan.cpp index 8efbbad..1fce7a3 100644 --- a/src/libunicode/scan.cpp +++ b/src/libunicode/scan.cpp @@ -12,8 +12,9 @@ * limitations under the License. */ #include -#include #include +#include +#include #include #include @@ -22,20 +23,6 @@ #include #include -// clang-format off -#if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) && !defined(__APPLE__) && !defined(__FreeBSD__) - #define USE_STD_SIMD - #include - namespace stdx = std::experimental; -#elif __has_include() && defined(LIBUNICODE_USE_STD_SIMD) - #define USE_STD_SIMD - #include - namespace stdx = std; -#elif defined(__SSE2__) - #include -#endif -// clang-format on - using std::distance; using std::get; using std::holds_alternative; @@ -48,19 +35,6 @@ namespace unicode namespace { - [[maybe_unused]] int countTrailingZeroBits(unsigned int value) noexcept - { -#if defined(_WIN32) - // return _tzcnt_u32(value); - // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. - unsigned long r = 0; - _BitScanForward(&r, value); - return r; -#else - return __builtin_ctz(value); -#endif - } - template constexpr bool ascending(T low, T val, T high) noexcept { @@ -77,66 +51,22 @@ namespace { return static_cast(ch) & 0x80; } - - // Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters. - constexpr bool is_ascii(char ch) noexcept - { - return !is_control(ch) && !is_complex(ch); - } } // namespace size_t detail::scan_for_text_ascii(string_view text, size_t maxColumnCount) noexcept { - auto input = text.data(); - auto const end = text.data() + min(text.size(), maxColumnCount); -#if defined(USE_STD_SIMD) - constexpr int numberOfElements = stdx::simd_abi::max_fixed_size; - stdx::fixed_size_simd simd_text {}; - while (input < end - numberOfElements) +#if (defined(LIBUNICODE_USE_STD_SIMD) || defined(LIBUNICODE_USE_INTRINSICS)) && (defined(__x86_64__) || defined(_M_AMD64)) + static auto simd_size = max_simd_size(); + if (simd_size == 512) { - simd_text.copy_from(input, stdx::element_aligned); - - // check for control - // TODO check for complex - auto const simd_mask_text = (simd_text < 0x20); - if (stdx::popcount(simd_mask_text) > 0) - { - input += stdx::find_first_set(simd_mask_text); - break; - } - input += numberOfElements; + return scan_for_text_ascii_512(text, maxColumnCount); } -#elif defined(USE_INTRINSICS) - intrinsics::m128i const ControlCodeMax = intrinsics::set1_epi8(0x20); // 0..0x1F - intrinsics::m128i const Complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000) - - while (input < end - sizeof(intrinsics::m128i)) + else if (simd_size == 256) { - intrinsics::m128i batch = intrinsics::load_unaligned((intrinsics::m128i*) input); - intrinsics::m128i isControl = intrinsics::compare_less(batch, ControlCodeMax); - intrinsics::m128i isComplex = intrinsics::and128(batch, Complex); - // intrinsics::m128i isComplex = _mm_cmplt_epi8(batch, Complex); - intrinsics::m128i testPack = intrinsics::or128(isControl, isComplex); - if (int const check = intrinsics::movemask_epi8(testPack); check != 0) - { - int advance = countTrailingZeroBits(static_cast(check)); - input += advance; - break; - } - input += sizeof(intrinsics::m128i); + return scan_for_text_ascii_256(text, maxColumnCount); } #endif - - while (input != end && is_ascii(*input)) - ++input; - - // if (static_cast(distance(text.data(), input))) - // std::print( - // "countAsciiTextChars: {} bytes: \"{}\"\n", - // static_cast(distance(text.data(), input)), - // (string_view(text.data(), static_cast(distance(text.data(), input))))); - - return static_cast(distance(text.data(), input)); + return scan_for_text_ascii_simd<128>(text, maxColumnCount); } scan_result detail::scan_for_text_nonascii(scan_state& state, diff --git a/src/libunicode/scan.h b/src/libunicode/scan.h index 902cc1c..7048cb5 100644 --- a/src/libunicode/scan.h +++ b/src/libunicode/scan.h @@ -79,6 +79,11 @@ class null_receiver final: public grapheme_cluster_receiver namespace detail { size_t scan_for_text_ascii(std::string_view text, size_t maxColumnCount) noexcept; + + template + size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept; + size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept; + size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept; scan_result scan_for_text_nonascii(scan_state& state, std::string_view text, size_t maxColumnCount, diff --git a/src/libunicode/scan256.cpp b/src/libunicode/scan256.cpp new file mode 100644 index 0000000..3cbead3 --- /dev/null +++ b/src/libunicode/scan256.cpp @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 +#include +#include + +namespace unicode::detail +{ +size_t scan_for_text_ascii_256(std::string_view text, size_t maxColumnCount) noexcept +{ + return scan_for_text_ascii_simd<256>(text, maxColumnCount); +} +} // namespace unicode::detail diff --git a/src/libunicode/scan512.cpp b/src/libunicode/scan512.cpp new file mode 100644 index 0000000..c96374f --- /dev/null +++ b/src/libunicode/scan512.cpp @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 +#include +#include + +namespace unicode::detail +{ +size_t scan_for_text_ascii_512(std::string_view text, size_t maxColumnCount) noexcept +{ + return scan_for_text_ascii_simd<512>(text, maxColumnCount); +} +} // namespace unicode::detail diff --git a/src/libunicode/scan_simd_impl.h b/src/libunicode/scan_simd_impl.h new file mode 100644 index 0000000..2e0b88e --- /dev/null +++ b/src/libunicode/scan_simd_impl.h @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +#pragma once +#include +#include +#include +#include + +// clang-format off +#if __has_include() && defined(LIBUNICODE_USE_STD_SIMD) && !defined(__APPLE__) && !defined(__FreeBSD__) + #define USE_STD_SIMD + #include + namespace stdx = std::experimental; +#elif __has_include() && defined(LIBUNICODE_USE_STD_SIMD) + #define USE_STD_SIMD + #include + namespace stdx = std; +#elif defined(LIBUNICODE_USE_INTRINSICS) + #include "intrinsics.h" +#endif +// clang-format on +namespace unicode::detail +{ +template +size_t scan_for_text_ascii_simd(std::string_view text, size_t maxColumnCount) noexcept +{ + [[maybe_unused]] constexpr int simd_size = SimdBitWidth / 8; + auto input = text.data(); + auto const end = text.data() + std::min(text.size(), maxColumnCount); + +#if defined(USE_STD_SIMD) + auto simd_text = stdx::fixed_size_simd {}; + while (input < end - simd_size) + { + simd_text.copy_from(input, stdx::element_aligned); + auto const is_control_mask = simd_text < 0x20; + auto const is_complex_mask = (simd_text & 0x80) == 0x80; + auto const ctrl_or_complex_mask = is_control_mask || is_complex_mask; + if (stdx::any_of(ctrl_or_complex_mask)) + { + input += stdx::find_first_set(ctrl_or_complex_mask); + break; + } + input += simd_size; + } +#elif defined(LIBUNICODE_USE_INTRINSICS) + constexpr auto trailing_zero_count = [](T value) noexcept { + // clang-format off + if constexpr (std::same_as, uint32_t>) + { + #if defined(_WIN32) + // return _tzcnt_u32(value); + // Don't do _tzcnt_u32, because that's only available on x86-64, but not on ARM64. + unsigned long r = 0; + _BitScanForward(&r, value); + return r; + #else + return __builtin_ctz(value); + #endif + } + else + { + #if defined(_WIN32) + unsigned long r = 0; + _BitScanForward64(&r, value); + return r; + #else + return __builtin_ctzl(value); + #endif + } + // clang-format on + }; + using intrinsics = intrinsics; + auto const vec_control = intrinsics::set1_epi8(0x20); // 0..0x1F + auto const vec_complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000) + + while (input < end - simd_size) + { + auto const batch = intrinsics::load(input); + auto const is_control_mask = intrinsics::less(batch, vec_control); + auto const is_complex_mask = intrinsics::equal(intrinsics::and_vec(batch, vec_complex), vec_complex); + auto const ctrl_or_complex_mask = intrinsics::or_mask(is_control_mask, is_complex_mask); + if (ctrl_or_complex_mask) + { + int const advance = trailing_zero_count(intrinsics::to_unsigned(ctrl_or_complex_mask)); + input += advance; + break; + } + input += sizeof(simd_size); + } +#endif + + constexpr auto is_ascii = [](char ch) noexcept { + auto const is_control = static_cast(ch) < 0x20; + auto const is_complex = static_cast(ch) & 0x80; + return !is_control && !is_complex; + }; + while (input != end && is_ascii(*input)) + ++input; + + return static_cast(std::distance(text.data(), input)); +} +} // namespace unicode::detail diff --git a/src/libunicode/scan_test.cpp b/src/libunicode/scan_test.cpp index 240e754..555c3e9 100644 --- a/src/libunicode/scan_test.cpp +++ b/src/libunicode/scan_test.cpp @@ -116,6 +116,31 @@ TEST_CASE("scan.ascii.32") CHECK(scan_for_text_ascii(text, 1) == 1); } +TEST_CASE("scan.ascii.64") +{ + auto const text = "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 64) == 64); + CHECK(scan_for_text_ascii(text, 32) == 32); + CHECK(scan_for_text_ascii(text, 16) == 16); + CHECK(scan_for_text_ascii(text, 8) == 8); + CHECK(scan_for_text_ascii(text, 1) == 1); +} + +TEST_CASE("scan.ascii.128") +{ + auto const text = "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 128) == 128); + CHECK(scan_for_text_ascii(text, 64) == 64); + CHECK(scan_for_text_ascii(text, 32) == 32); + CHECK(scan_for_text_ascii(text, 16) == 16); + CHECK(scan_for_text_ascii(text, 8) == 8); + CHECK(scan_for_text_ascii(text, 1) == 1); +} + TEST_CASE("scan.ascii.mixed_with_controls") { CHECK(scan_for_text_ascii("\0331234", 80) == 0); @@ -123,12 +148,27 @@ TEST_CASE("scan.ascii.mixed_with_controls") CHECK(scan_for_text_ascii("12345678\033", 80) == 8); CHECK(scan_for_text_ascii("0123456789ABCDEF\033", 80) == 16); CHECK(scan_for_text_ascii("0123456789ABCDEF1\033", 80) == 17); + auto text = "0123456789ABCDEF0\033123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 80) == 17); + text = "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF\0330123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 128) == 112); } TEST_CASE("scan.ascii.until_complex") { CHECK(scan_for_text_ascii("1234\x80", 80) == 4); CHECK(scan_for_text_ascii("0123456789{\xE2\x94\x80}ABCDEF", 80) == 11); + constexpr auto text = "0123456789{\xE2\x94\x80}ABCDEF0323456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF" + "0123456789ABCDEF0123456789ABCDEF"sv; + CHECK(scan_for_text_ascii(text, 80) == 11); } TEST_CASE("scan.complex.grapheme_cluster.1") diff --git a/src/libunicode/simd_detector.cpp b/src/libunicode/simd_detector.cpp new file mode 100644 index 0000000..2293b9b --- /dev/null +++ b/src/libunicode/simd_detector.cpp @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: Apache-2.0 +#include "simd_detector.h" + +#include + +// AVX512 required: +// AVX512_BITALG : popcnt +// AVX512_BW : compare greater (less is needed) +// AVX512_F : and +// +// auto max_simd_size() -> size_t; + +void cpuid(int32_t out[4], int32_t eax, int32_t ecx); +#if _WIN32 +__int64 xgetbv(unsigned int x); +#elif defined(__GNUC__) || defined(__clang__) +uint64_t xgetbv(unsigned int index); +#else +#endif + +auto detect_os_avx() -> bool; +auto detect_os_avx512() -> bool; + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) + #if _WIN32 + // clang-format off + #include + #include + void cpuid(int32_t out[4], int32_t eax, int32_t ecx) + { + __cpuidex(out, eax, ecx); + } + __int64 xgetbv(unsigned int x) + { + return _xgetbv(x); + } + // clang-format on + #elif defined(__GNUC__) || defined(__clang__) + // clang-format off + #include + void cpuid(int32_t out[4], int32_t eax, int32_t ecx) + { + __cpuid_count(eax, ecx, out[0], out[1], out[2], out[3]); + } + uint64_t xgetbv(unsigned int index) + { + uint32_t eax, edx; + __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); + return ((uint64_t) edx << 32) | eax; + } + #define _XCR_XFEATURE_ENABLED_MASK 0 + // clang-format on + #else + #error "No cpuid intrinsic defined for compiler." + #endif + +auto detect_os_avx() -> bool +{ + // Copied from: http://stackoverflow.com/a/22521619/922184 + bool avxSupported = false; + int32_t cpuInfo[4]; + cpuid(cpuInfo, 1, 0); + + bool const osUsesXSAVE_XRSTORE = (cpuInfo[2] & (1 << 27)); + bool const cpuAVXSuport = (cpuInfo[2] & (1 << 28)) != 0; + + if (osUsesXSAVE_XRSTORE && cpuAVXSuport) + { + auto const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); + avxSupported = (xcrFeatureMask & 0x6) == 0x6; + } + + return avxSupported; +} + +auto detect_os_avx512() -> bool +{ + if (!detect_os_avx()) + return false; + uint64_t const xcrFeatureMask = xgetbv(_XCR_XFEATURE_ENABLED_MASK); + return (xcrFeatureMask & 0xe6) == 0xe6; +} + +auto unicode::detail::max_simd_size() -> size_t +{ + if (!detect_os_avx()) + return 128; + + int32_t info[4]; + cpuid(info, 0, 0); + int const nIds = info[0]; + + // cpuid(info, 0x80000000, 0); + // uint32_t nExIds = info[0]; + + // Detect Features + // if (nIds >= 0x00000001) + // { + // cpuid(info, 0x00000001, 0); + // bool HW_MMX = (info[3] & ((int) 1 << 23)) != 0; + // bool HW_SSE = (info[3] & ((int) 1 << 25)) != 0; + // bool HW_SSE2 = (info[3] & ((int) 1 << 26)) != 0; + // bool HW_SSE3 = (info[2] & ((int) 1 << 0)) != 0; + // + // bool HW_SSSE3 = (info[2] & ((int) 1 << 9)) != 0; + // bool HW_SSE41 = (info[2] & ((int) 1 << 19)) != 0; + // bool HW_SSE42 = (info[2] & ((int) 1 << 20)) != 0; + // bool HW_AES = (info[2] & ((int) 1 << 25)) != 0; + // + // bool HW_AVX = (info[2] & ((int) 1 << 28)) != 0; + // bool HW_FMA3 = (info[2] & ((int) 1 << 12)) != 0; + // + // bool HW_RDRAND = (info[2] & ((int) 1 << 30)) != 0; + // } + if (nIds >= 0x00000007) + { + cpuid(info, 0x00000007, 0); + bool const HW_AVX2 = (info[1] & ((int) 1 << 5)); + if (!HW_AVX2) + return 128; + + // bool HW_BMI1 = (info[1] & ((int) 1 << 3)) != 0; + // bool HW_BMI2 = (info[1] & ((int) 1 << 8)) != 0; + // bool HW_ADX = (info[1] & ((int) 1 << 19)) != 0; + // bool HW_MPX = (info[1] & ((int) 1 << 14)) != 0; + // bool HW_SHA = (info[1] & ((int) 1 << 29)) != 0; + // bool HW_RDSEED = (info[1] & ((int) 1 << 18)) != 0; + // bool HW_PREFETCHWT1 = (info[2] & ((int) 1 << 0)) != 0; + // bool HW_RDPID = (info[2] & ((int) 1 << 22)) != 0; + + bool const HW_AVX512_F = (info[1] & ((int) 1 << 16)); + // bool HW_AVX512_CD = (info[1] & ((int) 1 << 28)) != 0; + // bool HW_AVX512_PF = (info[1] & ((int) 1 << 26)) != 0; + // bool HW_AVX512_ER = (info[1] & ((int) 1 << 27)) != 0; + + // bool HW_AVX512_VL = (info[1] & ((int) 1 << 31)) != 0; + bool const HW_AVX512_BW = (info[1] & ((int) 1 << 30)); + // bool HW_AVX512_DQ = (info[1] & ((int) 1 << 17)) != 0; + + // bool HW_AVX512_IFMA = (info[1] & ((int) 1 << 21)) != 0; + // bool HW_AVX512_VBMI = (info[2] & ((int) 1 << 1)) != 0; + + // bool HW_AVX512_VPOPCNTDQ = (info[2] & ((int) 1 << 14)) != 0; + // bool HW_AVX512_4VNNIW = (info[3] & ((int) 1 << 2)) != 0; + // bool HW_AVX512_4FMAPS = (info[3] & ((int) 1 << 3)) != 0; + + // bool HW_AVX512_VNNI = (info[2] & ((int) 1 << 11)) != 0; + + // bool HW_AVX512_VBMI2 = (info[2] & ((int) 1 << 6)) != 0; + // bool HW_GFNI = (info[2] & ((int) 1 << 8)) != 0; + // bool HW_VAES = (info[2] & ((int) 1 << 9)) != 0; + // bool HW_AVX512_VPCLMUL = (info[2] & ((int) 1 << 10)) != 0; + bool const HW_AVX512_BITALG = (info[2] & ((int) 1 << 12)); + + bool const use512 = detect_os_avx512() && HW_AVX512_F && HW_AVX512_BW && HW_AVX512_BITALG; + if (!use512) + return 256; + else + return 512; + + // cpuid(info, 0x00000007, 1); + // bool HW_AVX512_BF16 = (info[0] & ((int) 1 << 5)) != 0; + } + return 128; + // if (nExIds >= 0x80000001) + // { + // cpuid(info, 0x80000001, 0); + // bool HW_x64 = (info[3] & ((int) 1 << 29)) != 0; + // bool HW_ABM = (info[2] & ((int) 1 << 5)) != 0; + // bool HW_SSE4a = (info[2] & ((int) 1 << 6)) != 0; + // bool HW_FMA4 = (info[2] & ((int) 1 << 16)) != 0; + // bool HW_XOP = (info[2] & ((int) 1 << 11)) != 0; + // bool HW_PREFETCHW = (info[2] & ((int) 1 << 8)) != 0; + // } +} + +#else +auto unicode::detail::max_simd_size() -> size_t +{ + return 128; +} +#endif diff --git a/src/libunicode/simd_detector.h b/src/libunicode/simd_detector.h new file mode 100644 index 0000000..5fbb546 --- /dev/null +++ b/src/libunicode/simd_detector.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include +namespace unicode::detail +{ +auto max_simd_size() -> size_t; +}