Skip to content

Commit 5aa90fd

Browse files
DooezYaraslaut
andauthored
WIP: Use avx when available. (#108)
* scan_for_text_ascii: Added simple avx2 and avx512 detector. * WIP(simd scan_for_text_ascii): added intrinsics and cleanup. Added x86-64 intrinsics implementation. Moved implemenation into a separate header. Added macro checks for x86. non x86 targets use 128 bit simd max. Changed intrinsic macro definition to LIBUNICODE_USE_INTRINSICS. * WIP(simd scan_for_text_ascii): cleanup * Implement comments from @Yaraslaut. * WIP(simd scan_for_text_ascii) Arm intrinsics. * Fix(scan_for_text_asii): Fixed error and expanded tests with wider strings. * Use same structure across all platforms * fix macos * fix(scan_for_text_ascii): revert logic change and fix non-x86 builds. * fix cmake autoformat. * Moved `simd_detector.cpp` to x86 sources only. * Fixed cmake autoformat again. * Fix arm comparison intrinsics. * Fixed avx512 comparison and simd detection. * Fix gcc13 compilation. * Fix macOS compilation. * Added widows version of system_processor. * Fix windows simd detector. * Fix missing declataion warnings. * small changes * Add tests and small fix * Fix --------- Co-authored-by: Yaraslau Tamashevich <[email protected]>
1 parent 817cb59 commit 5aa90fd

10 files changed

+515
-142
lines changed

src/libunicode/CMakeLists.txt

+27-1
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,31 @@ target_link_libraries(unicode_loader PUBLIC unicode::ucd)
9797

9898
# =========================================================================================================
9999

100+
101+
if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std" OR LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics")
102+
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
103+
set(LIBUNICODE_SIMD_SOURCES
104+
simd_detector.cpp
105+
scan256.cpp
106+
scan512.cpp
107+
)
108+
set_source_files_properties(
109+
scan256.cpp
110+
PROPERTIES
111+
COMPILE_FLAGS
112+
-mavx2
113+
)
114+
set_source_files_properties(
115+
scan512.cpp
116+
PROPERTIES
117+
COMPILE_FLAGS
118+
-mavx512f
119+
COMPILE_FLAGS
120+
-mavx512bw
121+
)
122+
endif()
123+
endif()
124+
100125
add_library(unicode ${LIBUNICODE_LIB_MODE}
101126
capi.cpp
102127
codepoint_properties.cpp
@@ -106,6 +131,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
106131
script_segmenter.cpp
107132
utf8.cpp
108133
width.cpp
134+
${LIBUNICODE_SIMD_SOURCES}
109135

110136
# auto-generated by unicode_tablegen
111137
codepoint_properties_data.h
@@ -116,7 +142,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
116142
if(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "std")
117143
target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD)
118144
elseif(LIBUNICODE_SIMD_IMPLEMENTATION STREQUAL "intrinsics")
119-
target_compile_definitions(unicode PRIVATE USE_INTRINSICS)
145+
target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_INTRINSICS)
120146
endif()
121147

122148
set(public_headers

src/libunicode/intrinsics.h

+120-62
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
*/
1414
#pragma once
1515

16+
#include <cstdint>
1617
#if defined(__x86_64__) || defined(_M_AMD64)
17-
#include <emmintrin.h> // AVX, AVX2, FMP
18-
#include <immintrin.h> // SSE2
18+
#include <immintrin.h>
1919
#endif
2020

2121
#if defined(__aarch64__) || defined(_M_ARM64)
@@ -25,8 +25,8 @@
2525
namespace unicode
2626
{
2727

28-
template <typename>
29-
struct platform_intrinsics;
28+
template <size_t SimdBitWidth, typename = void>
29+
struct intrinsics;
3030

3131
#if defined(__GNUC__) && defined(__x86_64__)
3232
// For some reason, GCC associates attributes with __m128i that are not obvious (alignment),
@@ -36,112 +36,172 @@ struct platform_intrinsics;
3636

3737
#if defined(__x86_64__) || defined(_M_AMD64) // {{{
3838

39-
template <>
40-
struct platform_intrinsics<__m128i>
39+
template <typename T>
40+
struct intrinsics<128, T>
4141
{
42-
using m128i = __m128i;
42+
using vec_t = __m128i;
4343

44-
static inline m128i setzero() noexcept { return _mm_setzero_si128(); }
44+
using mask_t = int;
4545

46-
static inline m128i set1_epi8(signed char w) { return _mm_set1_epi8(w); }
46+
static inline vec_t setzero() noexcept { return _mm_setzero_si128(); }
4747

48-
static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept
49-
{
50-
return _mm_set_epi32(static_cast<int>(a), static_cast<int>(b), static_cast<int>(c), static_cast<int>(d));
51-
}
48+
static inline vec_t set1_epi8(signed char w) { return _mm_set1_epi8(w); }
49+
50+
static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm_xor_si128(a, b); }
51+
52+
static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm_and_si128(a, b); }
5253

53-
static inline m128i xor128(m128i a, m128i b) noexcept { return _mm_xor_si128(a, b); }
54+
static inline vec_t or_vec(vec_t a, vec_t b) { return _mm_or_si128(a, b); }
5455

55-
static inline m128i and128(m128i a, m128i b) noexcept { return _mm_and_si128(a, b); }
56+
static inline vec_t load(const char* p) noexcept { return _mm_loadu_si128(reinterpret_cast<const vec_t*>(p)); }
5657

57-
// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
58-
static inline m128i or128(m128i a, m128i b) { return _mm_or_si128(a, b); }
58+
static inline bool equal(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xFFFF; }
5959

60-
static inline m128i load_unaligned(m128i const* p) noexcept { return _mm_loadu_si128(static_cast<m128i const*>(p)); }
60+
static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmplt_epi8(a, b)); }
6161

62-
static inline int32_t to_i32(m128i a) { return _mm_cvtsi128_si32(a); }
62+
static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm_movemask_epi8(_mm_cmpgt_epi8(a, b)); }
6363

64-
static inline bool compare(m128i a, m128i b) noexcept { return _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xFFFF; }
64+
static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; }
6565

66-
static inline m128i compare_less(m128i a, m128i b) noexcept { return _mm_cmplt_epi8(a, b); }
66+
static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; }
6767

68-
static inline int movemask_epi8(m128i a) { return _mm_movemask_epi8(a); }
68+
static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; }
6969

70-
static inline m128i cvtsi64_si128(int64_t a) { return _mm_cvtsi64_si128(a); }
70+
static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast<uint32_t>(a); }
7171
};
7272

73-
using intrinsics = platform_intrinsics<__m128i>;
73+
template <typename T>
74+
struct intrinsics<256, T>
75+
{
76+
using vec_t = __m256i;
77+
78+
using mask_t = int;
79+
80+
static inline vec_t setzero() noexcept { return _mm256_setzero_si256(); }
81+
82+
static inline vec_t set1_epi8(signed char w) { return _mm256_set1_epi8(w); }
83+
84+
static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm256_xor_si256(a, b); }
85+
86+
static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm256_and_si256(a, b); }
87+
88+
static inline vec_t or_vec(vec_t a, vec_t b) { return _mm256_or_si256(a, b); }
89+
90+
static inline vec_t load(const char* p) noexcept { return _mm256_loadu_si256(reinterpret_cast<const vec_t*>(p)); }
91+
92+
static inline bool equal(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi32(a, b)) == 0xFFFF; }
93+
94+
static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(b, a)); }
95+
96+
static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8(_mm256_cmpgt_epi8(a, b)); }
97+
98+
static inline auto movemask_epi8(vec_t a) noexcept { return _mm256_movemask_epi8(a); }
99+
100+
static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; }
101+
102+
static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; }
103+
104+
static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; }
105+
106+
static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast<uint32_t>(a); }
107+
};
108+
109+
template <typename T>
110+
struct intrinsics<512, T>
111+
{
112+
using vec_t = __m512i;
113+
114+
using mask_t = __mmask64;
115+
116+
static inline vec_t setzero() noexcept { return _mm512_setzero_si512(); }
117+
118+
static inline vec_t set1_epi8(signed char w) { return _mm512_set1_epi8(w); }
119+
120+
static inline vec_t xor_vec(vec_t a, vec_t b) noexcept { return _mm512_xor_si512(a, b); }
121+
122+
static inline vec_t and_vec(vec_t a, vec_t b) noexcept { return _mm512_and_si512(a, b); }
123+
124+
static inline vec_t or_vec(vec_t a, vec_t b) { return _mm512_or_si512(a, b); }
125+
126+
static inline vec_t load(const char* p) noexcept { return _mm512_loadu_si512(reinterpret_cast<const vec_t*>(p)); }
127+
128+
static inline bool equal(vec_t a, vec_t b) noexcept { return _mm512_cmpeq_epi8_mask(a, b) == 0xFFFFFFFF; }
129+
130+
static inline mask_t less(vec_t a, vec_t b) noexcept { return _mm512_cmplt_epi8_mask(a, b); }
131+
132+
static inline mask_t greater(vec_t a, vec_t b) noexcept { return _mm512_cmpgt_epi8_mask(a, b); }
133+
134+
static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return _kand_mask64(a, b); }
135+
136+
static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return _kor_mask64(a, b); }
137+
138+
static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return _kxor_mask64(a, b); }
139+
140+
static inline uint64_t to_unsigned(mask_t a) noexcept { return static_cast<uint64_t>(a); }
141+
};
74142

75143
#endif
76144
// }}}
77145

78146
#if defined(__aarch64__) || defined(_M_ARM64) // {{{
79-
template <>
80-
struct platform_intrinsics<int64x2_t>
147+
148+
template <typename T>
149+
struct intrinsics<128, T>
81150
{
82151
// The following inline functions (in its initial version) were borrowed from:
83152
// https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h
84153

85-
using m128i = int64x2_t;
154+
using vec_t = int64x2_t;
86155

87-
static inline m128i setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); }
156+
using mask_t = int;
88157

89-
static inline m128i set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); }
158+
static inline vec_t setzero() noexcept { return vreinterpretq_s64_s32(vdupq_n_s32(0)); }
90159

91-
static inline m128i load32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept
92-
{
93-
alignas(16) int32_t data[4] = {
94-
static_cast<int>(a),
95-
static_cast<int>(b),
96-
static_cast<int>(c),
97-
static_cast<int>(d),
98-
};
99-
return vreinterpretq_s64_s32(vld1q_s32(data));
100-
}
160+
static inline vec_t set1_epi8(signed char w) { return vreinterpretq_s64_s8(vdupq_n_s8(w)); }
101161

102-
static inline m128i xor128(m128i a, m128i b) noexcept
162+
static inline vec_t xor_vec(vec_t a, vec_t b) noexcept
103163
{
104164
// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
105165
// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
106166
return vreinterpretq_s64_s32(veorq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
107167
}
108168

109-
static inline m128i and128(m128i a, m128i b) noexcept
169+
static inline vec_t and_vec(vec_t a, vec_t b) noexcept
110170
{
111171
return vreinterpretq_s64_s32(vandq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
112172
}
113173

114-
// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
115-
static inline m128i or128(m128i a, m128i b)
174+
static inline vec_t or_vec(vec_t a, vec_t b)
116175
{
117176
return vreinterpretq_s64_s32(vorrq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
118177
}
119178

120-
// Loads 128-bit value. :
121-
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
122-
static inline m128i load_unaligned(m128i const* p) noexcept { return vreinterpretq_s64_s32(vld1q_s32((int32_t const*) p)); }
123-
124-
// Copy the lower 32-bit integer in a to dst.
125-
//
126-
// dst[31:0] := a[31:0]
127-
//
128-
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
129-
static inline int32_t to_i32(m128i a) { return vgetq_lane_s32(vreinterpretq_s32_s64(a), 0); }
179+
static inline vec_t load(const char* p) noexcept
180+
{
181+
return vreinterpretq_s64_s32(vld1q_s32(reinterpret_cast<const int32_t*>(p)));
182+
}
130183

131-
static inline bool compare(m128i a, m128i b) noexcept
184+
static inline bool equal(vec_t a, vec_t b) noexcept
132185
{
133186
return movemask_epi8(vreinterpretq_s64_u32(vceqq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)))) == 0xFFFF;
134187
}
135188

136-
static inline m128i compare_less(m128i a, m128i b) noexcept
189+
static inline mask_t less(vec_t a, vec_t b) noexcept
137190
{
138-
// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
139-
// in b for lesser than.
140-
// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
141-
return vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b)));
191+
return movemask_epi8(vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vreinterpretq_s8_s64(b))));
142192
}
143193

144-
static inline int movemask_epi8(m128i a)
194+
static inline mask_t greater(vec_t a, vec_t b) noexcept { return less(b, a); }
195+
196+
static inline mask_t and_mask(mask_t a, mask_t b) noexcept { return a & b; }
197+
198+
static inline mask_t or_mask(mask_t a, mask_t b) noexcept { return a | b; }
199+
200+
static inline mask_t xor_mask(mask_t a, mask_t b) noexcept { return a ^ b; }
201+
202+
static inline uint32_t to_unsigned(mask_t a) noexcept { return static_cast<uint32_t>(a); }
203+
204+
static inline mask_t movemask_epi8(vec_t a)
145205
{
146206
// Use increasingly wide shifts+adds to collect the sign bits
147207
// together.
@@ -218,8 +278,6 @@ struct platform_intrinsics<int64x2_t>
218278
return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
219279
}
220280
};
221-
222-
using intrinsics = platform_intrinsics<int64x2_t>;
223281
#endif
224282
// }}}
225283

0 commit comments

Comments
 (0)