13
13
*/
14
14
#pragma once
15
15
16
+ #include < cstdint>
16
17
#if defined(__x86_64__) || defined(_M_AMD64)
17
- #include < emmintrin.h> // AVX, AVX2, FMP
18
- #include < immintrin.h> // SSE2
18
+ #include < immintrin.h>
19
19
#endif
20
20
21
21
#if defined(__aarch64__) || defined(_M_ARM64)
25
25
namespace unicode
26
26
{
27
27
28
- template <typename >
29
- struct platform_intrinsics ;
28
+ template <size_t SimdBitWidth, typename = void >
29
+ struct intrinsics ;
30
30
31
31
#if defined(__GNUC__) && defined(__x86_64__)
32
32
// For some reason, GCC associates attributes with __m128i that are not obvious (alignment),
@@ -36,112 +36,172 @@ struct platform_intrinsics;
36
36
37
37
#if defined(__x86_64__) || defined(_M_AMD64) // {{{
38
38
39
- template <>
40
- struct platform_intrinsics <__m128i >
39
+ template <typename T >
40
+ struct intrinsics < 128 , T >
41
41
{
42
- using m128i = __m128i;
42
+ using vec_t = __m128i;
43
43
44
- static inline m128i setzero () noexcept { return _mm_setzero_si128 (); }
44
+ using mask_t = int ;
45
45
46
- static inline m128i set1_epi8 ( signed char w) { return _mm_set1_epi8 (w ); }
46
+ static inline vec_t setzero () noexcept { return _mm_setzero_si128 ( ); }
47
47
48
- static inline m128i load32 (uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept
49
- {
50
- return _mm_set_epi32 (static_cast <int >(a), static_cast <int >(b), static_cast <int >(c), static_cast <int >(d));
51
- }
48
+ static inline vec_t set1_epi8 (signed char w) { return _mm_set1_epi8 (w); }
49
+
50
+ static inline vec_t xor_vec (vec_t a, vec_t b) noexcept { return _mm_xor_si128 (a, b); }
51
+
52
+ static inline vec_t and_vec (vec_t a, vec_t b) noexcept { return _mm_and_si128 (a, b); }
52
53
53
- static inline m128i xor128 (m128i a, m128i b) noexcept { return _mm_xor_si128 (a, b); }
54
+ static inline vec_t or_vec ( vec_t a, vec_t b) { return _mm_or_si128 (a, b); }
54
55
55
- static inline m128i and128 (m128i a, m128i b ) noexcept { return _mm_and_si128 (a, b ); }
56
+ static inline vec_t load ( const char * p ) noexcept { return _mm_loadu_si128 ( reinterpret_cast < const vec_t *>(p) ); }
56
57
57
- // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
58
- static inline m128i or128 (m128i a, m128i b) { return _mm_or_si128 (a, b); }
58
+ static inline bool equal (vec_t a, vec_t b) noexcept { return _mm_movemask_epi8 (_mm_cmpeq_epi32 (a, b)) == 0xFFFF ; }
59
59
60
- static inline m128i load_unaligned (m128i const * p ) noexcept { return _mm_loadu_si128 ( static_cast <m128i const *>(p )); }
60
+ static inline mask_t less ( vec_t a, vec_t b ) noexcept { return _mm_movemask_epi8 ( _mm_cmplt_epi8 (a, b )); }
61
61
62
- static inline int32_t to_i32 (m128i a) { return _mm_cvtsi128_si32 (a ); }
62
+ static inline mask_t greater ( vec_t a, vec_t b) noexcept { return _mm_movemask_epi8 ( _mm_cmpgt_epi8 (a, b) ); }
63
63
64
- static inline bool compare (m128i a, m128i b) noexcept { return _mm_movemask_epi8 ( _mm_cmpeq_epi32 (a, b)) == 0xFFFF ; }
64
+ static inline mask_t and_mask ( mask_t a, mask_t b) noexcept { return a & b ; }
65
65
66
- static inline m128i compare_less (m128i a, m128i b) noexcept { return _mm_cmplt_epi8 (a, b) ; }
66
+ static inline mask_t or_mask ( mask_t a, mask_t b) noexcept { return a | b ; }
67
67
68
- static inline int movemask_epi8 (m128i a) { return _mm_movemask_epi8 (a) ; }
68
+ static inline mask_t xor_mask ( mask_t a, mask_t b) noexcept { return a ^ b ; }
69
69
70
- static inline m128i cvtsi64_si128 ( int64_t a) { return _mm_cvtsi64_si128 (a); }
70
+ static inline uint32_t to_unsigned ( mask_t a) noexcept { return static_cast < uint32_t > (a); }
71
71
};
72
72
73
- using intrinsics = platform_intrinsics<__m128i>;
73
+ template <typename T>
74
+ struct intrinsics <256 , T>
75
+ {
76
+ using vec_t = __m256i;
77
+
78
+ using mask_t = int ;
79
+
80
+ static inline vec_t setzero () noexcept { return _mm256_setzero_si256 (); }
81
+
82
+ static inline vec_t set1_epi8 (signed char w) { return _mm256_set1_epi8 (w); }
83
+
84
+ static inline vec_t xor_vec (vec_t a, vec_t b) noexcept { return _mm256_xor_si256 (a, b); }
85
+
86
+ static inline vec_t and_vec (vec_t a, vec_t b) noexcept { return _mm256_and_si256 (a, b); }
87
+
88
+ static inline vec_t or_vec (vec_t a, vec_t b) { return _mm256_or_si256 (a, b); }
89
+
90
+ static inline vec_t load (const char * p) noexcept { return _mm256_loadu_si256 (reinterpret_cast <const vec_t *>(p)); }
91
+
92
+ static inline bool equal (vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8 (_mm256_cmpeq_epi32 (a, b)) == 0xFFFF ; }
93
+
94
+ static inline mask_t less (vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8 (_mm256_cmpgt_epi8 (b, a)); }
95
+
96
+ static inline mask_t greater (vec_t a, vec_t b) noexcept { return _mm256_movemask_epi8 (_mm256_cmpgt_epi8 (a, b)); }
97
+
98
+ static inline auto movemask_epi8 (vec_t a) noexcept { return _mm256_movemask_epi8 (a); }
99
+
100
+ static inline mask_t and_mask (mask_t a, mask_t b) noexcept { return a & b; }
101
+
102
+ static inline mask_t or_mask (mask_t a, mask_t b) noexcept { return a | b; }
103
+
104
+ static inline mask_t xor_mask (mask_t a, mask_t b) noexcept { return a ^ b; }
105
+
106
+ static inline uint32_t to_unsigned (mask_t a) noexcept { return static_cast <uint32_t >(a); }
107
+ };
108
+
109
+ template <typename T>
110
+ struct intrinsics <512 , T>
111
+ {
112
+ using vec_t = __m512i;
113
+
114
+ using mask_t = __mmask64;
115
+
116
+ static inline vec_t setzero () noexcept { return _mm512_setzero_si512 (); }
117
+
118
+ static inline vec_t set1_epi8 (signed char w) { return _mm512_set1_epi8 (w); }
119
+
120
+ static inline vec_t xor_vec (vec_t a, vec_t b) noexcept { return _mm512_xor_si512 (a, b); }
121
+
122
+ static inline vec_t and_vec (vec_t a, vec_t b) noexcept { return _mm512_and_si512 (a, b); }
123
+
124
+ static inline vec_t or_vec (vec_t a, vec_t b) { return _mm512_or_si512 (a, b); }
125
+
126
+ static inline vec_t load (const char * p) noexcept { return _mm512_loadu_si512 (reinterpret_cast <const vec_t *>(p)); }
127
+
128
+ static inline bool equal (vec_t a, vec_t b) noexcept { return _mm512_cmpeq_epi8_mask (a, b) == 0xFFFFFFFF ; }
129
+
130
+ static inline mask_t less (vec_t a, vec_t b) noexcept { return _mm512_cmplt_epi8_mask (a, b); }
131
+
132
+ static inline mask_t greater (vec_t a, vec_t b) noexcept { return _mm512_cmpgt_epi8_mask (a, b); }
133
+
134
+ static inline mask_t and_mask (mask_t a, mask_t b) noexcept { return _kand_mask64 (a, b); }
135
+
136
+ static inline mask_t or_mask (mask_t a, mask_t b) noexcept { return _kor_mask64 (a, b); }
137
+
138
+ static inline mask_t xor_mask (mask_t a, mask_t b) noexcept { return _kxor_mask64 (a, b); }
139
+
140
+ static inline uint64_t to_unsigned (mask_t a) noexcept { return static_cast <uint64_t >(a); }
141
+ };
74
142
75
143
#endif
76
144
// }}}
77
145
78
146
#if defined(__aarch64__) || defined(_M_ARM64) // {{{
79
- template <>
80
- struct platform_intrinsics <int64x2_t >
147
+
148
+ template <typename T>
149
+ struct intrinsics <128 , T>
81
150
{
82
151
// The following inline functions (in its initial version) were borrowed from:
83
152
// https://github.com/f1ed/emp/blob/master/emp-tool/utils/block.h
84
153
85
- using m128i = int64x2_t ;
154
+ using vec_t = int64x2_t ;
86
155
87
- static inline m128i setzero () noexcept { return vreinterpretq_s64_s32 ( vdupq_n_s32 ( 0 )); }
156
+ using mask_t = int ;
88
157
89
- static inline m128i set1_epi8 ( signed char w) { return vreinterpretq_s64_s8 ( vdupq_n_s8 (w )); }
158
+ static inline vec_t setzero () noexcept { return vreinterpretq_s64_s32 ( vdupq_n_s32 ( 0 )); }
90
159
91
- static inline m128i load32 (uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept
92
- {
93
- alignas (16 ) int32_t data[4 ] = {
94
- static_cast <int >(a),
95
- static_cast <int >(b),
96
- static_cast <int >(c),
97
- static_cast <int >(d),
98
- };
99
- return vreinterpretq_s64_s32 (vld1q_s32 (data));
100
- }
160
+ static inline vec_t set1_epi8 (signed char w) { return vreinterpretq_s64_s8 (vdupq_n_s8 (w)); }
101
161
102
- static inline m128i xor128 (m128i a, m128i b) noexcept
162
+ static inline vec_t xor_vec ( vec_t a, vec_t b) noexcept
103
163
{
104
164
// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
105
165
// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
106
166
return vreinterpretq_s64_s32 (veorq_s32 (vreinterpretq_s32_s64 (a), vreinterpretq_s32_s64 (b)));
107
167
}
108
168
109
- static inline m128i and128 (m128i a, m128i b) noexcept
169
+ static inline vec_t and_vec ( vec_t a, vec_t b) noexcept
110
170
{
111
171
return vreinterpretq_s64_s32 (vandq_s32 (vreinterpretq_s32_s64 (a), vreinterpretq_s32_s64 (b)));
112
172
}
113
173
114
- // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
115
- static inline m128i or128 (m128i a, m128i b)
174
+ static inline vec_t or_vec (vec_t a, vec_t b)
116
175
{
117
176
return vreinterpretq_s64_s32 (vorrq_s32 (vreinterpretq_s32_s64 (a), vreinterpretq_s32_s64 (b)));
118
177
}
119
178
120
- // Loads 128-bit value. :
121
- // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
122
- static inline m128i load_unaligned (m128i const * p) noexcept { return vreinterpretq_s64_s32 (vld1q_s32 ((int32_t const *) p)); }
123
-
124
- // Copy the lower 32-bit integer in a to dst.
125
- //
126
- // dst[31:0] := a[31:0]
127
- //
128
- // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
129
- static inline int32_t to_i32 (m128i a) { return vgetq_lane_s32 (vreinterpretq_s32_s64 (a), 0 ); }
179
+ static inline vec_t load (const char * p) noexcept
180
+ {
181
+ return vreinterpretq_s64_s32 (vld1q_s32 (reinterpret_cast <const int32_t *>(p)));
182
+ }
130
183
131
- static inline bool compare (m128i a, m128i b) noexcept
184
+ static inline bool equal ( vec_t a, vec_t b) noexcept
132
185
{
133
186
return movemask_epi8 (vreinterpretq_s64_u32 (vceqq_s32 (vreinterpretq_s32_s64 (a), vreinterpretq_s32_s64 (b)))) == 0xFFFF ;
134
187
}
135
188
136
- static inline m128i compare_less (m128i a, m128i b) noexcept
189
+ static inline mask_t less ( vec_t a, vec_t b) noexcept
137
190
{
138
- // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
139
- // in b for lesser than.
140
- // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
141
- return vreinterpretq_s64_u8 (vcltq_s8 (vreinterpretq_s8_s64 (a), vreinterpretq_s8_s64 (b)));
191
+ return movemask_epi8 (vreinterpretq_s64_u8 (vcltq_s8 (vreinterpretq_s8_s64 (a), vreinterpretq_s8_s64 (b))));
142
192
}
143
193
144
- static inline int movemask_epi8 (m128i a)
194
+ static inline mask_t greater (vec_t a, vec_t b) noexcept { return less (b, a); }
195
+
196
+ static inline mask_t and_mask (mask_t a, mask_t b) noexcept { return a & b; }
197
+
198
+ static inline mask_t or_mask (mask_t a, mask_t b) noexcept { return a | b; }
199
+
200
+ static inline mask_t xor_mask (mask_t a, mask_t b) noexcept { return a ^ b; }
201
+
202
+ static inline uint32_t to_unsigned (mask_t a) noexcept { return static_cast <uint32_t >(a); }
203
+
204
+ static inline mask_t movemask_epi8 (vec_t a)
145
205
{
146
206
// Use increasingly wide shifts+adds to collect the sign bits
147
207
// together.
@@ -218,8 +278,6 @@ struct platform_intrinsics<int64x2_t>
218
278
return vgetq_lane_u8 (paired64, 0 ) | ((int ) vgetq_lane_u8 (paired64, 8 ) << 8 );
219
279
}
220
280
};
221
-
222
- using intrinsics = platform_intrinsics<int64x2_t >;
223
281
#endif
224
282
// }}}
225
283
0 commit comments