|
2 | 2 | #include <immintrin.h> |
3 | 3 | #include <smmintrin.h> |
4 | 4 | #include <emmintrin.h> |
| 5 | +#include <wmmintrin.h> |
5 | 6 |
|
6 | 7 | __m128i gf128_mul(__m128i a, __m128i b) |
7 | 8 | { |
8 | | - __m256i poly = _mm256_set_epi64x(0, 0x1, 0, 0x87); |
| 9 | + //const __m256i poly = _mm256_set_epi64x(0, 0x1, 0, 0x87); |
9 | 10 |
|
10 | | - __m256i res = _mm256_setzero_si256(); |
11 | | - __m256i aa = _mm256_load_si256((__m256i*)&a); |
12 | | - __m256i bb = _mm256_setzero_si256(); |
| 11 | + __m128i low_low = _mm_clmulepi64_si128(a, b, 0x0); |
| 12 | + __m128i high_high = _mm_clmulepi64_si128(a, b, 0x11); |
| 13 | + __m128i mid_low = _mm_clmulepi64_si128(a, b, 0x01); |
| 14 | + __m128i mid_high = _mm_clmulepi64_si128(a, b, 0x10); |
13 | 15 |
|
14 | | - _mm256_xor_si256(res, res); |
| 16 | + _mm_xor_si128(low_low, mid_low); |
| 17 | + _mm_xor_si128(high_high, mid_high); |
15 | 18 |
|
16 | | - __m256i tmp5 = _mm256_clmulepi64_epi128(a, b, 0x0); |
17 | | - __m256i tmp4 = _mm256_clmulepi64_epi128(a, b, 0x10); |
18 | | - __m256i tmp3 = _mm256_clmulepi64_epi128(a, b, 0x01); |
19 | | - __m256i tmp2 = _mm256_clmulepi64_epi128(a, b, 0x11); |
20 | | - |
21 | | - tmp5 = _mm256_xor_si256(tmp5, tmp3); |
22 | | - tmp2 = _mm256_xor_si256(tmp2, tmp4); |
23 | | - |
24 | | - //res = _mm256_clmulepi64_si128(tmp5, poly, 0x10); |
25 | | - |
26 | | - //tmp3 = _mm_shuffle_epi32(tmp5, 0x4e); |
27 | | - |
28 | | - res = _mm256_xor_si256(res, tmp2); |
29 | | - res = _mm256_xor_si256(res, tmp3); |
30 | | - |
31 | | - return (((__m128i*)&res)[0]); |
| 19 | + return (_mm_setzero_si128()); |
32 | 20 | } |
0 commit comments