@@ -1104,7 +1104,7 @@ static SIMD_INLINE Vec<Long, 16> min(const Vec<Long, 16> &a,
11041104 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
11051105 const __m128i diff = _mm_sub_epi64 (b, a);
11061106#if 1 // TODO: check which is faster
1107- const __m128i res = _mm_xor_si128 (
1107+ const __m128i res = _mm_xor_si128 (
11081108 diff, _mm_and_si128 (_mm_xor_si128 (b, a), _mm_xor_si128 (diff, b)));
11091109#else
11101110 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
@@ -1209,7 +1209,7 @@ static SIMD_INLINE Vec<Long, 16> max(const Vec<Long, 16> &a,
12091209 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
12101210 const __m128i diff = _mm_sub_epi64 (b, a);
12111211#if 1 // TODO: check which is faster
1212- const __m128i res = _mm_xor_si128 (
1212+ const __m128i res = _mm_xor_si128 (
12131213 diff, _mm_and_si128 (_mm_xor_si128 (b, a), _mm_xor_si128 (diff, b)));
12141214#else
12151215 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
@@ -2188,10 +2188,10 @@ static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
21882188 const __m128i vInPos = _mm_max_epi8 (vIn, _mm_setzero_si128 ());
21892189#else
21902190 // from Agner Fog's VCL vectori128.h
2191- const __m128i signbit = _mm_set1_epi32 (0x80808080 );
2192- const __m128i a1 = _mm_xor_si128 (vIn, signbit); // add 0x80
2193- const __m128i m1 = _mm_max_epu8 (a1, signbit); // unsigned max
2194- const __m128i vInPos = _mm_xor_si128 (m1, signbit); // sub 0x80
2191+ const __m128i signbit = _mm_set1_epi32 (0x80808080 );
2192+ const __m128i a1 = _mm_xor_si128 (vIn, signbit); // add 0x80
2193+ const __m128i m1 = _mm_max_epu8 (a1, signbit); // unsigned max
2194+ const __m128i vInPos = _mm_xor_si128 (m1, signbit); // sub 0x80
21952195#endif
21962196 vOut[0 ] = _mm_unpacklo_epi8 (vInPos, _mm_setzero_si128 ());
21972197 vOut[1 ] = _mm_unpackhi_epi8 (vInPos, _mm_setzero_si128 ());
@@ -2212,16 +2212,16 @@ static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
22122212 vOut[2 ] = _mm_cvtepi8_epi32 (_mm_srli_si128 (vIn, 8 ));
22132213 vOut[3 ] = _mm_cvtepi8_epi32 (_mm_srli_si128 (vIn, 12 ));
22142214#else
2215- const __m128i lo8 = _mm_unpacklo_epi8 (_mm_undefined_si128 (), vIn);
2216- const __m128i hi8 = _mm_unpackhi_epi8 (_mm_undefined_si128 (), vIn);
2217- const __m128i lolo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), lo8);
2218- const __m128i lohi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), lo8);
2219- const __m128i hilo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), hi8);
2220- const __m128i hihi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), hi8);
2221- vOut[0 ] = _mm_srai_epi32 (lolo16, 24 );
2222- vOut[1 ] = _mm_srai_epi32 (lohi16, 24 );
2223- vOut[2 ] = _mm_srai_epi32 (hilo16, 24 );
2224- vOut[3 ] = _mm_srai_epi32 (hihi16, 24 );
2215+ const __m128i lo8 = _mm_unpacklo_epi8 (_mm_undefined_si128 (), vIn);
2216+ const __m128i hi8 = _mm_unpackhi_epi8 (_mm_undefined_si128 (), vIn);
2217+ const __m128i lolo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), lo8);
2218+ const __m128i lohi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), lo8);
2219+ const __m128i hilo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), hi8);
2220+ const __m128i hihi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), hi8);
2221+ vOut[0 ] = _mm_srai_epi32 (lolo16, 24 );
2222+ vOut[1 ] = _mm_srai_epi32 (lohi16, 24 );
2223+ vOut[2 ] = _mm_srai_epi32 (hilo16, 24 );
2224+ vOut[3 ] = _mm_srai_epi32 (hihi16, 24 );
22252225#endif
22262226}
22272227
@@ -2234,16 +2234,16 @@ static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
22342234 vOut[2 ] = _mm_cvtepi32_ps (_mm_cvtepi8_epi32 (_mm_srli_si128 (vIn, 8 )));
22352235 vOut[3 ] = _mm_cvtepi32_ps (_mm_cvtepi8_epi32 (_mm_srli_si128 (vIn, 12 )));
22362236#else
2237- const __m128i lo8 = _mm_unpacklo_epi8 (_mm_undefined_si128 (), vIn);
2238- const __m128i hi8 = _mm_unpackhi_epi8 (_mm_undefined_si128 (), vIn);
2239- const __m128i lolo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), lo8);
2240- const __m128i lohi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), lo8);
2241- const __m128i hilo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), hi8);
2242- const __m128i hihi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), hi8);
2243- vOut[0 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (lolo16, 24 ));
2244- vOut[1 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (lohi16, 24 ));
2245- vOut[2 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (hilo16, 24 ));
2246- vOut[3 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (hihi16, 24 ));
2237+ const __m128i lo8 = _mm_unpacklo_epi8 (_mm_undefined_si128 (), vIn);
2238+ const __m128i hi8 = _mm_unpackhi_epi8 (_mm_undefined_si128 (), vIn);
2239+ const __m128i lolo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), lo8);
2240+ const __m128i lohi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), lo8);
2241+ const __m128i hilo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), hi8);
2242+ const __m128i hihi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), hi8);
2243+ vOut[0 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (lolo16, 24 ));
2244+ vOut[1 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (lohi16, 24 ));
2245+ vOut[2 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (hilo16, 24 ));
2246+ vOut[3 ] = _mm_cvtepi32_ps (_mm_srai_epi32 (hihi16, 24 ));
22472247#endif
22482248}
22492249
@@ -2281,10 +2281,10 @@ static SIMD_INLINE void extend(const Vec<Short, 16> &vIn,
22812281 _mm_srai_epi32 (_mm_unpacklo_epi16 (_mm_undefined_si128 (), vIn), 16 );
22822282 const __m128i hi16 =
22832283 _mm_srai_epi32 (_mm_unpackhi_epi16 (_mm_undefined_si128 (), vIn), 16 );
2284- vOut[0 ] = _mm_cvtepi32_pd (lo16);
2285- vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lo16, 8 ));
2286- vOut[2 ] = _mm_cvtepi32_pd (hi16);
2287- vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hi16, 8 ));
2284+ vOut[0 ] = _mm_cvtepi32_pd (lo16);
2285+ vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lo16, 8 ));
2286+ vOut[2 ] = _mm_cvtepi32_pd (hi16);
2287+ vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hi16, 8 ));
22882288#endif
22892289}
22902290
@@ -2332,12 +2332,12 @@ static SIMD_INLINE void extend(const Vec<Word, 16> &vIn, Vec<Long, 16> vOut[4])
23322332 vOut[2 ] = _mm_cvtepu16_epi64 (_mm_srli_si128 (vIn, 8 ));
23332333 vOut[3 ] = _mm_cvtepu16_epi64 (_mm_srli_si128 (vIn, 12 ));
23342334#else
2335- const __m128i lo16 = _mm_unpacklo_epi16 (vIn, _mm_setzero_si128 ());
2336- const __m128i hi16 = _mm_unpackhi_epi16 (vIn, _mm_setzero_si128 ());
2337- vOut[0 ] = _mm_unpacklo_epi32 (lo16, _mm_setzero_si128 ());
2338- vOut[1 ] = _mm_unpackhi_epi32 (lo16, _mm_setzero_si128 ());
2339- vOut[2 ] = _mm_unpacklo_epi32 (hi16, _mm_setzero_si128 ());
2340- vOut[3 ] = _mm_unpackhi_epi32 (hi16, _mm_setzero_si128 ());
2335+ const __m128i lo16 = _mm_unpacklo_epi16 (vIn, _mm_setzero_si128 ());
2336+ const __m128i hi16 = _mm_unpackhi_epi16 (vIn, _mm_setzero_si128 ());
2337+ vOut[0 ] = _mm_unpacklo_epi32 (lo16, _mm_setzero_si128 ());
2338+ vOut[1 ] = _mm_unpackhi_epi32 (lo16, _mm_setzero_si128 ());
2339+ vOut[2 ] = _mm_unpacklo_epi32 (hi16, _mm_setzero_si128 ());
2340+ vOut[3 ] = _mm_unpackhi_epi32 (hi16, _mm_setzero_si128 ());
23412341#endif
23422342}
23432343
@@ -2350,12 +2350,12 @@ static SIMD_INLINE void extend(const Vec<Word, 16> &vIn,
23502350 vOut[2 ] = _mm_cvtepi32_pd (_mm_cvtepu16_epi32 (_mm_srli_si128 (vIn, 8 )));
23512351 vOut[3 ] = _mm_cvtepi32_pd (_mm_cvtepu16_epi32 (_mm_srli_si128 (vIn, 12 )));
23522352#else
2353- const __m128i lo16 = _mm_unpacklo_epi16 (vIn, _mm_setzero_si128 ());
2354- const __m128i hi16 = _mm_unpackhi_epi16 (vIn, _mm_setzero_si128 ());
2355- vOut[0 ] = _mm_cvtepi32_pd (lo16);
2356- vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lo16, 8 ));
2357- vOut[2 ] = _mm_cvtepi32_pd (hi16);
2358- vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hi16, 8 ));
2353+ const __m128i lo16 = _mm_unpacklo_epi16 (vIn, _mm_setzero_si128 ());
2354+ const __m128i hi16 = _mm_unpackhi_epi16 (vIn, _mm_setzero_si128 ());
2355+ vOut[0 ] = _mm_cvtepi32_pd (lo16);
2356+ vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lo16, 8 ));
2357+ vOut[2 ] = _mm_cvtepi32_pd (hi16);
2358+ vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hi16, 8 ));
23592359#endif
23602360}
23612361
@@ -2416,24 +2416,24 @@ static SIMD_INLINE void extend(const Vec<SignedByte, 16> &vIn,
24162416 vOut[6 ] = _mm_cvtepi32_pd (_mm_cvtepi8_epi32 (_mm_srli_si128 (vIn, 12 )));
24172417 vOut[7 ] = _mm_cvtepi32_pd (_mm_cvtepi8_epi32 (_mm_srli_si128 (vIn, 14 )));
24182418#else
2419- const __m128i lo8 = _mm_unpacklo_epi8 (_mm_undefined_si128 (), vIn);
2420- const __m128i hi8 = _mm_unpackhi_epi8 (_mm_undefined_si128 (), vIn);
2421- const __m128i lolo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), lo8);
2422- const __m128i lohi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), lo8);
2423- const __m128i hilo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), hi8);
2424- const __m128i hihi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), hi8);
2425- const __m128i lolo16ext = _mm_srai_epi32 (lolo16, 24 );
2426- const __m128i lohi16ext = _mm_srai_epi32 (lohi16, 24 );
2427- const __m128i hilo16ext = _mm_srai_epi32 (hilo16, 24 );
2428- const __m128i hihi16ext = _mm_srai_epi32 (hihi16, 24 );
2429- vOut[0 ] = _mm_cvtepi32_pd (lolo16ext);
2430- vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lolo16ext, 8 ));
2431- vOut[2 ] = _mm_cvtepi32_pd (lohi16ext);
2432- vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lohi16ext, 8 ));
2433- vOut[4 ] = _mm_cvtepi32_pd (hilo16ext);
2434- vOut[5 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hilo16ext, 8 ));
2435- vOut[6 ] = _mm_cvtepi32_pd (hihi16ext);
2436- vOut[7 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hihi16ext, 8 ));
2419+ const __m128i lo8 = _mm_unpacklo_epi8 (_mm_undefined_si128 (), vIn);
2420+ const __m128i hi8 = _mm_unpackhi_epi8 (_mm_undefined_si128 (), vIn);
2421+ const __m128i lolo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), lo8);
2422+ const __m128i lohi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), lo8);
2423+ const __m128i hilo16 = _mm_unpacklo_epi16 (_mm_undefined_si128 (), hi8);
2424+ const __m128i hihi16 = _mm_unpackhi_epi16 (_mm_undefined_si128 (), hi8);
2425+ const __m128i lolo16ext = _mm_srai_epi32 (lolo16, 24 );
2426+ const __m128i lohi16ext = _mm_srai_epi32 (lohi16, 24 );
2427+ const __m128i hilo16ext = _mm_srai_epi32 (hilo16, 24 );
2428+ const __m128i hihi16ext = _mm_srai_epi32 (hihi16, 24 );
2429+ vOut[0 ] = _mm_cvtepi32_pd (lolo16ext);
2430+ vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lolo16ext, 8 ));
2431+ vOut[2 ] = _mm_cvtepi32_pd (lohi16ext);
2432+ vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lohi16ext, 8 ));
2433+ vOut[4 ] = _mm_cvtepi32_pd (hilo16ext);
2434+ vOut[5 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hilo16ext, 8 ));
2435+ vOut[6 ] = _mm_cvtepi32_pd (hihi16ext);
2436+ vOut[7 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hihi16ext, 8 ));
24372437#endif
24382438}
24392439
@@ -2451,20 +2451,20 @@ static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn, Vec<Long, 16> vOut[8])
24512451 vOut[6 ] = _mm_cvtepu8_epi64 (_mm_srli_si128 (vIn, 12 ));
24522452 vOut[7 ] = _mm_cvtepu8_epi64 (_mm_srli_si128 (vIn, 14 ));
24532453#else
2454- const __m128i lo8 = _mm_unpacklo_epi8 (vIn, _mm_setzero_si128 ());
2455- const __m128i hi8 = _mm_unpackhi_epi8 (vIn, _mm_setzero_si128 ());
2456- const __m128i lolo16 = _mm_unpacklo_epi16 (lo8, _mm_setzero_si128 ());
2457- const __m128i lohi16 = _mm_unpackhi_epi16 (lo8, _mm_setzero_si128 ());
2458- const __m128i hilo16 = _mm_unpacklo_epi16 (hi8, _mm_setzero_si128 ());
2459- const __m128i hihi16 = _mm_unpackhi_epi16 (hi8, _mm_setzero_si128 ());
2460- vOut[0 ] = _mm_unpacklo_epi32 (lolo16, _mm_setzero_si128 ());
2461- vOut[1 ] = _mm_unpackhi_epi32 (lolo16, _mm_setzero_si128 ());
2462- vOut[2 ] = _mm_unpacklo_epi32 (lohi16, _mm_setzero_si128 ());
2463- vOut[3 ] = _mm_unpackhi_epi32 (lohi16, _mm_setzero_si128 ());
2464- vOut[4 ] = _mm_unpacklo_epi32 (hilo16, _mm_setzero_si128 ());
2465- vOut[5 ] = _mm_unpackhi_epi32 (hilo16, _mm_setzero_si128 ());
2466- vOut[6 ] = _mm_unpacklo_epi32 (hihi16, _mm_setzero_si128 ());
2467- vOut[7 ] = _mm_unpackhi_epi32 (hihi16, _mm_setzero_si128 ());
2454+ const __m128i lo8 = _mm_unpacklo_epi8 (vIn, _mm_setzero_si128 ());
2455+ const __m128i hi8 = _mm_unpackhi_epi8 (vIn, _mm_setzero_si128 ());
2456+ const __m128i lolo16 = _mm_unpacklo_epi16 (lo8, _mm_setzero_si128 ());
2457+ const __m128i lohi16 = _mm_unpackhi_epi16 (lo8, _mm_setzero_si128 ());
2458+ const __m128i hilo16 = _mm_unpacklo_epi16 (hi8, _mm_setzero_si128 ());
2459+ const __m128i hihi16 = _mm_unpackhi_epi16 (hi8, _mm_setzero_si128 ());
2460+ vOut[0 ] = _mm_unpacklo_epi32 (lolo16, _mm_setzero_si128 ());
2461+ vOut[1 ] = _mm_unpackhi_epi32 (lolo16, _mm_setzero_si128 ());
2462+ vOut[2 ] = _mm_unpacklo_epi32 (lohi16, _mm_setzero_si128 ());
2463+ vOut[3 ] = _mm_unpackhi_epi32 (lohi16, _mm_setzero_si128 ());
2464+ vOut[4 ] = _mm_unpacklo_epi32 (hilo16, _mm_setzero_si128 ());
2465+ vOut[5 ] = _mm_unpackhi_epi32 (hilo16, _mm_setzero_si128 ());
2466+ vOut[6 ] = _mm_unpacklo_epi32 (hihi16, _mm_setzero_si128 ());
2467+ vOut[7 ] = _mm_unpackhi_epi32 (hihi16, _mm_setzero_si128 ());
24682468#endif
24692469}
24702470
@@ -2481,20 +2481,20 @@ static SIMD_INLINE void extend(const Vec<Byte, 16> &vIn,
24812481 vOut[6 ] = _mm_cvtepi32_pd (_mm_cvtepu8_epi32 (_mm_srli_si128 (vIn, 12 )));
24822482 vOut[7 ] = _mm_cvtepi32_pd (_mm_cvtepu8_epi32 (_mm_srli_si128 (vIn, 14 )));
24832483#else
2484- const __m128i lo8 = _mm_unpacklo_epi8 (vIn, _mm_setzero_si128 ());
2485- const __m128i hi8 = _mm_unpackhi_epi8 (vIn, _mm_setzero_si128 ());
2486- const __m128i lolo16 = _mm_unpacklo_epi16 (lo8, _mm_setzero_si128 ());
2487- const __m128i lohi16 = _mm_unpackhi_epi16 (lo8, _mm_setzero_si128 ());
2488- const __m128i hilo16 = _mm_unpacklo_epi16 (hi8, _mm_setzero_si128 ());
2489- const __m128i hihi16 = _mm_unpackhi_epi16 (hi8, _mm_setzero_si128 ());
2490- vOut[0 ] = _mm_cvtepi32_pd (lolo16);
2491- vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lolo16, 8 ));
2492- vOut[2 ] = _mm_cvtepi32_pd (lohi16);
2493- vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lohi16, 8 ));
2494- vOut[4 ] = _mm_cvtepi32_pd (hilo16);
2495- vOut[5 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hilo16, 8 ));
2496- vOut[6 ] = _mm_cvtepi32_pd (hihi16);
2497- vOut[7 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hihi16, 8 ));
2484+ const __m128i lo8 = _mm_unpacklo_epi8 (vIn, _mm_setzero_si128 ());
2485+ const __m128i hi8 = _mm_unpackhi_epi8 (vIn, _mm_setzero_si128 ());
2486+ const __m128i lolo16 = _mm_unpacklo_epi16 (lo8, _mm_setzero_si128 ());
2487+ const __m128i lohi16 = _mm_unpackhi_epi16 (lo8, _mm_setzero_si128 ());
2488+ const __m128i hilo16 = _mm_unpacklo_epi16 (hi8, _mm_setzero_si128 ());
2489+ const __m128i hihi16 = _mm_unpackhi_epi16 (hi8, _mm_setzero_si128 ());
2490+ vOut[0 ] = _mm_cvtepi32_pd (lolo16);
2491+ vOut[1 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lolo16, 8 ));
2492+ vOut[2 ] = _mm_cvtepi32_pd (lohi16);
2493+ vOut[3 ] = _mm_cvtepi32_pd (_mm_srli_si128 (lohi16, 8 ));
2494+ vOut[4 ] = _mm_cvtepi32_pd (hilo16);
2495+ vOut[5 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hilo16, 8 ));
2496+ vOut[6 ] = _mm_cvtepi32_pd (hihi16);
2497+ vOut[7 ] = _mm_cvtepi32_pd (_mm_srli_si128 (hihi16, 8 ));
24982498#endif
24992499}
25002500
@@ -3572,7 +3572,7 @@ static SIMD_INLINE Vec<Long, 16> cmplt(const Vec<Long, 16> &a,
35723572 // from Hacker's Delight, 2-12 Comparison Predicates:
35733573 const __m128i diff = _mm_sub_epi64 (a, b);
35743574#if 1 // TODO: check which is faster
3575- const __m128i res = _mm_xor_si128 (
3575+ const __m128i res = _mm_xor_si128 (
35763576 diff, _mm_and_si128 (_mm_xor_si128 (a, b), _mm_xor_si128 (diff, a)));
35773577#else
35783578 const __m128i res = _mm_or_si128(_mm_andnot_si128(b, a),
@@ -3777,7 +3777,7 @@ static SIMD_INLINE Vec<Long, 16> cmpgt(const Vec<Long, 16> &a,
37773777 // from Hacker's Delight, 2-12 Comparison Predicates: (swapped lt)
37783778 const __m128i diff = _mm_sub_epi64 (b, a);
37793779#if 1 // TODO: check which is faster
3780- const __m128i res = _mm_xor_si128 (
3780+ const __m128i res = _mm_xor_si128 (
37813781 diff, _mm_and_si128 (_mm_xor_si128 (b, a), _mm_xor_si128 (diff, b)));
37823782#else
37833783 const __m128i res = _mm_or_si128(_mm_andnot_si128(a, b),
0 commit comments