Skip to content

Commit ac60194

Browse files
committed
Improve: Overflow clipping on Skylake
1 parent a4fce6d commit ac60194

File tree

1 file changed

+82
-12
lines changed

1 file changed

+82
-12
lines changed

include/simsimd/elementwise.h

Lines changed: 82 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,9 @@ SIMSIMD_PUBLIC void simsimd_scale_i8_skylake(simsimd_i8_t const *a, simsimd_size
20312031
__m128i a_i8_vec, sum_i8_vec;
20322032
__m512 a_vec, sum_vec;
20332033
__mmask16 mask = 0xFFFF;
2034+
__m512i sum_i32_vec;
2035+
__m512i min_i32_vec = _mm512_set1_epi32(-128);
2036+
__m512i max_i32_vec = _mm512_set1_epi32(127);
20342037

20352038
simsimd_scale_i8_skylake_cycle:
20362039
if (n < 16) {
@@ -2044,7 +2047,10 @@ SIMSIMD_PUBLIC void simsimd_scale_i8_skylake(simsimd_i8_t const *a, simsimd_size
20442047
}
20452048
a_vec = _mm512_cvtepi32_ps(_mm512_cvtepi8_epi32(a_i8_vec));
20462049
sum_vec = _mm512_fmadd_ps(a_vec, alpha_vec, beta_vec);
2047-
sum_i8_vec = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(sum_vec));
2050+
sum_i32_vec = _mm512_cvtps_epi32(sum_vec);
2051+
sum_i32_vec = _mm512_max_epi32(sum_i32_vec, min_i32_vec);
2052+
sum_i32_vec = _mm512_min_epi32(sum_i32_vec, max_i32_vec);
2053+
sum_i8_vec = _mm512_cvtepi32_epi8(sum_i32_vec);
20482054
_mm_mask_storeu_epi8(result, mask, sum_i8_vec);
20492055
result += 16;
20502056
if (n) goto simsimd_scale_i8_skylake_cycle;
@@ -2058,6 +2064,10 @@ SIMSIMD_PUBLIC void simsimd_fma_i8_skylake(
20582064
__m128i a_i8_vec, b_i8_vec, c_i8_vec, sum_i8_vec;
20592065
__m512 a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
20602066
__mmask16 mask = 0xFFFF;
2067+
__m512i sum_i32_vec;
2068+
__m512i min_i32_vec = _mm512_set1_epi32(-128);
2069+
__m512i max_i32_vec = _mm512_set1_epi32(127);
2070+
20612071
simsimd_fma_i8_skylake_cycle:
20622072
if (n < 16) {
20632073
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
@@ -2078,7 +2088,10 @@ SIMSIMD_PUBLIC void simsimd_fma_i8_skylake(
20782088
ab_vec = _mm512_mul_ps(a_vec, b_vec);
20792089
ab_scaled_vec = _mm512_mul_ps(ab_vec, alpha_vec);
20802090
sum_vec = _mm512_fmadd_ps(c_vec, beta_vec, ab_scaled_vec);
2081-
sum_i8_vec = _mm512_cvtepi32_epi8(_mm512_cvtps_epi32(sum_vec));
2091+
sum_i32_vec = _mm512_cvtps_epi32(sum_vec);
2092+
sum_i32_vec = _mm512_max_epi32(sum_i32_vec, min_i32_vec);
2093+
sum_i32_vec = _mm512_min_epi32(sum_i32_vec, max_i32_vec);
2094+
sum_i8_vec = _mm512_cvtepi32_epi8(sum_i32_vec);
20822095
_mm_mask_storeu_epi8(result, mask, sum_i8_vec);
20832096
result += 16;
20842097
if (n) goto simsimd_fma_i8_skylake_cycle;
@@ -2091,6 +2104,9 @@ SIMSIMD_PUBLIC void simsimd_scale_u8_skylake(simsimd_u8_t const *a, simsimd_size
20912104
__m128i a_u8_vec, sum_u8_vec;
20922105
__m512 a_vec, sum_vec;
20932106
__mmask16 mask = 0xFFFF;
2107+
__m512i sum_u32_vec;
2108+
__m512i min_u32_vec = _mm512_set1_epi32(0);
2109+
__m512i max_u32_vec = _mm512_set1_epi32(255);
20942110

20952111
simsimd_scale_u8_skylake_cycle:
20962112
if (n < 16) {
@@ -2104,7 +2120,10 @@ SIMSIMD_PUBLIC void simsimd_scale_u8_skylake(simsimd_u8_t const *a, simsimd_size
21042120
}
21052121
a_vec = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(a_u8_vec));
21062122
sum_vec = _mm512_fmadd_ps(a_vec, alpha_vec, beta_vec);
2107-
sum_u8_vec = _mm512_cvtepi32_epi8(_mm512_cvtps_epu32(sum_vec));
2123+
sum_u32_vec = _mm512_cvtps_epu32(sum_vec);
2124+
sum_u32_vec = _mm512_max_epu32(sum_u32_vec, min_u32_vec);
2125+
sum_u32_vec = _mm512_min_epu32(sum_u32_vec, max_u32_vec);
2126+
sum_u8_vec = _mm512_cvtepi32_epi8(sum_u32_vec);
21082127
_mm_mask_storeu_epi8(result, mask, sum_u8_vec);
21092128
result += 16;
21102129
if (n) goto simsimd_scale_u8_skylake_cycle;
@@ -2118,6 +2137,10 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_skylake(
21182137
__m128i a_u8_vec, b_u8_vec, c_u8_vec, sum_u8_vec;
21192138
__m512 a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
21202139
__mmask16 mask = 0xFFFF;
2140+
__m512i sum_u32_vec;
2141+
__m512i min_u32_vec = _mm512_set1_epi32(0);
2142+
__m512i max_u32_vec = _mm512_set1_epi32(255);
2143+
21212144
simsimd_fma_u8_skylake_cycle:
21222145
if (n < 16) {
21232146
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
@@ -2138,7 +2161,10 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_skylake(
21382161
ab_vec = _mm512_mul_ps(a_vec, b_vec);
21392162
ab_scaled_vec = _mm512_mul_ps(ab_vec, alpha_vec);
21402163
sum_vec = _mm512_fmadd_ps(c_vec, beta_vec, ab_scaled_vec);
2141-
sum_u8_vec = _mm512_cvtepi32_epi8(_mm512_cvtps_epu32(sum_vec));
2164+
sum_u32_vec = _mm512_cvtps_epu32(sum_vec);
2165+
sum_u32_vec = _mm512_max_epu32(sum_u32_vec, min_u32_vec);
2166+
sum_u32_vec = _mm512_min_epu32(sum_u32_vec, max_u32_vec);
2167+
sum_u8_vec = _mm512_cvtepi32_epi8(sum_u32_vec);
21422168
_mm_mask_storeu_epi8(result, mask, sum_u8_vec);
21432169
result += 16;
21442170
if (n) goto simsimd_fma_u8_skylake_cycle;
@@ -2151,6 +2177,9 @@ SIMSIMD_PUBLIC void simsimd_scale_i16_skylake(simsimd_i16_t const *a, simsimd_si
21512177
__m256i a_i16_vec, sum_i16_vec;
21522178
__m512 a_vec, sum_vec;
21532179
__mmask16 mask = 0xFFFF;
2180+
__m512i sum_i32_vec;
2181+
__m512i min_i32_vec = _mm512_set1_epi32(-32768);
2182+
__m512i max_i32_vec = _mm512_set1_epi32(32767);
21542183

21552184
simsimd_scale_i16_skylake_cycle:
21562185
if (n < 16) {
@@ -2164,7 +2193,10 @@ SIMSIMD_PUBLIC void simsimd_scale_i16_skylake(simsimd_i16_t const *a, simsimd_si
21642193
}
21652194
a_vec = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(a_i16_vec));
21662195
sum_vec = _mm512_fmadd_ps(a_vec, alpha_vec, beta_vec);
2167-
sum_i16_vec = _mm512_cvtepi32_epi16(_mm512_cvtps_epi32(sum_vec));
2196+
sum_i32_vec = _mm512_cvtps_epi32(sum_vec);
2197+
sum_i32_vec = _mm512_max_epi32(sum_i32_vec, min_i32_vec);
2198+
sum_i32_vec = _mm512_min_epi32(sum_i32_vec, max_i32_vec);
2199+
sum_i16_vec = _mm512_cvtepi32_epi16(sum_i32_vec);
21682200
_mm256_mask_storeu_epi16(result, mask, sum_i16_vec);
21692201
result += 16;
21702202
if (n) goto simsimd_scale_i16_skylake_cycle;
@@ -2178,6 +2210,10 @@ SIMSIMD_PUBLIC void simsimd_fma_i16_skylake(
21782210
__m256i a_i16_vec, b_i16_vec, c_i16_vec, sum_i16_vec;
21792211
__m512 a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
21802212
__mmask16 mask = 0xFFFF;
2213+
__m512i sum_i32_vec;
2214+
__m512i min_i32_vec = _mm512_set1_epi32(-32768);
2215+
__m512i max_i32_vec = _mm512_set1_epi32(32767);
2216+
21812217
simsimd_fma_i16_skylake_cycle:
21822218
if (n < 16) {
21832219
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
@@ -2198,7 +2234,10 @@ SIMSIMD_PUBLIC void simsimd_fma_i16_skylake(
21982234
ab_vec = _mm512_mul_ps(a_vec, b_vec);
21992235
ab_scaled_vec = _mm512_mul_ps(ab_vec, alpha_vec);
22002236
sum_vec = _mm512_fmadd_ps(c_vec, beta_vec, ab_scaled_vec);
2201-
sum_i16_vec = _mm512_cvtepi32_epi16(_mm512_cvtps_epi32(sum_vec));
2237+
sum_i32_vec = _mm512_cvtps_epi32(sum_vec);
2238+
sum_i32_vec = _mm512_max_epi32(sum_i32_vec, min_i32_vec);
2239+
sum_i32_vec = _mm512_min_epi32(sum_i32_vec, max_i32_vec);
2240+
sum_i16_vec = _mm512_cvtepi32_epi16(sum_i32_vec);
22022241
_mm256_mask_storeu_epi16(result, mask, sum_i16_vec);
22032242
result += 16;
22042243
if (n) goto simsimd_fma_i16_skylake_cycle;
@@ -2211,6 +2250,9 @@ SIMSIMD_PUBLIC void simsimd_scale_u16_skylake(simsimd_u16_t const *a, simsimd_si
22112250
__m256i a_u16_vec, sum_u16_vec;
22122251
__m512 a_vec, sum_vec;
22132252
__mmask16 mask = 0xFFFF;
2253+
__m512i sum_u32_vec;
2254+
__m512i min_u32_vec = _mm512_set1_epi32(0);
2255+
__m512i max_u32_vec = _mm512_set1_epi32(65535);
22142256

22152257
simsimd_scale_u16_skylake_cycle:
22162258
if (n < 16) {
@@ -2224,7 +2266,10 @@ SIMSIMD_PUBLIC void simsimd_scale_u16_skylake(simsimd_u16_t const *a, simsimd_si
22242266
}
22252267
a_vec = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(a_u16_vec));
22262268
sum_vec = _mm512_fmadd_ps(a_vec, alpha_vec, beta_vec);
2227-
sum_u16_vec = _mm512_cvtepi32_epi16(_mm512_cvtps_epu32(sum_vec));
2269+
sum_u32_vec = _mm512_cvtps_epu32(sum_vec);
2270+
sum_u32_vec = _mm512_max_epu32(sum_u32_vec, min_u32_vec);
2271+
sum_u32_vec = _mm512_min_epu32(sum_u32_vec, max_u32_vec);
2272+
sum_u16_vec = _mm512_cvtepi32_epi16(sum_u32_vec);
22282273
_mm256_mask_storeu_epi16(result, mask, sum_u16_vec);
22292274
result += 16;
22302275
if (n) goto simsimd_scale_u16_skylake_cycle;
@@ -2238,6 +2283,10 @@ SIMSIMD_PUBLIC void simsimd_fma_u16_skylake(
22382283
__m256i a_u16_vec, b_u16_vec, c_u16_vec, sum_u16_vec;
22392284
__m512 a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
22402285
__mmask16 mask = 0xFFFF;
2286+
__m512i sum_u32_vec;
2287+
__m512i min_u32_vec = _mm512_set1_epi32(0);
2288+
__m512i max_u32_vec = _mm512_set1_epi32(65535);
2289+
22412290
simsimd_fma_u16_skylake_cycle:
22422291
if (n < 16) {
22432292
mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n);
@@ -2258,7 +2307,10 @@ SIMSIMD_PUBLIC void simsimd_fma_u16_skylake(
22582307
ab_vec = _mm512_mul_ps(a_vec, b_vec);
22592308
ab_scaled_vec = _mm512_mul_ps(ab_vec, alpha_vec);
22602309
sum_vec = _mm512_fmadd_ps(c_vec, beta_vec, ab_scaled_vec);
2261-
sum_u16_vec = _mm512_cvtepi32_epi16(_mm512_cvtps_epu32(sum_vec));
2310+
sum_u32_vec = _mm512_cvtps_epu32(sum_vec);
2311+
sum_u32_vec = _mm512_max_epu32(sum_u32_vec, min_u32_vec);
2312+
sum_u32_vec = _mm512_min_epu32(sum_u32_vec, max_u32_vec);
2313+
sum_u16_vec = _mm512_cvtepi32_epi16(sum_u32_vec);
22622314
_mm256_mask_storeu_epi16(result, mask, sum_u16_vec);
22632315
result += 16;
22642316
if (n) goto simsimd_fma_u16_skylake_cycle;
@@ -2271,6 +2323,8 @@ SIMSIMD_PUBLIC void simsimd_scale_i32_skylake(simsimd_i32_t const *a, simsimd_si
22712323
__m256i a_i32_vec, sum_i32_vec;
22722324
__m512d a_vec, sum_vec;
22732325
__mmask8 mask = 0xFF;
2326+
__m512d min_vec = _mm512_set1_pd(-2147483648.0);
2327+
__m512d max_vec = _mm512_set1_pd(2147483647.0);
22742328

22752329
simsimd_scale_i32_skylake_cycle:
22762330
if (n < 8) {
@@ -2284,7 +2338,9 @@ SIMSIMD_PUBLIC void simsimd_scale_i32_skylake(simsimd_i32_t const *a, simsimd_si
22842338
}
22852339
a_vec = _mm512_cvtepi32_pd(a_i32_vec);
22862340
sum_vec = _mm512_fmadd_pd(a_vec, alpha_vec, beta_vec);
2287-
sum_i32_vec = _mm512_cvtpd_epi32(sum_vec);
2341+
sum_vec = _mm512_max_pd(sum_vec, min_vec);
2342+
sum_vec = _mm512_min_pd(sum_vec, max_vec);
2343+
sum_i32_vec = _mm512_cvttpd_epi32(sum_vec);
22882344
_mm256_mask_storeu_epi32(result, mask, sum_i32_vec);
22892345
result += 8;
22902346
if (n) goto simsimd_scale_i32_skylake_cycle;
@@ -2298,6 +2354,9 @@ SIMSIMD_PUBLIC void simsimd_fma_i32_skylake(
22982354
__m256i a_i32_vec, b_i32_vec, c_i32_vec, sum_i32_vec;
22992355
__m512d a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
23002356
__mmask8 mask = 0xFF;
2357+
__m512d min_vec = _mm512_set1_pd(-2147483648.0);
2358+
__m512d max_vec = _mm512_set1_pd(2147483647.0);
2359+
23012360
simsimd_fma_i32_skylake_cycle:
23022361
if (n < 8) {
23032362
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
@@ -2318,7 +2377,9 @@ SIMSIMD_PUBLIC void simsimd_fma_i32_skylake(
23182377
ab_vec = _mm512_mul_pd(a_vec, b_vec);
23192378
ab_scaled_vec = _mm512_mul_pd(ab_vec, alpha_vec);
23202379
sum_vec = _mm512_fmadd_pd(c_vec, beta_vec, ab_scaled_vec);
2321-
sum_i32_vec = _mm512_cvtpd_epi32(sum_vec);
2380+
sum_vec = _mm512_max_pd(sum_vec, min_vec);
2381+
sum_vec = _mm512_min_pd(sum_vec, max_vec);
2382+
sum_i32_vec = _mm512_cvttpd_epi32(sum_vec);
23222383
_mm256_mask_storeu_epi32(result, mask, sum_i32_vec);
23232384
result += 8;
23242385
if (n) goto simsimd_fma_i32_skylake_cycle;
@@ -2331,6 +2392,8 @@ SIMSIMD_PUBLIC void simsimd_scale_u32_skylake(simsimd_u32_t const *a, simsimd_si
23312392
__m256i a_u32_vec, sum_u32_vec;
23322393
__m512d a_vec, sum_vec;
23332394
__mmask8 mask = 0xFF;
2395+
__m512d min_vec = _mm512_set1_pd(0.0);
2396+
__m512d max_vec = _mm512_set1_pd(4294967295.0);
23342397

23352398
simsimd_scale_u32_skylake_cycle:
23362399
if (n < 8) {
@@ -2344,7 +2407,9 @@ SIMSIMD_PUBLIC void simsimd_scale_u32_skylake(simsimd_u32_t const *a, simsimd_si
23442407
}
23452408
a_vec = _mm512_cvtepu32_pd(a_u32_vec);
23462409
sum_vec = _mm512_fmadd_pd(a_vec, alpha_vec, beta_vec);
2347-
sum_u32_vec = _mm512_cvtpd_epu32(sum_vec);
2410+
sum_vec = _mm512_max_pd(sum_vec, min_vec);
2411+
sum_vec = _mm512_min_pd(sum_vec, max_vec);
2412+
sum_u32_vec = _mm512_cvttpd_epu32(sum_vec);
23482413
_mm256_mask_storeu_epi32(result, mask, sum_u32_vec);
23492414
result += 8;
23502415
if (n) goto simsimd_scale_u32_skylake_cycle;
@@ -2358,6 +2423,9 @@ SIMSIMD_PUBLIC void simsimd_fma_u32_skylake(
23582423
__m256i a_u32_vec, b_u32_vec, c_u32_vec, sum_u32_vec;
23592424
__m512d a_vec, b_vec, c_vec, ab_vec, ab_scaled_vec, sum_vec;
23602425
__mmask8 mask = 0xFF;
2426+
__m512d min_vec = _mm512_set1_pd(0.0);
2427+
__m512d max_vec = _mm512_set1_pd(4294967295.0);
2428+
23612429
simsimd_fma_u32_skylake_cycle:
23622430
if (n < 8) {
23632431
mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n);
@@ -2378,7 +2446,9 @@ SIMSIMD_PUBLIC void simsimd_fma_u32_skylake(
23782446
ab_vec = _mm512_mul_pd(a_vec, b_vec);
23792447
ab_scaled_vec = _mm512_mul_pd(ab_vec, alpha_vec);
23802448
sum_vec = _mm512_fmadd_pd(c_vec, beta_vec, ab_scaled_vec);
2381-
sum_u32_vec = _mm512_cvtpd_epu32(sum_vec);
2449+
sum_vec = _mm512_max_pd(sum_vec, min_vec);
2450+
sum_vec = _mm512_min_pd(sum_vec, max_vec);
2451+
sum_u32_vec = _mm512_cvttpd_epu32(sum_vec);
23822452
_mm256_mask_storeu_epi32(result, mask, sum_u32_vec);
23832453
result += 8;
23842454
if (n) goto simsimd_fma_u32_skylake_cycle;

0 commit comments

Comments
 (0)