@@ -1450,12 +1450,17 @@ SIMSIMD_PUBLIC void simsimd_scale_i32_haswell(simsimd_i32_t const *a, simsimd_si
14501450 simsimd_distance_t beta , simsimd_i32_t * result ) {
14511451 __m256d alpha_vec = _mm256_set1_pd (alpha );
14521452 __m256d beta_vec = _mm256_set1_pd (beta );
1453+ __m256d min_vec = _mm256_set1_pd (-2147483648.0 );
1454+ __m256d max_vec = _mm256_set1_pd (2147483647.0 );
14531455
14541456 // The main loop:
14551457 simsimd_size_t i = 0 ;
14561458 for (; i + 4 <= n ; i += 4 ) {
14571459 __m256d a_vec = _mm256_cvtepi32_pd (_mm_lddqu_si128 ((__m128i * )(a + i )));
14581460 __m256d sum_vec = _mm256_fmadd_pd (a_vec , alpha_vec , beta_vec );
1461+ // Clip to the largest values representable by 32-bit integers.
1462+ sum_vec = _mm256_max_pd (sum_vec , min_vec );
1463+ sum_vec = _mm256_min_pd (sum_vec , max_vec );
14591464 __m128i sum_i32_vec = _mm256_cvtpd_epi32 (sum_vec );
14601465 _mm_storeu_si128 ((__m128i * )(result + i ), sum_i32_vec );
14611466 }
@@ -1473,6 +1478,8 @@ SIMSIMD_PUBLIC void simsimd_fma_i32_haswell(
14731478 simsimd_distance_t alpha , simsimd_distance_t beta , simsimd_i32_t * result ) {
14741479 __m256d alpha_vec = _mm256_set1_pd (alpha );
14751480 __m256d beta_vec = _mm256_set1_pd (beta );
1481+ __m256d min_vec = _mm256_set1_pd (-2147483648.0 );
1482+ __m256d max_vec = _mm256_set1_pd (2147483647.0 );
14761483
14771484 // The main loop:
14781485 simsimd_size_t i = 0 ;
@@ -1483,6 +1490,9 @@ SIMSIMD_PUBLIC void simsimd_fma_i32_haswell(
14831490 __m256d ab_vec = _mm256_mul_pd (a_vec , b_vec );
14841491 __m256d ab_scaled_vec = _mm256_mul_pd (ab_vec , alpha_vec );
14851492 __m256d sum_vec = _mm256_fmadd_pd (c_vec , beta_vec , ab_scaled_vec );
1493+ // Clip to the largest values representable by 32-bit integers.
1494+ sum_vec = _mm256_max_pd (sum_vec , min_vec );
1495+ sum_vec = _mm256_min_pd (sum_vec , max_vec );
14861496 __m128i sum_i32_vec = _mm256_cvtpd_epi32 (sum_vec );
14871497 _mm_storeu_si128 ((__m128i * )(result + i ), sum_i32_vec );
14881498 }
@@ -1581,12 +1591,17 @@ SIMSIMD_PUBLIC void simsimd_scale_u32_haswell(simsimd_u32_t const *a, simsimd_si
15811591 simsimd_distance_t beta , simsimd_u32_t * result ) {
15821592 __m256d alpha_vec = _mm256_set1_pd (alpha );
15831593 __m256d beta_vec = _mm256_set1_pd (beta );
1594+ __m256d min_vec = _mm256_set1_pd (0 );
1595+ __m256d max_vec = _mm256_set1_pd (4294967295.0 );
15841596
15851597 // The main loop:
15861598 simsimd_size_t i = 0 ;
15871599 for (; i + 4 <= n ; i += 4 ) {
15881600 __m256d a_vec = _mm256_cvtepu32_pd_haswell (_mm_lddqu_si128 ((__m128i * )(a + i )));
15891601 __m256d sum_vec = _mm256_fmadd_pd (a_vec , alpha_vec , beta_vec );
1602+ // Clip to the largest values representable by 32-bit integers.
1603+ sum_vec = _mm256_max_pd (sum_vec , min_vec );
1604+ sum_vec = _mm256_min_pd (sum_vec , max_vec );
15901605 __m128i sum_u32_vec = _mm256_cvtpd_epu32_haswell (sum_vec );
15911606 _mm_storeu_si128 ((__m128i * )(result + i ), sum_u32_vec );
15921607 }
@@ -1604,6 +1619,8 @@ SIMSIMD_PUBLIC void simsimd_fma_u32_haswell(
16041619 simsimd_distance_t alpha , simsimd_distance_t beta , simsimd_u32_t * result ) {
16051620 __m256d alpha_vec = _mm256_set1_pd (alpha );
16061621 __m256d beta_vec = _mm256_set1_pd (beta );
1622+ __m256d min_vec = _mm256_set1_pd (0 );
1623+ __m256d max_vec = _mm256_set1_pd (4294967295.0 );
16071624
16081625 // The main loop:
16091626 simsimd_size_t i = 0 ;
@@ -1614,6 +1631,9 @@ SIMSIMD_PUBLIC void simsimd_fma_u32_haswell(
16141631 __m256d ab_vec = _mm256_mul_pd (a_vec , b_vec );
16151632 __m256d ab_scaled_vec = _mm256_mul_pd (ab_vec , alpha_vec );
16161633 __m256d sum_vec = _mm256_fmadd_pd (c_vec , beta_vec , ab_scaled_vec );
1634+ // Clip to the largest values representable by 32-bit integers.
1635+ sum_vec = _mm256_max_pd (sum_vec , min_vec );
1636+ sum_vec = _mm256_min_pd (sum_vec , max_vec );
16171637 __m128i sum_u32_vec = _mm256_cvtpd_epu32_haswell (sum_vec );
16181638 _mm_storeu_si128 ((__m128i * )(result + i ), sum_u32_vec );
16191639 }
0 commit comments