Skip to content

Commit 3f48285

Browse files
committed
Improve: Clipping doubles on Haswell
1 parent 79c4552 commit 3f48285

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

include/simsimd/elementwise.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1450,12 +1450,17 @@ SIMSIMD_PUBLIC void simsimd_scale_i32_haswell(simsimd_i32_t const *a, simsimd_si
14501450
simsimd_distance_t beta, simsimd_i32_t *result) {
14511451
__m256d alpha_vec = _mm256_set1_pd(alpha);
14521452
__m256d beta_vec = _mm256_set1_pd(beta);
1453+
__m256d min_vec = _mm256_set1_pd(-2147483648.0);
1454+
__m256d max_vec = _mm256_set1_pd(2147483647.0);
14531455

14541456
// The main loop:
14551457
simsimd_size_t i = 0;
14561458
for (; i + 4 <= n; i += 4) {
14571459
__m256d a_vec = _mm256_cvtepi32_pd(_mm_lddqu_si128((__m128i *)(a + i)));
14581460
__m256d sum_vec = _mm256_fmadd_pd(a_vec, alpha_vec, beta_vec);
1461+
// Clip to the largest values representable by 32-bit integers.
1462+
sum_vec = _mm256_max_pd(sum_vec, min_vec);
1463+
sum_vec = _mm256_min_pd(sum_vec, max_vec);
14591464
__m128i sum_i32_vec = _mm256_cvtpd_epi32(sum_vec);
14601465
_mm_storeu_si128((__m128i *)(result + i), sum_i32_vec);
14611466
}
@@ -1473,6 +1478,8 @@ SIMSIMD_PUBLIC void simsimd_fma_i32_haswell(
14731478
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i32_t *result) {
14741479
__m256d alpha_vec = _mm256_set1_pd(alpha);
14751480
__m256d beta_vec = _mm256_set1_pd(beta);
1481+
__m256d min_vec = _mm256_set1_pd(-2147483648.0);
1482+
__m256d max_vec = _mm256_set1_pd(2147483647.0);
14761483

14771484
// The main loop:
14781485
simsimd_size_t i = 0;
@@ -1483,6 +1490,9 @@ SIMSIMD_PUBLIC void simsimd_fma_i32_haswell(
14831490
__m256d ab_vec = _mm256_mul_pd(a_vec, b_vec);
14841491
__m256d ab_scaled_vec = _mm256_mul_pd(ab_vec, alpha_vec);
14851492
__m256d sum_vec = _mm256_fmadd_pd(c_vec, beta_vec, ab_scaled_vec);
1493+
// Clip to the largest values representable by 32-bit integers.
1494+
sum_vec = _mm256_max_pd(sum_vec, min_vec);
1495+
sum_vec = _mm256_min_pd(sum_vec, max_vec);
14861496
__m128i sum_i32_vec = _mm256_cvtpd_epi32(sum_vec);
14871497
_mm_storeu_si128((__m128i *)(result + i), sum_i32_vec);
14881498
}
@@ -1581,12 +1591,17 @@ SIMSIMD_PUBLIC void simsimd_scale_u32_haswell(simsimd_u32_t const *a, simsimd_si
15811591
simsimd_distance_t beta, simsimd_u32_t *result) {
15821592
__m256d alpha_vec = _mm256_set1_pd(alpha);
15831593
__m256d beta_vec = _mm256_set1_pd(beta);
1594+
__m256d min_vec = _mm256_set1_pd(0);
1595+
__m256d max_vec = _mm256_set1_pd(4294967295.0);
15841596

15851597
// The main loop:
15861598
simsimd_size_t i = 0;
15871599
for (; i + 4 <= n; i += 4) {
15881600
__m256d a_vec = _mm256_cvtepu32_pd_haswell(_mm_lddqu_si128((__m128i *)(a + i)));
15891601
__m256d sum_vec = _mm256_fmadd_pd(a_vec, alpha_vec, beta_vec);
1602+
// Clip to the largest values representable by 32-bit integers.
1603+
sum_vec = _mm256_max_pd(sum_vec, min_vec);
1604+
sum_vec = _mm256_min_pd(sum_vec, max_vec);
15901605
__m128i sum_u32_vec = _mm256_cvtpd_epu32_haswell(sum_vec);
15911606
_mm_storeu_si128((__m128i *)(result + i), sum_u32_vec);
15921607
}
@@ -1604,6 +1619,8 @@ SIMSIMD_PUBLIC void simsimd_fma_u32_haswell(
16041619
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u32_t *result) {
16051620
__m256d alpha_vec = _mm256_set1_pd(alpha);
16061621
__m256d beta_vec = _mm256_set1_pd(beta);
1622+
__m256d min_vec = _mm256_set1_pd(0);
1623+
__m256d max_vec = _mm256_set1_pd(4294967295.0);
16071624

16081625
// The main loop:
16091626
simsimd_size_t i = 0;
@@ -1614,6 +1631,9 @@ SIMSIMD_PUBLIC void simsimd_fma_u32_haswell(
16141631
__m256d ab_vec = _mm256_mul_pd(a_vec, b_vec);
16151632
__m256d ab_scaled_vec = _mm256_mul_pd(ab_vec, alpha_vec);
16161633
__m256d sum_vec = _mm256_fmadd_pd(c_vec, beta_vec, ab_scaled_vec);
1634+
// Clip to the largest values representable by 32-bit integers.
1635+
sum_vec = _mm256_max_pd(sum_vec, min_vec);
1636+
sum_vec = _mm256_min_pd(sum_vec, max_vec);
16171637
__m128i sum_u32_vec = _mm256_cvtpd_epu32_haswell(sum_vec);
16181638
_mm_storeu_si128((__m128i *)(result + i), sum_u32_vec);
16191639
}

0 commit comments

Comments
 (0)