@@ -3356,55 +3356,6 @@ SIMSIMD_INTERNAL void simsimd_dot_e5m2x64_finalize_skylake(
33563356 (simsimd_dot_f32x16_state_skylake_t const * )s2 , (simsimd_dot_f32x16_state_skylake_t const * )s3 , results );
33573357}
33583358
3359- /* Unified Outer-Product API: f32x2
3360- * ================================
3361- * Processes 2 k-elements per update, outputs 1×16 partial products.
3362- * This is the minimum efficient granularity for F32 (2 FMA ops per call).
3363- *
3364- * B packing layout (transposed, k-major):
3365- * b_packed[k_offset * 16 + col] = B[col][k_base + k_offset]
3366- * where k_offset ∈ {0, 1}, col ∈ {0..15}
3367- *
3368- * Usage in cache-blocked GEMM:
3369- * for k_block in 0..K by K_TILE:
3370- * simsimd_dot_outer_f32x2_update_1x16_skylake(&state, &A[k_block], &B_packed[k_block * 16])
3371- */
3372-
3373- /** @brief State for 1×16 f32 outer-product (2 k-elements per update). */
3374- typedef struct simsimd_dot_outer_f32x2_state_1x16_skylake_t {
3375- __m512 accumulator ;
3376- } simsimd_dot_outer_f32x2_state_1x16_skylake_t ;
3377-
3378- SIMSIMD_INTERNAL void simsimd_dot_outer_f32x2_init_1x16_skylake (simsimd_dot_outer_f32x2_state_1x16_skylake_t * state ) {
3379- state -> accumulator = _mm512_setzero_ps ();
3380- }
3381-
3382- /**
3383- * @brief Update 1×16 f32 state with 2 k-elements (2 FMA ops).
3384- *
3385- * @param state Pointer to accumulator state.
3386- * @param a_slice Pointer to 2 f32 values from A: a[k], a[k+1]
3387- * @param b_transposed Pointer to 32 f32 values from B (transposed):
3388- * b[0..15] = B[0..15][k], b[16..31] = B[0..15][k+1]
3389- */
3390- SIMSIMD_INTERNAL void simsimd_dot_outer_f32x2_update_1x16_skylake (simsimd_dot_outer_f32x2_state_1x16_skylake_t * state ,
3391- simsimd_f32_t const * a_slice ,
3392- simsimd_f32_t const * b_transposed ) {
3393-
3394- __m512 a_broadcast_0 = _mm512_set1_ps (a_slice [0 ]);
3395- __m512 b_column_0 = _mm512_loadu_ps (b_transposed );
3396- state -> accumulator = _mm512_fmadd_ps (a_broadcast_0 , b_column_0 , state -> accumulator );
3397-
3398- __m512 a_broadcast_1 = _mm512_set1_ps (a_slice [1 ]);
3399- __m512 b_column_1 = _mm512_loadu_ps (b_transposed + 16 );
3400- state -> accumulator = _mm512_fmadd_ps (a_broadcast_1 , b_column_1 , state -> accumulator );
3401- }
3402-
3403- SIMSIMD_INTERNAL void simsimd_dot_outer_f32x2_finalize_1x16_skylake (
3404- simsimd_dot_outer_f32x2_state_1x16_skylake_t const * state , simsimd_f32_t * result_row ) {
3405- _mm512_storeu_ps (result_row , state -> accumulator );
3406- }
3407-
34083359#pragma clang attribute pop
34093360#pragma GCC pop_options
34103361#endif // SIMSIMD_TARGET_SKYLAKE
@@ -3521,7 +3472,9 @@ SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16c_t const *a_pairs, sim
35213472 results [1 ] = _simsimd_reduce_f32x16_skylake (ab_imag_vec );
35223473}
35233474
3524- /* Convert 32x E4M3 values to 32x BF16 values.
3475+ /**
3476+ * @brief Convert 32x E4M3 values to 32x BF16 values.
3477+ *
35253478 * Uses optimized path with fused exp+mant extraction.
35263479 * Denormals (exp=0, mant!=0) are flushed to zero (DAZ behavior).
35273480 *
@@ -3542,7 +3495,9 @@ SIMSIMD_INTERNAL __m512i _simsimd_e4m3_to_bf16_genoa(__m256i fp8) {
35423495 return _mm512_or_si512 (sign , masked_exp_mant );
35433496}
35443497
3545- /* Convert 32x E5M2 values to 32x BF16 values.
3498+ /**
3499+ * @brief Convert 32x E5M2 values to 32x BF16 values.
3500+ *
35463501 * Uses optimized path with fused exp+mant extraction.
35473502 * Denormals (exp=0, mant!=0) are flushed to zero (DAZ behavior).
35483503 *
@@ -3635,10 +3590,20 @@ SIMSIMD_INTERNAL void simsimd_dot_bf16x32_finalize_genoa(
36353590 simsimd_dot_bf16x32_state_genoa_t const * s0 , simsimd_dot_bf16x32_state_genoa_t const * s1 , //
36363591 simsimd_dot_bf16x32_state_genoa_t const * s2 , simsimd_dot_bf16x32_state_genoa_t const * s3 , //
36373592 simsimd_f32_t * results ) {
3638- // State is layout-compatible with f32x16 (both contain just __m512 sum)
3639- simsimd_dot_f32x16_finalize_skylake ( //
3640- (simsimd_dot_f32x16_state_skylake_t const * )s0 , (simsimd_dot_f32x16_state_skylake_t const * )s1 , //
3641- (simsimd_dot_f32x16_state_skylake_t const * )s2 , (simsimd_dot_f32x16_state_skylake_t const * )s3 , results );
3593+ // ILP-optimized 4-way horizontal reduction (same logic as Skylake f32x16)
3594+ __m256 a0 = _mm256_add_ps (_mm512_castps512_ps256 (s0 -> sum ), _mm512_extractf32x8_ps (s0 -> sum , 1 ));
3595+ __m256 a1 = _mm256_add_ps (_mm512_castps512_ps256 (s1 -> sum ), _mm512_extractf32x8_ps (s1 -> sum , 1 ));
3596+ __m256 a2 = _mm256_add_ps (_mm512_castps512_ps256 (s2 -> sum ), _mm512_extractf32x8_ps (s2 -> sum , 1 ));
3597+ __m256 a3 = _mm256_add_ps (_mm512_castps512_ps256 (s3 -> sum ), _mm512_extractf32x8_ps (s3 -> sum , 1 ));
3598+ __m128 b0 = _mm_add_ps (_mm256_castps256_ps128 (a0 ), _mm256_extractf128_ps (a0 , 1 ));
3599+ __m128 b1 = _mm_add_ps (_mm256_castps256_ps128 (a1 ), _mm256_extractf128_ps (a1 , 1 ));
3600+ __m128 b2 = _mm_add_ps (_mm256_castps256_ps128 (a2 ), _mm256_extractf128_ps (a2 , 1 ));
3601+ __m128 b3 = _mm_add_ps (_mm256_castps256_ps128 (a3 ), _mm256_extractf128_ps (a3 , 1 ));
3602+ __m128 t01_lo = _mm_unpacklo_ps (b0 , b1 ), t23_lo = _mm_unpacklo_ps (b2 , b3 );
3603+ __m128 t01_hi = _mm_unpackhi_ps (b0 , b1 ), t23_hi = _mm_unpackhi_ps (b2 , b3 );
3604+ __m128 row0 = _mm_movelh_ps (t01_lo , t23_lo ), row1 = _mm_movehl_ps (t23_lo , t01_lo );
3605+ __m128 row2 = _mm_movelh_ps (t01_hi , t23_hi ), row3 = _mm_movehl_ps (t23_hi , t01_hi );
3606+ _mm_storeu_ps (results , _mm_add_ps (_mm_add_ps (row0 , row1 ), _mm_add_ps (row2 , row3 )));
36423607}
36433608
36443609/**
@@ -3671,10 +3636,20 @@ SIMSIMD_INTERNAL void simsimd_dot_e4m3x64_finalize_genoa(
36713636 simsimd_dot_e4m3x64_state_genoa_t const * s0 , simsimd_dot_e4m3x64_state_genoa_t const * s1 , //
36723637 simsimd_dot_e4m3x64_state_genoa_t const * s2 , simsimd_dot_e4m3x64_state_genoa_t const * s3 , //
36733638 simsimd_f32_t * results ) {
3674- // State is layout-compatible with f32x16 (both contain just __m512 sum)
3675- simsimd_dot_f32x16_finalize_skylake ( //
3676- (simsimd_dot_f32x16_state_skylake_t const * )s0 , (simsimd_dot_f32x16_state_skylake_t const * )s1 , //
3677- (simsimd_dot_f32x16_state_skylake_t const * )s2 , (simsimd_dot_f32x16_state_skylake_t const * )s3 , results );
3639+ // ILP-optimized 4-way horizontal reduction (same logic as Skylake f32x16)
3640+ __m256 a0 = _mm256_add_ps (_mm512_castps512_ps256 (s0 -> sum ), _mm512_extractf32x8_ps (s0 -> sum , 1 ));
3641+ __m256 a1 = _mm256_add_ps (_mm512_castps512_ps256 (s1 -> sum ), _mm512_extractf32x8_ps (s1 -> sum , 1 ));
3642+ __m256 a2 = _mm256_add_ps (_mm512_castps512_ps256 (s2 -> sum ), _mm512_extractf32x8_ps (s2 -> sum , 1 ));
3643+ __m256 a3 = _mm256_add_ps (_mm512_castps512_ps256 (s3 -> sum ), _mm512_extractf32x8_ps (s3 -> sum , 1 ));
3644+ __m128 b0 = _mm_add_ps (_mm256_castps256_ps128 (a0 ), _mm256_extractf128_ps (a0 , 1 ));
3645+ __m128 b1 = _mm_add_ps (_mm256_castps256_ps128 (a1 ), _mm256_extractf128_ps (a1 , 1 ));
3646+ __m128 b2 = _mm_add_ps (_mm256_castps256_ps128 (a2 ), _mm256_extractf128_ps (a2 , 1 ));
3647+ __m128 b3 = _mm_add_ps (_mm256_castps256_ps128 (a3 ), _mm256_extractf128_ps (a3 , 1 ));
3648+ __m128 t01_lo = _mm_unpacklo_ps (b0 , b1 ), t23_lo = _mm_unpacklo_ps (b2 , b3 );
3649+ __m128 t01_hi = _mm_unpackhi_ps (b0 , b1 ), t23_hi = _mm_unpackhi_ps (b2 , b3 );
3650+ __m128 row0 = _mm_movelh_ps (t01_lo , t23_lo ), row1 = _mm_movehl_ps (t23_lo , t01_lo );
3651+ __m128 row2 = _mm_movelh_ps (t01_hi , t23_hi ), row3 = _mm_movehl_ps (t23_hi , t01_hi );
3652+ _mm_storeu_ps (results , _mm_add_ps (_mm_add_ps (row0 , row1 ), _mm_add_ps (row2 , row3 )));
36783653}
36793654
36803655/**
@@ -3707,72 +3682,20 @@ SIMSIMD_INTERNAL void simsimd_dot_e5m2x64_finalize_genoa(
37073682 simsimd_dot_e5m2x64_state_genoa_t const * s0 , simsimd_dot_e5m2x64_state_genoa_t const * s1 , //
37083683 simsimd_dot_e5m2x64_state_genoa_t const * s2 , simsimd_dot_e5m2x64_state_genoa_t const * s3 , //
37093684 simsimd_f32_t * results ) {
3710- // State is layout-compatible with f32x16 (both contain just __m512 sum)
3711- simsimd_dot_f32x16_finalize_skylake ( //
3712- (simsimd_dot_f32x16_state_skylake_t const * )s0 , (simsimd_dot_f32x16_state_skylake_t const * )s1 , //
3713- (simsimd_dot_f32x16_state_skylake_t const * )s2 , (simsimd_dot_f32x16_state_skylake_t const * )s3 , results );
3714- }
3715-
3716- /* BF16x2 Outer-Product API (1×16, using VDPBF16PS with pair broadcast)
3717- * =====================================================================
3718- *
3719- * For computing 16 dot products simultaneously using VDPBF16PS instruction.
3720- * Each update processes 2 k-elements (1 bf16 pair) and outputs to 16 columns.
3721- *
3722- * VDPBF16PS: acc[i] += a[2i]*b[2i] + a[2i+1]*b[2i+1]
3723- *
3724- * By broadcasting the same pair {a0, a1} to all 16 lanes and loading
3725- * one pair per column from B, we compute 16 partial dot products per call.
3726- *
3727- * B packing layout (interleaved pairs):
3728- * b_packed[pair_idx * 32 + col * 2 + 0] = B[col][2*pair_idx]
3729- * b_packed[pair_idx * 32 + col * 2 + 1] = B[col][2*pair_idx + 1]
3730- *
3731- * A single _mm512_loadu_si512(b_packed + pair_idx * 32) loads:
3732- * {b[0][2p], b[0][2p+1], b[1][2p], b[1][2p+1], ..., b[15][2p], b[15][2p+1]}
3733- */
3734-
3735- /** @brief State for 1×16 bf16 outer-product using VDPBF16PS. */
3736- typedef struct simsimd_dot_outer_bf16x4_state_1x16_genoa_t {
3737- __m512 acc ; // 16 f32 accumulators (one per output column)
3738- } simsimd_dot_outer_bf16x4_state_1x16_genoa_t ;
3739-
3740- SIMSIMD_INTERNAL void simsimd_dot_outer_bf16x4_init_1x16_genoa (simsimd_dot_outer_bf16x4_state_1x16_genoa_t * state ) {
3741- state -> acc = _mm512_setzero_ps ();
3742- }
3743-
3744- /**
3745- * @brief Update 1×16 bf16 state with 4 k-elements (2 pairs) using VDPBF16PS.
3746- *
3747- * @param state Pointer to 1×16 accumulator state.
3748- * @param a Pointer to 4 bf16 values from query row (a[k:k+4]).
3749- * @param b_packed Pointer to 2 blocks of 16 interleaved bf16 pairs (2 × 64 bytes).
3750- * Layout: For pair p in 0..1: b_packed[p*32..p*32+31] contains
3751- * {b[0][2p], b[0][2p+1], b[1][2p], b[1][2p+1], ..., b[15][2p], b[15][2p+1]}
3752- */
3753- SIMSIMD_INTERNAL void simsimd_dot_outer_bf16x4_update_1x16_genoa (simsimd_dot_outer_bf16x4_state_1x16_genoa_t * state ,
3754- simsimd_bf16_t const * a ,
3755- simsimd_bf16_t const * b_packed ) {
3756- __m512 acc = state -> acc ;
3757-
3758- // Process 2 pairs (4 k-elements)
3759- __m512i a_bc_0 = _mm512_set1_epi32 (* (simsimd_i32_t const * )(a + 0 ));
3760- __m512i b_vec_0 = _mm512_loadu_si512 (b_packed + 0 );
3761- acc = _mm512_dpbf16_ps (acc , (__m512bh )a_bc_0 , (__m512bh )b_vec_0 );
3762-
3763- __m512i a_bc_1 = _mm512_set1_epi32 (* (simsimd_i32_t const * )(a + 2 ));
3764- __m512i b_vec_1 = _mm512_loadu_si512 (b_packed + 32 );
3765- acc = _mm512_dpbf16_ps (acc , (__m512bh )a_bc_1 , (__m512bh )b_vec_1 );
3766-
3767- state -> acc = acc ;
3768- }
3769-
3770- /**
3771- * @brief Finalize: Store 16 f32 results directly (NO horizontal reduction).
3772- */
3773- SIMSIMD_INTERNAL void simsimd_dot_outer_bf16x4_finalize_1x16_genoa (
3774- simsimd_dot_outer_bf16x4_state_1x16_genoa_t const * state , simsimd_f32_t * c ) {
3775- _mm512_storeu_ps (c , state -> acc );
3685+ // ILP-optimized 4-way horizontal reduction (same logic as Skylake f32x16)
3686+ __m256 a0 = _mm256_add_ps (_mm512_castps512_ps256 (s0 -> sum ), _mm512_extractf32x8_ps (s0 -> sum , 1 ));
3687+ __m256 a1 = _mm256_add_ps (_mm512_castps512_ps256 (s1 -> sum ), _mm512_extractf32x8_ps (s1 -> sum , 1 ));
3688+ __m256 a2 = _mm256_add_ps (_mm512_castps512_ps256 (s2 -> sum ), _mm512_extractf32x8_ps (s2 -> sum , 1 ));
3689+ __m256 a3 = _mm256_add_ps (_mm512_castps512_ps256 (s3 -> sum ), _mm512_extractf32x8_ps (s3 -> sum , 1 ));
3690+ __m128 b0 = _mm_add_ps (_mm256_castps256_ps128 (a0 ), _mm256_extractf128_ps (a0 , 1 ));
3691+ __m128 b1 = _mm_add_ps (_mm256_castps256_ps128 (a1 ), _mm256_extractf128_ps (a1 , 1 ));
3692+ __m128 b2 = _mm_add_ps (_mm256_castps256_ps128 (a2 ), _mm256_extractf128_ps (a2 , 1 ));
3693+ __m128 b3 = _mm_add_ps (_mm256_castps256_ps128 (a3 ), _mm256_extractf128_ps (a3 , 1 ));
3694+ __m128 t01_lo = _mm_unpacklo_ps (b0 , b1 ), t23_lo = _mm_unpacklo_ps (b2 , b3 );
3695+ __m128 t01_hi = _mm_unpackhi_ps (b0 , b1 ), t23_hi = _mm_unpackhi_ps (b2 , b3 );
3696+ __m128 row0 = _mm_movelh_ps (t01_lo , t23_lo ), row1 = _mm_movehl_ps (t23_lo , t01_lo );
3697+ __m128 row2 = _mm_movelh_ps (t01_hi , t23_hi ), row3 = _mm_movehl_ps (t23_hi , t01_hi );
3698+ _mm_storeu_ps (results , _mm_add_ps (_mm_add_ps (row0 , row1 ), _mm_add_ps (row2 , row3 )));
37763699}
37773700
37783701#pragma clang attribute pop
@@ -4315,74 +4238,6 @@ SIMSIMD_INTERNAL void simsimd_dot_u8x64_finalize_ice(
43154238 _mm_storeu_ps (results , _mm_cvtepi32_ps (sum_i32 ));
43164239}
43174240
4318- /* Unified Outer-Product API: i8x8
4319- * ================================
4320- * Processes 8 k-elements per update, outputs 1×16 partial products.
4321- * Uses 4 VPDPWSSD instructions (each processes 2 i16 = 2 sign-extended i8).
4322- *
4323- * B packing layout (pair-interleaved, i8 sign-extended to i16 on load):
4324- * b_packed[pair_index * 32 + col * 2 + offset] = B[col][pair_index * 2 + offset]
4325- * where pair_index ∈ {0..3}, col ∈ {0..15}, offset ∈ {0, 1}
4326- *
4327- * Note: VPDPWSSD operates on i16 pairs. We sign-extend i8→i16 before use.
4328- * This handles signed i8 × signed i8 correctly (unlike VPDPBUSD which is u8×i8).
4329- */
4330-
4331- /** @brief State for 1×16 i8 outer-product (8 k-elements per update). */
4332- typedef struct simsimd_dot_outer_i8x8_state_1x16_ice_t {
4333- __m512i accumulator ;
4334- } simsimd_dot_outer_i8x8_state_1x16_ice_t ;
4335-
4336- SIMSIMD_INTERNAL void simsimd_dot_outer_i8x8_init_1x16_ice (simsimd_dot_outer_i8x8_state_1x16_ice_t * state ) {
4337- state -> accumulator = _mm512_setzero_si512 ();
4338- }
4339-
4340- /**
4341- * @brief Update 1×16 i8 state with 8 k-elements (4 VPDPWSSD ops).
4342- *
4343- * @param state Pointer to accumulator state.
4344- * @param a_slice Pointer to 8 i8 values from A: a[k..k+7]
4345- * @param b_pairs_interleaved Pointer to 128 i8 values (4 pair-blocks of 32 each):
4346- * Block p: {B[0][2p], B[0][2p+1], B[1][2p], B[1][2p+1], ...}
4347- */
4348- SIMSIMD_INTERNAL void simsimd_dot_outer_i8x8_update_1x16_ice (simsimd_dot_outer_i8x8_state_1x16_ice_t * state ,
4349- simsimd_i8_t const * a_slice ,
4350- simsimd_i8_t const * b_pairs_interleaved ) {
4351-
4352- __m512i accumulator = state -> accumulator ;
4353-
4354- // Process 4 pairs of i8 values (8 total k-elements)
4355- #define SIMSIMD_I8X8_PROCESS_PAIR (pair_index ) \
4356- do { \
4357- /* Sign-extend a[2p] and a[2p+1] to i16, pack into i32, broadcast */ \
4358- simsimd_i16_t a_lo = (simsimd_i16_t )a_slice [pair_index * 2 ]; \
4359- simsimd_i16_t a_hi = (simsimd_i16_t )a_slice [pair_index * 2 + 1 ]; \
4360- simsimd_i32_t a_pair_packed = ((simsimd_i32_t )(a_hi & 0xFFFF ) << 16 ) | (a_lo & 0xFFFF ); \
4361- __m512i a_pair_broadcast = _mm512_set1_epi32 (a_pair_packed ); \
4362- \
4363- /* Load 32 i8 from B, sign-extend to 32 i16 */ \
4364- __m256i b_i8_chunk = _mm256_loadu_si256 ((__m256i const * )(b_pairs_interleaved + pair_index * 32 )); \
4365- __m512i b_i16_extended = _mm512_cvtepi8_epi16 (b_i8_chunk ); \
4366- \
4367- /* VPDPWSSD: acc[i] += a_lo * b[2i] + a_hi * b[2i+1] */ \
4368- accumulator = _mm512_dpwssd_epi32 (accumulator , a_pair_broadcast , b_i16_extended ); \
4369- } while (0 )
4370-
4371- SIMSIMD_I8X8_PROCESS_PAIR (0 );
4372- SIMSIMD_I8X8_PROCESS_PAIR (1 );
4373- SIMSIMD_I8X8_PROCESS_PAIR (2 );
4374- SIMSIMD_I8X8_PROCESS_PAIR (3 );
4375-
4376- #undef SIMSIMD_I8X8_PROCESS_PAIR
4377-
4378- state -> accumulator = accumulator ;
4379- }
4380-
4381- SIMSIMD_INTERNAL void simsimd_dot_outer_i8x8_finalize_1x16_ice (simsimd_dot_outer_i8x8_state_1x16_ice_t const * state ,
4382- simsimd_i32_t * result_row ) {
4383- _mm512_storeu_si512 (result_row , state -> accumulator );
4384- }
4385-
43864241#pragma clang attribute pop
43874242#pragma GCC pop_options
43884243#endif // SIMSIMD_TARGET_ICE
0 commit comments