Skip to content

Commit 6265b0a

Browse files
committed
Chore: Remove legace matmul tiling
1 parent d3760b1 commit 6265b0a

File tree

2 files changed

+284
-571
lines changed

2 files changed

+284
-571
lines changed

include/simsimd/dot.h

Lines changed: 48 additions & 193 deletions
Original file line numberDiff line numberDiff line change
@@ -3356,55 +3356,6 @@ SIMSIMD_INTERNAL void simsimd_dot_e5m2x64_finalize_skylake(
33563356
(simsimd_dot_f32x16_state_skylake_t const *)s2, (simsimd_dot_f32x16_state_skylake_t const *)s3, results);
33573357
}
33583358

3359-
/* Unified Outer-Product API: f32x2
3360-
* ================================
3361-
* Processes 2 k-elements per update, outputs 1×16 partial products.
3362-
* This is the minimum efficient granularity for F32 (2 FMA ops per call).
3363-
*
3364-
* B packing layout (transposed, k-major):
3365-
* b_packed[k_offset * 16 + col] = B[col][k_base + k_offset]
3366-
* where k_offset ∈ {0, 1}, col ∈ {0..15}
3367-
*
3368-
* Usage in cache-blocked GEMM:
3369-
* for k_block in 0..K by K_TILE:
3370-
* simsimd_dot_outer_f32x2_update_1x16_skylake(&state, &A[k_block], &B_packed[k_block * 16])
3371-
*/
3372-
3373-
/** @brief State for 1×16 f32 outer-product (2 k-elements per update). */
3374-
typedef struct simsimd_dot_outer_f32x2_state_1x16_skylake_t {
3375-
__m512 accumulator;
3376-
} simsimd_dot_outer_f32x2_state_1x16_skylake_t;
3377-
3378-
SIMSIMD_INTERNAL void simsimd_dot_outer_f32x2_init_1x16_skylake(simsimd_dot_outer_f32x2_state_1x16_skylake_t *state) {
3379-
state->accumulator = _mm512_setzero_ps();
3380-
}
3381-
3382-
/**
3383-
* @brief Update 1×16 f32 state with 2 k-elements (2 FMA ops).
3384-
*
3385-
* @param state Pointer to accumulator state.
3386-
* @param a_slice Pointer to 2 f32 values from A: a[k], a[k+1]
3387-
* @param b_transposed Pointer to 32 f32 values from B (transposed):
3388-
* b[0..15] = B[0..15][k], b[16..31] = B[0..15][k+1]
3389-
*/
3390-
SIMSIMD_INTERNAL void simsimd_dot_outer_f32x2_update_1x16_skylake(simsimd_dot_outer_f32x2_state_1x16_skylake_t *state,
3391-
simsimd_f32_t const *a_slice,
3392-
simsimd_f32_t const *b_transposed) {
3393-
3394-
__m512 a_broadcast_0 = _mm512_set1_ps(a_slice[0]);
3395-
__m512 b_column_0 = _mm512_loadu_ps(b_transposed);
3396-
state->accumulator = _mm512_fmadd_ps(a_broadcast_0, b_column_0, state->accumulator);
3397-
3398-
__m512 a_broadcast_1 = _mm512_set1_ps(a_slice[1]);
3399-
__m512 b_column_1 = _mm512_loadu_ps(b_transposed + 16);
3400-
state->accumulator = _mm512_fmadd_ps(a_broadcast_1, b_column_1, state->accumulator);
3401-
}
3402-
3403-
SIMSIMD_INTERNAL void simsimd_dot_outer_f32x2_finalize_1x16_skylake(
3404-
simsimd_dot_outer_f32x2_state_1x16_skylake_t const *state, simsimd_f32_t *result_row) {
3405-
_mm512_storeu_ps(result_row, state->accumulator);
3406-
}
3407-
34083359
#pragma clang attribute pop
34093360
#pragma GCC pop_options
34103361
#endif // SIMSIMD_TARGET_SKYLAKE
@@ -3521,7 +3472,9 @@ SIMSIMD_PUBLIC void simsimd_vdot_bf16c_genoa(simsimd_bf16c_t const *a_pairs, sim
35213472
results[1] = _simsimd_reduce_f32x16_skylake(ab_imag_vec);
35223473
}
35233474

3524-
/* Convert 32x E4M3 values to 32x BF16 values.
3475+
/**
3476+
* @brief Convert 32x E4M3 values to 32x BF16 values.
3477+
*
35253478
* Uses optimized path with fused exp+mant extraction.
35263479
* Denormals (exp=0, mant!=0) are flushed to zero (DAZ behavior).
35273480
*
@@ -3542,7 +3495,9 @@ SIMSIMD_INTERNAL __m512i _simsimd_e4m3_to_bf16_genoa(__m256i fp8) {
35423495
return _mm512_or_si512(sign, masked_exp_mant);
35433496
}
35443497

3545-
/* Convert 32x E5M2 values to 32x BF16 values.
3498+
/**
3499+
* @brief Convert 32x E5M2 values to 32x BF16 values.
3500+
*
35463501
* Uses optimized path with fused exp+mant extraction.
35473502
* Denormals (exp=0, mant!=0) are flushed to zero (DAZ behavior).
35483503
*
@@ -3635,10 +3590,20 @@ SIMSIMD_INTERNAL void simsimd_dot_bf16x32_finalize_genoa(
36353590
simsimd_dot_bf16x32_state_genoa_t const *s0, simsimd_dot_bf16x32_state_genoa_t const *s1, //
36363591
simsimd_dot_bf16x32_state_genoa_t const *s2, simsimd_dot_bf16x32_state_genoa_t const *s3, //
36373592
simsimd_f32_t *results) {
3638-
// State is layout-compatible with f32x16 (both contain just __m512 sum)
3639-
simsimd_dot_f32x16_finalize_skylake( //
3640-
(simsimd_dot_f32x16_state_skylake_t const *)s0, (simsimd_dot_f32x16_state_skylake_t const *)s1, //
3641-
(simsimd_dot_f32x16_state_skylake_t const *)s2, (simsimd_dot_f32x16_state_skylake_t const *)s3, results);
3593+
// ILP-optimized 4-way horizontal reduction (same logic as Skylake f32x16)
3594+
__m256 a0 = _mm256_add_ps(_mm512_castps512_ps256(s0->sum), _mm512_extractf32x8_ps(s0->sum, 1));
3595+
__m256 a1 = _mm256_add_ps(_mm512_castps512_ps256(s1->sum), _mm512_extractf32x8_ps(s1->sum, 1));
3596+
__m256 a2 = _mm256_add_ps(_mm512_castps512_ps256(s2->sum), _mm512_extractf32x8_ps(s2->sum, 1));
3597+
__m256 a3 = _mm256_add_ps(_mm512_castps512_ps256(s3->sum), _mm512_extractf32x8_ps(s3->sum, 1));
3598+
__m128 b0 = _mm_add_ps(_mm256_castps256_ps128(a0), _mm256_extractf128_ps(a0, 1));
3599+
__m128 b1 = _mm_add_ps(_mm256_castps256_ps128(a1), _mm256_extractf128_ps(a1, 1));
3600+
__m128 b2 = _mm_add_ps(_mm256_castps256_ps128(a2), _mm256_extractf128_ps(a2, 1));
3601+
__m128 b3 = _mm_add_ps(_mm256_castps256_ps128(a3), _mm256_extractf128_ps(a3, 1));
3602+
__m128 t01_lo = _mm_unpacklo_ps(b0, b1), t23_lo = _mm_unpacklo_ps(b2, b3);
3603+
__m128 t01_hi = _mm_unpackhi_ps(b0, b1), t23_hi = _mm_unpackhi_ps(b2, b3);
3604+
__m128 row0 = _mm_movelh_ps(t01_lo, t23_lo), row1 = _mm_movehl_ps(t23_lo, t01_lo);
3605+
__m128 row2 = _mm_movelh_ps(t01_hi, t23_hi), row3 = _mm_movehl_ps(t23_hi, t01_hi);
3606+
_mm_storeu_ps(results, _mm_add_ps(_mm_add_ps(row0, row1), _mm_add_ps(row2, row3)));
36423607
}
36433608

36443609
/**
@@ -3671,10 +3636,20 @@ SIMSIMD_INTERNAL void simsimd_dot_e4m3x64_finalize_genoa(
36713636
simsimd_dot_e4m3x64_state_genoa_t const *s0, simsimd_dot_e4m3x64_state_genoa_t const *s1, //
36723637
simsimd_dot_e4m3x64_state_genoa_t const *s2, simsimd_dot_e4m3x64_state_genoa_t const *s3, //
36733638
simsimd_f32_t *results) {
3674-
// State is layout-compatible with f32x16 (both contain just __m512 sum)
3675-
simsimd_dot_f32x16_finalize_skylake( //
3676-
(simsimd_dot_f32x16_state_skylake_t const *)s0, (simsimd_dot_f32x16_state_skylake_t const *)s1, //
3677-
(simsimd_dot_f32x16_state_skylake_t const *)s2, (simsimd_dot_f32x16_state_skylake_t const *)s3, results);
3639+
// ILP-optimized 4-way horizontal reduction (same logic as Skylake f32x16)
3640+
__m256 a0 = _mm256_add_ps(_mm512_castps512_ps256(s0->sum), _mm512_extractf32x8_ps(s0->sum, 1));
3641+
__m256 a1 = _mm256_add_ps(_mm512_castps512_ps256(s1->sum), _mm512_extractf32x8_ps(s1->sum, 1));
3642+
__m256 a2 = _mm256_add_ps(_mm512_castps512_ps256(s2->sum), _mm512_extractf32x8_ps(s2->sum, 1));
3643+
__m256 a3 = _mm256_add_ps(_mm512_castps512_ps256(s3->sum), _mm512_extractf32x8_ps(s3->sum, 1));
3644+
__m128 b0 = _mm_add_ps(_mm256_castps256_ps128(a0), _mm256_extractf128_ps(a0, 1));
3645+
__m128 b1 = _mm_add_ps(_mm256_castps256_ps128(a1), _mm256_extractf128_ps(a1, 1));
3646+
__m128 b2 = _mm_add_ps(_mm256_castps256_ps128(a2), _mm256_extractf128_ps(a2, 1));
3647+
__m128 b3 = _mm_add_ps(_mm256_castps256_ps128(a3), _mm256_extractf128_ps(a3, 1));
3648+
__m128 t01_lo = _mm_unpacklo_ps(b0, b1), t23_lo = _mm_unpacklo_ps(b2, b3);
3649+
__m128 t01_hi = _mm_unpackhi_ps(b0, b1), t23_hi = _mm_unpackhi_ps(b2, b3);
3650+
__m128 row0 = _mm_movelh_ps(t01_lo, t23_lo), row1 = _mm_movehl_ps(t23_lo, t01_lo);
3651+
__m128 row2 = _mm_movelh_ps(t01_hi, t23_hi), row3 = _mm_movehl_ps(t23_hi, t01_hi);
3652+
_mm_storeu_ps(results, _mm_add_ps(_mm_add_ps(row0, row1), _mm_add_ps(row2, row3)));
36783653
}
36793654

36803655
/**
@@ -3707,72 +3682,20 @@ SIMSIMD_INTERNAL void simsimd_dot_e5m2x64_finalize_genoa(
37073682
simsimd_dot_e5m2x64_state_genoa_t const *s0, simsimd_dot_e5m2x64_state_genoa_t const *s1, //
37083683
simsimd_dot_e5m2x64_state_genoa_t const *s2, simsimd_dot_e5m2x64_state_genoa_t const *s3, //
37093684
simsimd_f32_t *results) {
3710-
// State is layout-compatible with f32x16 (both contain just __m512 sum)
3711-
simsimd_dot_f32x16_finalize_skylake( //
3712-
(simsimd_dot_f32x16_state_skylake_t const *)s0, (simsimd_dot_f32x16_state_skylake_t const *)s1, //
3713-
(simsimd_dot_f32x16_state_skylake_t const *)s2, (simsimd_dot_f32x16_state_skylake_t const *)s3, results);
3714-
}
3715-
3716-
/* BF16x2 Outer-Product API (1×16, using VDPBF16PS with pair broadcast)
3717-
* =====================================================================
3718-
*
3719-
* For computing 16 dot products simultaneously using VDPBF16PS instruction.
3720-
* Each update processes 2 k-elements (1 bf16 pair) and outputs to 16 columns.
3721-
*
3722-
* VDPBF16PS: acc[i] += a[2i]*b[2i] + a[2i+1]*b[2i+1]
3723-
*
3724-
* By broadcasting the same pair {a0, a1} to all 16 lanes and loading
3725-
* one pair per column from B, we compute 16 partial dot products per call.
3726-
*
3727-
* B packing layout (interleaved pairs):
3728-
* b_packed[pair_idx * 32 + col * 2 + 0] = B[col][2*pair_idx]
3729-
* b_packed[pair_idx * 32 + col * 2 + 1] = B[col][2*pair_idx + 1]
3730-
*
3731-
* A single _mm512_loadu_si512(b_packed + pair_idx * 32) loads:
3732-
* {b[0][2p], b[0][2p+1], b[1][2p], b[1][2p+1], ..., b[15][2p], b[15][2p+1]}
3733-
*/
3734-
3735-
/** @brief State for 1×16 bf16 outer-product using VDPBF16PS. */
3736-
typedef struct simsimd_dot_outer_bf16x4_state_1x16_genoa_t {
3737-
__m512 acc; // 16 f32 accumulators (one per output column)
3738-
} simsimd_dot_outer_bf16x4_state_1x16_genoa_t;
3739-
3740-
SIMSIMD_INTERNAL void simsimd_dot_outer_bf16x4_init_1x16_genoa(simsimd_dot_outer_bf16x4_state_1x16_genoa_t *state) {
3741-
state->acc = _mm512_setzero_ps();
3742-
}
3743-
3744-
/**
3745-
* @brief Update 1×16 bf16 state with 4 k-elements (2 pairs) using VDPBF16PS.
3746-
*
3747-
* @param state Pointer to 1×16 accumulator state.
3748-
* @param a Pointer to 4 bf16 values from query row (a[k:k+4]).
3749-
* @param b_packed Pointer to 2 blocks of 16 interleaved bf16 pairs (2 × 64 bytes).
3750-
* Layout: For pair p in 0..1: b_packed[p*32..p*32+31] contains
3751-
* {b[0][2p], b[0][2p+1], b[1][2p], b[1][2p+1], ..., b[15][2p], b[15][2p+1]}
3752-
*/
3753-
SIMSIMD_INTERNAL void simsimd_dot_outer_bf16x4_update_1x16_genoa(simsimd_dot_outer_bf16x4_state_1x16_genoa_t *state,
3754-
simsimd_bf16_t const *a,
3755-
simsimd_bf16_t const *b_packed) {
3756-
__m512 acc = state->acc;
3757-
3758-
// Process 2 pairs (4 k-elements)
3759-
__m512i a_bc_0 = _mm512_set1_epi32(*(simsimd_i32_t const *)(a + 0));
3760-
__m512i b_vec_0 = _mm512_loadu_si512(b_packed + 0);
3761-
acc = _mm512_dpbf16_ps(acc, (__m512bh)a_bc_0, (__m512bh)b_vec_0);
3762-
3763-
__m512i a_bc_1 = _mm512_set1_epi32(*(simsimd_i32_t const *)(a + 2));
3764-
__m512i b_vec_1 = _mm512_loadu_si512(b_packed + 32);
3765-
acc = _mm512_dpbf16_ps(acc, (__m512bh)a_bc_1, (__m512bh)b_vec_1);
3766-
3767-
state->acc = acc;
3768-
}
3769-
3770-
/**
3771-
* @brief Finalize: Store 16 f32 results directly (NO horizontal reduction).
3772-
*/
3773-
SIMSIMD_INTERNAL void simsimd_dot_outer_bf16x4_finalize_1x16_genoa(
3774-
simsimd_dot_outer_bf16x4_state_1x16_genoa_t const *state, simsimd_f32_t *c) {
3775-
_mm512_storeu_ps(c, state->acc);
3685+
// ILP-optimized 4-way horizontal reduction (same logic as Skylake f32x16)
3686+
__m256 a0 = _mm256_add_ps(_mm512_castps512_ps256(s0->sum), _mm512_extractf32x8_ps(s0->sum, 1));
3687+
__m256 a1 = _mm256_add_ps(_mm512_castps512_ps256(s1->sum), _mm512_extractf32x8_ps(s1->sum, 1));
3688+
__m256 a2 = _mm256_add_ps(_mm512_castps512_ps256(s2->sum), _mm512_extractf32x8_ps(s2->sum, 1));
3689+
__m256 a3 = _mm256_add_ps(_mm512_castps512_ps256(s3->sum), _mm512_extractf32x8_ps(s3->sum, 1));
3690+
__m128 b0 = _mm_add_ps(_mm256_castps256_ps128(a0), _mm256_extractf128_ps(a0, 1));
3691+
__m128 b1 = _mm_add_ps(_mm256_castps256_ps128(a1), _mm256_extractf128_ps(a1, 1));
3692+
__m128 b2 = _mm_add_ps(_mm256_castps256_ps128(a2), _mm256_extractf128_ps(a2, 1));
3693+
__m128 b3 = _mm_add_ps(_mm256_castps256_ps128(a3), _mm256_extractf128_ps(a3, 1));
3694+
__m128 t01_lo = _mm_unpacklo_ps(b0, b1), t23_lo = _mm_unpacklo_ps(b2, b3);
3695+
__m128 t01_hi = _mm_unpackhi_ps(b0, b1), t23_hi = _mm_unpackhi_ps(b2, b3);
3696+
__m128 row0 = _mm_movelh_ps(t01_lo, t23_lo), row1 = _mm_movehl_ps(t23_lo, t01_lo);
3697+
__m128 row2 = _mm_movelh_ps(t01_hi, t23_hi), row3 = _mm_movehl_ps(t23_hi, t01_hi);
3698+
_mm_storeu_ps(results, _mm_add_ps(_mm_add_ps(row0, row1), _mm_add_ps(row2, row3)));
37763699
}
37773700

37783701
#pragma clang attribute pop
@@ -4315,74 +4238,6 @@ SIMSIMD_INTERNAL void simsimd_dot_u8x64_finalize_ice(
43154238
_mm_storeu_ps(results, _mm_cvtepi32_ps(sum_i32));
43164239
}
43174240

4318-
/* Unified Outer-Product API: i8x8
4319-
* ================================
4320-
* Processes 8 k-elements per update, outputs 1×16 partial products.
4321-
* Uses 4 VPDPWSSD instructions (each processes 2 i16 = 2 sign-extended i8).
4322-
*
4323-
* B packing layout (pair-interleaved, i8 sign-extended to i16 on load):
4324-
* b_packed[pair_index * 32 + col * 2 + offset] = B[col][pair_index * 2 + offset]
4325-
* where pair_index ∈ {0..3}, col ∈ {0..15}, offset ∈ {0, 1}
4326-
*
4327-
* Note: VPDPWSSD operates on i16 pairs. We sign-extend i8→i16 before use.
4328-
* This handles signed i8 × signed i8 correctly (unlike VPDPBUSD which is u8×i8).
4329-
*/
4330-
4331-
/** @brief State for 1×16 i8 outer-product (8 k-elements per update). */
4332-
typedef struct simsimd_dot_outer_i8x8_state_1x16_ice_t {
4333-
__m512i accumulator;
4334-
} simsimd_dot_outer_i8x8_state_1x16_ice_t;
4335-
4336-
SIMSIMD_INTERNAL void simsimd_dot_outer_i8x8_init_1x16_ice(simsimd_dot_outer_i8x8_state_1x16_ice_t *state) {
4337-
state->accumulator = _mm512_setzero_si512();
4338-
}
4339-
4340-
/**
4341-
* @brief Update 1×16 i8 state with 8 k-elements (4 VPDPWSSD ops).
4342-
*
4343-
* @param state Pointer to accumulator state.
4344-
* @param a_slice Pointer to 8 i8 values from A: a[k..k+7]
4345-
* @param b_pairs_interleaved Pointer to 128 i8 values (4 pair-blocks of 32 each):
4346-
* Block p: {B[0][2p], B[0][2p+1], B[1][2p], B[1][2p+1], ...}
4347-
*/
4348-
SIMSIMD_INTERNAL void simsimd_dot_outer_i8x8_update_1x16_ice(simsimd_dot_outer_i8x8_state_1x16_ice_t *state,
4349-
simsimd_i8_t const *a_slice,
4350-
simsimd_i8_t const *b_pairs_interleaved) {
4351-
4352-
__m512i accumulator = state->accumulator;
4353-
4354-
// Process 4 pairs of i8 values (8 total k-elements)
4355-
#define SIMSIMD_I8X8_PROCESS_PAIR(pair_index) \
4356-
do { \
4357-
/* Sign-extend a[2p] and a[2p+1] to i16, pack into i32, broadcast */ \
4358-
simsimd_i16_t a_lo = (simsimd_i16_t)a_slice[pair_index * 2]; \
4359-
simsimd_i16_t a_hi = (simsimd_i16_t)a_slice[pair_index * 2 + 1]; \
4360-
simsimd_i32_t a_pair_packed = ((simsimd_i32_t)(a_hi & 0xFFFF) << 16) | (a_lo & 0xFFFF); \
4361-
__m512i a_pair_broadcast = _mm512_set1_epi32(a_pair_packed); \
4362-
\
4363-
/* Load 32 i8 from B, sign-extend to 32 i16 */ \
4364-
__m256i b_i8_chunk = _mm256_loadu_si256((__m256i const *)(b_pairs_interleaved + pair_index * 32)); \
4365-
__m512i b_i16_extended = _mm512_cvtepi8_epi16(b_i8_chunk); \
4366-
\
4367-
/* VPDPWSSD: acc[i] += a_lo * b[2i] + a_hi * b[2i+1] */ \
4368-
accumulator = _mm512_dpwssd_epi32(accumulator, a_pair_broadcast, b_i16_extended); \
4369-
} while (0)
4370-
4371-
SIMSIMD_I8X8_PROCESS_PAIR(0);
4372-
SIMSIMD_I8X8_PROCESS_PAIR(1);
4373-
SIMSIMD_I8X8_PROCESS_PAIR(2);
4374-
SIMSIMD_I8X8_PROCESS_PAIR(3);
4375-
4376-
#undef SIMSIMD_I8X8_PROCESS_PAIR
4377-
4378-
state->accumulator = accumulator;
4379-
}
4380-
4381-
SIMSIMD_INTERNAL void simsimd_dot_outer_i8x8_finalize_1x16_ice(simsimd_dot_outer_i8x8_state_1x16_ice_t const *state,
4382-
simsimd_i32_t *result_row) {
4383-
_mm512_storeu_si512(result_row, state->accumulator);
4384-
}
4385-
43864241
#pragma clang attribute pop
43874242
#pragma GCC pop_options
43884243
#endif // SIMSIMD_TARGET_ICE

0 commit comments

Comments
 (0)