ashvardanian
diff --git a/‎bench/bench_cross_amx.cpp‎
Lines changed: 12 additions & 0 deletions b/‎bench/bench_cross_amx.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎c/dispatch_e5m2.c‎
Lines changed: 23 additions & 0 deletions b/‎c/dispatch_e5m2.c‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎include/numkong/cast/README.md‎
Lines changed: 3 additions & 0 deletions b/‎include/numkong/cast/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/numkong/cast/haswell.h‎
Lines changed: 7 additions & 24 deletions b/‎include/numkong/cast/haswell.h‎
Lines changed: 7 additions & 24 deletions
diff --git a/‎include/numkong/cast/skylake.h‎
Lines changed: 45 additions & 20 deletions b/‎include/numkong/cast/skylake.h‎
Lines changed: 45 additions & 20 deletions
diff --git a/‎include/numkong/dot/README.md‎
Lines changed: 1 addition & 0 deletions b/‎include/numkong/dot/README.md‎
Lines changed: 1 addition & 0 deletions
@@ -58,5 +58,17 @@ void bench_cross_amx() {
     run_euclideans_packed<f16_k>("euclideans_packed_f16_graniteamx", nk_dots_packed_size_f16_graniteamx,
                                  nk_dots_pack_f16_graniteamx, nk_euclideans_packed_f16_graniteamx);
     run_euclideans_symmetric<f16_k>("euclideans_symmetric_f16_graniteamx", nk_euclideans_symmetric_f16_graniteamx);
+
+    run_dots_packed<e5m2_k>("dots_packed_e5m2_graniteamx", nk_dots_packed_size_e5m2_graniteamx,
+                            nk_dots_pack_e5m2_graniteamx, nk_dots_packed_e5m2_graniteamx);
+    run_dots_symmetric<e5m2_k>("dots_symmetric_e5m2_graniteamx", nk_dots_symmetric_e5m2_graniteamx);
+
+    run_angulars_packed<e5m2_k>("angulars_packed_e5m2_graniteamx", nk_dots_packed_size_e5m2_graniteamx,
+                                nk_dots_pack_e5m2_graniteamx, nk_angulars_packed_e5m2_graniteamx);
+    run_angulars_symmetric<e5m2_k>("angulars_symmetric_e5m2_graniteamx", nk_angulars_symmetric_e5m2_graniteamx);
+
+    run_euclideans_packed<e5m2_k>("euclideans_packed_e5m2_graniteamx", nk_dots_packed_size_e5m2_graniteamx,
+                                  nk_dots_pack_e5m2_graniteamx, nk_euclideans_packed_e5m2_graniteamx);
+    run_euclideans_symmetric<e5m2_k>("euclideans_symmetric_e5m2_graniteamx", nk_euclideans_symmetric_e5m2_graniteamx);
 #endif
 }
@@ -113,6 +113,29 @@ void nk_dispatch_e5m2_find_(nk_capability_t v, nk_kernel_kind_t k, nk_kernel_pun
         default: break;
         }
 #endif
+#if NK_TARGET_GRANITEAMX
+    if (v & nk_cap_graniteamx_k) switch (k) {
+        case nk_kernel_dots_packed_size_k:
+            *m = (m_t)&nk_dots_packed_size_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
+            return;
+        case nk_kernel_dots_pack_k: *m = (m_t)&nk_dots_pack_e5m2_graniteamx, *c = nk_cap_graniteamx_k; return;
+        case nk_kernel_dots_packed_k: *m = (m_t)&nk_dots_packed_e5m2_graniteamx, *c = nk_cap_graniteamx_k; return;
+        case nk_kernel_angulars_packed_k:
+            *m = (m_t)&nk_angulars_packed_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
+            return;
+        case nk_kernel_euclideans_packed_k:
+            *m = (m_t)&nk_euclideans_packed_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
+            return;
+        case nk_kernel_dots_symmetric_k: *m = (m_t)&nk_dots_symmetric_e5m2_graniteamx, *c = nk_cap_graniteamx_k; return;
+        case nk_kernel_angulars_symmetric_k:
+            *m = (m_t)&nk_angulars_symmetric_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
+            return;
+        case nk_kernel_euclideans_symmetric_k:
+            *m = (m_t)&nk_euclideans_symmetric_e5m2_graniteamx, *c = nk_cap_graniteamx_k;
+            return;
+        default: break;
+        }
+#endif
 #if NK_TARGET_SAPPHIREAMX
     if (v & nk_cap_sapphireamx_k) switch (k) {
         case nk_kernel_dots_packed_size_k:
 
@@ -93,6 +93,9 @@ NEON backend uses `vreinterpretq_u16_u8` + `vzip` for zero-extension; Haswell us
 `nk_f16_to_f32_haswell`, `nk_f32_to_f16_haswell` use the F16C extension instructions `VCVTPH2PS` / `VCVTPS2PH` — single-instruction conversion of 8 elements with correct denormal handling, NaN propagation, and RNE rounding.
 The serial fallback (`nk_f16_to_f32_serial`) must handle denormals via explicit exponent/mantissa extraction and conditional re-normalization — ~15 integer ops per element vs 1 instruction with F16C.
 AVX-512 (`nk_cast_skylake`) doubles throughput to 16 elements per instruction.
+F16C also unlocks a cheaper FP8 → F32 path that bypasses i32-lane bit math: `nk_e5m2x16_to_f32x16_skylake_` and `nk_e5m2x8_to_f32x8_haswell_` widen u8 → u16 and left-shift by 8 (E5M2 shares F16's bias 15, so the result is a bit-exact F16 encoding of every input including subnormals and NaN), then feed `VCVTPH2PS` — three ops total.
+E4M3 can't use a plain shift (bias 7 vs 15), but the Giesen-style fake-F16 `((byte & 0x7F) << 7) | ((byte & 0x80) << 8)` gives an F16 whose value differs from the E4M3 magnitude by exactly 2⁸; `nk_e4m3x16_to_f32x16_skylake_` and `nk_e4m3x8_to_f32x8_haswell_` widen through `VCVTPH2PS`, multiply by 256 in F32 to correct, and blend in F32 NaN for the lone `|byte|==0x7F` encoding.
+For E4M3 GEMM specifically, `nk_e4m3x16_to_f16x16_skylake_` produces TRUE F16 (bias-corrected, with a small subnormal LUT and NaN blend) so the packed buffer stores 2 bytes/element instead of 4 — the inner loop reads F16 and widens to F32 once per B-load, trading ~10% compute for 50% pack memory.
 
 ## Performance
 
 
@@ -194,31 +194,14 @@ NK_INTERNAL __m256 nk_e4m3x8_to_f32x8_haswell_(__m128i e4m3_i8x8) {
     return _mm256_mul_ps(fake_f32x8, _mm256_set1_ps(256.0f));
 }
 
-/** @brief Convert 8x e5m2 → 8x f32 via bit manipulation (AVX2).
- *  E5M2 format: S EEEEE MM (bias=15). F32: sign<<31, (exp+112)<<23, mant<<21.
- *  Subnormals (exp=0): value = mantissa × 2⁽¹⁻¹⁵⁾ × 2⁻² = mantissa ÷ 65536. */
+/** @brief Convert 8x e5m2 → 8x f32 via free-shift widen (AVX2 + F16C).
+ *  E5M2 shares F16's exponent bias (15): `(byte << 8)` is the matching F16 bit
+ *  pattern for every E5M2 value (normals, subnormals, zero, ±Inf, NaN — all
+ *  bit-exact). Widen u8 → u16, shift, then VCVTPH2PS to F32. Three ops total. */
 NK_INTERNAL __m256 nk_e5m2x8_to_f32x8_haswell_(__m128i e5m2_i8x8) {
-    __m256i e5m2_i32x8 = _mm256_cvtepu8_epi32(e5m2_i8x8);
-
-    // Extract fields
-    __m256i exp_i32x8 = _mm256_and_si256(_mm256_srli_epi32(e5m2_i32x8, 2), _mm256_set1_epi32(0x1F));
-    __m256i mant_i32x8 = _mm256_and_si256(e5m2_i32x8, _mm256_set1_epi32(0x03));
-
-    // Build F32 sign bit
-    __m256i f32_sign_i32x8 = _mm256_slli_epi32(_mm256_srli_epi32(e5m2_i32x8, 7), 31);
-
-    // Normal path: sign | ((exp+112)<<23) | (mant<<21)
-    __m256i f32_exp_i32x8 = _mm256_slli_epi32(_mm256_add_epi32(exp_i32x8, _mm256_set1_epi32(112)), 23);
-    __m256i f32_mant_i32x8 = _mm256_slli_epi32(mant_i32x8, 21);
-    __m256i normal_bits_i32x8 = _mm256_or_si256(f32_sign_i32x8, _mm256_or_si256(f32_exp_i32x8, f32_mant_i32x8));
-
-    // Subnormal path: value = mantissa / 65536.0f, then apply sign
-    __m256 subnorm_abs_f32x8 = _mm256_mul_ps(_mm256_cvtepi32_ps(mant_i32x8), _mm256_set1_ps(1.0f / 65536.0f));
-    __m256 subnorm_f32x8 = _mm256_or_ps(subnorm_abs_f32x8, _mm256_castsi256_ps(f32_sign_i32x8));
-
-    // Blend: if exp==0, use subnormal result; otherwise use normal bits
-    __m256i exp_zero_mask = _mm256_cmpeq_epi32(exp_i32x8, _mm256_setzero_si256());
-    return _mm256_blendv_ps(_mm256_castsi256_ps(normal_bits_i32x8), subnorm_f32x8, _mm256_castsi256_ps(exp_zero_mask));
+    __m128i e5m2_u16x8 = _mm_cvtepu8_epi16(e5m2_i8x8);
+    __m128i f16_bits_u16x8 = _mm_slli_epi16(e5m2_u16x8, 8);
+    return _mm256_cvtph_ps(f16_bits_u16x8);
 }
 
 /** @brief Convert 8x f32 → 8x e4m3 via bit manipulation (AVX2).
 
@@ -198,27 +198,40 @@ NK_INTERNAL __m512 nk_e4m3x16_to_f32x16_skylake_(__m128i e4m3_i8x16) {
     return _mm512_mul_ps(fake_f32x16, _mm512_set1_ps(256.0f));
 }
 
-/** @brief Convert 16x e5m2 → 16x f32 via bit manipulation (AVX-512).
- *  E5M2 format: S EEEEE MM (bias=15). F32: sign<<31, (exp+112)<<23, mantissa<<21.
- *  Subnormals (exp=0): value = mantissa × 2⁽¹⁻¹⁵⁾ × 2⁻² = mantissa ÷ 65536. */
+/** @brief Convert 16x e4m3 → 16x f16 via arithmetic + 8-entry subnormal LUT (AVX-512BW + AVX-512VL).
+ *  E4M3: S EEEE MMM (bias=7). F16: S EEEEE MMMMMMMMMM (bias=15).
+ *  Normal (exp != 0): F16 = ((lower7 << 7) + 0x2000) | (sign << 8) — bias delta 8 added at the
+ *  exp-position (8 << 10 = 0x2000) after placing magnitude bits at F16 positions 13..7.
+ *  Subnormal (exp == 0): looked up from 8-entry F16 LUT — values 0, 1/512, 2/512, …, 7/512 encoded as
+ *  F16 normals (the smallest E4M3 subnormal 1/512 = 2⁻⁹ is well within F16 normal range).
+ *  NaN (|byte| == 0x7F): blended in as F16 quiet NaN with original sign. */
+NK_INTERNAL __m256i nk_e4m3x16_to_f16x16_skylake_(__m128i e4m3_u8x16) {
+    __m256i e4m3_i16x16 = _mm256_cvtepu8_epi16(e4m3_u8x16);
+    __m256i sign_i16x16 = _mm256_and_si256(e4m3_i16x16, _mm256_set1_epi16((short)0x80));
+    __m256i lower7_i16x16 = _mm256_and_si256(e4m3_i16x16, _mm256_set1_epi16(0x7F));
+    __m256i normal_abs_i16x16 = _mm256_add_epi16(_mm256_slli_epi16(lower7_i16x16, 7), _mm256_set1_epi16(0x2000));
+    __m256i subn_lut_i16x16 = _mm256_set_epi16( //
+        0x2300, 0x2200, 0x2100, 0x2000, 0x1E00, 0x1C00, 0x1800, 0x0000, 0x2300, 0x2200, 0x2100, 0x2000, 0x1E00, 0x1C00,
+        0x1800, 0x0000);
+    __m256i mant_idx_i16x16 = _mm256_and_si256(e4m3_i16x16, _mm256_set1_epi16(0x07));
+    __m256i subn_abs_i16x16 = _mm256_permutexvar_epi16(mant_idx_i16x16, subn_lut_i16x16);
+    __mmask16 is_subnormal = _mm256_testn_epi16_mask(e4m3_i16x16, _mm256_set1_epi16(0x78));
+    __m256i abs_i16x16 = _mm256_mask_blend_epi16(is_subnormal, normal_abs_i16x16, subn_abs_i16x16);
+    __m256i shifted_sign_i16x16 = _mm256_slli_epi16(sign_i16x16, 8);
+    __m256i result_i16x16 = _mm256_or_si256(abs_i16x16, shifted_sign_i16x16);
+    __mmask16 is_nan = _mm256_cmpeq_epi16_mask(lower7_i16x16, _mm256_set1_epi16(0x7F));
+    __m256i nan_i16x16 = _mm256_or_si256(shifted_sign_i16x16, _mm256_set1_epi16(0x7E00));
+    return _mm256_mask_blend_epi16(is_nan, result_i16x16, nan_i16x16);
+}
+
+/** @brief Convert 16x e5m2 → 16x f32 via free-shift widen (AVX-512 + F16C).
+ *  E5M2 shares F16's exponent bias (15): `(byte << 8)` is the matching F16 bit
+ *  pattern for every E5M2 value (normals, subnormals, zero, ±Inf, NaN — all
+ *  bit-exact). Widen u8 → u16, shift, then VCVTPH2PS to F32. Three ops total. */
 NK_INTERNAL __m512 nk_e5m2x16_to_f32x16_skylake_(__m128i e5m2_i8x16) {
-    __m512i e5m2_i32x16 = _mm512_cvtepu8_epi32(e5m2_i8x16);
-
-    // Extract fields
-    __m512i exp_i32x16 = _mm512_and_si512(_mm512_srli_epi32(e5m2_i32x16, 2), _mm512_set1_epi32(0x1F));
-    __m512i mantissa_i32x16 = _mm512_and_si512(e5m2_i32x16, _mm512_set1_epi32(0x03));
-    __m512i sign_i32x16 = _mm512_slli_epi32(_mm512_srli_epi32(e5m2_i32x16, 7), 31);
-
-    // Normal path: sign | ((exp+112)<<23) | (mantissa<<21)
-    __m512i f32_exp_i32x16 = _mm512_slli_epi32(_mm512_add_epi32(exp_i32x16, _mm512_set1_epi32(112)), 23);
-    __m512i f32_mantissa_i32x16 = _mm512_slli_epi32(mantissa_i32x16, 21);
-    __m512 result_f32x16 = _mm512_castsi512_ps(
-        _mm512_ternarylogic_epi32(sign_i32x16, f32_exp_i32x16, f32_mantissa_i32x16, 0xFE));
-
-    // Subnormal fix: for exp==0 lanes, replace with (mantissa / 65536) | sign using masked OR
-    __mmask16 is_subnormal = _mm512_testn_epi32_mask(e5m2_i32x16, _mm512_set1_epi32(0x7C));
-    __m512 subnorm_abs_f32x16 = _mm512_mul_ps(_mm512_cvtepi32_ps(mantissa_i32x16), _mm512_set1_ps(1.0f / 65536.0f));
-    return _mm512_mask_or_ps(result_f32x16, is_subnormal, subnorm_abs_f32x16, _mm512_castsi512_ps(sign_i32x16));
+    __m256i e5m2_u16x16 = _mm256_cvtepu8_epi16(e5m2_i8x16);
+    __m256i f16_bits_u16x16 = _mm256_slli_epi16(e5m2_u16x16, 8);
+    return _mm512_cvtph_ps(f16_bits_u16x16);
 }
 
 /** @brief Convert 16x e2m3 → 16x f32 via bit manipulation (AVX-512).
@@ -650,6 +663,18 @@ NK_INTERNAL void nk_partial_load_e4m3x16_to_f32x16_skylake_(void const *src, nk_
     dst->zmm_ps = nk_e4m3x16_to_f32x16_skylake_(e4m3_partial.xmm);
 }
 
+/** @brief Load 16 e4m3 values and convert to 16 f16 (Skylake AVX-512BW). */
+NK_INTERNAL void nk_load_e4m3x16_to_f16x16_skylake_(void const *src, nk_b256_vec_t *dst) {
+    dst->ymm = nk_e4m3x16_to_f16x16_skylake_(_mm_loadu_si128((__m128i const *)src));
+}
+
+/** @brief Partial load of up to 16 e4m3 values with conversion to f16 (Skylake AVX-512BW). */
+NK_INTERNAL void nk_partial_load_e4m3x16_to_f16x16_skylake_(void const *src, nk_b256_vec_t *dst, nk_size_t n) {
+    nk_b128_vec_t e4m3_partial;
+    nk_partial_load_b8x16_skylake_(src, &e4m3_partial, n);
+    dst->ymm = nk_e4m3x16_to_f16x16_skylake_(e4m3_partial.xmm);
+}
+
 /** @brief Load 16 e5m2 values and convert to 16 f32 (Skylake AVX-512). */
 NK_INTERNAL void nk_load_e5m2x16_to_f32x16_skylake_(void const *src, nk_b512_vec_t *dst) {
     dst->zmm_ps = nk_e5m2x16_to_f32x16_skylake_(_mm_loadu_si128((__m128i const *)src));
 
@@ -111,6 +111,7 @@ This processes 64 E4M3 bytes per iteration in u8, doubling the element density o
 
 `nk_dot_e5m2_genoa` converts FP8 values to BF16, then accumulates via `VDPBF16PS`, reusing Genoa's BF16 dot-product instruction for FP8 types.
 Each `VDPBF16PS` fuses two BF16 multiply-adds per 32-bit lane at 6-cycle throughput.
+On Skylake-X–class CPUs without BF16 dot-product hardware, `nk_dot_e4m3_skylake` / `nk_dot_e5m2_skylake` (and their Haswell twins `nk_dot_e4m3_haswell` / `nk_dot_e5m2_haswell`) instead route through the Giesen-style FP8 → F16 fake-bit-pattern cast, widen via `VCVTPH2PS`, and accumulate in F32 with two independent FMA chains reducing into a single register — avoiding the 3-chain scheduler-stall of the BF16 algebraic form on kernels without native BF16 FMA.
 `nk_dot_bf16c_genoa` uses the same instruction for complex BF16, preparing operands with `VPSHUFB` for lane swapping and `VPXORD` with `0x80000000` for sign flips before feeding into `VDPBF16PS`.
 
 ### Deferred Sign-Flip in Complex Dot Products