amd
diff --git a/‎classic/aocl_gemm_f16f16f16of16.c‎
Lines changed: 22 additions & 152 deletions b/‎classic/aocl_gemm_f16f16f16of16.c‎
Lines changed: 22 additions & 152 deletions
@@ -27,6 +27,7 @@
  */
 
 #include "aocl_dlp_gemm_check.h"
+#include "classic/aocl_fp16_convert.h"
 #include "classic/aocl_gemm_interface_apis.h"
 #include "classic/aocl_lib_interface_apis.h"
 #include "classic/dlp_errors.h"
@@ -40,164 +41,33 @@
 #include "runtime/dlp_runtime.h"
 #include "threading/dlp_gemm_thread_decor_openmp.h"
 
-#if defined(__F16C__) && defined(__GNUC__)
-#include <immintrin.h>
-#endif
-
-/**
- * @brief Convert float32 to float16
- *
- * Uses compiler intrinsics when available (_cvtss_sh with F16C),
- * otherwise falls back to portable software bit manipulation.
- * Uses round-to-nearest-even rounding mode per IEEE-754.
- *
- * The software fallback correctly handles:
- * - Round-to-nearest-even rounding
- * - NaN propagation (preserves quiet NaN)
- * - Subnormal denormalization with proper rounding
- * - Overflow to infinity
- * - Underflow to zero or subnormal
- * - Rounding-induced exponent increment
- */
-static inline float16
-f32_to_fp16(float f32_val)
-{
-#if defined(__F16C__) && defined(__GNUC__)
-    /* Use F16C intrinsic for hardware conversion */
-    return (float16)_cvtss_sh(f32_val, 0);
-#else
-    /* Software conversion from float32 to float16 with IEEE-754 rounding */
-    union
-    {
-        float    f;
-        uint32_t u;
-    } x;
-    x.f = f32_val;
-
-    /* Extract components */
-    uint32_t sign   = (x.u & 0x80000000U) >> 16;   /* Bit 31 → 15 */
-    int32_t  exp32  = ((x.u & 0x7F800000U) >> 23); /* Extract exponent */
-    uint32_t mant32 = (x.u & 0x007FFFFFU);         /* Extract mantissa */
-
-    /* Special case: FP32 zero or subnormal */
-    if (exp32 == 0) {
-        /* FP32 subnormals are too small for FP16, flush to signed zero */
-        return (float16)(sign);
-    }
-
-    /* Special case: FP32 infinity or NaN */
-    if (exp32 == 0xFF) {
-        if (mant32 == 0) {
-            /* Infinity */
-            return (float16)(sign | 0x7C00U);
-        } else {
-            /* NaN: preserve some mantissa bits, ensure quiet NaN */
-            uint16_t mant16 = (uint16_t)((mant32 >> 13) | 0x0200U);
-            return (float16)(sign | 0x7C00U | (mant16 & 0x03FFU));
-        }
-    }
-
-    /* Rebias exponent: FP32 bias=127, FP16 bias=15 */
-    int32_t exp16 = exp32 - 112; /* exp32 - 127 + 15 = exp32 - 112 */
-
-    /* Add implicit leading 1 to mantissa for calculations */
-    mant32 |= 0x00800000U;
-
-    /* Check for underflow (handle denormals) */
-    if (exp16 <= 0) {
-        if (exp16 < -10) {
-            /* Too small, flush to zero */
-            return (float16)(sign);
-        }
-
-        /*
-         * Denormalize: shift mantissa right to align with FP16 denormal format.
-         * For FP16 denormals, the value is: mantissa * 2^-24
-         * We need to shift the 24-bit mantissa (with implicit 1) right by
-         * (14 - exp32 + 127) = (141 - exp32) positions to get the 10-bit
-         * result. This is equivalent to shifting by (1 - exp16 + 13) = (14 -
-         * exp16).
-         */
-        int total_shift = 14 - exp16; /* Total shift to get 10-bit mantissa */
-
-        /* Round to nearest even using the bits that will be shifted out */
-        uint32_t round_bit   = (mant32 >> (total_shift - 1)) & 1;
-        uint32_t sticky_mask = (1U << (total_shift - 1)) - 1;
-        uint32_t sticky      = (mant32 & sticky_mask) != 0;
-        uint32_t lsb         = (mant32 >> total_shift) & 1;
-
-        /* Compute the shifted mantissa */
-        uint32_t mant16 = mant32 >> total_shift;
-
-        /* Apply round-to-nearest-even */
-        if (round_bit && (sticky || lsb)) {
-            mant16++;
-        }
-
-        /* Check if rounding caused normalization (overflow into bit 10) */
-        if (mant16 >= 0x0400U) {
-            return (float16)(sign | 0x0400U); /* Smallest normal */
-        }
-
-        return (float16)(sign | (uint16_t)mant16);
-    }
-
-    /* Check for overflow before rounding */
-    if (exp16 >= 0x1F) {
-        return (float16)(sign | 0x7C00U);
-    }
-
-    /* Normal value: Round mantissa from 23 to 10 bits */
-    uint32_t round_bits = mant32 & 0x1FFFU; /* Bits 12-0 */
-    uint32_t lsb        = (mant32 >> 13) & 1;
-
-    /* Round to nearest even */
-    if (round_bits > 0x1000U || (round_bits == 0x1000U && lsb)) {
-        mant32 += 0x1000U;
-    }
-
-    /* Check for carry into exponent AFTER rounding */
-    if (mant32 & 0x01000000U) {
-        /* Mantissa overflowed into bit 24 */
-        exp16++;
-        mant32 = 0x00800000U; /* Reset to implicit 1 only */
-
-        /* Check if exponent overflowed to infinity */
-        if (exp16 >= 0x1F) {
-            return (float16)(sign | 0x7C00U);
-        }
-    }
-
-    /* Extract rounded 10-bit mantissa (remove implicit 1) */
-    uint16_t mant16 = (uint16_t)((mant32 >> 13) & 0x03FFU);
-
-    return (float16)(sign | ((uint16_t)exp16 << 10) | mant16);
-#endif
-}
-
 void
 aocl_gemm_f16f16f16of16(const char      order,
                         const char      transa,
                         const char      transb,
                         const md_t      m,
                         const md_t      n,
                         const md_t      k,
-                        const float     alpha,
+                        const float16   alpha,
                         const float16*  a,
                         const md_t      lda,
                         const char      mem_format_a,
                         const float16*  b,
                         const md_t      ldb,
                         const char      mem_format_b,
-                        const float     beta,
+                        const float16   beta,
                         float16*        c,
                         const md_t      ldc,
                         dlp_metadata_t* metadata)
 {
     DLP_GEMM_START_LOGGER();
+    // alpha/beta arrive as float16 (the FP16 GEMM API contract). The shared
+    // logger prints them as %f, so widen once at the call boundary via
+    // fp16_to_f32. The widening is for printing only and never propagates
+    // back into computation.
     DLP_GEMM_WRITE_LOGGER("f16f16f16of16", order, transa, transb, m, n, k,
-                          ((float)alpha), lda, mem_format_a, ldb, mem_format_b,
-                          ((float)beta), ldc, metadata);
+                          fp16_to_f32(alpha), lda, mem_format_a, ldb,
+                          mem_format_b, fp16_to_f32(beta), ldc, metadata);
 
     DLP_METADATA_SET_ERROR(metadata, DLP_CLSC_SUCCESS);
 
@@ -388,20 +258,20 @@ aocl_gemm_f16f16f16of16(const char      order,
     AOCL_DLP_MEMORY_TAG jit_mtag_a = mtag_a_use;
     AOCL_DLP_MEMORY_TAG jit_mtag_b = mtag_b_use;
 
-    // Convert alpha and beta from float to float16 for JIT kernel.
-    // The FP16 JIT kernel uses vpbroadcastw (16-bit broadcast) to load
-    // alpha/beta, so we must pass FP16 addresses.
-    float16 alpha_fp16 = f32_to_fp16(alpha);
-    float16 beta_fp16  = f32_to_fp16(beta);
-
     // Initialize DLP Plus kernel path (JIT support)
     lcntx_l.dlp_kernel_hndl.kernel_base = NULL;
 
+    // alpha/beta are passed as FP16. The decision engine reads
+    // (void*)&alpha and (void*)&beta as float16* via
+    // getScalingTypes<dlp::float16>, and the JIT consumes alpha natively
+    // as FP16 (vpbroadcastw + vmulph). Beta is consumed as FP16 on the
+    // of16 rail and widened to float by the 5-loop before each kernel
+    // call on the of32 rail.
     dlp_init_and_get_kernel_hndl(
         DLP_KERNEL_F16F16F16OF16, order, jit_mtag_a, jit_mtag_b, m_use, n_use,
-        k, rs_a_use, cs_a_use, rs_b_use, cs_b_use, rs_c, cs_c,
-        (void*)&alpha_fp16, (void*)&beta_fp16, post_op_list, mr_hint, nr_hint,
-        kc_hint, DLP_F16, &lcntx_l.dlp_kernel_hndl);
+        k, rs_a_use, cs_a_use, rs_b_use, cs_b_use, rs_c, cs_c, (void*)&alpha,
+        (void*)&beta, post_op_list, mr_hint, nr_hint, kc_hint, DLP_F16,
+        &lcntx_l.dlp_kernel_hndl);
 
     // FP16 is JIT-only (no intrinsic fallback), so check if JIT succeeded
     if (lcntx_l.dlp_kernel_hndl.kernel_base == NULL) {
@@ -417,13 +287,13 @@ aocl_gemm_f16f16f16of16(const char      order,
 #ifdef DLP_ENABLE_OPENMP
     dlp_gemm_f16f16f16of16_openmp_thread_decorator(
         m_use, n_use, k, a_use, rs_a_use, cs_a_use, mtag_a_use, b_use, rs_b_use,
-        cs_b_use, mtag_b_use, c, rs_c, cs_c, alpha_fp16, beta_fp16, &rntm_g,
-        &lcntx_l, &ops, DLP_F16);
+        cs_b_use, mtag_b_use, c, rs_c, cs_c, alpha, beta, &rntm_g, &lcntx_l,
+        &ops, DLP_F16);
 #else
     dlp_gemm_f16f16f16of16_thread_decorator(
         m_use, n_use, k, a_use, rs_a_use, cs_a_use, mtag_a_use, b_use, rs_b_use,
-        cs_b_use, mtag_b_use, c, rs_c, cs_c, alpha_fp16, beta_fp16, &rntm_g,
-        &lcntx_l, &ops, DLP_F16);
+        cs_b_use, mtag_b_use, c, rs_c, cs_c, alpha, beta, &rntm_g, &lcntx_l,
+        &ops, DLP_F16);
 #endif
 
 err_hndl:;