s8s8_sym_quant gtest: enable reorder, post-op coverage and fix leak/ref bugs (#557)

arnsharm · web-flow · commit b3db7f5800ac · 2026-06-04T16:07:12.000+05:30
- Added optional const GroupScaleParam* group_scale argument to
  IUal::reorder(), UalDlp::reorder() and UalRef::reorder() for the
  sym_quant APIs. The argument defaults to nullptr, keeping existing callers
  source-compatible.

- UalDlp::reorder() now selects the sym_quant reorder APIs
  aocl_get_reorder_buf_size_s8s8s32os32_sym_quant() and
  aocl_reorder_s8s8s32os32_sym_quant() when given s8 input, s32
  accumulation, f32/bf16 output and a non-null group_scale, normalizing
  group_size == 0 to the full K dimension since the sym_quant APIs
  require a strictly positive group size.

- Fixed a leak of the A pack buffer in the GEMV (m=1) path of
  s8s8_sym_quant kernel.

- Hardened group pre-op validation in
  dlp_gemm_translate_to_group_postops_list() to also reject scale-factor
  and zero-point arrays shorter than the required length, i.e.,
  m*(ceil(k/group_size)) for A matrix and n*(ceil(k/group_size)) for B matrix.

- Fixed the reference skipping group-scale de-quantization under post-ops in
  RefUalPlan::execute() since it was taking the integer GEMM's
  needsF32Intermediate path. isS8S8GroupScale is now computed earlier
  and excluded from needsF32Intermediate, letting these cases fall through
  to the sym_quant reference which de-quantizes and then applies post-ops
  via applyPostOps().

AMD-Internal: [CPUPL-8537]
Signed-off-by: Arnav Sharma &lt;Arnav.Sharma@amd.com&gt;
diff --git a/bench/bench_gemm.cc b/bench/bench_gemm.cc
@@ -150,8 +150,8 @@ class OptimizedGemmBenchmark : public ConcreteUAL
         // Apply memory tag for B (reorder and pack are mutually exclusive)
         if (config.reorderB) {
             Matrix B_reordered;
-            this->reorder(B_, B_reordered, a_type_, b_type_, c_type_,
-                          acc_type_);
+            this->reorder(B_, B_reordered, a_type_, b_type_, c_type_, acc_type_,
+                          config.group_scale_param.get());
             B_ = std::move(B_reordered);
             // Reorder handles transposition; reset trans flag for GEMM call
             transB_ = false;
diff --git a/classic/frame/dlp_gemm_post_ops.c b/classic/frame/dlp_gemm_post_ops.c
@@ -137,8 +137,6 @@ dlp_gemm_translate_to_group_postops_list(dlp_group_post_op*      metadata,
                                          md_t                    n,
                                          md_t                    k)
 {
-    (void)m;
-    (void)n;
     if ((metadata == NULL) || (metadata->seq_length <= 0)) {
         dlp_gemm_set_group_post_ops_node_params(post_op_list, 0, NULL, NULL, 0,
                                                 0, NULL, NULL, 0, 0,
@@ -166,25 +164,41 @@ dlp_gemm_translate_to_group_postops_list(dlp_group_post_op*      metadata,
             if (((metadata->a_zp)->zero_point_len > 0)
                 && ((metadata->a_zp)->zero_point == NULL))
                 return DLP_CLSC_NULL_POINTER;
+
+            if ((metadata->a_zp)->zero_point_len
+                < (m * ((k + group_size - 1) / group_size)))
+                return DLP_CLSC_INVALID_ZP_LEN;
         }
 
         if (metadata->a_scl != NULL) {
             if (((metadata->a_scl)->scale_factor_len > 0)
                 && ((metadata->a_scl)->scale_factor == NULL))
                 return DLP_CLSC_NULL_POINTER;
+
+            if ((metadata->a_scl)->scale_factor_len
+                < (m * ((k + group_size - 1) / group_size)))
+                return DLP_CLSC_INVALID_SF_LEN;
         }
 
         if (metadata->b_zp != NULL) {
             /* check for validity of pre-ops */
             if (((metadata->b_zp)->zero_point_len > 0)
                 && ((metadata->b_zp)->zero_point == NULL))
                 return DLP_CLSC_NULL_POINTER;
+
+            if ((metadata->b_zp)->zero_point_len
+                < (n * ((k + group_size - 1) / group_size)))
+                return DLP_CLSC_INVALID_ZP_LEN;
         }
 
         if (metadata->b_scl != NULL) {
             if (((metadata->b_scl)->scale_factor_len > 0)
                 && ((metadata->b_scl)->scale_factor == NULL))
                 return DLP_CLSC_NULL_POINTER;
+
+            if ((metadata->b_scl)->scale_factor_len
+                < (n * ((k + group_size - 1) / group_size)))
+                return DLP_CLSC_INVALID_SF_LEN;
         }
 
         if ((metadata->a_scl != NULL) && (metadata->b_scl != NULL)
diff --git a/classic/frame/s8s8s32/dlp_gemm_s8s8s32_sym_quant.c b/classic/frame/s8s8s32/dlp_gemm_s8s8s32_sym_quant.c
@@ -266,7 +266,7 @@ DLP_GEMV2(int8_t, int8_t, int32_t, s8s8s32o32_sym_quant)
         } // jc loop
 
         // Release pack buffers.
-        if (mtag_b == PACK && (pack_a_buffer_s8s8s32os32 != NULL)) {
+        if ((mtag_a == PACK) && (pack_a_buffer_s8s8s32os32 != NULL)) {
             dlp_free_page_aligned(pack_a_buffer_s8s8s32os32);
         }
     }
diff --git a/include/classic/dlp_errors.h b/include/classic/dlp_errors.h
@@ -52,9 +52,11 @@ typedef enum
                                          */
     DLP_CLSC_INVALID_MATRIX_TYPE,       /**< Invalid matrix type specified */
     DLP_CLSC_INVALID_GROUP_DIMENSION, /**< Invalid group dimension specified */
-    DLP_CLSC_TYPE_MISMATCH,           /**< Data type mismatch encountered */
-    DLP_CLSC_INVALID_JIT_KERNEL,      /**< JIT kernel generation failed or no
-                                         fallback kernel available */
+    DLP_CLSC_INVALID_SF_LEN,     /**< Invalid scale factor length specified */
+    DLP_CLSC_INVALID_ZP_LEN,     /**< Invalid zero point length specified */
+    DLP_CLSC_TYPE_MISMATCH,      /**< Data type mismatch encountered */
+    DLP_CLSC_INVALID_JIT_KERNEL, /**< JIT kernel generation failed or no
+                                    fallback kernel available */
     DLP_CLSC_INVALID_KERNEL, /**< Static kernel not found for given parameters
                               */
     DLP_CLSC_ERROR_MAX /**< Maximum error code value (for bounds checking) */
diff --git a/tests/adaptors/dlp/ual_dlp.cc b/tests/adaptors/dlp/ual_dlp.cc
@@ -116,12 +116,13 @@ UalDlp::toString(UALType type)
  * @return UALError Error code indicating success or failure
  */
 UALError
-UalDlp::reorder(const Matrix& in,
-                Matrix&       out,
-                MatrixType    A_type,
-                MatrixType    B_type,
-                MatrixType    C_type,
-                MatrixType    accType)
+UalDlp::reorder(const Matrix&          in,
+                Matrix&                out,
+                MatrixType             A_type,
+                MatrixType             B_type,
+                MatrixType             C_type,
+                MatrixType             accType,
+                const GroupScaleParam* group_scale)
 {
     dlp_metadata_t meta;
     meta.error_hndl.error_code = DLP_CLSC_SUCCESS;
@@ -130,6 +131,13 @@ UalDlp::reorder(const Matrix& in,
     md_t effective_rows = in.getEffectiveRows();
     md_t effective_cols = in.getEffectiveCols();
 
+    // Detect symmetric-quantization reorder path:
+    // s8 input, s32 accumulation, f32 or bf16 output, and group_scale provided.
+    const bool sym_quant =
+        (group_scale != nullptr) && (in.getMatrixType() == MatrixType::s8)
+        && (accType == MatrixType::s32)
+        && (C_type == MatrixType::f32 || C_type == MatrixType::bf16);
+
     // Determine appropriate reorder function based on input type and GEMM
     // context The A, B, C types provide context for optimal reordering strategy
     msz_t alloc_bytes = 0;
@@ -166,14 +174,20 @@ UalDlp::reorder(const Matrix& in,
                 effective_cols, &meta);
         }
     } else if (in.getMatrixType() == MatrixType::s8) {
-        // For s8, consider the accumulation type and output type
-        if (accType == MatrixType::s32) {
-            alloc_bytes = aocl_get_reorder_buf_size_s8s8s32os32(
+        // For s8, select sym_quant or standard reorder based on GEMM context
+        if (sym_quant) {
+            // group_size=0 means "full K dimension"; normalize before calling
+            // the AOCL sym_quant APIs which require a strictly positive value.
+            md_t gs = group_scale->getGroupSize();
+            if (gs == 0) {
+                gs = effective_rows; // effective_rows == K for B matrix
+            }
+            DLP_SYMM_STAT_QUANT symq = { gs };
+            alloc_bytes = aocl_get_reorder_buf_size_s8s8s32os32_sym_quant(
                 in.getLayout() == MatrixLayout::ROW_MAJOR ? 'r' : 'c',
                 in.isTransposed() ? 't' : 'n', 'B', effective_rows,
-                effective_cols, &meta);
+                effective_cols, &symq, &meta);
         } else {
-            // Handle other accumulation types - for now, fall back to standard
             alloc_bytes = aocl_get_reorder_buf_size_s8s8s32os32(
                 in.getLayout() == MatrixLayout::ROW_MAJOR ? 'r' : 'c',
                 in.isTransposed() ? 't' : 'n', 'B', effective_rows,
@@ -274,13 +288,31 @@ UalDlp::reorder(const Matrix& in,
                 &meta);
             break;
         case MatrixType::s8:
-            aocl_reorder_s8s8s32os32(
-                layout, in.isTransposed() ? 't' : 'n', 'B',
-                reinterpret_cast<const int8_t*>(
-                    in.getMatrixData().getMatrixPtr()),
-                reinterpret_cast<int8_t*>(out.getMatrixData().getMatrixPtr()),
-                effective_rows, effective_cols, in.getLeadingDimension(),
-                &meta);
+            if (sym_quant) {
+                // group_size=0 means "full K"; normalize to avoid div-by-zero.
+                md_t gs = group_scale->getGroupSize();
+                if (gs == 0) {
+                    gs = effective_rows;
+                }
+                DLP_SYMM_STAT_QUANT symq = { gs };
+                aocl_reorder_s8s8s32os32_sym_quant(
+                    layout, in.isTransposed() ? 't' : 'n', 'B',
+                    reinterpret_cast<const int8_t*>(
+                        in.getMatrixData().getMatrixPtr()),
+                    reinterpret_cast<int8_t*>(
+                        out.getMatrixData().getMatrixPtr()),
+                    effective_rows, effective_cols, in.getLeadingDimension(),
+                    &symq, &meta);
+            } else {
+                aocl_reorder_s8s8s32os32(
+                    layout, in.isTransposed() ? 't' : 'n', 'B',
+                    reinterpret_cast<const int8_t*>(
+                        in.getMatrixData().getMatrixPtr()),
+                    reinterpret_cast<int8_t*>(
+                        out.getMatrixData().getMatrixPtr()),
+                    effective_rows, effective_cols, in.getLeadingDimension(),
+                    &meta);
+            }
             break;
         case MatrixType::fp16:
             if (A_type == MatrixType::f32 && C_type == MatrixType::f32
diff --git a/tests/adaptors/ref/ual_plan_ref.cc b/tests/adaptors/ref/ual_plan_ref.cc
@@ -261,8 +261,12 @@ RefUalPlan::execute()
          || (aType == MatrixType::s8 && bType == MatrixType::s8));
     bool isBf16Gemm = (aType == MatrixType::bf16 && bType == MatrixType::bf16);
     bool isF32Gemm  = (aType == MatrixType::f32 && bType == MatrixType::f32);
-    bool needsF32Intermediate =
-        (isIntegerGemm || isBf16Gemm || isF32Gemm) && hasPostOps;
+
+    bool isS8S8GroupScale =
+        (aType == MatrixType::s8 && bType == MatrixType::s8 && m_group_scale);
+
+    bool needsF32Intermediate = (isIntegerGemm || isBf16Gemm || isF32Gemm)
+                                && hasPostOps && !isS8S8GroupScale;
 
     if (needsF32Intermediate && !ualRef.checkValidGemmParams(A, B, C, true)) {
         needsF32Intermediate = false;
@@ -301,9 +305,6 @@ RefUalPlan::execute()
     // Uses specialized ref that handles per-group scale application during
     // K-panel accumulation, which is required for correct results when
     // group_size > 0.
-    bool isS8S8GroupScale =
-        (aType == MatrixType::s8 && bType == MatrixType::s8 && m_group_scale);
-
     if (isS8S8GroupScale) {
         md_t gs = m_group_scale->getGroupSize();
 
diff --git a/tests/adaptors/ref/ual_ref.cc b/tests/adaptors/ref/ual_ref.cc
@@ -243,7 +243,8 @@ UalRef::reorder(const Matrix& in,
                 MatrixType    A_type,
                 MatrixType    B_type,
                 MatrixType    C_type,
-                MatrixType    accType)
+                MatrixType    accType,
+                const GroupScaleParam* /*group_scale*/)
 {
     /*
         Reordering operation in reference is
diff --git a/tests/classic/test_gemm.cc b/tests/classic/test_gemm.cc
@@ -1092,7 +1092,7 @@ class GemmParameterizedTest : public ::testing::TestWithParam<GemmTestConfig>
 
             dlp_reorder_status = ual_test_->reorder(
                 B, B_reordered, config_.a_type, config_.b_type, config_.c_type,
-                config_.acc_type);
+                config_.acc_type, config_.group_scale_param.get());
 
             // Skip test if DLP reorder is not supported
             if (dlp_reorder_status == UALError::UAL_NOT_SUPPORTED) {
@@ -1112,7 +1112,8 @@ class GemmParameterizedTest : public ::testing::TestWithParam<GemmTestConfig>
             if (params_valid) {
                 ref_reorder_status = ual_ref_->reorder(
                     B_ref, B_ref_reordered, config_.a_type, config_.b_type,
-                    config_.c_type, config_.acc_type);
+                    config_.c_type, config_.acc_type,
+                    config_.group_scale_param.get());
 
                 if (ref_reorder_status == UALError::UAL_SUCCESS) {
                     // For bf16×s4 and bf16×u4, the reference uses row-major B
diff --git a/tests/include/adaptors/dlp/ual_dlp.hh b/tests/include/adaptors/dlp/ual_dlp.hh
@@ -34,6 +34,7 @@
 namespace dlp::testing::classic {
 
 using dlp::testing::framework::BatchGroup;
+using dlp::testing::framework::GroupScaleParam;
 using dlp::testing::framework::IUal;
 using dlp::testing::framework::Matrix;
 using dlp::testing::framework::MatrixLayout;
@@ -112,14 +113,17 @@ class UalDlp : public IUal
      * @param B_type Type of matrix B in GEMM context
      * @param C_type Type of matrix C in GEMM context
      * @param accType Accumulation type
+     * @param group_scale Optional symmetric-quantization group-scale
+     * parameters; when non-null selects the sym_quant reorder path
      * @return UALError Error code indicating success or failure
      */
-    UALError reorder(const Matrix& in,
-                     Matrix&       out,
-                     MatrixType    A_type,
-                     MatrixType    B_type,
-                     MatrixType    C_type,
-                     MatrixType    accType) override;
+    UALError reorder(const Matrix&          in,
+                     Matrix&                out,
+                     MatrixType             A_type,
+                     MatrixType             B_type,
+                     MatrixType             C_type,
+                     MatrixType             accType,
+                     const GroupScaleParam* group_scale = nullptr) override;
 
     UALError batch_gemm(std::vector<BatchGroup>& groups,
                         MatrixType               accType) override;
diff --git a/tests/include/adaptors/ref/ual_ref.hh b/tests/include/adaptors/ref/ual_ref.hh
@@ -35,6 +35,7 @@
 namespace dlp::testing::classic {
 
 using dlp::testing::framework::BatchGroup;
+using dlp::testing::framework::GroupScaleParam;
 using dlp::testing::framework::IUal;
 using dlp::testing::framework::Matrix;
 using dlp::testing::framework::MatrixLayout;
@@ -114,14 +115,16 @@ class UalRef : public IUal
      * @param B_type Type of matrix B in GEMM context
      * @param C_type Type of matrix C in GEMM context
      * @param accType Accumulation type
+     * @param group_scale Optional symmetric-quantization group-scale parameters
      * @return UALError Error code indicating success or failure
      */
-    UALError reorder(const Matrix& in,
-                     Matrix&       out,
-                     MatrixType    A_type,
-                     MatrixType    B_type,
-                     MatrixType    C_type,
-                     MatrixType    accType) override;
+    UALError reorder(const Matrix&          in,
+                     Matrix&                out,
+                     MatrixType             A_type,
+                     MatrixType             B_type,
+                     MatrixType             C_type,
+                     MatrixType             accType,
+                     const GroupScaleParam* group_scale = nullptr) override;
 
     UALError batch_gemm(std::vector<BatchGroup>& groups,
                         MatrixType               accType) override;
diff --git a/tests/include/framework/ual.hh b/tests/include/framework/ual.hh
@@ -233,14 +233,17 @@ class IUal
      * @param B_type Type of matrix B in GEMM context
      * @param C_type Type of matrix C in GEMM context
      * @param accType Accumulation type
+     * @param group_scale Optional symmetric-quantization group-scale
+     * parameters; when non-null selects the sym_quant reorder path
      * @return UALError Error code indicating success or failure
      */
-    virtual UALError reorder(const Matrix& in,
-                             Matrix&       out,
-                             MatrixType    A_type,
-                             MatrixType    B_type,
-                             MatrixType    C_type,
-                             MatrixType    accType) = 0;
+    virtual UALError reorder(const Matrix&          in,
+                             Matrix&                out,
+                             MatrixType             A_type,
+                             MatrixType             B_type,
+                             MatrixType             C_type,
+                             MatrixType             accType,
+                             const GroupScaleParam* group_scale = nullptr) = 0;
 
     /**
      * @brief Create a backend-specific execution plan

Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@ DLP_GEMV2(int8_t, int8_t, int32_t, s8s8s32o32_sym_quant)`
`266`	`266`	`} // jc loop`
`267`	`267`
`268`	`268`	`// Release pack buffers.`
`269`		`- if (mtag_b == PACK && (pack_a_buffer_s8s8s32os32 != NULL)) {`
	`269`	`+ if ((mtag_a == PACK) && (pack_a_buffer_s8s8s32os32 != NULL)) {`
`270`	`270`	`dlp_free_page_aligned(pack_a_buffer_s8s8s32os32);`
`271`	`271`	`}`
`272`	`272`	`}`