nnstreamer · mnalewaj · Oct 22, 2025
@@ -18,6 +18,14 @@
 
 namespace nntrainer {
 
+static inline unsigned int ceil_div(unsigned int a, unsigned int b) {
+  return (a + b - 1) / b;
+};
+
+static inline unsigned int align(unsigned int a, unsigned int b) {
+  return (a % b == 0) ? a : a - a % b + b;
+};
+
 void gemv_int4_async_cl(std::vector<void *> weights,
                         std::vector<uint16_t *> scales, uint16_t *input,
                         std::vector<uint16_t *> outputs, unsigned int K,
@@ -104,6 +112,9 @@ void gemv_int4_async_cl(std::vector<void *> weights,
 void gemv_int4_cl(char *weight, uint16_t *scale, uint16_t *input,
                   uint16_t *output, unsigned int K, unsigned int N,
                   unsigned int quantization_group_size) {
+  const auto N_GROUP_SIZE = 32; // due to input data format
+  const unsigned int alignN = align(N, N_GROUP_SIZE);
+
   bool result = false;
   auto *blas_cc =
     static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));
@@ -155,7 +166,7 @@ void gemv_int4_cl(char *weight, uint16_t *scale, uint16_t *input,
     throw std::runtime_error(
       "Failed to set kernel argument 5 for fully_connected_gpu_int4_gemv");
 
-  const int work_groups_count[3] = {(int)(N / 2), 1, 16};
+  const int work_groups_count[3] = {(int)(alignN / 2), 1, 16};
   const int work_group_size[3] = {16, 1, 16};
 
   result = blas_cc->command_queue_inst_.DispatchCommand(
@@ -433,14 +444,6 @@ void openvino_gemm_async_cl(float *input, std::vector<void *> weights,
 
   bool result = false;
 
-  auto ceil_div = [](unsigned int a, unsigned int b) -> unsigned int {
-    return (a + b - 1) / b;
-  };
-
-  auto align = [](unsigned int a, unsigned int b) -> unsigned int {
-    return (a % b == 0) ? a : a - a % b + b;
-  };
-
   // copy fp32 input to fp16
   copy_fp32_u16(M * K, input, (uint16_t *)clbuffInstance.getSVMInput());
 
@@ -592,9 +595,54 @@ void openvino_sgemm_cl(float *input, char *weight, uint16_t *scale,
   copy_u16_fp32(M * N, (uint16_t *)clbuffInstance.getSVMOutput(), output);
 }
 
+// TODO remove it
+void *allocateSVM(size_t size_bytes) {
+  auto *blas_cc = static_cast<nntrainer::ClContext *>(
+    nntrainer::Engine::Global().getRegisteredContext("gpu"));
+
+  void *ptr = blas_cc->context_inst_.createSVMRegion(size_bytes);
+
+  if (ptr == nullptr) {
+    throw std::runtime_error(
+      "Failed to allocated SVM for the OpenCL BLAS unit test.");
+  }
+
+  return ptr;
+}
+
+// TODO remove it
+void freeSVM(void *ptr) {
+  auto *blas_cc = static_cast<nntrainer::ClContext *>(
+    nntrainer::Engine::Global().getRegisteredContext("gpu"));
+
+  blas_cc->context_inst_.releaseSVMRegion(ptr);
+  ptr = nullptr;
+}
+
 void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
                       unsigned int M, unsigned int N, unsigned int K,
                       unsigned int quantization_group_size) {
+
+  int alignK = align(K, quantization_group_size);
+  const auto N_GROUP_SIZE = 32; // due to input data format
+  int alignN = align(N, N_GROUP_SIZE);
+  // Padding input data - TODO remove this and do this in kernel quantize_input
+  uint16_t *input_ptr;
+  if (alignK != K) {
+    uint32_t padded_input_size = M * alignK;
+    input_ptr = (uint16_t *)allocateSVM(padded_input_size * sizeof(uint16_t));
+    for (int y = 0; y < M; y++) {
+      for (int x = 0; x < K; x++) {
+        input_ptr[y * alignK + x] = ((uint16_t *)input)[y * K + x];
+      }
+      for (int x = K; x < alignK; x++) {
+        input_ptr[y * alignK + x] = compute_fp32_to_fp16(0.f);
+      }
+    }
+  } else {
+    input_ptr = (uint16_t *)input;
+  }
+
   bool result = false;
   auto *blas_cc =
     static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));
@@ -605,14 +653,6 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
     " -D SIZE_QUANTIZATION_GROUP=" + std::to_string(quantization_group_size) +
     " -D SCALE_ROW_MAJOR=" + std::to_string(scale_row_major);
 
-  auto ceil_div = [](unsigned int a, unsigned int b) -> unsigned int {
-    return (a + b - 1) / b;
-  };
-
-  auto align = [](unsigned int a, unsigned int b) -> unsigned int {
-    return (a % b == 0) ? a : a - a % b + b;
-  };
-
   std::vector<cl_event> quantize_event(1);
   {
     ClContext::SharedPtrClKernel kernel_ptr = blas_cc->registerClKernel(
@@ -624,7 +664,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
 
     int arg = 0;
 
-    result = kernel_ptr->SetKernelSVMArguments(arg++, input);
+    result = kernel_ptr->SetKernelSVMArguments(arg++, input_ptr);
 
     if (!result)
       throw std::runtime_error("Failed to set kernel argument 0 for "
@@ -643,7 +683,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
                                "quantize_input");
 
     const int work_groups_count[3] = {
-      (int)align((M * K) / quantization_group_size, 64), 1, 1};
+      (int)align((M * ceil_div(K, quantization_group_size)), 64), 1, 1};
     const int work_group_size[3] = {64, 1, 1};
 
     result = blas_cc->command_queue_inst_.DispatchCommand(
@@ -665,7 +705,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
 
   int arg = 0;
 
-  result = kernel_ptr->SetKernelSVMArguments(arg++, input);
+  result = kernel_ptr->SetKernelSVMArguments(arg++, input_ptr);
 
   if (!result)
     throw std::runtime_error(
@@ -703,7 +743,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
     throw std::runtime_error(
       "Failed to set kernel argument 6 for fc_bf_tiled_kernel_default");
 
-  const int work_groups_count[3] = {(int)(N / 2),
+  const int work_groups_count[3] = {(int)(alignN / 2),
                                     (int)(align(ceil_div(M, 8), 8)), 1};
   const int work_group_size[3] = {16, 8, 1};
 
@@ -723,6 +763,10 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
       "Failed to read output data for fc_bf_tiled_kernel_default");
     return;
   }
+
+  if (alignK != K) {
+    freeSVM(input_ptr);
+  }
 }
 
 void sgemv_q6_k_cl(void *matAdata, float *vecXdata, float *vecYdata,

@@ -15,7 +15,7 @@
 #define CAT(x, y) __CAT(x, y)
 
 #define unroll_for __attribute__((opencl_unroll_hint)) for
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+#define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
 #define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -78,8 +78,8 @@
   CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)
 
 #define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset)       \
-  BLOCK_READN_FUNC(type_size, vector_size)(                                    \
-    (const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))
+  BLOCK_READN_FUNC(type_size, vector_size)                                     \
+  ((const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))
 
 #define BLOCK_READN(type, vector_size, ptr, offset)                            \
   AS_TYPE(                                                                     \
@@ -189,7 +189,9 @@ fully_connected_gpu_int4_gemv(__global half *input, const __global half *scales,
                               __global half *output,
                               const __global char *weights, const int WEIGHTS_K,
                               const int WEIGHTS_N) {
-  const int SCALE_GROUP_NUM = WEIGHTS_K / SIZE_QUANTIZATION_GROUP;
+  const int SCALE_GROUP_NUM = CEIL_DIV(WEIGHTS_K, SIZE_QUANTIZATION_GROUP);
+  int ALIGN_WEIGHTS_N = ALIGN(WEIGHTS_N, 32);
+  int ALIGN_WEIGHTS_K = ALIGN(WEIGHTS_K, SIZE_QUANTIZATION_GROUP);
 
   int n = get_global_id(0) * 2;         // N
   int thr_id = get_local_id(2);         // 0~15
@@ -212,18 +214,19 @@ fully_connected_gpu_int4_gemv(__global half *input, const __global half *scales,
   float2 sum_all = 0;
   for (int gk = gk0; gk < gk1; gk++) {
     __global half *A = input + gk * DECOMPRESSION_GROUP_SIZE;
-    const __global char *B =
-      weights + get_4bit_weight_index(gk * DECOMPRESSION_GROUP_SIZE, n,
-                                      WEIGHTS_K, WEIGHTS_N, 32);
+    int w_id = get_4bit_weight_index(gk * DECOMPRESSION_GROUP_SIZE, n,
+                                     ALIGN_WEIGHTS_K, ALIGN_WEIGHTS_N, 32);
+
+    const __global char *B = weights + w_id;
 
     GEMV_ACCUMULATOR_VEC_TYPE sum = 0;
 
 #if SCALE_ROW_MAJOR
     float scale_0 = convert_float(scales[gk]);
     float scale_1 = convert_float(scales[gk + 16 * SCALE_GROUP_NUM]);
 #else
-    float scale_0 = convert_float(scales[gk * WEIGHTS_N]);
-    float scale_1 = convert_float(scales[gk * WEIGHTS_N + 16]);
+    float scale_0 = convert_float(scales[gk * ALIGN_WEIGHTS_N]);
+    float scale_1 = convert_float(scales[gk * ALIGN_WEIGHTS_N + 16]);
 #endif
 
     __attribute__((opencl_unroll_hint(4))) for (int g = 0;

@@ -4,14 +4,27 @@
 #define COMPRESSED_WEIGHTS_INT4 1
 #define FILTER_LAYOUT_OS_IS_YX_OSV32_ISV2 1
 
+#define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define CLAMP(v, l, u) MAX((l), MIN((v), (u)))
+
+#define ALIGN_SIZE_K ALIGN(SIZE_K, SIZE_QUANTIZATION_GROUP)
+
 #define DECOMPRESSION_SCALE_TERM 1
 #define DECOMPRESSION_SCALE_GROUP_SIZE SIZE_QUANTIZATION_GROUP
-#define DECOMPRESSION_SCALE_GROUPS_NUM (SIZE_K / DECOMPRESSION_SCALE_GROUP_SIZE)
+#define DECOMPRESSION_SCALE_GROUPS_NUM                                         \
+  CEIL_DIV(SIZE_K, DECOMPRESSION_SCALE_GROUP_SIZE)
 
-#define DECOMPRESSION_SCALE_BATCH_NUM SIZE_N
+#define TILE_IFM_ELEMENTS_SIZE 32
+#define ALIGN_SIZE_N ALIGN(SIZE_N, TILE_IFM_ELEMENTS_SIZE)
+
+#define DECOMPRESSION_SCALE_BATCH_NUM ALIGN_SIZE_N
 #define DECOMPRESSION_SCALE_BATCH_PITCH DECOMPRESSION_SCALE_GROUPS_NUM
 #define DECOMPRESSION_SCALE_FEATURE_PITCH 1
-#define DECOMPRESSION_SCALE_LENGTH (SIZE_N * DECOMPRESSION_SCALE_GROUPS_NUM)
+#define DECOMPRESSION_SCALE_LENGTH                                             \
+  ((ALIGN_SIZE_N) * (DECOMPRESSION_SCALE_GROUPS_NUM))
 
 #define INPUT0_TYPE half
 #define OUTPUT_TYPE half
@@ -21,7 +34,7 @@
 #define INPUT0_OFFSET 0
 #define OUTPUT_OFFSET 0
 
-#define IFM_SIZE SIZE_K
+#define IFM_SIZE ALIGN_SIZE_K
 
 #define ACCUMULATOR_TYPE float
 #define ACTIVATION_TYPE float
@@ -54,12 +67,11 @@
 #define OUTER_OFM 1
 #define DISPATCH_BSV 1
 #define DISPATCH_FSV 1
-#define TILE_IFM_ELEMENTS_SIZE 32
 #define NUM_LOOP_IN_DYN_QUAN_GROUP (QUANTIZE_GROUP_SIZE / (TILE_IFM * SIMD))
 #define REALIGN_FP16_OFFSET 0
 #define TILE_OUT_F_NUM SIZE_N
 #define TILE_OUT_F_PITCH 1
-#define TILE_IN_B_PITCH SIZE_K
+#define TILE_IN_B_PITCH ALIGN_SIZE_K
 #define TILE_OUT_B_PITCH SIZE_N
 
 #define ACTIVATION_FUNC_TYPED(input, params) (input)
@@ -153,11 +165,6 @@ inline int imad_SW(int acc, uchar4 input, uchar4 weight)
 #endif
 
 #define unroll_for __attribute__((opencl_unroll_hint)) for
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define CLAMP(v, l, u) MAX((l), MIN((v), (u)))
 
 // Creates vector type.
 #define MAKE_VECTOR_TYPE_IMPL_1(elem_type) elem_type
@@ -252,8 +259,8 @@ inline int imad_SW(int acc, uchar4 input, uchar4 weight)
   CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)
 
 #define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset)       \
-  BLOCK_READN_FUNC(type_size, vector_size)(                                    \
-    (const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))
+  BLOCK_READN_FUNC(type_size, vector_size)                                     \
+  ((const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))
 
 #define BLOCK_READN(type, vector_size, ptr, offset)                            \
   AS_TYPE(                                                                     \
@@ -458,9 +465,9 @@ DECLARE_BLOCK_READ_EMULATION(8, 8)
   CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)
 
 #define BLOCK_WRITEN_RAW(type_size, vector_size, addr_space, ptr, offset, val) \
-  BLOCK_WRITEN_FUNC(type_size, vector_size)(                                   \
-    (addr_space BLOCK_WRITE_TYPE(type_size) *)(ptr) + (offset),                \
-    AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_WRITE_TYPE(type_size), vector_size), val))
+  BLOCK_WRITEN_FUNC(type_size, vector_size)                                    \
+  ((addr_space BLOCK_WRITE_TYPE(type_size) *)(ptr) + (offset),                 \
+   AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_WRITE_TYPE(type_size), vector_size), val))
 
 #define BLOCK_WRITEN(type, vector_size, ptr, offset, val)                      \
   BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, __global, ptr, offset, val)
@@ -1335,16 +1342,16 @@ inline void fc_bf_tiled_kernel_dyn_quan(
 
     // =====================================================================================================================================
     // Main computation loop
-    const uint iterations =
-      MAIN_LOOP_ELEMENTS_COUNT /
-      TILE_IFM_ELEMENTS_SIZE; // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD)
+    const uint iterations = CEIL_DIV(
+      MAIN_LOOP_ELEMENTS_COUNT,
+      TILE_IFM_ELEMENTS_SIZE); // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD)
     // Each sub-group loads 2 Batch
     const uint idx_sglid =
       (sglid * TILE_K) %
       TILE_IFM_ELEMENTS_SIZE; // same index for sglid 0~7 : to tile_k direction
     const uint batch_sglid =
       (sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE; // 0 to 1 : to batch direction
-    const uint scale_pitch = (TILE_IN_B_PITCH / QUANTIZE_GROUP_SIZE);
+    const uint scale_pitch = CEIL_DIV(TILE_IN_B_PITCH, QUANTIZE_GROUP_SIZE);
 
 #if PER_TOKEN_SIZE_DYN_QUANTIZE
     // Each token is quantized by once. So, all MAIN_LOOP_ELEMENTS_COUNT share
@@ -1379,7 +1386,7 @@ inline void fc_bf_tiled_kernel_dyn_quan(
                                                 ++ni) {
       uint in_offset =
         input_offset + (idx_sglid + batch_sglid * TILE_IN_B_PITCH);
-      uint scale_offset = input_offset / QUANTIZE_GROUP_SIZE;
+      uint scale_offset = CEIL_DIV(input_offset, QUANTIZE_GROUP_SIZE);
       for (uint bi = 0; bi < HALF_TILE_B; ++bi) {
         // Load quantizing info from pre-quantizing kernel
         tiled_input_0[bi] = vload4(0, &quantized_input[in_offset]);
@@ -1685,7 +1692,7 @@ inline void fc_bf_tiled_kernel_dyn_quan(
 #else
             const uint scale_offset =
               (offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) +
-              ni_offset * SIZE_N;
+              ni_offset * ALIGN_SIZE_N;
 #endif
             ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
 #else