Extend MlasSBGemmBatch to accept ZeroMode

Rohanjames1997 · Rohanjames1997 · commit 6cbbd2548504 · 2025-12-19T17:13:18.000Z
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -1955,6 +1955,7 @@ struct MLAS_SBGEMM_DATA_PARAMS {
     const MLAS_SBGEMM_POSTPROCESSOR* OutputProcessor = nullptr;
     bool AIsfp32 = false; /**< matrix A is fp32, needs to be converted to bf16*/
     bool BIsfp32 = false; /**< matrix B is fp32, needs to be converted to bf16*/
+    bool ZeroMode = true; /**< true: C = A*B, false: C += A*B */
 };
 
 /**
diff --git a/onnxruntime/core/mlas/lib/sbconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sbconv_kernel_neon.cpp
@@ -183,76 +183,87 @@ void
 }
 
 //
-// BF16 Pointwise Convolution Kernel
+// BF16 Pointwise (1x1) Convolution Kernel using SBGEMM.
 //
 void MLASCALL
 MlasConvPointwiseBf16KernelNeon(
     const float* Input,
     const float* Filter,
     float* Output,
     size_t StrideWidth,
-    size_t InputChannels, /* numChannels/BlockSize = 16/16 = 1 */
+    size_t InputChannels,
     size_t FilterCount,
-    size_t /*InputStride*/,
+    size_t InputStride,
     size_t FilterStride,
     size_t OutputStride,
     size_t OutputCount,
     const float* Bias,
     unsigned KernelFlags
 )
 {
+    const bool AccumulateOutput = (KernelFlags & MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT) != 0;
     const bool BiasAddition = (KernelFlags & MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION) != 0;
+    const bool ReluActivation = (KernelFlags & MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION) != 0;
 
     const size_t StrideWidthElements = StrideWidth / sizeof(float);
+    const size_t InputStrideElements = InputStride / sizeof(float);
     const size_t FilterStrideElements = FilterStride / sizeof(float);
     const size_t OutputStrideElements = OutputStride / sizeof(float);
 
-    const float32x4_t ZeroVector = MlasBroadcastFloat32x4(0.0f);
-    const float32x4_t ReluMask = vreinterpretq_f32_s32(MlasBroadcastInt32x4(-(KernelFlags & MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION)));
+    // SBGEMM only adds bias when ZeroMode=true. When accumulating (ZeroMode=false),
+    // pre-add bias to existing output before the GEMM operations.
+    if (BiasAddition && AccumulateOutput) {
+        for (size_t f = 0; f < FilterCount; f++) {
+            float* output = Output + f * OutputStrideElements;
+            const float32x4_t b0 = MlasLoadFloat32x4(&Bias[f * BlockSize]);
+            const float32x4_t b1 = MlasLoadFloat32x4(&Bias[f * BlockSize + 4]);
+            const float32x4_t b2 = MlasLoadFloat32x4(&Bias[f * BlockSize + 8]);
+            const float32x4_t b3 = MlasLoadFloat32x4(&Bias[f * BlockSize + 12]);
+            for (size_t i = 0; i < OutputCount; i++) {
+                MlasStoreFloat32x4(&output[i * BlockSize], MlasAddFloat32x4(b0, MlasLoadFloat32x4(&output[i * BlockSize])));
+                MlasStoreFloat32x4(&output[i * BlockSize + 4], MlasAddFloat32x4(b1, MlasLoadFloat32x4(&output[i * BlockSize + 4])));
+                MlasStoreFloat32x4(&output[i * BlockSize + 8], MlasAddFloat32x4(b2, MlasLoadFloat32x4(&output[i * BlockSize + 8])));
+                MlasStoreFloat32x4(&output[i * BlockSize + 12], MlasAddFloat32x4(b3, MlasLoadFloat32x4(&output[i * BlockSize + 12])));
+            }
+        }
+    }
 
-    std::vector<MLAS_SBGEMM_DATA_PARAMS> gemm_params(FilterCount);
+    // Build SBGEMM params for all (filter, input_channel) combinations.
+    // FilterCount <= 4, InputChannels <= 8, so max 32 elements.
+    // Bias is set on all elements but SBGEMM only uses it when ZeroMode=true.
+    MLAS_SBGEMM_DATA_PARAMS gemm_params[32];
 
+    size_t idx = 0;
     for (size_t f = 0; f < FilterCount; f++) {
         const float* filter = Filter + f * FilterStrideElements;
         float* output = Output + f * OutputStrideElements;
-
-        gemm_params[f].A = Input;
-        gemm_params[f].B = filter;
-        gemm_params[f].C = output;
-        gemm_params[f].lda = StrideWidthElements;
-        gemm_params[f].ldb = BlockSize;
-        gemm_params[f].ldc = BlockSize;
-        gemm_params[f].Bias = BiasAddition ? (Bias + f * BlockSize) : nullptr;
-        gemm_params[f].AIsfp32 = true;
-        gemm_params[f].BIsfp32 = true;
-        gemm_params[f].OutputProcessor = nullptr;
+        for (size_t ic = 0; ic < InputChannels; ic++, idx++) {
+            gemm_params[idx].A = Input + ic * InputStrideElements;
+            gemm_params[idx].B = filter + ic * BlockSize * BlockSize;
+            gemm_params[idx].C = output;
+            gemm_params[idx].lda = StrideWidthElements;
+            gemm_params[idx].ldb = BlockSize;
+            gemm_params[idx].ldc = BlockSize;
+            gemm_params[idx].Bias = BiasAddition ? (Bias + f * BlockSize) : nullptr;
+            gemm_params[idx].AIsfp32 = true;
+            gemm_params[idx].BIsfp32 = true;
+            gemm_params[idx].ZeroMode = (ic == 0) && !AccumulateOutput;
+            gemm_params[idx].OutputProcessor = nullptr;
+        }
     }
 
-    MlasSBGemmBatch(OutputCount, BlockSize, InputChannels * BlockSize, FilterCount, gemm_params.data(), nullptr);
-
-    for (size_t f = 0; f < FilterCount; f++) {
-        float* output = Output + f * OutputStrideElements;
-
-        for (size_t output_idx = 0; output_idx < OutputCount; output_idx++) {
-            float32x4_t Accumulator0 = MlasLoadFloat32x4(&output[output_idx * BlockSize]);
-            float32x4_t Accumulator1 = MlasLoadFloat32x4(&output[output_idx * BlockSize + 4]);
-            float32x4_t Accumulator2 = MlasLoadFloat32x4(&output[output_idx * BlockSize + 8]);
-            float32x4_t Accumulator3 = MlasLoadFloat32x4(&output[output_idx * BlockSize + 12]);
-
-            float32x4_t Relu0 = MlasMaximumFloat32x4(Accumulator0, ZeroVector);
-            float32x4_t Relu1 = MlasMaximumFloat32x4(Accumulator1, ZeroVector);
-            float32x4_t Relu2 = MlasMaximumFloat32x4(Accumulator2, ZeroVector);
-            float32x4_t Relu3 = MlasMaximumFloat32x4(Accumulator3, ZeroVector);
-
-            Accumulator0 = MlasBlendFloat32x4(Accumulator0, Relu0, ReluMask);
-            Accumulator1 = MlasBlendFloat32x4(Accumulator1, Relu1, ReluMask);
-            Accumulator2 = MlasBlendFloat32x4(Accumulator2, Relu2, ReluMask);
-            Accumulator3 = MlasBlendFloat32x4(Accumulator3, Relu3, ReluMask);
-
-            MlasStoreFloat32x4(&output[output_idx * BlockSize], Accumulator0);
-            MlasStoreFloat32x4(&output[output_idx * BlockSize + 4], Accumulator1);
-            MlasStoreFloat32x4(&output[output_idx * BlockSize + 8], Accumulator2);
-            MlasStoreFloat32x4(&output[output_idx * BlockSize + 12], Accumulator3);
+    MlasSBGemmBatch(OutputCount, BlockSize, BlockSize, idx, gemm_params, nullptr);
+
+    if (ReluActivation) {
+        const float32x4_t ZeroVector = MlasBroadcastFloat32x4(0.0f);
+        for (size_t f = 0; f < FilterCount; f++) {
+            float* output = Output + f * OutputStrideElements;
+            for (size_t i = 0; i < OutputCount; i++) {
+                MlasStoreFloat32x4(&output[i * BlockSize], MlasMaximumFloat32x4(MlasLoadFloat32x4(&output[i * BlockSize]), ZeroVector));
+                MlasStoreFloat32x4(&output[i * BlockSize + 4], MlasMaximumFloat32x4(MlasLoadFloat32x4(&output[i * BlockSize + 4]), ZeroVector));
+                MlasStoreFloat32x4(&output[i * BlockSize + 8], MlasMaximumFloat32x4(MlasLoadFloat32x4(&output[i * BlockSize + 8]), ZeroVector));
+                MlasStoreFloat32x4(&output[i * BlockSize + 12], MlasMaximumFloat32x4(MlasLoadFloat32x4(&output[i * BlockSize + 12]), ZeroVector));
+            }
         }
     }
 }
diff --git a/onnxruntime/core/mlas/lib/sbgemm.h b/onnxruntime/core/mlas/lib/sbgemm.h
@@ -112,7 +112,7 @@ MlasSBGemmKernel(const size_t CountM, const size_t CountN, const size_t CountK,
 
 template <typename KernelType>
 MLAS_FORCEINLINE void
-MlasSBGemmPackedOperation(size_t M, size_t RangeStartN, size_t RangeCountN, size_t AlignedN, size_t K, const float* A, size_t lda, const void* PackedB, float* C, size_t ldc, const float* Bias, void* PostProcessor)
+MlasSBGemmPackedOperation(size_t M, size_t RangeStartN, size_t RangeCountN, size_t AlignedN, size_t K, const float* A, size_t lda, const void* PackedB, float* C, size_t ldc, const float* Bias, void* PostProcessor, bool InitialZeroMode)
 {
     constexpr MLAS_SBGEMM_STRIDES Strides = KernelType::Strides;
     size_t PackedStrideN = Strides.N;
@@ -131,7 +131,7 @@ MlasSBGemmPackedOperation(size_t M, size_t RangeStartN, size_t RangeCountN, size
         //
         size_t CountK;
         for (size_t k = 0; k < K; k += CountK) {
-            bool ZeroMode = (k == 0);
+            bool ZeroMode = (k == 0) && InitialZeroMode;
             CountK = std::min(K - k, PackedStrideK);
 
             const bfloat16_t* pb = (const bfloat16_t*)PackedB + AlignedN * k + CountK * SliceStartN;
@@ -148,7 +148,7 @@ MlasSBGemmPackedOperation(size_t M, size_t RangeStartN, size_t RangeCountN, size
 
 template <typename KernelType>
 void
-MlasSBGemmNonPackedOperation(size_t M, size_t N, size_t K, const float* A, size_t lda, const float* B, size_t ldb, float* C, size_t ldc, const float* Bias, void* PostProcessor)
+MlasSBGemmNonPackedOperation(size_t M, size_t N, size_t K, const float* A, size_t lda, const float* B, size_t ldb, float* C, size_t ldc, const float* Bias, void* PostProcessor, bool InitialZeroMode)
 {
     //
     // Compute the strides to step through slices of the input matrices.
@@ -201,7 +201,7 @@ MlasSBGemmNonPackedOperation(size_t M, size_t N, size_t K, const float* A, size_
             const float* pbias =
                 ((nullptr == Bias) ? nullptr : Bias + n);  // TODO: check the SliceNStart
 
-            bool ZeroMode = (k == 0);
+            bool ZeroMode = (k == 0) && InitialZeroMode;
             MlasSBGemmKernel<KernelType>(M, CountN, CountK, A + k, lda, PanelB, c, ldc, ZeroMode ? pbias : nullptr, ZeroMode);
         }
         if (PostProcessor != nullptr) {
@@ -249,16 +249,17 @@ MlasSBGemmOperation(const ptrdiff_t ThreadCountM, const ptrdiff_t ThreadCountN,
     const float* A = (const float*)DataParams->A + RangeStartM * lda;
     float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
     const float* bias = DataParams->Bias;
+    const bool zeroMode = DataParams->ZeroMode;
 
     if (!DataParams->BIsfp32) {
         MlasSBGemmPackedOperation<KernelType>(
             RangeCountM, RangeStartN, RangeCountN, BlockedN * MLAS_SGEMM_STRIDEN_THREAD_ALIGN, K, A,
-            lda, DataParams->B, C, ldc, bias, (void*)DataParams->OutputProcessor
+            lda, DataParams->B, C, ldc, bias, (void*)DataParams->OutputProcessor, zeroMode
         );
     } else {
         const size_t ldb = DataParams->ldb;
         const float* B = (const float*)DataParams->B + RangeStartN;
-        MlasSBGemmNonPackedOperation<KernelType>(RangeCountM, RangeCountN, K, A, lda, B, ldb, C, ldc, bias, (void*)DataParams->OutputProcessor);
+        MlasSBGemmNonPackedOperation<KernelType>(RangeCountM, RangeCountN, K, A, lda, B, ldb, C, ldc, bias, (void*)DataParams->OutputProcessor, zeroMode);
     }
 }