Address comments from reviewers

milpuz01 · milpuz01 · commit 040fe4944480 · 2026-02-07T21:36:38.000Z
Signed-off-by: Milos Puzovic &lt;milos.puzovic@arm.com&gt;
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -327,13 +327,17 @@ function (setup_arm_neon_nchwc)
    ${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.h
    ${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.cpp
    ${MLAS_SRC_DIR}/spool_nchwc_kernel_neon.cpp
-   # Hand written AArch64 micro-kernel for NCHW convolution.  Using a
-   # separate assembly file allows tighter control over register allocation
-   # and avoids the overhead of C++/intrinsics based code generation.
-   ${MLAS_SRC_DIR}/aarch64/SconvKernelNeon.S
-   ${MLAS_SRC_DIR}/aarch64/SconvDepthwiseKernelNeon.S
-   ${MLAS_SRC_DIR}/aarch64/SconvPointwiseKernelNeon.S
   )
+  if(NOT WIN32)
+    target_sources(onnxruntime_mlas PRIVATE
+     # Hand written AArch64 micro-kernel for NCHW convolution.  Using a
+     # separate assembly file allows tighter control over register allocation
+     # and avoids the overhead of C++/intrinsics based code generation.
+     ${MLAS_SRC_DIR}/aarch64/SconvKernelNeon.S
+     ${MLAS_SRC_DIR}/aarch64/SconvDepthwiseKernelNeon.S
+     ${MLAS_SRC_DIR}/aarch64/SconvPointwiseKernelNeon.S
+    )
+  endif()
   list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC)
   set(mlas_private_compile_definitions ${mlas_private_compile_definitions} PARENT_SCOPE)
 endfunction ()
@@ -466,8 +470,6 @@ else()
           ${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSdot.S
           ${MLAS_SRC_DIR}/aarch64/SgemmKernelNeon.S
           ${MLAS_SRC_DIR}/aarch64/SgemvKernelNeon.S
-          ${MLAS_SRC_DIR}/aarch64/SconvDepthwiseKernelNeon.S
-          ${MLAS_SRC_DIR}/aarch64/SconvPointwiseKernelNeon.S
           ${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelNeon.S
           ${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdot.S
           ${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdotLd64.S
diff --git a/onnxruntime/core/mlas/lib/aarch64/SconvDepthwiseKernelNeon.S b/onnxruntime/core/mlas/lib/aarch64/SconvDepthwiseKernelNeon.S
@@ -4,7 +4,7 @@ SPDX-License-Identifier: MIT
 
 Module Name:
 
-    SconvDepthwiseFloatKernelNeon.S
+    SconvDepthwiseKernelNeon.S
 
 Abstract:
 
@@ -18,7 +18,6 @@ Abstract:
       * When an output position touches padding, only the affected 4-wide
         lanes are checked individually and loaded; others are zeroed.  This
         mirrors the behavior of the C++ helper LoadInputVectorWithBounds.
-        mirrors the behaviour of the C++ helper LoadInputVectorWithBounds.
       * Keep the multiply/accumulate operations tightly scheduled to hide the
         load latency.
 
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -963,15 +963,25 @@ extern "C" {
     MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
 #endif
 #if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
+    // Intrinsics kernel for direct NCHW convolution
+    MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon;
+#if !defined(_WIN32)
     // AArch64 assembly micro-kernel for direct NCHW convolution
     MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeonAsm;
+#endif
     MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon;
+    // Intrinsics kernel for depthwise NCHWc convolution
+    MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon;
+#if !defined(_WIN32)
     // AArch64 assembly micro-kernel for depthwise NCHWc convolution
     MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeonAsm;
-    MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon;
+#endif
+    // Intrinsics kernel for pointwise NCHWc convolution
+    MLAS_CONV_POINTWISE_FLOAT_KERNEL MlasConvPointwiseFloatKernelNeon;
+#if !defined(_WIN32)
     // AArch64 assembly micro-kernel for pointwise NCHWc convolution
     MLAS_CONV_POINTWISE_FLOAT_KERNEL MlasConvPointwiseFloatKernelNeonAsm;
-    MLAS_CONV_POINTWISE_FLOAT_KERNEL MlasConvPointwiseFloatKernelNeon;
+#endif
 #if defined(__aarch64__) && defined(__linux__)
     MLAS_CONV_POINTWISE_FLOAT_KERNEL MlasConvPointwiseBf16KernelNeon;
 #endif
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -571,10 +571,15 @@ Return Value:
     this->EltwiseDispatch = &MlasEltwiseDispatchNeon;
 
 #if defined(MLAS_USE_ARM_NEON_NCHWC)
+    // Use the AArch64 assembly implementation on non-Windows platforms.
+#if !defined(_WIN32)
     // Prefer the hand written micro-kernel for the NCHW convolution path. It
     // offers a tighter schedule and a specialised two-output inner loop that
     // reduces pressure on the memory system compared to the generic kernel.
     this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeonAsm;
+#else
+    this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon;
+#endif
     // Prefer the hand written AArch64 micro-kernel for pointwise convolution
     // as it computes multiple output positions at once and significantly
     // reduces memory traffic. The AArch64 assembly kernel is selected by
diff --git a/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp
@@ -183,6 +183,58 @@ void
 }
 
 
+//
+// Implementation of MlasConvNchwFloatKernelNeon
+//
+
+void
+    MLASCALL
+    MlasConvNchwFloatKernelNeon(
+        const float* Input,
+        const float* Filter,
+        float* Output,
+        size_t StrideWidth,
+        size_t DilationWidth,
+        size_t FilterCount,
+        size_t InputStride,
+        size_t FilterStride,
+        size_t OutputStride,
+        size_t KernelHeight,
+        size_t KernelWidth,
+        const float* InputBase,
+        size_t InputWidth,
+        size_t DilatedInputWidth,
+        size_t OutputCountLeftPad,
+        size_t OutputCount,
+        size_t OutputCountRightPad,
+        const float* Bias,
+        unsigned KernelFlags
+    )
+{
+    MlasConvFloatKernelNeonImpl<false>(
+        Input,
+        Filter,
+        Output,
+        StrideWidth,
+        DilationWidth,
+        FilterCount,
+        InputStride,
+        FilterStride,
+        OutputStride,
+        KernelHeight,
+        KernelWidth,
+        InputBase,
+        InputWidth,
+        DilatedInputWidth,
+        OutputCountLeftPad,
+        OutputCount,
+        OutputCountRightPad,
+        Bias,
+        KernelFlags
+    );
+}
+
+
 //
 // Implementation of MlasConvNchwcFloatKernelNeon
 //
diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp
@@ -882,7 +882,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
 #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel;
-#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
+#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) && !defined(_WIN32)
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* const KernelFast = MlasConvPointwiseFloatKernelNeonAsm;
 #endif
 #if defined(__aarch64__) && defined(__linux__)
@@ -940,7 +940,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
                 //
 
                 MLAS_CONV_POINTWISE_FLOAT_KERNEL* KernelToUse = Kernel;
-#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
+#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) && !defined(_WIN32)
                 if (!WorkBlock->UseBf16 && OutputThisIteration >= 4 &&
                     StrideHeight == 1 && StrideWidth == 1) {
                     KernelToUse = KernelFast;
@@ -1034,7 +1034,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
 #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel;
-#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
+#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) && !defined(_WIN32)
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* const KernelFast = MlasConvDepthwiseFloatKernelNeonAsm;
 #endif
 #else
@@ -1061,7 +1061,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
             //
 
             MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* KernelToUse = Kernel;
-#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
+#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) && !defined(_WIN32)
             if (OutputWidth >= 4) {
                 KernelToUse = KernelFast;
             }