Move comment to more appropriate place

milpuz01 · milpuz01 · commit bd38b0e2c854 · 2026-02-12T22:24:15.000Z
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -580,11 +580,6 @@ Return Value:
 #else
     this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon;
 #endif
-    // Prefer the hand written AArch64 micro-kernel for pointwise convolution
-    // as it computes multiple output positions at once and significantly
-    // reduces memory traffic. The AArch64 assembly kernel is selected by
-    // heuristics in snchwc.cpp to avoid regressions on small convolutions, so
-    // we set the default to the intrinsics version here.
     this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon;
     this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon;
     this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelNeon;
diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp
@@ -883,6 +883,8 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel;
 #if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) && !defined(_WIN32)
+        // AArch64 assembly kernel pointwise convolution computes multiple
+        // output positions at once and significantly reduces memory traffic.
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* const KernelFast = MlasConvPointwiseFloatKernelNeonAsm;
 #endif
 #if defined(__aarch64__) && defined(__linux__)
@@ -941,6 +943,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
                 MLAS_CONV_POINTWISE_FLOAT_KERNEL* KernelToUse = Kernel;
 #if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) && !defined(_WIN32)
+                // Heuristically select the AArch64 assembly kernel for larger convolutions
                 if (!WorkBlock->UseBf16 && OutputThisIteration >= 4 &&
                     StrideHeight == 1 && StrideWidth == 1) {
                     KernelToUse = KernelFast;