Allow MSVC to build KleidiAI in Windows on Arm environments. (#26995)

Colm-in-Arm · web-flow · commit 13774261c728 · 2026-01-14T00:57:49.000Z
### Description Remove the limitations on using onnxruntime_USE_KLEIDIAI in a Windows on Arm environment. ### Motivation and Context Historically the KleidiAI build had difficulties with using Microsoft compiler for Arm environments (MSVC). As a result a hard exclusion of onnxruntime_USE_KLEIDIAI and MSVC was added and subsequently consolidated into cmake/CMakeLists.txt by [this](2e8a45a) commit. The problems in KleidiAI were resolved in their v1.14.0 release. v1.15.0 was introduced via [this](8fe4804) commit. This PR removes the limitation, allowing MSVC to be used to compile with onnxruntime_USE_KLEIDIAI enabled in a Winodws on Arm environment. In addition there were legacy restrictions in CMakeLists.txt relating to DOTPROD and I8MM CPU features. This is already handled in the KleidiAI build. ### Verification Following the Windows build instructions [here](https://onnxruntime.ai/docs/build/inferencing.html#windows) KleidiAI and its associated logic in MLAS will be built when ARM64 is detected. **Note**: As is made clear in these build instructions MSVC must include support for ARM64. Both Python and Cmake must be native ARM64. Signed-off-by: Colm Donelan <colm.donelan@arm.com>
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -539,25 +539,6 @@ if(onnxruntime_USE_KLEIDIAI)
       set(${is_supported_var} FALSE PARENT_SCOPE)
       return()
     endif()
-
-    # check for compiler support
-    if(MSVC)
-      # TODO detect on MSVC
-    else()
-      check_cxx_compiler_flag(-march=armv8.2-a+dotprod HAS_ARM64_DOTPROD)
-      check_cxx_compiler_flag(-march=armv8.2-a+i8mm HAS_ARM64_I8MM)
-      if(NOT HAS_ARM64_DOTPROD)
-        message(WARNING "The compiler doesn't support dotprod instructions.")
-      endif()
-      if(NOT HAS_ARM64_I8MM)
-        message(WARNING "The compiler doesn't support i8mm instructions.")
-      endif()
-      if(NOT HAS_ARM64_DOTPROD OR NOT HAS_ARM64_I8MM)
-        set(${is_supported_var} FALSE PARENT_SCOPE)
-        return()
-      endif()
-    endif()
-
     set(${is_supported_var} TRUE PARENT_SCOPE)
   endfunction()
 
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -163,7 +163,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
 
   Status Compute(OpKernelContext* context) const override;
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override {
@@ -307,7 +307,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
  private:
   // Indicates when MlasDynamicQGemmBatch() can be used
   bool can_use_dynamic_quant_mlas_{false};
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
   // Indicates that the biases are a constant input and thus already quantized / packed
   bool dynamic_quant_mlas_bias_data_was_packed_{false};
 #endif
@@ -382,7 +382,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
   }
   // Guard against KleidiAI functions being called in non kleidi builds
   // TODO: migrate to a suitable override function call for kleidi dynamic qgemm function calls
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
   else {
     MatMulComputeHelper helper;
     ORT_RETURN_IF_ERROR(helper.Compute(ctx->Input<Tensor>(IN_A)->Shape(),
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -2116,7 +2116,7 @@ MlasFlashAttention(
     MLAS_THREADPOOL* ThreadPool
 );
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
 /**
  * @brief Function to override the packing mechanism decision if kleidi ai is included
  * @param enable     enable kleidiai packing (allow or disallow depending on true/false)
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -19,7 +19,7 @@ Module Name:
 #ifdef MLAS_USE_SVE
 #include "sve/mlasi_sve.h"
 #endif
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
 #include "kleidiai/mlasi_kleidiai.h"
 #endif
 
@@ -603,7 +603,7 @@ Return Value:
         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
     }
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){
         this->MlasGemmBatchOverride = ArmKleidiAI::MlasGemmBatch;
         this->MlasGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize;
diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include "qgemm.h"
 
 // TODO: When overrides are implemented, remove this
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
 #include "kleidiai/mlasi_kleidiai.h"
 #endif
 
@@ -205,7 +205,7 @@ bool
 MLASCALL
 MlasIsDynamicQGemmAvailable()
 {
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
   return ArmKleidiAI::UseSME2;
 #else
   return false;
@@ -222,7 +222,7 @@ MlasDynamicQGemmBatch (
 ) {
     assert(MlasIsDynamicQGemmAvailable());
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     //No fallback
     ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool);
 #endif
@@ -346,7 +346,7 @@ MlasDynamicQgemmPackBSize(
     assert(MlasIsDynamicQGemmAvailable());
 
     size_t bytes = 0;
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     //No fallback available
     //TODO: Insert Override
     bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K);
@@ -440,7 +440,7 @@ MlasDynamicQgemmPackB(
 {
     assert(MlasIsDynamicQGemmAvailable());
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     //No fallback
     ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB);
 #endif
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -1670,7 +1670,7 @@ Return Value:
     // Compute the number of bytes required to hold the packed buffer.
     //
     // KleidiAI or other override
-    #if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+    #if defined(USE_KLEIDIAI)
     if (GetMlasPlatform().MlasGemmPackBSizeOverride != nullptr &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans) {
@@ -1737,7 +1737,7 @@ Return Value:
 
 --*/
 {
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     if (GetMlasPlatform().MlasGemmPackBOverride != nullptr  &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans    &&