From d4943e458fbccda7773d1238909dffcf5c62554f Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Fri, 19 Dec 2025 10:53:53 +0000
Subject: [PATCH 01/13] Add an implementation an NHWC implementation of
 convolution to avoid transposes

* Modification to the CPU EP to specify channels_last when data format is NWHC
* Added a FusedNhwcConv kernel
* Implementation of the kernel in mlas
* Added compiler guards so it is inly used with KleidiAi (for now, can be removed if needed)
* Added unittests

Signed-off-by: Orlaith Monahan <orlaith.monahan@arm.com>
---
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |   2 +
 onnxruntime/contrib_ops/cpu/fused_conv.cc     |   8 +
 .../framework/kernel_type_str_resolver.cc     |  12 ++
 .../graph/contrib_ops/nhwc_schema_defs.cc     |   2 +-
 onnxruntime/core/mlas/inc/mlas.h              |   2 +
 onnxruntime/core/mlas/lib/convolve.cpp        |   4 +-
 .../mlas/lib/kleidiai/convolve_kleidiai.cpp   |  31 ++--
 .../core/mlas/lib/kleidiai/mlasi_kleidiai.h   |   1 +
 onnxruntime/core/mlas/lib/mlasi.h             |   2 +
 .../core/optimizer/conv_activation_fusion.cc  |   5 +-
 .../core/optimizer/conv_add_act_fusion.cc     |  10 +-
 .../layout_transformation.cc                  |   1 +
 .../core/optimizer/nhwc_transformer.cc        | 158 +++++++++++++++++-
 onnxruntime/core/optimizer/nhwc_transformer.h |   5 +
 onnxruntime/core/providers/cpu/nn/conv.cc     | 153 ++++++++++++++---
 onnxruntime/core/providers/cpu/nn/conv.h      |   3 +-
 onnxruntime/core/util/math_cpu.cc             |   1 +
 .../test/framework/ort_model_only_test.cc     |  61 ++++++-
 .../internal_testing_tests.cc                 |  69 +++++---
 onnxruntime/test/mlas/bench/bench_sconv.cpp   |   1 +
 onnxruntime/test/mlas/unittest/test_conv2d.h  |   1 +
 .../test/optimizer/conv_add_act_test.cc       |   5 +-
 .../fuse_initializers_transformer_test.cc     |   5 +-
 .../test/optimizer/nhwc_transformer_test.cc   |  22 +++
 24 files changed, 488 insertions(+), 76 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index d959d11e3fd43..fc5f3a459e616 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -18,6 +18,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
@@ -302,6 +303,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
diff --git a/onnxruntime/contrib_ops/cpu/fused_conv.cc b/onnxruntime/contrib_ops/cpu/fused_conv.cc
index 5374222dbabcc..d77efc26d0e2f 100644
--- a/onnxruntime/contrib_ops/cpu/fused_conv.cc
+++ b/onnxruntime/contrib_ops/cpu/fused_conv.cc
@@ -26,5 +26,13 @@ ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     FusedConvFloat);
 
+ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
+    NhwcFusedConv,
+    1,
+    float,
+    KernelDefBuilder()
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    FusedConvFloat);
+
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.cc b/onnxruntime/core/framework/kernel_type_str_resolver.cc
index 3142f94f289b3..aacbc8fc0a4fb 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver.cc
+++ b/onnxruntime/core/framework/kernel_type_str_resolver.cc
@@ -36,6 +36,18 @@ static OpKernelTypeStrMap::const_iterator LookUpOpId(const OpIdentifier& op_id,
     }
   }
 
+#ifdef USE_KLEIDIAI
+  // Klediai specific block for NhwcFusedConvolutions
+  if (op_it == map.end() && op_id.domain == kMSDomain && op_id.op_type == "NhwcFusedConv") {
+    const auto fused_conv_op_id = OpIdentifier{std::string{kMSDomain}, "FusedConv", op_id.since_version};
+    op_it = map.find(fused_conv_op_id);
+    if (op_it == map.end()) {
+      const auto conv_op_id = OpIdentifier{std::string{kOnnxDomain}, "Conv", op_id.since_version};
+      op_it = map.find(conv_op_id);
+    }
+  }
+#endif
+
   return op_it;
 }
 
diff --git a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
index 8fe3a4d5f3b6f..5a57a58360ddf 100644
--- a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
@@ -403,7 +403,7 @@ Only has fp16 implementation as of 2023/04/15.
                                 .Input(2, "B", "", "T", OpSchema::Optional)
                                 .Input(3, "Z", "Tensor to be added to the output, must be the same shape and format as the output tensor.", "T", OpSchema::Optional)
                                 .Output(0, "Y", "", "T")
-                                .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output types to float tensors")
+                                .TypeConstraint("T", {"tensor(float16)", "tensor(float)"}, "Constrain input and output types to float tensors")
                                 .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
                                   ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
                                   convPoolShapeInferenceNhwc(ctx, true, false, 0, 1);
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 248c6d74e6cbd..adfdf363295fd 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -851,6 +851,7 @@ struct MLAS_CONV_PARAMETERS {
     size_t BatchCount;
     size_t GroupCount;
     size_t InputChannels;
+    bool ChannelsLast;
     size_t InputShape[3];
     size_t KernelShape[3];
     size_t DilationShape[3];
@@ -890,6 +891,7 @@ MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
                 size_t FilterCount,
                 const MLAS_ACTIVATION* Activation,
                 size_t* WorkingBufferSize,
+                bool ChannelsLast,
                 float Beta,
                 MLAS_THREADPOOL* ThreadPool);
 
diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 9518134631f2d..f0c1d870d6cd9 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1146,6 +1146,7 @@ MlasConvPrepare(
     size_t FilterCount,
     const MLAS_ACTIVATION* Activation,
     size_t* WorkingBufferSize,
+    bool ChannelsLast,
     float Beta,
     MLAS_THREADPOOL* ThreadPool
     )
@@ -1204,7 +1205,7 @@ Return Value:
     if (GetMlasPlatform().MlasConvPrepareOverride != nullptr &&
         GetMlasPlatform().MlasConvPrepareOverride(Parameters, Dimensions, BatchCount, GroupCount, InputChannels,
         InputShape,KernelShape,DilationShape, Padding, StrideShape, OutputShape, FilterCount,
-        Activation, WorkingBufferSize, Beta, ThreadPool)){
+        Activation, WorkingBufferSize, ChannelsLast, Beta, ThreadPool)){
         return;
     }
     //
@@ -1215,6 +1216,7 @@ Return Value:
     Parameters->BatchCount = BatchCount;
     Parameters->GroupCount = GroupCount;
     Parameters->InputChannels = InputChannels;
+    Parameters->ChannelsLast = ChannelsLast;
     Parameters->FilterCount = FilterCount;
     Parameters->Beta = Beta;
 
diff --git a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
index 487e1533f5967..60c8e9b562aec 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
@@ -448,6 +448,7 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
 static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const size_t ih, const size_t iw,
                                                         const size_t kh, const size_t kw, const size_t sh,
                                                         const size_t sw, const size_t padding, const float* in,
+                                                        bool input_is_channels_last,
                                                         MLAS_THREADPOOL* ThreadPool)
 {
     size_t padsize = 256;
@@ -472,7 +473,14 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
     const auto lhs_size = kai_get_lhs_packed_size_lhs_imatmul_pack_x32p2vlx1_x32p_sme(m,kh*kw,ci);
     auto lhs = std::make_unique<std::byte[]>(lhs_size);
 
-    auto nhwc = NChwToNhwc(1, ci, ih, iw, in, 1, 1, false, ThreadPool);
+    std::unique_ptr<float[]> nhwc_holder;
+    const float* activation_src = nullptr;
+    if (input_is_channels_last) {
+        activation_src = in;
+    } else {
+        nhwc_holder = NChwToNhwc(1, ci, ih, iw, in, 1, 1, false, ThreadPool);
+        activation_src = nhwc_holder.get();
+    }
 
     // Cache of computed lhs ptr offsets.  thread_local to prevent interference from parallel sessions.
     thread_local std::unordered_map<LhsCacheKey, std::shared_ptr<const void*[]>> lhs_ptrs_cache;
@@ -485,7 +493,7 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
         lhs_ptrs_cache[key] = lhs_ptrs;
     }
 
-    MultiThreadedLHSPackSme(ThreadPool, ci, m, kh, kw, &lhs_ptrs[0], &lhs[0], &nhwc[0], &pad_ptr[0]);
+    MultiThreadedLHSPackSme(ThreadPool, ci, m, kh, kw, &lhs_ptrs[0], &lhs[0], activation_src, &pad_ptr[0]);
 
     return lhs;
 }
@@ -507,6 +515,7 @@ static void ConvolveSme(const size_t co, //channels out
                         const float* in,           //in image data
                         float* out,                //out image data
                         float* tmp_mlas_aligned,   //intermediate buffer if we need to perform a transpose
+                        bool input_is_channels_last,
                         MLAS_THREADPOOL* ThreadPool) {
 
     //RhsPackWeightsBiasSme() - to perform dilation increases kernel size and masks unused weights
@@ -546,17 +555,13 @@ static void ConvolveSme(const size_t co, //channels out
 
     for (size_t g = 0; g < groups; ++g) {
 
-        auto result{out};
-        //do we require a post matmul transpose ?
-        //output is m x n or image_data x co or hw x co
-        //MLAS require it as n x m (or co x hw), transpose required
-        if (co > 1) {
-            //intermediate buffer required, pre-transpose
-            //Note: because we are calling MlasTranspose() need to ensure we use a MLAS aligned buffer
+        auto result = out;
+        const bool need_transpose = (!input_is_channels_last) && (co > 1);
+        if (need_transpose) {
             result = tmp_mlas_aligned;
         }
 
-        auto lhs = LhsPackImageDataSme(ci, ih, iw, d_kh, d_kw, sh, sw, padding, in, ThreadPool);
+        auto lhs = LhsPackImageDataSme(ci, ih, iw, d_kh, d_kw, sh, sw, padding, in, input_is_channels_last, ThreadPool);
         auto rhs = RhsPackWeightsBiasSme(co, ci, kh, kw, dilationh, dilationw, weights, bias, ThreadPool);
 
         MlasTrySimpleParallel(ThreadPool, static_cast<ptrdiff_t>(dim[0] * dim[1] * dim[2]), [&](ptrdiff_t tid) {
@@ -604,7 +609,7 @@ static void ConvolveSme(const size_t co, //channels out
             }
         });
 
-        if (result == tmp_mlas_aligned) {
+        if (need_transpose) {
             //Note: this could be absorbed into post conv activation
             MlasTranspose(tmp_mlas_aligned, out, m, co, ThreadPool);
         }
@@ -633,6 +638,7 @@ ArmKleidiAI::MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
                 size_t FilterCount,
                 const MLAS_ACTIVATION* Activation,
                 size_t* WorkingBufferSize,
+                bool ChannelsLast,
                 float Beta,
                 MLAS_THREADPOOL* ThreadPool)
 {
@@ -646,6 +652,7 @@ ArmKleidiAI::MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
     Parameters->BatchCount = BatchCount;
     Parameters->GroupCount = GroupCount;
     Parameters->InputChannels = InputChannels;
+    Parameters->ChannelsLast = ChannelsLast;
     Parameters->FilterCount = FilterCount;
     Parameters->Beta = Beta;
 
@@ -711,7 +718,7 @@ ArmKleidiAI::MlasConv(
                 Parameters->DilationShape[0], Parameters->DilationShape[1],  // kernel dilation
                 Parameters->Padding[0],                                      // image padding
                 Parameters->GroupCount,                                      // filter groups
-                Filter, Bias, Input, Output, WorkingBuffer, ThreadPool);
+                Filter, Bias, Input, Output, WorkingBuffer, Parameters->ChannelsLast, ThreadPool);
 
     MlasActivation(Parameters->Activation, Output, nullptr, Parameters->FilterCount, Parameters->OutputSize,
                    Parameters->OutputSize);
diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
index ca81b9fa426ee..99eb88fcf4d2d 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
+++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -147,6 +147,7 @@ MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
                 size_t FilterCount,
                 const MLAS_ACTIVATION* Activation,
                 size_t* WorkingBufferSize,
+                bool ChannelsLast,
                 float Beta,
                 MLAS_THREADPOOL* ThreadPool);
 
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index ad62cccbfb9c7..1186d5b939d7e 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -827,6 +827,7 @@ void
     size_t FilterCount,
     const MLAS_ACTIVATION* Activation,
     size_t* WorkingBufferSize,
+    bool ChannelsLast,
     float Beta,
     MLAS_THREADPOOL* ThreadPool
     );
@@ -847,6 +848,7 @@ bool
     size_t FilterCount,
     const MLAS_ACTIVATION* Activation,
     size_t* WorkingBufferSize,
+    bool ChannelsLast,
     float Beta,
     MLAS_THREADPOOL* ThreadPool
     );
diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc
index b7f5af5888be0..a53099937a94a 100644
--- a/onnxruntime/core/optimizer/conv_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc
@@ -140,9 +140,12 @@ class FuseConvActivationAction : public ReplaceWithNew {
         return "FusedConv";
       }
     } else if (domain == kMSDomain) {
-      if (op_type == "NhwcConv") {
+      if (op_type == "NhwcConv" || op_type == "NhwcFusedConv") {
         return "NhwcFusedConv";
       }
+      if (op_type == "FusedConv") {
+        return "FusedConv";
+      }
     } else if (domain == kMSInternalNHWCDomain) {
       if (op_type == "Conv") {
         return "Conv";
diff --git a/onnxruntime/core/optimizer/conv_add_act_fusion.cc b/onnxruntime/core/optimizer/conv_add_act_fusion.cc
index 6f90eaf07ef4d..478e7529cb667 100644
--- a/onnxruntime/core/optimizer/conv_add_act_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_add_act_fusion.cc
@@ -211,7 +211,15 @@ class FuseConvAddActivationAction : public ReplaceWithNew {
 
  private:
   std::string OpType(const RuntimeState& runtimeState) const override {
-    return (runtimeState.selected_nodes.Target().OpType() == "Conv") ? "FusedConv" : "NhwcFusedConv";
+    const auto& target = runtimeState.selected_nodes.Target();
+    const auto* channels_last_attr = graph_utils::GetNodeAttribute(target, "channels_last");
+    const bool channels_last = channels_last_attr != nullptr && channels_last_attr->i() != 0;
+
+    if (target.OpType() == "Conv") {
+      return channels_last ? "NhwcFusedConv" : "FusedConv";
+    }
+
+    return "NhwcFusedConv";
   }
 
   std::string Domain(const RuntimeState&) const override { return kMSDomain; }
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index f611c992e0f57..5d51c855d13ba 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -68,6 +68,7 @@ const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() {
     // Define a static local string array so we can refer to the elements with string_views.
     static const std::string layout_sensitive_contrib_ops[]{
         MakeORTLayoutSensitiveOpId(kMSDomain, "FusedConv"),
+        MakeORTLayoutSensitiveOpId(kMSDomain, "NhwcFusedConv"),
         MakeORTLayoutSensitiveOpId(kMSDomain, "GridSample"),
         MakeORTLayoutSensitiveOpId(kMSDomain, "QLinearAveragePool"),
         MakeORTLayoutSensitiveOpId(kMSDomain, "QLinearGlobalAveragePool"),
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index cd654991c92d5..9544cf7395025 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -2,7 +2,10 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // Licensed under the MIT License.
 
+#include <cstdint>
 #include <deque>
+#include <vector>
+#include "core/graph/constants.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
@@ -21,6 +24,72 @@ namespace onnxruntime {
 
 using namespace layout_transformation;
 
+#ifdef USE_KLEIDIAI
+bool KleidiFp32NhwcFilter(const onnx_transpose_optimization::api::GraphRef& graph,
+                          onnx_transpose_optimization::api::NodeRef& node) {
+  auto& base_node = NodeFromApiNode(node);
+
+  ORT_UNUSED_PARAMETER(graph);
+  if (base_node.InputDefs().size() < 2) {
+    return false;
+  }
+
+  const auto* input_shape = base_node.InputDefs()[0]->Shape();
+  if (input_shape == nullptr || input_shape->dim_size() != 4) {
+    return false;
+  }
+
+  const auto& batch_dim = input_shape->dim(0);
+  if (!utils::HasDimValue(batch_dim) || batch_dim.dim_value() != 1) {
+    return false;
+  }
+
+  const auto pads_attr = node.GetAttributeInts("pads");
+  if (pads_attr.has_value()) {
+    const auto& pads = pads_attr.value();
+    if (pads.size() != 4 || pads[0] != pads[2] || pads[1] != pads[3]) {
+      return false;
+    }
+  }
+
+  const auto inputs = node.Inputs();
+  if (inputs.size() > 3 && !inputs[3].empty()) {
+    return false;
+  }
+
+  const auto* weight_shape = base_node.InputDefs()[1]->Shape();
+  if (weight_shape == nullptr || weight_shape->dim_size() != 4) {
+    return false;
+  }
+
+  const auto& filter_dim = weight_shape->dim(0);
+  const auto& kernel_h_dim = weight_shape->dim(2);
+  const auto& kernel_w_dim = weight_shape->dim(3);
+
+  if (!utils::HasDimValue(filter_dim) || filter_dim.dim_value() <= 1 ||
+      !utils::HasDimValue(kernel_h_dim) || kernel_h_dim.dim_value() < 3 ||
+      !utils::HasDimValue(kernel_w_dim) || kernel_w_dim.dim_value() < 3) {
+    return false;
+  }
+
+  const auto dilations_opt = node.GetAttributeInts("dilations");
+  if (dilations_opt.has_value()) {
+    const auto& dilations = dilations_opt.value();
+    if ((dilations.size() >= 1 && dilations[0] != 1) ||
+        (dilations.size() >= 2 && dilations[1] != 1)) {
+      return false;
+    }
+  }
+
+  const auto group_opt = node.GetAttributeInt("group");
+  if (group_opt.has_value() && group_opt.value() != 1) {
+    return false;
+  }
+
+  return true;
+}
+#endif
+
 static inline const OpTransformInfo*
 NhwcConvLookup(
     const OpTransformMap& conv_table,
@@ -41,6 +110,13 @@ NhwcConvLookup(
   if (iter == conv_table.end()) {
     return nullptr;
   }
+
+  if (iter->second.filter_ != nullptr) {
+    if (!iter->second.filter_(graph, node)) {
+      return nullptr;
+    }
+  }
+
   return &(iter->second);
 }
 
@@ -108,15 +184,62 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
         nhwc_conv_fp16.version_, nhwc_conv_fp16.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
+      const auto filter = [](const api::GraphRef&, api::NodeRef& node) {
+        const auto dilations_opt = node.GetAttributeInts("dilations");
+        if (dilations_opt.has_value()) {
+          const auto& dilations = dilations_opt.value();
+          if ((dilations.size() >= 1 && dilations[0] != 1) ||
+              (dilations.size() >= 2 && dilations[1] != 1)) {
+            return false;
+          }
+        }
+
+        const auto group_opt = node.GetAttributeInt("group");
+        if (group_opt.has_value() && group_opt.value() != 1) {
+          return false;
+        }
+
+        return true;
+      };
+
       conv_table_.emplace(
           OpIdInfo("Conv", kOnnxDomain, api::DataType::FLOAT16),
-          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false});
+          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false, filter});
       conv_table_.emplace(
           OpIdInfo("FusedConv", kMSDomain, api::DataType::FLOAT16),
-          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false});
+          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false, filter});
     }
   }
 
+#ifdef USE_KLEIDIAI
+  // Klediai specific block for NhwcFusedConvolutions
+  {
+    // F32 Conv -> F32 NHWC Conv
+    OpKernelRegistryId nhwc_conv_fp32{
+        "NhwcFusedConv", kMSDomain, 1, {{"T", {DataTypeImpl::GetTensorType<float>()}}}};
+
+    const KernelCreateInfo* kernel_create_info{};
+    const auto status = cpu_kernel_registry->TryFindKernel(
+        kCpuExecutionProvider, nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_,
+        nhwc_conv_fp32.version_, nhwc_conv_fp32.type_constraints_, logger, &kernel_create_info);
+
+    if (status.IsOK() && kernel_create_info != nullptr) {
+      kernel_create_info = nullptr;
+
+      const auto filter = [](const api::GraphRef& graph, api::NodeRef& node) {
+        return KleidiFp32NhwcFilter(graph, node);
+      };
+
+      conv_table_.emplace(
+          OpIdInfo("Conv", kOnnxDomain, api::DataType::FLOAT),
+          OpTransformInfo{nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_, nhwc_conv_fp32.version_, false, filter});
+      conv_table_.emplace(
+          OpIdInfo("FusedConv", kMSDomain, api::DataType::FLOAT),
+          OpTransformInfo{nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_, nhwc_conv_fp32.version_, false, filter});
+    }
+  }
+#endif
+
   {
     // fp16 MaxPool -> fp16 nhwc MaxPool
     OpKernelRegistryId nhwc_maxpool_fp16{
@@ -214,10 +337,39 @@ Status NhwcTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level,
     if (transform->has_channels_last_attrib_) {
       node->SetAttributeInt("channels_last", 1);
     }
+
+    if (node->OpType() == "Conv" || node->OpType() == "FusedConv") {
+      const auto group_opt = node->GetAttributeInt("group");
+      if (group_opt.has_value() && group_opt.value() != 1) {
+        continue;
+      }
+
+      const auto dilations_opt = node->GetAttributeInts("dilations");
+      if (dilations_opt.has_value()) {
+        const auto& dilations = dilations_opt.value();
+        if ((dilations.size() >= 1 && dilations[0] != 1) ||
+            (dilations.size() >= 2 && dilations[1] != 1)) {
+          continue;
+        }
+      }
+    }
+
     size_t rank = shape->dim_size();
     std::vector<int64_t> input_perm = ChannelFirstToLastPerm(rank);
     std::vector<int64_t> output_perm = ChannelLastToFirstPerm(rank);
-    WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm});
+    const auto inputs = node->Inputs();
+    std::vector<const std::vector<int64_t>*> input_perms(inputs.size(), nullptr);
+    if (!inputs.empty()) {
+      input_perms[0] = &input_perm;
+    }
+    // Optional Sum (Z) input for FusedConv variants resides at index 3. When present,
+    // it must be converted to NHWC alongside the activation tensor.
+    const bool has_fused_sum_input = (node->Domain() == kMSDomain && node->OpType() == "FusedConv");
+    if (has_fused_sum_input && inputs.size() > 3 && !inputs[3].empty()) {
+      input_perms[3] = &input_perm;
+    }
+
+    WrapTransposesAroundNode(*api_graph, *node, input_perms, {&output_perm});
 
     // Replace the operator if needed
     if (node->Domain() != transform->domain_ ||
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.h b/onnxruntime/core/optimizer/nhwc_transformer.h
index c65f851fdab9d..6dd11bdba6bdd 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.h
+++ b/onnxruntime/core/optimizer/nhwc_transformer.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <functional>
 #include "core/common/common.h"
 #include "core/framework/execution_provider.h"
 #include "core/framework/kernel_registry.h"
@@ -54,10 +55,14 @@ class OpIdHash {
  * @brief Information needed for operator layout transformation
  */
 struct OpTransformInfo {
+  using FilterFn = std::function<bool(const onnx_transpose_optimization::api::GraphRef& graph,
+                                      onnx_transpose_optimization::api::NodeRef& node)>;
+
   const std::string optype_;
   const std::string domain_;
   const int version_;
   const bool has_channels_last_attrib_;
+  const FilterFn filter_{nullptr};
 };
 
 using OpTransformMap = std::unordered_map<OpIdInfo, OpTransformInfo, OpIdHash>;
diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc
index d10213f55d5d4..4cc0df42d2969 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@@ -15,6 +15,8 @@
  */
 /* Modifications Copyright (c) Microsoft. */
 
+#include <vector>
+
 #include "core/providers/cpu/nn/conv.h"
 
 #include "core/common/narrow.h"
@@ -24,6 +26,44 @@
 namespace onnxruntime {
 using ConvPadVector = ConvAttributes::ConvPadVector;
 
+namespace {
+
+template <typename T>
+void ConvertNHWCToNCHW(const T* src, T* dst,
+                       int64_t n, int64_t c, int64_t h, int64_t w) {
+  const int64_t hw = (SafeInt<int64_t>(h) * w);
+  for (int64_t n_idx = 0; n_idx < n; ++n_idx) {
+    const int64_t n_src_offset = n_idx * hw * c;
+    const int64_t n_dst_offset = n_idx * c * hw;
+    for (int64_t c_idx = 0; c_idx < c; ++c_idx) {
+      const T* src_ptr = src + n_src_offset + c_idx;
+      T* dst_ptr = dst + n_dst_offset + c_idx * hw;
+      for (int64_t hw_idx = 0; hw_idx < hw; ++hw_idx) {
+        dst_ptr[hw_idx] = src_ptr[hw_idx * c];
+      }
+    }
+  }
+}
+
+template <typename T>
+void ConvertNCHWToNHWC(const T* src, T* dst,
+                       int64_t n, int64_t c, int64_t h, int64_t w) {
+  const int64_t hw = (SafeInt<int64_t>(h) * w);
+  for (int64_t n_idx = 0; n_idx < n; ++n_idx) {
+    const int64_t n_src_offset = n_idx * c * hw;
+    const int64_t n_dst_offset = n_idx * hw * c;
+    for (int64_t hw_idx = 0; hw_idx < hw; ++hw_idx) {
+      const T* src_ptr = src + n_src_offset + hw_idx;
+      T* dst_ptr = dst + n_dst_offset + hw_idx * c;
+      for (int64_t c_idx = 0; c_idx < c; ++c_idx) {
+        dst_ptr[c_idx] = src_ptr[c_idx * hw];
+      }
+    }
+  }
+}
+
+}  // namespace
+
 template <typename T>
 Status Conv<T>::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<Tensor>(0);
@@ -160,11 +200,10 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
   const Tensor* B = num_inputs >= 3 ? context->Input<Tensor>(2) : nullptr;
   const Tensor* Sum = num_inputs >= 4 ? context->Input<Tensor>(3) : nullptr;
   const int64_t N = X->Shape()[0];
-  const int64_t C = X->Shape()[1];
+  const int64_t C = X->Shape()[channels_last_ ? 3 : 1];
   const int64_t M = W->Shape()[0];
-  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
+  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last_));
 
-  // kernel_shape is an optional attribute and has to be inferred from W if not provided
   TensorShapeVector kernel_shape;
   ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
 
@@ -182,12 +221,14 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
   }
 
   TensorShapeVector Y_dims({N, M});
-  TensorShape input_shape = X->Shape().Slice(2);
+  TensorShape input_shape = channels_last_ ? X->Shape().Slice(1, 3) : X->Shape().Slice(2);
   ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+  if (channels_last_) {
+    Y_dims = {Y_dims[0], Y_dims[2], Y_dims[3], Y_dims[1]};
+  }
   Tensor* Y = context->Output(0, TensorShape(Y_dims));
-  TensorShape output_shape = Y->Shape().Slice(2);
+  TensorShape output_shape = channels_last_ ? TensorShape(Y_dims).Slice(1, 3) : Y->Shape().Slice(2);
 
-  // Bail out early if one of the dimensions is zero.
   if (Y->Shape().Size() == 0) {
     return Status::OK();
   }
@@ -198,20 +239,39 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
   auto Xdata = X->DataAsSpan<float>();
   const auto* Bdata = B != nullptr ? B->Data<float>() : nullptr;
   auto Ydata = Y->MutableDataAsSpan<float>();
-  // Check for the optional Conv/Sum fusion.
+  const size_t kernel_rank = kernel_shape.size();
+  concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
+
+  if (channels_last_) {
+    ORT_RETURN_IF_NOT(kernel_rank == 2, "NhwcFusedConv currently supports 2D kernels.");
+    ORT_RETURN_IF_NOT(dilations[0] == 1 && dilations[1] == 1, "NhwcFusedConv currently supports dilation == 1.");
+  }
+
+  const bool wants_channels_last = channels_last_;
+  const bool sum_present = Sum != nullptr;
+  const bool nhwc_fastpath =
+      wants_channels_last && kernel_rank == 2 && conv_attrs_.group == 1 &&
+      dilations[0] == 1 && dilations[1] == 1 && !sum_present;
+  const bool manual_sum = wants_channels_last && !nhwc_fastpath && sum_present;
+
+  std::vector<float> sum_manual_buffer;
+  const float* sum_manual_data = nullptr;
+
   float Beta = 0.0f;
-  if (Sum != nullptr) {
+  if (sum_present) {
     const auto& sum_shape = Sum->Shape();
     ORT_RETURN_IF_NOT(Y->Shape() == sum_shape, "output and sum shape must match");
-    // If the output was not allocated inplace with the sum tensor, then copy here.
-    auto sum_data = Sum->DataAsSpan<float>();
-    if (Ydata.data() != sum_data.data()) {
-      gsl::copy(sum_data, Ydata);
+    if (manual_sum) {
+      sum_manual_buffer.assign(Sum->Data<float>(), Sum->Data<float>() + Y->Shape().Size());
+      sum_manual_data = sum_manual_buffer.data();
+    } else {
+      auto sum_span = Sum->DataAsSpan<float>();
+      if (Ydata.data() != sum_span.data()) {
+        gsl::copy(sum_span, Ydata);
+      }
+      Beta = 1.0f;
     }
-    Beta = 1.0f;
   }
-  const size_t kernel_rank = kernel_shape.size();
-  concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
 
   if (kernel_rank >= 1 && kernel_rank <= 3) {
     MLAS_CONV_PARAMETERS Parameters;
@@ -230,20 +290,66 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
                     narrow<size_t>(M / conv_attrs_.group),
                     &activation_,
                     &WorkingBufferSize,
-                    Beta,
+                    nhwc_fastpath,
+                    nhwc_fastpath ? 0.0f : Beta,
                     thread_pool);
 
-    auto* working_data = WorkingBufferSize > 0 ? alloc->Alloc(sizeof(float) * SafeInt<size_t>(WorkingBufferSize))
-                                               : nullptr;
-    BufferUniquePtr working_buffer(working_data, BufferDeleter(std::move(alloc)));
+    float* working_data = nullptr;
+    BufferUniquePtr working_buffer;
+    if (WorkingBufferSize > 0) {
+      working_data = static_cast<float*>(alloc->Alloc(sizeof(float) * SafeInt<size_t>(WorkingBufferSize)));
+      working_buffer = BufferUniquePtr(working_data, BufferDeleter(alloc));
+    }
+
+    float* output_compute = Ydata.data();
+    BufferUniquePtr output_temp;
+    if (wants_channels_last && !nhwc_fastpath) {
+      const SafeInt<size_t> output_compute_size =
+          SafeInt<size_t>(Y->Shape()[0]) * SafeInt<size_t>(M) *
+          SafeInt<size_t>(output_shape[0]) * SafeInt<size_t>(output_shape[1]);
+      float* temp_output = static_cast<float*>(alloc->Alloc(sizeof(float) * output_compute_size));
+      output_temp = BufferUniquePtr(temp_output, BufferDeleter(alloc));
+      output_compute = temp_output;
+    }
+
+    const float* input_compute = Xdata.data();
+    BufferUniquePtr input_temp;
+    if (wants_channels_last && !nhwc_fastpath) {
+      ORT_RETURN_IF_NOT(X->Shape().NumDimensions() == 4, "Nhwc fallback expects 4D input.");
+      const auto& x_dims = X->Shape().GetDims();
+      const int64_t input_n = x_dims[0];
+      const int64_t input_h = x_dims[1];
+      const int64_t input_w = x_dims[2];
+      const int64_t input_c = x_dims[3];
+      const SafeInt<size_t> input_elements = SafeInt<size_t>(X->Shape().Size());
+      float* temp_input = static_cast<float*>(alloc->Alloc(sizeof(float) * input_elements));
+      input_temp = BufferUniquePtr(temp_input, BufferDeleter(alloc));
+      ConvertNHWCToNCHW(X->Data<float>(), temp_input,
+                        input_n, input_c, input_h, input_w);
+      input_compute = temp_input;
+    }
 
     MlasConv(&Parameters,
-             Xdata.data(),
+             input_compute,
              W->Data<float>(),
              Bdata,
-             static_cast<float*>(working_buffer.get()),
-             Ydata.data(),
+             working_data,
+             output_compute,
              thread_pool);
+
+    if (wants_channels_last && !nhwc_fastpath) {
+      const auto& y_dims = Y->Shape().GetDims();
+      ORT_RETURN_IF_NOT(y_dims.size() == 4, "Nhwc fallback expects 4D output.");
+      ConvertNCHWToNHWC(output_compute,
+                        Ydata.data(),
+                        y_dims[0], y_dims[3], y_dims[1], y_dims[2]);
+      if (manual_sum) {
+        auto y_span = gsl::make_span(Ydata.data(), Ydata.size());
+        for (size_t i = 0; i < y_span.size(); ++i) {
+          y_span[i] += sum_manual_data[i];
+        }
+      }
+    }
   } else {
     const int64_t input_image_size = input_shape.Size();
     const int64_t output_image_size = output_shape.Size();
@@ -284,7 +390,8 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
             thread_pool);
       }
 
-      MlasActivation(&activation_, Ydata.data(), Bdata, narrow<size_t>(M), narrow<size_t>(output_image_size), narrow<size_t>(output_image_size));
+      MlasActivation(&activation_, Ydata.data(), Bdata, narrow<size_t>(M),
+                     narrow<size_t>(output_image_size), narrow<size_t>(output_image_size));
 
       Xdata = Xdata.subspan(X_offset * conv_attrs_.group);
       Ydata = Ydata.subspan(Y_offset * conv_attrs_.group);
diff --git a/onnxruntime/core/providers/cpu/nn/conv.h b/onnxruntime/core/providers/cpu/nn/conv.h
index 5ed5d2ca91def..78912d3146a1e 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.h
+++ b/onnxruntime/core/providers/cpu/nn/conv.h
@@ -24,7 +24,7 @@ class Conv : public OpKernel {
 template <>
 class Conv<float> : public OpKernel {
  public:
-  Conv(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info) {
+  Conv(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info), channels_last_(info.GetKernelDef().OpName() == "NhwcFusedConv") {
     activation_.ActivationKind = MlasIdentityActivation;
   }
 
@@ -34,6 +34,7 @@ class Conv<float> : public OpKernel {
   MLAS_ACTIVATION activation_;
 
   ConvAttributes conv_attrs_;
+  bool channels_last_{false};
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 045dc98a3501e..03b2067eadc2e 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -770,6 +770,7 @@ void Im2col<T, StorageOrder::NHWC>::operator()(
 template struct Im2col<int8_t, StorageOrder::NHWC>;
 template struct Im2col<uint8_t, StorageOrder::NHWC>;
 template struct Im2col<MLFloat16, StorageOrder::NHWC>;
+template struct Im2col<float, StorageOrder::NHWC>;
 
 template <>
 void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64_t channels, int64_t height,
diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index 3032b3170a6e0..91266d81b4f91 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -17,6 +17,8 @@
 #include "test/util/include/asserts.h"
 #include "test/util/include/inference_session_wrapper.h"
 
+#include <filesystem>
+#include <iostream>
 #include "flatbuffers/idl.h"
 #include "flatbuffers/util.h"
 
@@ -27,6 +29,28 @@ using namespace ONNX_NAMESPACE;
 
 namespace onnxruntime {
 namespace test {
+namespace {
+std::filesystem::path ResolveTestPath(const std::filesystem::path& path) {
+  if (path.is_absolute() || path.empty()) {
+    return path;
+  }
+
+  std::filesystem::path workspace_candidate = std::filesystem::current_path() / path;
+  if (std::filesystem::exists(workspace_candidate)) {
+    return workspace_candidate;
+  }
+
+  static const std::filesystem::path kSourceTestRoot =
+      std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path();
+  std::filesystem::path source_candidate = kSourceTestRoot / path;
+  if (std::filesystem::exists(source_candidate)) {
+    return source_candidate;
+  }
+
+  return workspace_candidate;
+}
+}  // namespace
+
 struct OrtModelTestInfo {
   std::basic_string<ORTCHAR_T> model_filename;
   std::string logid;
@@ -59,17 +83,21 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
 
   std::vector<char> model_data;
   InferenceSessionWrapper session_object{so, GetEnvironment()};
+  std::filesystem::path model_path = ResolveTestPath(std::filesystem::path{test_info.model_filename});
+
+  std::cerr << "RunOrtModel cwd: " << std::filesystem::current_path() << " loading: " << model_path << std::endl;
+  const auto& model_path_str = model_path.native();
   if (test_info.run_use_buffer) {
     // Load the file into a buffer and use the buffer to create inference session
     size_t num_bytes = 0;
-    ASSERT_STATUS_OK(Env::Default().GetFileLength(test_info.model_filename.c_str(), num_bytes));
+    ASSERT_STATUS_OK(Env::Default().GetFileLength(model_path_str.c_str(), num_bytes));
     model_data.resize(num_bytes);
-    std::ifstream bytes_stream(test_info.model_filename, std::ifstream::in | std::ifstream::binary);
+    std::ifstream bytes_stream(model_path, std::ifstream::in | std::ifstream::binary);
     bytes_stream.read(model_data.data(), num_bytes);
     bytes_stream.close();
     ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(num_bytes)));
   } else {
-    ASSERT_STATUS_OK(session_object.Load(test_info.model_filename));  // infer type from filename
+    ASSERT_STATUS_OK(session_object.Load(model_path_str));  // infer type from filename
   }
 
   ASSERT_STATUS_OK(session_object.Initialize());
@@ -145,7 +173,7 @@ static void CompareGraphAndSessionState(const InferenceSessionWrapper& session_o
 
   for (const auto& pair : i1) {
     auto iter = i2.find(pair.first);
-    ASSERT_NE(iter, i2.cend());
+    ASSERT_NE(iter, i2.cend()) << "Missing initializer " << pair.first;
 
     const OrtValue& left = pair.second;
     const OrtValue& right = iter->second;
@@ -213,9 +241,28 @@ static void CompareSessionMetadata(const InferenceSessionWrapper& session_object
 static void SaveAndCompareModels(const PathString& orig_file,
                                  const PathString& ort_file,
                                  TransformerLevel optimization_level = TransformerLevel::Level3) {
+  std::filesystem::path orig_path = ResolveTestPath(std::filesystem::path{orig_file});
+  std::filesystem::path ort_path = ResolveTestPath(std::filesystem::path{ort_file});
+  if (ort_path.has_parent_path()) {
+    std::filesystem::create_directories(ort_path.parent_path());
+  }
+
+  const bool orig_is_ort_format = orig_path.extension() == ORT_TSTR(".ort");
+  if (orig_is_ort_format) {
+    SessionOptions so;
+    so.session_logid = "SerializeToOrtFormat";
+    so.optimized_model_filepath = ort_path.native();
+    so.graph_optimization_level = optimization_level;
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigSaveModelFormat, "ORT"));
+    InferenceSessionWrapper session_object{so, GetEnvironment()};
+    ASSERT_STATUS_OK(session_object.Load(orig_path.native()));
+    ASSERT_STATUS_OK(session_object.Initialize());
+    return;
+  }
+
   SessionOptions so;
   so.session_logid = "SerializeToOrtFormat";
-  so.optimized_model_filepath = ort_file;
+  so.optimized_model_filepath = ort_path.native();
   so.graph_optimization_level = optimization_level;
 
   // not strictly necessary - type should be inferred from the filename
@@ -223,7 +270,7 @@ static void SaveAndCompareModels(const PathString& orig_file,
   InferenceSessionWrapper session_object{so, GetEnvironment()};
 
   // create .ort file during Initialize due to values in SessionOptions
-  ASSERT_STATUS_OK(session_object.Load(orig_file));
+  ASSERT_STATUS_OK(session_object.Load(orig_path.native()));
   ASSERT_STATUS_OK(session_object.Initialize());
 
   SessionOptions so2;
@@ -234,7 +281,7 @@ static void SaveAndCompareModels(const PathString& orig_file,
 
   // load serialized version
   InferenceSessionWrapper session_object2{so2, GetEnvironment()};
-  ASSERT_STATUS_OK(session_object2.Load(ort_file));
+  ASSERT_STATUS_OK(session_object2.Load(ort_path.native()));
   ASSERT_STATUS_OK(session_object2.Initialize());
 
   CompareSessionMetadata(session_object, session_object2);
diff --git a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
index 74a812062875a..b9c58ca386b12 100644
--- a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
+++ b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
@@ -22,6 +22,7 @@
 
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
+#include <filesystem>
 
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::logging;
@@ -36,12 +37,35 @@ using namespace onnxruntime::internal_testing_ep;
 
 #define ORT_MODEL_FOLDER ORT_TSTR("testdata/")
 
+namespace {
+std::filesystem::path ResolveInternalTestPath(const std::filesystem::path& path) {
+  if (path.is_absolute() || path.empty()) {
+    return path;
+  }
+
+  std::filesystem::path candidate = std::filesystem::current_path() / path;
+  if (std::filesystem::exists(candidate)) {
+    return candidate;
+  }
+
+  static const std::filesystem::path kSourceTestRoot =
+      std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path().parent_path();
+  return kSourceTestRoot / path;
+}
+
+std::basic_string<ORTCHAR_T> ResolveInternalTestPathString(const ORTCHAR_T* path) {
+  return ResolveInternalTestPath(std::filesystem::path{path}).native();
+}
+}  // namespace
+
 static Status CreateSession(const SessionOptions& so, std::unique_ptr<InferenceSessionWrapper>& session,
                             const ORTCHAR_T* model_path = ORT_MODEL_FOLDER "mnist.onnx",  // arbitrary test model
                             bool enable_custom_ep = true,
                             const std::unordered_set<std::string>* override_supported_ops = nullptr) {
   session = std::make_unique<InferenceSessionWrapper>(so, GetEnvironment());
 
+  std::filesystem::path resolved_model_path = ResolveInternalTestPath(std::filesystem::path{model_path});
+
   // set supported ops to ops that are ideally found consecutively in the model.
   // we can say the EP potentially handles them all, but can also test removing handling of one or more ops
   // at runtime to simulate a lower spec device where not all ops can be handled. this allows us to test
@@ -55,7 +79,7 @@ static Status CreateSession(const SessionOptions& so, std::unique_ptr<InferenceS
         std::make_unique<InternalTestingExecutionProvider>(*supported_ops)));
   }
 
-  ORT_RETURN_IF_ERROR(session->Load(model_path));
+  ORT_RETURN_IF_ERROR(session->Load(resolved_model_path.c_str()));
   ORT_RETURN_IF_ERROR(session->Initialize());
   return Status::OK();
 }
@@ -98,7 +122,7 @@ static void ExecuteMnist(InferenceSessionWrapper& session, bool custom_ep_enable
 
 #if !defined(ORT_MINIMAL_BUILD)
 TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.test_output.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.test_output.ort");
 
   //
   // First load the onnx format model and save as an ORT model.
@@ -121,10 +145,10 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
   so.optimized_model_filepath.clear();
   bool enable_custom_ep = false;
 
-  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path, enable_custom_ep));
+  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path.c_str(), enable_custom_ep));
   const auto& graph1 = session2->GetGraph();
-  // model should have all the original nodes and we should be able to execute with the fallback to CPU EP
-  ASSERT_EQ(graph1.NumberOfNodes(), num_nodes);
+  // ensure we can execute with the fallback to CPU EP even if additional nodes are introduced during loading
+  ASSERT_GE(graph1.NumberOfNodes(), num_nodes);
   ExecuteMnist(*session2, enable_custom_ep);
   session2 = nullptr;
 
@@ -133,7 +157,7 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
   // for the ORT format model.
   //
   enable_custom_ep = true;
-  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path, enable_custom_ep));
+  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path.c_str(), enable_custom_ep));
   const auto& graph2 = session2->GetGraph();
   // model should be able to be loaded, and we should compile using custom ep. that will result in one node for the
   // custom EP (with Conv/Add/Relu/MaxPool), one for a reshape, and one for the fused MatMul+Add.
@@ -142,7 +166,7 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
 }
 
 TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
 
   // make sure we can't save a model with compiled ops. input/output model format doesn't matter
   SessionOptions so;
@@ -154,7 +178,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
   ASSERT_STATUS_OK(session->RegisterExecutionProvider(
       std::make_unique<InternalTestingExecutionProvider>(supported_ops)));
 
-  ASSERT_STATUS_OK(session->Load(ort_model_path));
+  ASSERT_STATUS_OK(session->Load(ort_model_path.c_str()));
   ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session->Initialize(),
                                       "Unable to serialize model as it contains compiled nodes");
 }
@@ -163,7 +187,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
 // version of the ONNX operator when matching a static kernel, those are required.
 #if !defined(DISABLE_CONTRIB_OPS)
 TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx");
 
   SessionOptions so;
   InferenceSessionWrapper session(so, GetEnvironment());
@@ -175,7 +199,7 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
   ep->EnableStaticKernels();
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(ep)));
 
-  ASSERT_STATUS_OK(session.Load(ort_model_path));
+  ASSERT_STATUS_OK(session.Load(ort_model_path.c_str()));
   ASSERT_STATUS_OK(session.Initialize());
 
   TensorShape input_shape_x{1, 1, 7, 7};
@@ -204,7 +228,8 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
 
 TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
   auto run_test = [&](const ORTCHAR_T* model_path) {
-    SCOPED_TRACE("model path: " + ToUTF8String(model_path));
+    auto resolved_model_path = ResolveInternalTestPathString(model_path);
+    SCOPED_TRACE("model path: " + ToUTF8String(resolved_model_path.c_str()));
 
     SessionOptions so;
     // set this if you want to manually inspect the optimized model
@@ -218,7 +243,7 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
     ep->EnableStaticKernels();
     ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(ep)));
 
-    ASSERT_STATUS_OK(session.Load(model_path));
+    ASSERT_STATUS_OK(session.Load(resolved_model_path.c_str()));
     ASSERT_STATUS_OK(session.Initialize());
 
     const auto& graph = session.GetGraph();
@@ -249,13 +274,11 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
   };
 
   // the internal NHWC domain supports opset 11 and later
-  const ORTCHAR_T* onnx_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx";
-  run_test(onnx_model_path);
+  run_test(ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx");
 
   // Note: Using ORT format model with runtime optimizations so that the Conv nodes are preserved in the graph,
   // not converted into FusedConv nodes. The InternalTestingExecutionProvider handles Conv nodes.
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.with_runtime_opt.ort";
-  run_test(ort_model_path);
+  run_test(ORT_MODEL_FOLDER "squeezenet/model_opset11.with_runtime_opt.ort");
 }
 
 // make sure allocators returned by SessionState::GetAllocator are valid when IExecutionProvider::ReplaceAllocator
@@ -283,8 +306,8 @@ TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorag
     ASSERT_STATUS_OK(session.RegisterExecutionProvider(ep));
   }
 
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model.onnx";
-  ASSERT_STATUS_OK(session.Load(ort_model_path));
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "squeezenet/model.onnx");
+  ASSERT_STATUS_OK(session.Load(ort_model_path.c_str()));
   ASSERT_STATUS_OK(session.Initialize());
 
   // Need to undo the wrapping that happens in Environment::RegisterAllocator to be able to compare the pointers
@@ -301,25 +324,25 @@ TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorag
 
 // test to validate a minimal build
 TEST(InternalTestingEP, TestLoadOrtModel) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
 
   std::unique_ptr<InferenceSessionWrapper> session;
   bool enable_custom_ep = true;
 
-  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path, enable_custom_ep));
+  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path.c_str(), enable_custom_ep));
   ExecuteMnist(*session, enable_custom_ep);
 }
 
 // test that if the custom EP cannot take all nodes due to device limitations
 // that we fallback to the CPU implementations and can execute the model
 TEST(InternalTestingEP, TestLoadOrtModelWithReducedOpCoverage) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
   const std::unordered_set<std::string> supported_ops{"Conv", "Add", "Relu" /*, "MaxPool"*/};
 
   std::unique_ptr<InferenceSessionWrapper> session;
   bool enable_custom_ep = true;
 
-  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path, enable_custom_ep, &supported_ops));
+  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path.c_str(), enable_custom_ep, &supported_ops));
 
   const auto& graph = session->GetGraph();
   // Conv+Add gets fused by level 1 optimizer into single node. The 'Conv'/'Add'/'Relu' nodes should be compiled and
@@ -454,7 +477,7 @@ TEST(InternalTestingEP, TestOrtModelWithCompileFailure) {
   // the layout transformation for this EP is already done at this stage and reverting
   // can result in more failures.
   // This is to test the model initialization fails if compile fails.
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
 
   const std::unordered_set<std::string>& supported_ops{"Conv", "Gemm"};
   const std::unordered_set<std::string>& compile_failure_ops{"Gemm"};
diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index dc37980002978..163f7f1dc2f16 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -110,6 +110,7 @@ void SCONV_NCHW(benchmark::State& state, const char* /*dummy*/) {
                   static_cast<size_t>(output_channels_per_group),
                   &activation,
                   &WorkingBufferSize,
+                  false,
                   0.0f,
                   nullptr);
 
diff --git a/onnxruntime/test/mlas/unittest/test_conv2d.h b/onnxruntime/test/mlas/unittest/test_conv2d.h
index 20bf0ec84f5bf..736d8587b2546 100644
--- a/onnxruntime/test/mlas/unittest/test_conv2d.h
+++ b/onnxruntime/test/mlas/unittest/test_conv2d.h
@@ -57,6 +57,7 @@ class MlasConv2DTest : public MlasTestBase {
                     FilterCount,
                     &Activation,
                     &WorkingBufferSize,
+                    false,
                     0.0f,
                     threadpool_);
 
diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc
index f61f9b29d9cce..704d7ac907450 100644
--- a/onnxruntime/test/optimizer/conv_add_act_test.cc
+++ b/onnxruntime/test/optimizer/conv_add_act_test.cc
@@ -30,9 +30,10 @@ void TestConvPath(const std::vector<int64_t>& input_shape, const std::vector<int
 
   auto check_graph = [&](InferenceSessionWrapper& session) {
     auto op_to_count = CountOpsInGraph(session.GetGraph());
-    EXPECT_EQ(op_to_count["com.microsoft.FusedConv"], 1);
+    const auto fused_count = op_to_count["com.microsoft.FusedConv"] + op_to_count["com.microsoft.NhwcFusedConv"];
+    EXPECT_EQ(fused_count, 1);
   };
-  InlinedHashSet<std::string> disabled_optimizers = {"NchwcTransformer"};
+  InlinedHashSet<std::string> disabled_optimizers = {"NchwcTransformer", "NhwcTransformer"};
   TransformerTester(build_test_case,
                     check_graph,
                     TransformerLevel::Default,
diff --git a/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
index de973679c8f80..7bb492c4854d9 100644
--- a/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
+++ b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
@@ -363,6 +363,7 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_
 
   // Create session and check graph before / after initiation
   InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"}));
   ASSERT_STATUS_OK(session.Load(model_uri));
   test_graph_structure_at_init(session.GetGraph());
   ASSERT_STATUS_OK(session.Initialize());
@@ -402,6 +403,7 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_
 
   // Create session and check graph before / after initiation
   InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"}));
   ASSERT_STATUS_OK(session.Load(model_uri));
   test_graph_structure_at_init(session.GetGraph());
   ASSERT_STATUS_OK(session.Initialize());
@@ -443,6 +445,7 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_
 
   // Create session and check graph before / after initiation
   InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"}));
   ASSERT_STATUS_OK(session.Load(model_uri));
   test_graph_structure_at_init(session.GetGraph());
   ASSERT_STATUS_OK(session.Initialize());
@@ -494,7 +497,7 @@ TEST(TransformerTest, FuseFp16InitializersWithGraphOutputs) {
   // by folding it with Add node. This will not allow us to test the
   // scenario where Cast node is producing graph output and need to
   // kept untouched by FuseInitializersTransformer.
-  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"ConstantFolding"}));
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"ConstantFolding", "NhwcTransformer"}));
   ASSERT_STATUS_OK(session.Load(model_uri));
   _graph_structure_at_load(session.GetGraph());
   ASSERT_STATUS_OK(session.Initialize());
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index 21ea7af4e7389..4d270ba014eae 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -224,6 +224,28 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
                     TransformerLevel::Level3);
 }
 
+TEST(NhwcTransformerTests, ConvDepthwiseFloat) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<float>({1, 8, 7, 7}, -1.0f, 1.0f);
+    auto* weight_arg = builder.MakeInitializer<float>({8, 1, 3, 3}, -1.0f, 1.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    Node& conv_node = builder.AddConvNode(input_arg, weight_arg, output_arg);
+    conv_node.AddAttribute("group", static_cast<int64_t>(8));
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], 0);
+    EXPECT_EQ(op_to_count["Transpose"], 0);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3);
+}
+
 TEST(NhwcTransformerTests, ConvAveragePool) {
   DNNL_GTEST_SKIP();
 

From 1606a1c473ca281d63952e4314fe068c8a6e8b0c Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Fri, 19 Dec 2025 13:20:49 +0000
Subject: [PATCH 02/13] Add a value for channels_last to bench_sconv.cpp

Signed-off-by: Orlaith Monahan <orlaith.monahan@arm.com>
---
 onnxruntime/test/mlas/bench/bench_sconv.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index 163f7f1dc2f16..e5559a8f838b0 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -218,6 +218,7 @@ void SCONV_NCHW_THREADED(benchmark::State& state, const char* /*dummy*/) {
                   static_cast<size_t>(output_channels_per_group),
                   &activation,
                   &WorkingBufferSize,
+                  false,
                   0.0f,
                   tp);
 

From 2dd199e1051bf7c0b575ee6514ed3516090c9613 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Mon, 12 Jan 2026 10:57:00 +0000
Subject: [PATCH 03/13] Update internal_testing_tests.cc

Update to the internal_testings_tests helper macros for file expansion so it works on other platforms
---
 onnxruntime/test/internal_testing_ep/internal_testing_tests.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
index b9c58ca386b12..e8bab013de97a 100644
--- a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
+++ b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
@@ -49,7 +49,7 @@ std::filesystem::path ResolveInternalTestPath(const std::filesystem::path& path)
   }
 
   static const std::filesystem::path kSourceTestRoot =
-      std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path().parent_path();
+      std::filesystem::path{ORT_TSTR_ON_MACRO(__FILE__)}.parent_path().parent_path().parent_path();
   return kSourceTestRoot / path;
 }
 

From 4df9cea096cce5c595fb25fb07d0be7f10af5b36 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Wed, 14 Jan 2026 16:05:28 +0000
Subject: [PATCH 04/13] Update nhwc_transformer_test.cc

Fix for failing ConvDepthwiseFloat test, allows for a small tolerance when running on different hardware
---
 onnxruntime/test/optimizer/nhwc_transformer_test.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index 4d270ba014eae..3ad70b7f6ff5e 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -243,7 +243,11 @@ TEST(NhwcTransformerTests, ConvDepthwiseFloat) {
   TransformerTester(build_test_case,
                     check_nhwc_graph,
                     TransformerLevel::Level2,
-                    TransformerLevel::Level3);
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 12,
+                    /*per_sample_tolerance*/ 1e-6,
+                    /*relative_per_sample_tolerance*/ 1e-6);
+
 }
 
 TEST(NhwcTransformerTests, ConvAveragePool) {

From b133782d1e4eb97276e98318d123db36e3c97252 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Wed, 14 Jan 2026 16:07:22 +0000
Subject: [PATCH 05/13] Update internal_testing_tests.cc

For for failing TestSaveAndLoadOrtModel test
Make sure the model being saved / loaded is being done from a writeable location
---
 .../test/internal_testing_ep/internal_testing_tests.cc        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
index e8bab013de97a..83fb3f07c8e76 100644
--- a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
+++ b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
@@ -122,7 +122,9 @@ static void ExecuteMnist(InferenceSessionWrapper& session, bool custom_ep_enable
 
 #if !defined(ORT_MINIMAL_BUILD)
 TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
-  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.test_output.ort");
+  const auto ort_model_dir = ResolveInternalTestPath(std::filesystem::path{ORT_MODEL_FOLDER});
+  const std::basic_string<ORTCHAR_T> ort_model_path =
+      (ort_model_dir / ORT_TSTR("mnist.internal_testing_ep.test_output.ort")).native();
 
   //
   // First load the onnx format model and save as an ORT model.

From 0c2d1cd4b7abd6d33e5231b77db41358b2dceee0 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Wed, 14 Jan 2026 16:09:10 +0000
Subject: [PATCH 06/13] Update ort_model_only_test.cc

Fix for undeclared identifier linker error
---
 onnxruntime/test/framework/ort_model_only_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index 91266d81b4f91..0de93a25f89f1 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -24,6 +24,8 @@
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
+#define WIDEN2(x) L##x
+#define WIDEN(x) WIDEN2(x)
 
 using namespace ONNX_NAMESPACE;
 
@@ -41,7 +43,7 @@ std::filesystem::path ResolveTestPath(const std::filesystem::path& path) {
   }
 
   static const std::filesystem::path kSourceTestRoot =
-      std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path();
+    std::filesystem::path{WIDEN(__FILE__)}.parent_path().parent_path();
   std::filesystem::path source_candidate = kSourceTestRoot / path;
   if (std::filesystem::exists(source_candidate)) {
     return source_candidate;

From 25c0be7081bc52310679a4339540cc3c4b1ea4a7 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Wed, 14 Jan 2026 18:00:49 +0000
Subject: [PATCH 07/13] Lintrunner fixes

Signed-off-by: Orlaith Monahan <orlaith.monahan@arm.com>
---
 onnxruntime/test/framework/ort_model_only_test.cc   | 2 +-
 onnxruntime/test/optimizer/nhwc_transformer_test.cc | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index 0de93a25f89f1..72f3c6e08095b 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -43,7 +43,7 @@ std::filesystem::path ResolveTestPath(const std::filesystem::path& path) {
   }
 
   static const std::filesystem::path kSourceTestRoot =
-    std::filesystem::path{WIDEN(__FILE__)}.parent_path().parent_path();
+      std::filesystem::path{WIDEN(__FILE__)}.parent_path().parent_path();
   std::filesystem::path source_candidate = kSourceTestRoot / path;
   if (std::filesystem::exists(source_candidate)) {
     return source_candidate;
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index 3ad70b7f6ff5e..87afd865a60a5 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -247,7 +247,6 @@ TEST(NhwcTransformerTests, ConvDepthwiseFloat) {
                     /*opset_version*/ 12,
                     /*per_sample_tolerance*/ 1e-6,
                     /*relative_per_sample_tolerance*/ 1e-6);
-
 }
 
 TEST(NhwcTransformerTests, ConvAveragePool) {

From 04821506db7767e7afbd3d262c866d5c73cf5d70 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Mon, 26 Jan 2026 10:07:16 +0000
Subject: [PATCH 08/13] Update onnxruntime/core/optimizer/nhwc_transformer.cc

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/optimizer/nhwc_transformer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index 9544cf7395025..5bd592f8ef01d 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -212,7 +212,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
   }
 
 #ifdef USE_KLEIDIAI
-  // Klediai specific block for NhwcFusedConvolutions
+  // KleidiAI specific block for NhwcFusedConvolutions
   {
     // F32 Conv -> F32 NHWC Conv
     OpKernelRegistryId nhwc_conv_fp32{

From f9606cdddf31c4320216c69e7df8fc46dabddfbe Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Mon, 26 Jan 2026 10:07:31 +0000
Subject: [PATCH 09/13] Update
 onnxruntime/core/framework/kernel_type_str_resolver.cc

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/framework/kernel_type_str_resolver.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.cc b/onnxruntime/core/framework/kernel_type_str_resolver.cc
index aacbc8fc0a4fb..f73550c14ebc0 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver.cc
+++ b/onnxruntime/core/framework/kernel_type_str_resolver.cc
@@ -37,7 +37,7 @@ static OpKernelTypeStrMap::const_iterator LookUpOpId(const OpIdentifier& op_id,
   }
 
 #ifdef USE_KLEIDIAI
-  // Klediai specific block for NhwcFusedConvolutions
+  // KleidiAI specific block for NhwcFusedConvolutions
   if (op_it == map.end() && op_id.domain == kMSDomain && op_id.op_type == "NhwcFusedConv") {
     const auto fused_conv_op_id = OpIdentifier{std::string{kMSDomain}, "FusedConv", op_id.since_version};
     op_it = map.find(fused_conv_op_id);

From 63d9c555b8b05e59fc4862a77e92264232cd35e2 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Mon, 26 Jan 2026 10:08:13 +0000
Subject: [PATCH 10/13] Update onnxruntime/core/providers/cpu/nn/conv.cc

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/providers/cpu/nn/conv.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc
index 4cc0df42d2969..f5615015366d0 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@@ -243,8 +243,8 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
   concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
 
   if (channels_last_) {
-    ORT_RETURN_IF_NOT(kernel_rank == 2, "NhwcFusedConv currently supports 2D kernels.");
-    ORT_RETURN_IF_NOT(dilations[0] == 1 && dilations[1] == 1, "NhwcFusedConv currently supports dilation == 1.");
+    ORT_RETURN_IF_NOT(kernel_rank == 2, "Conv with channels_last layout currently supports 2D kernels.");
+    ORT_RETURN_IF_NOT(dilations[0] == 1 && dilations[1] == 1, "Conv with channels_last layout currently supports dilation == 1.");
   }
 
   const bool wants_channels_last = channels_last_;

From 457513b5fee415a1f4ff0ab2ba67d23f744bd0a6 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Mon, 26 Jan 2026 10:08:41 +0000
Subject: [PATCH 11/13] Update
 onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index fc5f3a459e616..2d604a86561df 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -18,7 +18,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv);
+#ifdef USE_KLEIDIAI
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv);
+#endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);

From b836bd3b0584b15f3b153a98f036c0b6c008d010 Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Mon, 26 Jan 2026 10:08:56 +0000
Subject: [PATCH 12/13] Update
 onnxruntime/test/framework/ort_model_only_test.cc

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/test/framework/ort_model_only_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index 72f3c6e08095b..da1622dfd1af9 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -87,7 +87,6 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
   InferenceSessionWrapper session_object{so, GetEnvironment()};
   std::filesystem::path model_path = ResolveTestPath(std::filesystem::path{test_info.model_filename});
 
-  std::cerr << "RunOrtModel cwd: " << std::filesystem::current_path() << " loading: " << model_path << std::endl;
   const auto& model_path_str = model_path.native();
   if (test_info.run_use_buffer) {
     // Load the file into a buffer and use the buffer to create inference session

From 891dad554f4ca41b3db8826fcaa7468508844c4a Mon Sep 17 00:00:00 2001
From: Orlaith Monahan <orlaith.monahan@arm.com>
Date: Wed, 4 Feb 2026 12:57:14 +0000
Subject: [PATCH 13/13] Additional guards to not include KLEIDIAI specific
 kernels

Signed-off-by: Orlaith Monahan <orlaith.monahan@arm.com>
---
 onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 2d604a86561df..692412a8efcce 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -305,7 +305,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
+#ifdef USE_KLEIDIAI
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv)>,
+#endif
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,