From d4943e458fbccda7773d1238909dffcf5c62554f Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Fri, 19 Dec 2025 10:53:53 +0000 Subject: [PATCH 01/13] Add an implementation an NHWC implementation of convolution to avoid transposes * Modification to the CPU EP to specify channels_last when data format is NWHC * Added a FusedNhwcConv kernel * Implementation of the kernel in mlas * Added compiler guards so it is inly used with KleidiAi (for now, can be removed if needed) * Added unittests Signed-off-by: Orlaith Monahan --- .../contrib_ops/cpu/cpu_contrib_kernels.cc | 2 + onnxruntime/contrib_ops/cpu/fused_conv.cc | 8 + .../framework/kernel_type_str_resolver.cc | 12 ++ .../graph/contrib_ops/nhwc_schema_defs.cc | 2 +- onnxruntime/core/mlas/inc/mlas.h | 2 + onnxruntime/core/mlas/lib/convolve.cpp | 4 +- .../mlas/lib/kleidiai/convolve_kleidiai.cpp | 31 ++-- .../core/mlas/lib/kleidiai/mlasi_kleidiai.h | 1 + onnxruntime/core/mlas/lib/mlasi.h | 2 + .../core/optimizer/conv_activation_fusion.cc | 5 +- .../core/optimizer/conv_add_act_fusion.cc | 10 +- .../layout_transformation.cc | 1 + .../core/optimizer/nhwc_transformer.cc | 158 +++++++++++++++++- onnxruntime/core/optimizer/nhwc_transformer.h | 5 + onnxruntime/core/providers/cpu/nn/conv.cc | 153 ++++++++++++++--- onnxruntime/core/providers/cpu/nn/conv.h | 3 +- onnxruntime/core/util/math_cpu.cc | 1 + .../test/framework/ort_model_only_test.cc | 61 ++++++- .../internal_testing_tests.cc | 69 +++++--- onnxruntime/test/mlas/bench/bench_sconv.cpp | 1 + onnxruntime/test/mlas/unittest/test_conv2d.h | 1 + .../test/optimizer/conv_add_act_test.cc | 5 +- .../fuse_initializers_transformer_test.cc | 5 +- .../test/optimizer/nhwc_transformer_test.cc | 22 +++ 24 files changed, 488 insertions(+), 76 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index d959d11e3fd43..fc5f3a459e616 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -18,6 +18,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention); @@ -302,6 +303,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/contrib_ops/cpu/fused_conv.cc b/onnxruntime/contrib_ops/cpu/fused_conv.cc index 5374222dbabcc..d77efc26d0e2f 100644 --- a/onnxruntime/contrib_ops/cpu/fused_conv.cc +++ b/onnxruntime/contrib_ops/cpu/fused_conv.cc @@ -26,5 +26,13 @@ ONNX_CPU_OPERATOR_TYPED_MS_KERNEL( .TypeConstraint("T", DataTypeImpl::GetTensorType()), FusedConvFloat); +ONNX_CPU_OPERATOR_TYPED_MS_KERNEL( + NhwcFusedConv, + 1, + float, + KernelDefBuilder() + .TypeConstraint("T", DataTypeImpl::GetTensorType()), + FusedConvFloat); + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.cc b/onnxruntime/core/framework/kernel_type_str_resolver.cc index 3142f94f289b3..aacbc8fc0a4fb 100644 --- a/onnxruntime/core/framework/kernel_type_str_resolver.cc +++ b/onnxruntime/core/framework/kernel_type_str_resolver.cc @@ -36,6 +36,18 @@ static OpKernelTypeStrMap::const_iterator LookUpOpId(const OpIdentifier& op_id, } } +#ifdef USE_KLEIDIAI + // Klediai specific block for NhwcFusedConvolutions + if (op_it == map.end() && op_id.domain == kMSDomain && op_id.op_type == "NhwcFusedConv") { + const auto fused_conv_op_id = OpIdentifier{std::string{kMSDomain}, "FusedConv", op_id.since_version}; + op_it = map.find(fused_conv_op_id); + if (op_it == map.end()) { + const auto conv_op_id = OpIdentifier{std::string{kOnnxDomain}, "Conv", op_id.since_version}; + op_it = map.find(conv_op_id); + } + } +#endif + return op_it; } diff --git a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc index 8fe3a4d5f3b6f..5a57a58360ddf 100644 --- a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc @@ -403,7 +403,7 @@ Only has fp16 implementation as of 2023/04/15. .Input(2, "B", "", "T", OpSchema::Optional) .Input(3, "Z", "Tensor to be added to the output, must be the same shape and format as the output tensor.", "T", OpSchema::Optional) .Output(0, "Y", "", "T") - .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output types to float tensors") + .TypeConstraint("T", {"tensor(float16)", "tensor(float)"}, "Constrain input and output types to float tensors") .TypeAndShapeInferenceFunction([](InferenceContext& ctx) { ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0); convPoolShapeInferenceNhwc(ctx, true, false, 0, 1); diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 248c6d74e6cbd..adfdf363295fd 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -851,6 +851,7 @@ struct MLAS_CONV_PARAMETERS { size_t BatchCount; size_t GroupCount; size_t InputChannels; + bool ChannelsLast; size_t InputShape[3]; size_t KernelShape[3]; size_t DilationShape[3]; @@ -890,6 +891,7 @@ MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters, size_t FilterCount, const MLAS_ACTIVATION* Activation, size_t* WorkingBufferSize, + bool ChannelsLast, float Beta, MLAS_THREADPOOL* ThreadPool); diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 9518134631f2d..f0c1d870d6cd9 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -1146,6 +1146,7 @@ MlasConvPrepare( size_t FilterCount, const MLAS_ACTIVATION* Activation, size_t* WorkingBufferSize, + bool ChannelsLast, float Beta, MLAS_THREADPOOL* ThreadPool ) @@ -1204,7 +1205,7 @@ Return Value: if (GetMlasPlatform().MlasConvPrepareOverride != nullptr && GetMlasPlatform().MlasConvPrepareOverride(Parameters, Dimensions, BatchCount, GroupCount, InputChannels, InputShape,KernelShape,DilationShape, Padding, StrideShape, OutputShape, FilterCount, - Activation, WorkingBufferSize, Beta, ThreadPool)){ + Activation, WorkingBufferSize, ChannelsLast, Beta, ThreadPool)){ return; } // @@ -1215,6 +1216,7 @@ Return Value: Parameters->BatchCount = BatchCount; Parameters->GroupCount = GroupCount; Parameters->InputChannels = InputChannels; + Parameters->ChannelsLast = ChannelsLast; Parameters->FilterCount = FilterCount; Parameters->Beta = Beta; diff --git a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp index 487e1533f5967..60c8e9b562aec 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp @@ -448,6 +448,7 @@ static std::shared_ptr LhsPtrFill(const size_t ci, const size_t i static std::unique_ptr LhsPackImageDataSme(const size_t ci, const size_t ih, const size_t iw, const size_t kh, const size_t kw, const size_t sh, const size_t sw, const size_t padding, const float* in, + bool input_is_channels_last, MLAS_THREADPOOL* ThreadPool) { size_t padsize = 256; @@ -472,7 +473,14 @@ static std::unique_ptr LhsPackImageDataSme(const size_t ci, const s const auto lhs_size = kai_get_lhs_packed_size_lhs_imatmul_pack_x32p2vlx1_x32p_sme(m,kh*kw,ci); auto lhs = std::make_unique(lhs_size); - auto nhwc = NChwToNhwc(1, ci, ih, iw, in, 1, 1, false, ThreadPool); + std::unique_ptr nhwc_holder; + const float* activation_src = nullptr; + if (input_is_channels_last) { + activation_src = in; + } else { + nhwc_holder = NChwToNhwc(1, ci, ih, iw, in, 1, 1, false, ThreadPool); + activation_src = nhwc_holder.get(); + } // Cache of computed lhs ptr offsets. thread_local to prevent interference from parallel sessions. thread_local std::unordered_map> lhs_ptrs_cache; @@ -485,7 +493,7 @@ static std::unique_ptr LhsPackImageDataSme(const size_t ci, const s lhs_ptrs_cache[key] = lhs_ptrs; } - MultiThreadedLHSPackSme(ThreadPool, ci, m, kh, kw, &lhs_ptrs[0], &lhs[0], &nhwc[0], &pad_ptr[0]); + MultiThreadedLHSPackSme(ThreadPool, ci, m, kh, kw, &lhs_ptrs[0], &lhs[0], activation_src, &pad_ptr[0]); return lhs; } @@ -507,6 +515,7 @@ static void ConvolveSme(const size_t co, //channels out const float* in, //in image data float* out, //out image data float* tmp_mlas_aligned, //intermediate buffer if we need to perform a transpose + bool input_is_channels_last, MLAS_THREADPOOL* ThreadPool) { //RhsPackWeightsBiasSme() - to perform dilation increases kernel size and masks unused weights @@ -546,17 +555,13 @@ static void ConvolveSme(const size_t co, //channels out for (size_t g = 0; g < groups; ++g) { - auto result{out}; - //do we require a post matmul transpose ? - //output is m x n or image_data x co or hw x co - //MLAS require it as n x m (or co x hw), transpose required - if (co > 1) { - //intermediate buffer required, pre-transpose - //Note: because we are calling MlasTranspose() need to ensure we use a MLAS aligned buffer + auto result = out; + const bool need_transpose = (!input_is_channels_last) && (co > 1); + if (need_transpose) { result = tmp_mlas_aligned; } - auto lhs = LhsPackImageDataSme(ci, ih, iw, d_kh, d_kw, sh, sw, padding, in, ThreadPool); + auto lhs = LhsPackImageDataSme(ci, ih, iw, d_kh, d_kw, sh, sw, padding, in, input_is_channels_last, ThreadPool); auto rhs = RhsPackWeightsBiasSme(co, ci, kh, kw, dilationh, dilationw, weights, bias, ThreadPool); MlasTrySimpleParallel(ThreadPool, static_cast(dim[0] * dim[1] * dim[2]), [&](ptrdiff_t tid) { @@ -604,7 +609,7 @@ static void ConvolveSme(const size_t co, //channels out } }); - if (result == tmp_mlas_aligned) { + if (need_transpose) { //Note: this could be absorbed into post conv activation MlasTranspose(tmp_mlas_aligned, out, m, co, ThreadPool); } @@ -633,6 +638,7 @@ ArmKleidiAI::MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters, size_t FilterCount, const MLAS_ACTIVATION* Activation, size_t* WorkingBufferSize, + bool ChannelsLast, float Beta, MLAS_THREADPOOL* ThreadPool) { @@ -646,6 +652,7 @@ ArmKleidiAI::MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters, Parameters->BatchCount = BatchCount; Parameters->GroupCount = GroupCount; Parameters->InputChannels = InputChannels; + Parameters->ChannelsLast = ChannelsLast; Parameters->FilterCount = FilterCount; Parameters->Beta = Beta; @@ -711,7 +718,7 @@ ArmKleidiAI::MlasConv( Parameters->DilationShape[0], Parameters->DilationShape[1], // kernel dilation Parameters->Padding[0], // image padding Parameters->GroupCount, // filter groups - Filter, Bias, Input, Output, WorkingBuffer, ThreadPool); + Filter, Bias, Input, Output, WorkingBuffer, Parameters->ChannelsLast, ThreadPool); MlasActivation(Parameters->Activation, Output, nullptr, Parameters->FilterCount, Parameters->OutputSize, Parameters->OutputSize); diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h index ca81b9fa426ee..99eb88fcf4d2d 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h +++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h @@ -147,6 +147,7 @@ MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters, size_t FilterCount, const MLAS_ACTIVATION* Activation, size_t* WorkingBufferSize, + bool ChannelsLast, float Beta, MLAS_THREADPOOL* ThreadPool); diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index ad62cccbfb9c7..1186d5b939d7e 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -827,6 +827,7 @@ void size_t FilterCount, const MLAS_ACTIVATION* Activation, size_t* WorkingBufferSize, + bool ChannelsLast, float Beta, MLAS_THREADPOOL* ThreadPool ); @@ -847,6 +848,7 @@ bool size_t FilterCount, const MLAS_ACTIVATION* Activation, size_t* WorkingBufferSize, + bool ChannelsLast, float Beta, MLAS_THREADPOOL* ThreadPool ); diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc index b7f5af5888be0..a53099937a94a 100644 --- a/onnxruntime/core/optimizer/conv_activation_fusion.cc +++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc @@ -140,9 +140,12 @@ class FuseConvActivationAction : public ReplaceWithNew { return "FusedConv"; } } else if (domain == kMSDomain) { - if (op_type == "NhwcConv") { + if (op_type == "NhwcConv" || op_type == "NhwcFusedConv") { return "NhwcFusedConv"; } + if (op_type == "FusedConv") { + return "FusedConv"; + } } else if (domain == kMSInternalNHWCDomain) { if (op_type == "Conv") { return "Conv"; diff --git a/onnxruntime/core/optimizer/conv_add_act_fusion.cc b/onnxruntime/core/optimizer/conv_add_act_fusion.cc index 6f90eaf07ef4d..478e7529cb667 100644 --- a/onnxruntime/core/optimizer/conv_add_act_fusion.cc +++ b/onnxruntime/core/optimizer/conv_add_act_fusion.cc @@ -211,7 +211,15 @@ class FuseConvAddActivationAction : public ReplaceWithNew { private: std::string OpType(const RuntimeState& runtimeState) const override { - return (runtimeState.selected_nodes.Target().OpType() == "Conv") ? "FusedConv" : "NhwcFusedConv"; + const auto& target = runtimeState.selected_nodes.Target(); + const auto* channels_last_attr = graph_utils::GetNodeAttribute(target, "channels_last"); + const bool channels_last = channels_last_attr != nullptr && channels_last_attr->i() != 0; + + if (target.OpType() == "Conv") { + return channels_last ? "NhwcFusedConv" : "FusedConv"; + } + + return "NhwcFusedConv"; } std::string Domain(const RuntimeState&) const override { return kMSDomain; } diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc index f611c992e0f57..5d51c855d13ba 100644 --- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc +++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc @@ -68,6 +68,7 @@ const std::unordered_set& GetORTLayoutSensitiveOps() { // Define a static local string array so we can refer to the elements with string_views. static const std::string layout_sensitive_contrib_ops[]{ MakeORTLayoutSensitiveOpId(kMSDomain, "FusedConv"), + MakeORTLayoutSensitiveOpId(kMSDomain, "NhwcFusedConv"), MakeORTLayoutSensitiveOpId(kMSDomain, "GridSample"), MakeORTLayoutSensitiveOpId(kMSDomain, "QLinearAveragePool"), MakeORTLayoutSensitiveOpId(kMSDomain, "QLinearGlobalAveragePool"), diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc index cd654991c92d5..9544cf7395025 100644 --- a/onnxruntime/core/optimizer/nhwc_transformer.cc +++ b/onnxruntime/core/optimizer/nhwc_transformer.cc @@ -2,7 +2,10 @@ // SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates // Licensed under the MIT License. +#include #include +#include +#include "core/graph/constants.h" #include "core/mlas/inc/mlas.h" #include "core/graph/graph_utils.h" #include "core/optimizer/initializer.h" @@ -21,6 +24,72 @@ namespace onnxruntime { using namespace layout_transformation; +#ifdef USE_KLEIDIAI +bool KleidiFp32NhwcFilter(const onnx_transpose_optimization::api::GraphRef& graph, + onnx_transpose_optimization::api::NodeRef& node) { + auto& base_node = NodeFromApiNode(node); + + ORT_UNUSED_PARAMETER(graph); + if (base_node.InputDefs().size() < 2) { + return false; + } + + const auto* input_shape = base_node.InputDefs()[0]->Shape(); + if (input_shape == nullptr || input_shape->dim_size() != 4) { + return false; + } + + const auto& batch_dim = input_shape->dim(0); + if (!utils::HasDimValue(batch_dim) || batch_dim.dim_value() != 1) { + return false; + } + + const auto pads_attr = node.GetAttributeInts("pads"); + if (pads_attr.has_value()) { + const auto& pads = pads_attr.value(); + if (pads.size() != 4 || pads[0] != pads[2] || pads[1] != pads[3]) { + return false; + } + } + + const auto inputs = node.Inputs(); + if (inputs.size() > 3 && !inputs[3].empty()) { + return false; + } + + const auto* weight_shape = base_node.InputDefs()[1]->Shape(); + if (weight_shape == nullptr || weight_shape->dim_size() != 4) { + return false; + } + + const auto& filter_dim = weight_shape->dim(0); + const auto& kernel_h_dim = weight_shape->dim(2); + const auto& kernel_w_dim = weight_shape->dim(3); + + if (!utils::HasDimValue(filter_dim) || filter_dim.dim_value() <= 1 || + !utils::HasDimValue(kernel_h_dim) || kernel_h_dim.dim_value() < 3 || + !utils::HasDimValue(kernel_w_dim) || kernel_w_dim.dim_value() < 3) { + return false; + } + + const auto dilations_opt = node.GetAttributeInts("dilations"); + if (dilations_opt.has_value()) { + const auto& dilations = dilations_opt.value(); + if ((dilations.size() >= 1 && dilations[0] != 1) || + (dilations.size() >= 2 && dilations[1] != 1)) { + return false; + } + } + + const auto group_opt = node.GetAttributeInt("group"); + if (group_opt.has_value() && group_opt.value() != 1) { + return false; + } + + return true; +} +#endif + static inline const OpTransformInfo* NhwcConvLookup( const OpTransformMap& conv_table, @@ -41,6 +110,13 @@ NhwcConvLookup( if (iter == conv_table.end()) { return nullptr; } + + if (iter->second.filter_ != nullptr) { + if (!iter->second.filter_(graph, node)) { + return nullptr; + } + } + return &(iter->second); } @@ -108,15 +184,62 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, nhwc_conv_fp16.version_, nhwc_conv_fp16.type_constraints_, logger, &kernel_create_info); if (status.IsOK() && kernel_create_info != nullptr) { kernel_create_info = nullptr; + const auto filter = [](const api::GraphRef&, api::NodeRef& node) { + const auto dilations_opt = node.GetAttributeInts("dilations"); + if (dilations_opt.has_value()) { + const auto& dilations = dilations_opt.value(); + if ((dilations.size() >= 1 && dilations[0] != 1) || + (dilations.size() >= 2 && dilations[1] != 1)) { + return false; + } + } + + const auto group_opt = node.GetAttributeInt("group"); + if (group_opt.has_value() && group_opt.value() != 1) { + return false; + } + + return true; + }; + conv_table_.emplace( OpIdInfo("Conv", kOnnxDomain, api::DataType::FLOAT16), - OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false}); + OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false, filter}); conv_table_.emplace( OpIdInfo("FusedConv", kMSDomain, api::DataType::FLOAT16), - OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false}); + OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false, filter}); } } +#ifdef USE_KLEIDIAI + // Klediai specific block for NhwcFusedConvolutions + { + // F32 Conv -> F32 NHWC Conv + OpKernelRegistryId nhwc_conv_fp32{ + "NhwcFusedConv", kMSDomain, 1, {{"T", {DataTypeImpl::GetTensorType()}}}}; + + const KernelCreateInfo* kernel_create_info{}; + const auto status = cpu_kernel_registry->TryFindKernel( + kCpuExecutionProvider, nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_, + nhwc_conv_fp32.version_, nhwc_conv_fp32.type_constraints_, logger, &kernel_create_info); + + if (status.IsOK() && kernel_create_info != nullptr) { + kernel_create_info = nullptr; + + const auto filter = [](const api::GraphRef& graph, api::NodeRef& node) { + return KleidiFp32NhwcFilter(graph, node); + }; + + conv_table_.emplace( + OpIdInfo("Conv", kOnnxDomain, api::DataType::FLOAT), + OpTransformInfo{nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_, nhwc_conv_fp32.version_, false, filter}); + conv_table_.emplace( + OpIdInfo("FusedConv", kMSDomain, api::DataType::FLOAT), + OpTransformInfo{nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_, nhwc_conv_fp32.version_, false, filter}); + } + } +#endif + { // fp16 MaxPool -> fp16 nhwc MaxPool OpKernelRegistryId nhwc_maxpool_fp16{ @@ -214,10 +337,39 @@ Status NhwcTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level, if (transform->has_channels_last_attrib_) { node->SetAttributeInt("channels_last", 1); } + + if (node->OpType() == "Conv" || node->OpType() == "FusedConv") { + const auto group_opt = node->GetAttributeInt("group"); + if (group_opt.has_value() && group_opt.value() != 1) { + continue; + } + + const auto dilations_opt = node->GetAttributeInts("dilations"); + if (dilations_opt.has_value()) { + const auto& dilations = dilations_opt.value(); + if ((dilations.size() >= 1 && dilations[0] != 1) || + (dilations.size() >= 2 && dilations[1] != 1)) { + continue; + } + } + } + size_t rank = shape->dim_size(); std::vector input_perm = ChannelFirstToLastPerm(rank); std::vector output_perm = ChannelLastToFirstPerm(rank); - WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm}); + const auto inputs = node->Inputs(); + std::vector*> input_perms(inputs.size(), nullptr); + if (!inputs.empty()) { + input_perms[0] = &input_perm; + } + // Optional Sum (Z) input for FusedConv variants resides at index 3. When present, + // it must be converted to NHWC alongside the activation tensor. + const bool has_fused_sum_input = (node->Domain() == kMSDomain && node->OpType() == "FusedConv"); + if (has_fused_sum_input && inputs.size() > 3 && !inputs[3].empty()) { + input_perms[3] = &input_perm; + } + + WrapTransposesAroundNode(*api_graph, *node, input_perms, {&output_perm}); // Replace the operator if needed if (node->Domain() != transform->domain_ || diff --git a/onnxruntime/core/optimizer/nhwc_transformer.h b/onnxruntime/core/optimizer/nhwc_transformer.h index c65f851fdab9d..6dd11bdba6bdd 100644 --- a/onnxruntime/core/optimizer/nhwc_transformer.h +++ b/onnxruntime/core/optimizer/nhwc_transformer.h @@ -3,6 +3,7 @@ #pragma once +#include #include "core/common/common.h" #include "core/framework/execution_provider.h" #include "core/framework/kernel_registry.h" @@ -54,10 +55,14 @@ class OpIdHash { * @brief Information needed for operator layout transformation */ struct OpTransformInfo { + using FilterFn = std::function; + const std::string optype_; const std::string domain_; const int version_; const bool has_channels_last_attrib_; + const FilterFn filter_{nullptr}; }; using OpTransformMap = std::unordered_map; diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc index d10213f55d5d4..4cc0df42d2969 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.cc +++ b/onnxruntime/core/providers/cpu/nn/conv.cc @@ -15,6 +15,8 @@ */ /* Modifications Copyright (c) Microsoft. */ +#include + #include "core/providers/cpu/nn/conv.h" #include "core/common/narrow.h" @@ -24,6 +26,44 @@ namespace onnxruntime { using ConvPadVector = ConvAttributes::ConvPadVector; +namespace { + +template +void ConvertNHWCToNCHW(const T* src, T* dst, + int64_t n, int64_t c, int64_t h, int64_t w) { + const int64_t hw = (SafeInt(h) * w); + for (int64_t n_idx = 0; n_idx < n; ++n_idx) { + const int64_t n_src_offset = n_idx * hw * c; + const int64_t n_dst_offset = n_idx * c * hw; + for (int64_t c_idx = 0; c_idx < c; ++c_idx) { + const T* src_ptr = src + n_src_offset + c_idx; + T* dst_ptr = dst + n_dst_offset + c_idx * hw; + for (int64_t hw_idx = 0; hw_idx < hw; ++hw_idx) { + dst_ptr[hw_idx] = src_ptr[hw_idx * c]; + } + } + } +} + +template +void ConvertNCHWToNHWC(const T* src, T* dst, + int64_t n, int64_t c, int64_t h, int64_t w) { + const int64_t hw = (SafeInt(h) * w); + for (int64_t n_idx = 0; n_idx < n; ++n_idx) { + const int64_t n_src_offset = n_idx * c * hw; + const int64_t n_dst_offset = n_idx * hw * c; + for (int64_t hw_idx = 0; hw_idx < hw; ++hw_idx) { + const T* src_ptr = src + n_src_offset + hw_idx; + T* dst_ptr = dst + n_dst_offset + hw_idx * c; + for (int64_t c_idx = 0; c_idx < c; ++c_idx) { + dst_ptr[c_idx] = src_ptr[c_idx * hw]; + } + } + } +} + +} // namespace + template Status Conv::Compute(OpKernelContext* context) const { const auto* X = context->Input(0); @@ -160,11 +200,10 @@ Status Conv::Compute(OpKernelContext* context) const { const Tensor* B = num_inputs >= 3 ? context->Input(2) : nullptr; const Tensor* Sum = num_inputs >= 4 ? context->Input(3) : nullptr; const int64_t N = X->Shape()[0]; - const int64_t C = X->Shape()[1]; + const int64_t C = X->Shape()[channels_last_ ? 3 : 1]; const int64_t M = W->Shape()[0]; - ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W)); + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last_)); - // kernel_shape is an optional attribute and has to be inferred from W if not provided TensorShapeVector kernel_shape; ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape)); @@ -182,12 +221,14 @@ Status Conv::Compute(OpKernelContext* context) const { } TensorShapeVector Y_dims({N, M}); - TensorShape input_shape = X->Shape().Slice(2); + TensorShape input_shape = channels_last_ ? X->Shape().Slice(1, 3) : X->Shape().Slice(2); ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims)); + if (channels_last_) { + Y_dims = {Y_dims[0], Y_dims[2], Y_dims[3], Y_dims[1]}; + } Tensor* Y = context->Output(0, TensorShape(Y_dims)); - TensorShape output_shape = Y->Shape().Slice(2); + TensorShape output_shape = channels_last_ ? TensorShape(Y_dims).Slice(1, 3) : Y->Shape().Slice(2); - // Bail out early if one of the dimensions is zero. if (Y->Shape().Size() == 0) { return Status::OK(); } @@ -198,20 +239,39 @@ Status Conv::Compute(OpKernelContext* context) const { auto Xdata = X->DataAsSpan(); const auto* Bdata = B != nullptr ? B->Data() : nullptr; auto Ydata = Y->MutableDataAsSpan(); - // Check for the optional Conv/Sum fusion. + const size_t kernel_rank = kernel_shape.size(); + concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool(); + + if (channels_last_) { + ORT_RETURN_IF_NOT(kernel_rank == 2, "NhwcFusedConv currently supports 2D kernels."); + ORT_RETURN_IF_NOT(dilations[0] == 1 && dilations[1] == 1, "NhwcFusedConv currently supports dilation == 1."); + } + + const bool wants_channels_last = channels_last_; + const bool sum_present = Sum != nullptr; + const bool nhwc_fastpath = + wants_channels_last && kernel_rank == 2 && conv_attrs_.group == 1 && + dilations[0] == 1 && dilations[1] == 1 && !sum_present; + const bool manual_sum = wants_channels_last && !nhwc_fastpath && sum_present; + + std::vector sum_manual_buffer; + const float* sum_manual_data = nullptr; + float Beta = 0.0f; - if (Sum != nullptr) { + if (sum_present) { const auto& sum_shape = Sum->Shape(); ORT_RETURN_IF_NOT(Y->Shape() == sum_shape, "output and sum shape must match"); - // If the output was not allocated inplace with the sum tensor, then copy here. - auto sum_data = Sum->DataAsSpan(); - if (Ydata.data() != sum_data.data()) { - gsl::copy(sum_data, Ydata); + if (manual_sum) { + sum_manual_buffer.assign(Sum->Data(), Sum->Data() + Y->Shape().Size()); + sum_manual_data = sum_manual_buffer.data(); + } else { + auto sum_span = Sum->DataAsSpan(); + if (Ydata.data() != sum_span.data()) { + gsl::copy(sum_span, Ydata); + } + Beta = 1.0f; } - Beta = 1.0f; } - const size_t kernel_rank = kernel_shape.size(); - concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool(); if (kernel_rank >= 1 && kernel_rank <= 3) { MLAS_CONV_PARAMETERS Parameters; @@ -230,20 +290,66 @@ Status Conv::Compute(OpKernelContext* context) const { narrow(M / conv_attrs_.group), &activation_, &WorkingBufferSize, - Beta, + nhwc_fastpath, + nhwc_fastpath ? 0.0f : Beta, thread_pool); - auto* working_data = WorkingBufferSize > 0 ? alloc->Alloc(sizeof(float) * SafeInt(WorkingBufferSize)) - : nullptr; - BufferUniquePtr working_buffer(working_data, BufferDeleter(std::move(alloc))); + float* working_data = nullptr; + BufferUniquePtr working_buffer; + if (WorkingBufferSize > 0) { + working_data = static_cast(alloc->Alloc(sizeof(float) * SafeInt(WorkingBufferSize))); + working_buffer = BufferUniquePtr(working_data, BufferDeleter(alloc)); + } + + float* output_compute = Ydata.data(); + BufferUniquePtr output_temp; + if (wants_channels_last && !nhwc_fastpath) { + const SafeInt output_compute_size = + SafeInt(Y->Shape()[0]) * SafeInt(M) * + SafeInt(output_shape[0]) * SafeInt(output_shape[1]); + float* temp_output = static_cast(alloc->Alloc(sizeof(float) * output_compute_size)); + output_temp = BufferUniquePtr(temp_output, BufferDeleter(alloc)); + output_compute = temp_output; + } + + const float* input_compute = Xdata.data(); + BufferUniquePtr input_temp; + if (wants_channels_last && !nhwc_fastpath) { + ORT_RETURN_IF_NOT(X->Shape().NumDimensions() == 4, "Nhwc fallback expects 4D input."); + const auto& x_dims = X->Shape().GetDims(); + const int64_t input_n = x_dims[0]; + const int64_t input_h = x_dims[1]; + const int64_t input_w = x_dims[2]; + const int64_t input_c = x_dims[3]; + const SafeInt input_elements = SafeInt(X->Shape().Size()); + float* temp_input = static_cast(alloc->Alloc(sizeof(float) * input_elements)); + input_temp = BufferUniquePtr(temp_input, BufferDeleter(alloc)); + ConvertNHWCToNCHW(X->Data(), temp_input, + input_n, input_c, input_h, input_w); + input_compute = temp_input; + } MlasConv(&Parameters, - Xdata.data(), + input_compute, W->Data(), Bdata, - static_cast(working_buffer.get()), - Ydata.data(), + working_data, + output_compute, thread_pool); + + if (wants_channels_last && !nhwc_fastpath) { + const auto& y_dims = Y->Shape().GetDims(); + ORT_RETURN_IF_NOT(y_dims.size() == 4, "Nhwc fallback expects 4D output."); + ConvertNCHWToNHWC(output_compute, + Ydata.data(), + y_dims[0], y_dims[3], y_dims[1], y_dims[2]); + if (manual_sum) { + auto y_span = gsl::make_span(Ydata.data(), Ydata.size()); + for (size_t i = 0; i < y_span.size(); ++i) { + y_span[i] += sum_manual_data[i]; + } + } + } } else { const int64_t input_image_size = input_shape.Size(); const int64_t output_image_size = output_shape.Size(); @@ -284,7 +390,8 @@ Status Conv::Compute(OpKernelContext* context) const { thread_pool); } - MlasActivation(&activation_, Ydata.data(), Bdata, narrow(M), narrow(output_image_size), narrow(output_image_size)); + MlasActivation(&activation_, Ydata.data(), Bdata, narrow(M), + narrow(output_image_size), narrow(output_image_size)); Xdata = Xdata.subspan(X_offset * conv_attrs_.group); Ydata = Ydata.subspan(Y_offset * conv_attrs_.group); diff --git a/onnxruntime/core/providers/cpu/nn/conv.h b/onnxruntime/core/providers/cpu/nn/conv.h index 5ed5d2ca91def..78912d3146a1e 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.h +++ b/onnxruntime/core/providers/cpu/nn/conv.h @@ -24,7 +24,7 @@ class Conv : public OpKernel { template <> class Conv : public OpKernel { public: - Conv(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info) { + Conv(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info), channels_last_(info.GetKernelDef().OpName() == "NhwcFusedConv") { activation_.ActivationKind = MlasIdentityActivation; } @@ -34,6 +34,7 @@ class Conv : public OpKernel { MLAS_ACTIVATION activation_; ConvAttributes conv_attrs_; + bool channels_last_{false}; }; } // namespace onnxruntime diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc index 045dc98a3501e..03b2067eadc2e 100644 --- a/onnxruntime/core/util/math_cpu.cc +++ b/onnxruntime/core/util/math_cpu.cc @@ -770,6 +770,7 @@ void Im2col::operator()( template struct Im2col; template struct Im2col; template struct Im2col; +template struct Im2col; template <> void Col2im(const float* data_col, int64_t channels, int64_t height, diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc index 3032b3170a6e0..91266d81b4f91 100644 --- a/onnxruntime/test/framework/ort_model_only_test.cc +++ b/onnxruntime/test/framework/ort_model_only_test.cc @@ -17,6 +17,8 @@ #include "test/util/include/asserts.h" #include "test/util/include/inference_session_wrapper.h" +#include +#include #include "flatbuffers/idl.h" #include "flatbuffers/util.h" @@ -27,6 +29,28 @@ using namespace ONNX_NAMESPACE; namespace onnxruntime { namespace test { +namespace { +std::filesystem::path ResolveTestPath(const std::filesystem::path& path) { + if (path.is_absolute() || path.empty()) { + return path; + } + + std::filesystem::path workspace_candidate = std::filesystem::current_path() / path; + if (std::filesystem::exists(workspace_candidate)) { + return workspace_candidate; + } + + static const std::filesystem::path kSourceTestRoot = + std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path(); + std::filesystem::path source_candidate = kSourceTestRoot / path; + if (std::filesystem::exists(source_candidate)) { + return source_candidate; + } + + return workspace_candidate; +} +} // namespace + struct OrtModelTestInfo { std::basic_string model_filename; std::string logid; @@ -59,17 +83,21 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) { std::vector model_data; InferenceSessionWrapper session_object{so, GetEnvironment()}; + std::filesystem::path model_path = ResolveTestPath(std::filesystem::path{test_info.model_filename}); + + std::cerr << "RunOrtModel cwd: " << std::filesystem::current_path() << " loading: " << model_path << std::endl; + const auto& model_path_str = model_path.native(); if (test_info.run_use_buffer) { // Load the file into a buffer and use the buffer to create inference session size_t num_bytes = 0; - ASSERT_STATUS_OK(Env::Default().GetFileLength(test_info.model_filename.c_str(), num_bytes)); + ASSERT_STATUS_OK(Env::Default().GetFileLength(model_path_str.c_str(), num_bytes)); model_data.resize(num_bytes); - std::ifstream bytes_stream(test_info.model_filename, std::ifstream::in | std::ifstream::binary); + std::ifstream bytes_stream(model_path, std::ifstream::in | std::ifstream::binary); bytes_stream.read(model_data.data(), num_bytes); bytes_stream.close(); ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(num_bytes))); } else { - ASSERT_STATUS_OK(session_object.Load(test_info.model_filename)); // infer type from filename + ASSERT_STATUS_OK(session_object.Load(model_path_str)); // infer type from filename } ASSERT_STATUS_OK(session_object.Initialize()); @@ -145,7 +173,7 @@ static void CompareGraphAndSessionState(const InferenceSessionWrapper& session_o for (const auto& pair : i1) { auto iter = i2.find(pair.first); - ASSERT_NE(iter, i2.cend()); + ASSERT_NE(iter, i2.cend()) << "Missing initializer " << pair.first; const OrtValue& left = pair.second; const OrtValue& right = iter->second; @@ -213,9 +241,28 @@ static void CompareSessionMetadata(const InferenceSessionWrapper& session_object static void SaveAndCompareModels(const PathString& orig_file, const PathString& ort_file, TransformerLevel optimization_level = TransformerLevel::Level3) { + std::filesystem::path orig_path = ResolveTestPath(std::filesystem::path{orig_file}); + std::filesystem::path ort_path = ResolveTestPath(std::filesystem::path{ort_file}); + if (ort_path.has_parent_path()) { + std::filesystem::create_directories(ort_path.parent_path()); + } + + const bool orig_is_ort_format = orig_path.extension() == ORT_TSTR(".ort"); + if (orig_is_ort_format) { + SessionOptions so; + so.session_logid = "SerializeToOrtFormat"; + so.optimized_model_filepath = ort_path.native(); + so.graph_optimization_level = optimization_level; + ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigSaveModelFormat, "ORT")); + InferenceSessionWrapper session_object{so, GetEnvironment()}; + ASSERT_STATUS_OK(session_object.Load(orig_path.native())); + ASSERT_STATUS_OK(session_object.Initialize()); + return; + } + SessionOptions so; so.session_logid = "SerializeToOrtFormat"; - so.optimized_model_filepath = ort_file; + so.optimized_model_filepath = ort_path.native(); so.graph_optimization_level = optimization_level; // not strictly necessary - type should be inferred from the filename @@ -223,7 +270,7 @@ static void SaveAndCompareModels(const PathString& orig_file, InferenceSessionWrapper session_object{so, GetEnvironment()}; // create .ort file during Initialize due to values in SessionOptions - ASSERT_STATUS_OK(session_object.Load(orig_file)); + ASSERT_STATUS_OK(session_object.Load(orig_path.native())); ASSERT_STATUS_OK(session_object.Initialize()); SessionOptions so2; @@ -234,7 +281,7 @@ static void SaveAndCompareModels(const PathString& orig_file, // load serialized version InferenceSessionWrapper session_object2{so2, GetEnvironment()}; - ASSERT_STATUS_OK(session_object2.Load(ort_file)); + ASSERT_STATUS_OK(session_object2.Load(ort_path.native())); ASSERT_STATUS_OK(session_object2.Initialize()); CompareSessionMetadata(session_object, session_object2); diff --git a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc index 74a812062875a..b9c58ca386b12 100644 --- a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc +++ b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "gmock/gmock.h" +#include using namespace ONNX_NAMESPACE; using namespace onnxruntime::logging; @@ -36,12 +37,35 @@ using namespace onnxruntime::internal_testing_ep; #define ORT_MODEL_FOLDER ORT_TSTR("testdata/") +namespace { +std::filesystem::path ResolveInternalTestPath(const std::filesystem::path& path) { + if (path.is_absolute() || path.empty()) { + return path; + } + + std::filesystem::path candidate = std::filesystem::current_path() / path; + if (std::filesystem::exists(candidate)) { + return candidate; + } + + static const std::filesystem::path kSourceTestRoot = + std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path().parent_path(); + return kSourceTestRoot / path; +} + +std::basic_string ResolveInternalTestPathString(const ORTCHAR_T* path) { + return ResolveInternalTestPath(std::filesystem::path{path}).native(); +} +} // namespace + static Status CreateSession(const SessionOptions& so, std::unique_ptr& session, const ORTCHAR_T* model_path = ORT_MODEL_FOLDER "mnist.onnx", // arbitrary test model bool enable_custom_ep = true, const std::unordered_set* override_supported_ops = nullptr) { session = std::make_unique(so, GetEnvironment()); + std::filesystem::path resolved_model_path = ResolveInternalTestPath(std::filesystem::path{model_path}); + // set supported ops to ops that are ideally found consecutively in the model. // we can say the EP potentially handles them all, but can also test removing handling of one or more ops // at runtime to simulate a lower spec device where not all ops can be handled. this allows us to test @@ -55,7 +79,7 @@ static Status CreateSession(const SessionOptions& so, std::unique_ptr(*supported_ops))); } - ORT_RETURN_IF_ERROR(session->Load(model_path)); + ORT_RETURN_IF_ERROR(session->Load(resolved_model_path.c_str())); ORT_RETURN_IF_ERROR(session->Initialize()); return Status::OK(); } @@ -98,7 +122,7 @@ static void ExecuteMnist(InferenceSessionWrapper& session, bool custom_ep_enable #if !defined(ORT_MINIMAL_BUILD) TEST(InternalTestingEP, TestSaveAndLoadOrtModel) { - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.test_output.ort"; + const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.test_output.ort"); // // First load the onnx format model and save as an ORT model. @@ -121,10 +145,10 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) { so.optimized_model_filepath.clear(); bool enable_custom_ep = false; - ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path, enable_custom_ep)); + ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path.c_str(), enable_custom_ep)); const auto& graph1 = session2->GetGraph(); - // model should have all the original nodes and we should be able to execute with the fallback to CPU EP - ASSERT_EQ(graph1.NumberOfNodes(), num_nodes); + // ensure we can execute with the fallback to CPU EP even if additional nodes are introduced during loading + ASSERT_GE(graph1.NumberOfNodes(), num_nodes); ExecuteMnist(*session2, enable_custom_ep); session2 = nullptr; @@ -133,7 +157,7 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) { // for the ORT format model. // enable_custom_ep = true; - ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path, enable_custom_ep)); + ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path.c_str(), enable_custom_ep)); const auto& graph2 = session2->GetGraph(); // model should be able to be loaded, and we should compile using custom ep. that will result in one node for the // custom EP (with Conv/Add/Relu/MaxPool), one for a reshape, and one for the fused MatMul+Add. @@ -142,7 +166,7 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) { } TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) { - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"; + const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"); // make sure we can't save a model with compiled ops. input/output model format doesn't matter SessionOptions so; @@ -154,7 +178,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) { ASSERT_STATUS_OK(session->RegisterExecutionProvider( std::make_unique(supported_ops))); - ASSERT_STATUS_OK(session->Load(ort_model_path)); + ASSERT_STATUS_OK(session->Load(ort_model_path.c_str())); ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session->Initialize(), "Unable to serialize model as it contains compiled nodes"); } @@ -163,7 +187,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) { // version of the ONNX operator when matching a static kernel, those are required. #if !defined(DISABLE_CONTRIB_OPS) TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) { - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx"; + const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx"); SessionOptions so; InferenceSessionWrapper session(so, GetEnvironment()); @@ -175,7 +199,7 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) { ep->EnableStaticKernels(); ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(ep))); - ASSERT_STATUS_OK(session.Load(ort_model_path)); + ASSERT_STATUS_OK(session.Load(ort_model_path.c_str())); ASSERT_STATUS_OK(session.Initialize()); TensorShape input_shape_x{1, 1, 7, 7}; @@ -204,7 +228,8 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) { TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) { auto run_test = [&](const ORTCHAR_T* model_path) { - SCOPED_TRACE("model path: " + ToUTF8String(model_path)); + auto resolved_model_path = ResolveInternalTestPathString(model_path); + SCOPED_TRACE("model path: " + ToUTF8String(resolved_model_path.c_str())); SessionOptions so; // set this if you want to manually inspect the optimized model @@ -218,7 +243,7 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) { ep->EnableStaticKernels(); ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(ep))); - ASSERT_STATUS_OK(session.Load(model_path)); + ASSERT_STATUS_OK(session.Load(resolved_model_path.c_str())); ASSERT_STATUS_OK(session.Initialize()); const auto& graph = session.GetGraph(); @@ -249,13 +274,11 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) { }; // the internal NHWC domain supports opset 11 and later - const ORTCHAR_T* onnx_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx"; - run_test(onnx_model_path); + run_test(ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx"); // Note: Using ORT format model with runtime optimizations so that the Conv nodes are preserved in the graph, // not converted into FusedConv nodes. The InternalTestingExecutionProvider handles Conv nodes. - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.with_runtime_opt.ort"; - run_test(ort_model_path); + run_test(ORT_MODEL_FOLDER "squeezenet/model_opset11.with_runtime_opt.ort"); } // make sure allocators returned by SessionState::GetAllocator are valid when IExecutionProvider::ReplaceAllocator @@ -283,8 +306,8 @@ TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorag ASSERT_STATUS_OK(session.RegisterExecutionProvider(ep)); } - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model.onnx"; - ASSERT_STATUS_OK(session.Load(ort_model_path)); + const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "squeezenet/model.onnx"); + ASSERT_STATUS_OK(session.Load(ort_model_path.c_str())); ASSERT_STATUS_OK(session.Initialize()); // Need to undo the wrapping that happens in Environment::RegisterAllocator to be able to compare the pointers @@ -301,25 +324,25 @@ TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorag // test to validate a minimal build TEST(InternalTestingEP, TestLoadOrtModel) { - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"; + const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"); std::unique_ptr session; bool enable_custom_ep = true; - ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path, enable_custom_ep)); + ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path.c_str(), enable_custom_ep)); ExecuteMnist(*session, enable_custom_ep); } // test that if the custom EP cannot take all nodes due to device limitations // that we fallback to the CPU implementations and can execute the model TEST(InternalTestingEP, TestLoadOrtModelWithReducedOpCoverage) { - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"; + const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"); const std::unordered_set supported_ops{"Conv", "Add", "Relu" /*, "MaxPool"*/}; std::unique_ptr session; bool enable_custom_ep = true; - ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path, enable_custom_ep, &supported_ops)); + ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path.c_str(), enable_custom_ep, &supported_ops)); const auto& graph = session->GetGraph(); // Conv+Add gets fused by level 1 optimizer into single node. The 'Conv'/'Add'/'Relu' nodes should be compiled and @@ -454,7 +477,7 @@ TEST(InternalTestingEP, TestOrtModelWithCompileFailure) { // the layout transformation for this EP is already done at this stage and reverting // can result in more failures. // This is to test the model initialization fails if compile fails. - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"; + const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort"); const std::unordered_set& supported_ops{"Conv", "Gemm"}; const std::unordered_set& compile_failure_ops{"Gemm"}; diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp index dc37980002978..163f7f1dc2f16 100644 --- a/onnxruntime/test/mlas/bench/bench_sconv.cpp +++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp @@ -110,6 +110,7 @@ void SCONV_NCHW(benchmark::State& state, const char* /*dummy*/) { static_cast(output_channels_per_group), &activation, &WorkingBufferSize, + false, 0.0f, nullptr); diff --git a/onnxruntime/test/mlas/unittest/test_conv2d.h b/onnxruntime/test/mlas/unittest/test_conv2d.h index 20bf0ec84f5bf..736d8587b2546 100644 --- a/onnxruntime/test/mlas/unittest/test_conv2d.h +++ b/onnxruntime/test/mlas/unittest/test_conv2d.h @@ -57,6 +57,7 @@ class MlasConv2DTest : public MlasTestBase { FilterCount, &Activation, &WorkingBufferSize, + false, 0.0f, threadpool_); diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc index f61f9b29d9cce..704d7ac907450 100644 --- a/onnxruntime/test/optimizer/conv_add_act_test.cc +++ b/onnxruntime/test/optimizer/conv_add_act_test.cc @@ -30,9 +30,10 @@ void TestConvPath(const std::vector& input_shape, const std::vector disabled_optimizers = {"NchwcTransformer"}; + InlinedHashSet disabled_optimizers = {"NchwcTransformer", "NhwcTransformer"}; TransformerTester(build_test_case, check_graph, TransformerLevel::Default, diff --git a/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc index de973679c8f80..7bb492c4854d9 100644 --- a/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc +++ b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc @@ -363,6 +363,7 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_ // Create session and check graph before / after initiation InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"})); ASSERT_STATUS_OK(session.Load(model_uri)); test_graph_structure_at_init(session.GetGraph()); ASSERT_STATUS_OK(session.Initialize()); @@ -402,6 +403,7 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_ // Create session and check graph before / after initiation InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"})); ASSERT_STATUS_OK(session.Load(model_uri)); test_graph_structure_at_init(session.GetGraph()); ASSERT_STATUS_OK(session.Initialize()); @@ -443,6 +445,7 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_ // Create session and check graph before / after initiation InferenceSessionWrapper session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"})); ASSERT_STATUS_OK(session.Load(model_uri)); test_graph_structure_at_init(session.GetGraph()); ASSERT_STATUS_OK(session.Initialize()); @@ -494,7 +497,7 @@ TEST(TransformerTest, FuseFp16InitializersWithGraphOutputs) { // by folding it with Add node. This will not allow us to test the // scenario where Cast node is producing graph output and need to // kept untouched by FuseInitializersTransformer. - ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"ConstantFolding"})); + ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"ConstantFolding", "NhwcTransformer"})); ASSERT_STATUS_OK(session.Load(model_uri)); _graph_structure_at_load(session.GetGraph()); ASSERT_STATUS_OK(session.Initialize()); diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc index 21ea7af4e7389..4d270ba014eae 100644 --- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc +++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc @@ -224,6 +224,28 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) { TransformerLevel::Level3); } +TEST(NhwcTransformerTests, ConvDepthwiseFloat) { + auto build_test_case = [&](ModelTestBuilder& builder) { + auto* input_arg = builder.MakeInput({1, 8, 7, 7}, -1.0f, 1.0f); + auto* weight_arg = builder.MakeInitializer({8, 1, 3, 3}, -1.0f, 1.0f); + auto* output_arg = builder.MakeOutput(); + + Node& conv_node = builder.AddConvNode(input_arg, weight_arg, output_arg); + conv_node.AddAttribute("group", static_cast(8)); + }; + + auto check_nhwc_graph = [&](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], 0); + EXPECT_EQ(op_to_count["Transpose"], 0); + }; + + TransformerTester(build_test_case, + check_nhwc_graph, + TransformerLevel::Level2, + TransformerLevel::Level3); +} + TEST(NhwcTransformerTests, ConvAveragePool) { DNNL_GTEST_SKIP(); From 1606a1c473ca281d63952e4314fe068c8a6e8b0c Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Fri, 19 Dec 2025 13:20:49 +0000 Subject: [PATCH 02/13] Add a value for channels_last to bench_sconv.cpp Signed-off-by: Orlaith Monahan --- onnxruntime/test/mlas/bench/bench_sconv.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp index 163f7f1dc2f16..e5559a8f838b0 100644 --- a/onnxruntime/test/mlas/bench/bench_sconv.cpp +++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp @@ -218,6 +218,7 @@ void SCONV_NCHW_THREADED(benchmark::State& state, const char* /*dummy*/) { static_cast(output_channels_per_group), &activation, &WorkingBufferSize, + false, 0.0f, tp); From 2dd199e1051bf7c0b575ee6514ed3516090c9613 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Mon, 12 Jan 2026 10:57:00 +0000 Subject: [PATCH 03/13] Update internal_testing_tests.cc Update to the internal_testings_tests helper macros for file expansion so it works on other platforms --- onnxruntime/test/internal_testing_ep/internal_testing_tests.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc index b9c58ca386b12..e8bab013de97a 100644 --- a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc +++ b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc @@ -49,7 +49,7 @@ std::filesystem::path ResolveInternalTestPath(const std::filesystem::path& path) } static const std::filesystem::path kSourceTestRoot = - std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path().parent_path(); + std::filesystem::path{ORT_TSTR_ON_MACRO(__FILE__)}.parent_path().parent_path().parent_path(); return kSourceTestRoot / path; } From 4df9cea096cce5c595fb25fb07d0be7f10af5b36 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Wed, 14 Jan 2026 16:05:28 +0000 Subject: [PATCH 04/13] Update nhwc_transformer_test.cc Fix for failing ConvDepthwiseFloat test, allows for a small tolerance when running on different hardware --- onnxruntime/test/optimizer/nhwc_transformer_test.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc index 4d270ba014eae..3ad70b7f6ff5e 100644 --- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc +++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc @@ -243,7 +243,11 @@ TEST(NhwcTransformerTests, ConvDepthwiseFloat) { TransformerTester(build_test_case, check_nhwc_graph, TransformerLevel::Level2, - TransformerLevel::Level3); + TransformerLevel::Level3, + /*opset_version*/ 12, + /*per_sample_tolerance*/ 1e-6, + /*relative_per_sample_tolerance*/ 1e-6); + } TEST(NhwcTransformerTests, ConvAveragePool) { From b133782d1e4eb97276e98318d123db36e3c97252 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Wed, 14 Jan 2026 16:07:22 +0000 Subject: [PATCH 05/13] Update internal_testing_tests.cc For for failing TestSaveAndLoadOrtModel test Make sure the model being saved / loaded is being done from a writeable location --- .../test/internal_testing_ep/internal_testing_tests.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc index e8bab013de97a..83fb3f07c8e76 100644 --- a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc +++ b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc @@ -122,7 +122,9 @@ static void ExecuteMnist(InferenceSessionWrapper& session, bool custom_ep_enable #if !defined(ORT_MINIMAL_BUILD) TEST(InternalTestingEP, TestSaveAndLoadOrtModel) { - const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.test_output.ort"); + const auto ort_model_dir = ResolveInternalTestPath(std::filesystem::path{ORT_MODEL_FOLDER}); + const std::basic_string ort_model_path = + (ort_model_dir / ORT_TSTR("mnist.internal_testing_ep.test_output.ort")).native(); // // First load the onnx format model and save as an ORT model. From 0c2d1cd4b7abd6d33e5231b77db41358b2dceee0 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Wed, 14 Jan 2026 16:09:10 +0000 Subject: [PATCH 06/13] Update ort_model_only_test.cc Fix for undeclared identifier linker error --- onnxruntime/test/framework/ort_model_only_test.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc index 91266d81b4f91..0de93a25f89f1 100644 --- a/onnxruntime/test/framework/ort_model_only_test.cc +++ b/onnxruntime/test/framework/ort_model_only_test.cc @@ -24,6 +24,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +#define WIDEN2(x) L##x +#define WIDEN(x) WIDEN2(x) using namespace ONNX_NAMESPACE; @@ -41,7 +43,7 @@ std::filesystem::path ResolveTestPath(const std::filesystem::path& path) { } static const std::filesystem::path kSourceTestRoot = - std::filesystem::path{ORT_TSTR(__FILE__)}.parent_path().parent_path(); + std::filesystem::path{WIDEN(__FILE__)}.parent_path().parent_path(); std::filesystem::path source_candidate = kSourceTestRoot / path; if (std::filesystem::exists(source_candidate)) { return source_candidate; From 25c0be7081bc52310679a4339540cc3c4b1ea4a7 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Wed, 14 Jan 2026 18:00:49 +0000 Subject: [PATCH 07/13] Lintrunner fixes Signed-off-by: Orlaith Monahan --- onnxruntime/test/framework/ort_model_only_test.cc | 2 +- onnxruntime/test/optimizer/nhwc_transformer_test.cc | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc index 0de93a25f89f1..72f3c6e08095b 100644 --- a/onnxruntime/test/framework/ort_model_only_test.cc +++ b/onnxruntime/test/framework/ort_model_only_test.cc @@ -43,7 +43,7 @@ std::filesystem::path ResolveTestPath(const std::filesystem::path& path) { } static const std::filesystem::path kSourceTestRoot = - std::filesystem::path{WIDEN(__FILE__)}.parent_path().parent_path(); + std::filesystem::path{WIDEN(__FILE__)}.parent_path().parent_path(); std::filesystem::path source_candidate = kSourceTestRoot / path; if (std::filesystem::exists(source_candidate)) { return source_candidate; diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc index 3ad70b7f6ff5e..87afd865a60a5 100644 --- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc +++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc @@ -247,7 +247,6 @@ TEST(NhwcTransformerTests, ConvDepthwiseFloat) { /*opset_version*/ 12, /*per_sample_tolerance*/ 1e-6, /*relative_per_sample_tolerance*/ 1e-6); - } TEST(NhwcTransformerTests, ConvAveragePool) { From 04821506db7767e7afbd3d262c866d5c73cf5d70 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Mon, 26 Jan 2026 10:07:16 +0000 Subject: [PATCH 08/13] Update onnxruntime/core/optimizer/nhwc_transformer.cc Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/optimizer/nhwc_transformer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc index 9544cf7395025..5bd592f8ef01d 100644 --- a/onnxruntime/core/optimizer/nhwc_transformer.cc +++ b/onnxruntime/core/optimizer/nhwc_transformer.cc @@ -212,7 +212,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator, } #ifdef USE_KLEIDIAI - // Klediai specific block for NhwcFusedConvolutions + // KleidiAI specific block for NhwcFusedConvolutions { // F32 Conv -> F32 NHWC Conv OpKernelRegistryId nhwc_conv_fp32{ From f9606cdddf31c4320216c69e7df8fc46dabddfbe Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Mon, 26 Jan 2026 10:07:31 +0000 Subject: [PATCH 09/13] Update onnxruntime/core/framework/kernel_type_str_resolver.cc Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/framework/kernel_type_str_resolver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.cc b/onnxruntime/core/framework/kernel_type_str_resolver.cc index aacbc8fc0a4fb..f73550c14ebc0 100644 --- a/onnxruntime/core/framework/kernel_type_str_resolver.cc +++ b/onnxruntime/core/framework/kernel_type_str_resolver.cc @@ -37,7 +37,7 @@ static OpKernelTypeStrMap::const_iterator LookUpOpId(const OpIdentifier& op_id, } #ifdef USE_KLEIDIAI - // Klediai specific block for NhwcFusedConvolutions + // KleidiAI specific block for NhwcFusedConvolutions if (op_it == map.end() && op_id.domain == kMSDomain && op_id.op_type == "NhwcFusedConv") { const auto fused_conv_op_id = OpIdentifier{std::string{kMSDomain}, "FusedConv", op_id.since_version}; op_it = map.find(fused_conv_op_id); From 63d9c555b8b05e59fc4862a77e92264232cd35e2 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Mon, 26 Jan 2026 10:08:13 +0000 Subject: [PATCH 10/13] Update onnxruntime/core/providers/cpu/nn/conv.cc Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/providers/cpu/nn/conv.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc index 4cc0df42d2969..f5615015366d0 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.cc +++ b/onnxruntime/core/providers/cpu/nn/conv.cc @@ -243,8 +243,8 @@ Status Conv::Compute(OpKernelContext* context) const { concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool(); if (channels_last_) { - ORT_RETURN_IF_NOT(kernel_rank == 2, "NhwcFusedConv currently supports 2D kernels."); - ORT_RETURN_IF_NOT(dilations[0] == 1 && dilations[1] == 1, "NhwcFusedConv currently supports dilation == 1."); + ORT_RETURN_IF_NOT(kernel_rank == 2, "Conv with channels_last layout currently supports 2D kernels."); + ORT_RETURN_IF_NOT(dilations[0] == 1 && dilations[1] == 1, "Conv with channels_last layout currently supports dilation == 1."); } const bool wants_channels_last = channels_last_; From 457513b5fee415a1f4ff0ab2ba67d23f744bd0a6 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Mon, 26 Jan 2026 10:08:41 +0000 Subject: [PATCH 11/13] Update onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index fc5f3a459e616..2d604a86561df 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -18,7 +18,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv); +#ifdef USE_KLEIDIAI class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv); +#endif class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, MultiHeadAttention); From b836bd3b0584b15f3b153a98f036c0b6c008d010 Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Mon, 26 Jan 2026 10:08:56 +0000 Subject: [PATCH 12/13] Update onnxruntime/test/framework/ort_model_only_test.cc Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/test/framework/ort_model_only_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc index 72f3c6e08095b..da1622dfd1af9 100644 --- a/onnxruntime/test/framework/ort_model_only_test.cc +++ b/onnxruntime/test/framework/ort_model_only_test.cc @@ -87,7 +87,6 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) { InferenceSessionWrapper session_object{so, GetEnvironment()}; std::filesystem::path model_path = ResolveTestPath(std::filesystem::path{test_info.model_filename}); - std::cerr << "RunOrtModel cwd: " << std::filesystem::current_path() << " loading: " << model_path << std::endl; const auto& model_path_str = model_path.native(); if (test_info.run_use_buffer) { // Load the file into a buffer and use the buffer to create inference session From 891dad554f4ca41b3db8826fcaa7468508844c4a Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Wed, 4 Feb 2026 12:57:14 +0000 Subject: [PATCH 13/13] Additional guards to not include KLEIDIAI specific kernels Signed-off-by: Orlaith Monahan --- onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index 2d604a86561df..692412a8efcce 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -305,7 +305,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, +#ifdef USE_KLEIDIAI BuildKernelCreateInfo, +#endif BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo,