diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake index 6405236da1734..6c5464851937c 100644 --- a/cmake/external/abseil-cpp.cmake +++ b/cmake/external/abseil-cpp.cmake @@ -20,8 +20,13 @@ else() endif() endif() -if(Patch_FOUND AND WIN32) - set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_windows.patch) +if(Patch_FOUND) + if (WIN32) + set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_windows.patch && + ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_cuda_warnings.patch) + else() + set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_cuda_warnings.patch) + endif() else() set(ABSL_PATCH_COMMAND "") endif() diff --git a/cmake/external/cuda_configuration.cmake b/cmake/external/cuda_configuration.cmake index be6a5febf3e14..00f7d81eda53d 100644 --- a/cmake/external/cuda_configuration.cmake +++ b/cmake/external/cuda_configuration.cmake @@ -85,6 +85,11 @@ macro(setup_cuda_architectures) # * Always use accelerated (`-a` suffix) target for supported real architectures. # cmake-format: on + # Allow override via CUDAARCHS environment variable (standard CMake variable) + if(NOT CMAKE_CUDA_ARCHITECTURES AND DEFINED ENV{CUDAARCHS}) + set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}") + endif() + if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native") # Detect highest available compute capability set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch) @@ -139,12 +144,12 @@ macro(setup_cuda_architectures) continue() endif() - if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$") + if(CUDA_ARCH MATCHES "^([1-9])([0-9])+[af]?-virtual$") set(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL ${CUDA_ARCH}) - elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?-real$") - list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}) - elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?$") + elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)[af]?-real$") list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}) + elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)([af]?)$") + list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}${CMAKE_MATCH_4}) else() message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}") endif() @@ -156,7 +161,7 @@ macro(setup_cuda_architectures) set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}") - set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120") + set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120") foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS) if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG) add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}") @@ -165,10 +170,13 @@ macro(setup_cuda_architectures) endforeach() # Enable accelerated features (like WGMMA, TMA and setmaxnreg) for SM >= 90. - set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120") + set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "110" "120") unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED) foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES) - if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL) + if(CUDA_ARCH MATCHES "^([0-9]+)f$") + # Family code, no -real suffix + list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}") + elseif("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL) list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real") else() list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real") diff --git a/cmake/patches/abseil/absl_cuda_warnings.patch b/cmake/patches/abseil/absl_cuda_warnings.patch new file mode 100644 index 0000000000000..144b9f904bf0f --- /dev/null +++ b/cmake/patches/abseil/absl_cuda_warnings.patch @@ -0,0 +1,40 @@ +diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h +index 1234567..abcdefg 100644 +--- a/absl/hash/internal/hash.h ++++ b/absl/hash/internal/hash.h +@@ -477,7 +477,7 @@ H AbslHashValue(H hash_state, T (&)[N]) { + template + H AbslHashValue(H hash_state, T (&)[N]) { + static_assert( +- sizeof(T) == -1, ++ sizeof(T) == size_t(-1), + "Hashing C arrays is not allowed. For string literals, wrap the literal " + "in absl::string_view(). To hash the array contents, use " + "absl::MakeSpan() or make the array an std::array. To hash the array " +diff --git a/absl/hash/hash.h b/absl/hash/hash.h +index 1234567..abcdefg 100644 +--- a/absl/hash/hash.h ++++ b/absl/hash/hash.h +@@ -333,7 +333,8 @@ class HashState : public hash_internal::HashStateBase { + absl::enable_if_t< + std::is_base_of, T>::value, int> = 0> + static HashState Create(T* state) { +- HashState s; ++ HashState s = {}; ++ (void)s; + s.Init(state); + return s; + } +diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h +index 1234567..abcdefg 100644 +--- a/absl/container/internal/raw_hash_set.h ++++ b/absl/container/internal/raw_hash_set.h +@@ -464,7 +464,7 @@ inline uint16_t NextSeed() { + inline uint16_t NextSeed() { + static_assert(PerTableSeed::kBitCount == 16); + thread_local uint16_t seed = +- static_cast(reinterpret_cast(&seed)); ++ static_cast(reinterpret_cast(&seed) & 0xFFFFu); + seed += uint16_t{0xad53}; + return seed; + } diff --git a/cmake/vcpkg-ports/abseil/absl_cuda_warnings.patch b/cmake/vcpkg-ports/abseil/absl_cuda_warnings.patch new file mode 100644 index 0000000000000..144b9f904bf0f --- /dev/null +++ b/cmake/vcpkg-ports/abseil/absl_cuda_warnings.patch @@ -0,0 +1,40 @@ +diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h +index 1234567..abcdefg 100644 +--- a/absl/hash/internal/hash.h ++++ b/absl/hash/internal/hash.h +@@ -477,7 +477,7 @@ H AbslHashValue(H hash_state, T (&)[N]) { + template + H AbslHashValue(H hash_state, T (&)[N]) { + static_assert( +- sizeof(T) == -1, ++ sizeof(T) == size_t(-1), + "Hashing C arrays is not allowed. For string literals, wrap the literal " + "in absl::string_view(). To hash the array contents, use " + "absl::MakeSpan() or make the array an std::array. To hash the array " +diff --git a/absl/hash/hash.h b/absl/hash/hash.h +index 1234567..abcdefg 100644 +--- a/absl/hash/hash.h ++++ b/absl/hash/hash.h +@@ -333,7 +333,8 @@ class HashState : public hash_internal::HashStateBase { + absl::enable_if_t< + std::is_base_of, T>::value, int> = 0> + static HashState Create(T* state) { +- HashState s; ++ HashState s = {}; ++ (void)s; + s.Init(state); + return s; + } +diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h +index 1234567..abcdefg 100644 +--- a/absl/container/internal/raw_hash_set.h ++++ b/absl/container/internal/raw_hash_set.h +@@ -464,7 +464,7 @@ inline uint16_t NextSeed() { + inline uint16_t NextSeed() { + static_assert(PerTableSeed::kBitCount == 16); + thread_local uint16_t seed = +- static_cast(reinterpret_cast(&seed)); ++ static_cast(reinterpret_cast(&seed) & 0xFFFFu); + seed += uint16_t{0xad53}; + return seed; + } diff --git a/cmake/vcpkg-ports/abseil/portfile.cmake b/cmake/vcpkg-ports/abseil/portfile.cmake index 3cdedca7265ef..1e9c48ea834b2 100644 --- a/cmake/vcpkg-ports/abseil/portfile.cmake +++ b/cmake/vcpkg-ports/abseil/portfile.cmake @@ -9,6 +9,7 @@ vcpkg_from_github( SHA512 4ee1a217203933382e728d354a149253a517150eee7580a0abecc69584b2eb200d91933ef424487e3a3fe0e8ab5e77b0288485cac982171b3585314a4417e7d4 HEAD_REF master PATCHES absl_windows.patch + absl_cuda_warnings.patch ) diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs index 1ae7b5c9eb991..abe73b77f4071 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Reflection; using System.Runtime.InteropServices; using static Microsoft.ML.OnnxRuntime.NativeMethods; @@ -474,6 +475,12 @@ internal static class NativeMethods static NativeMethods() { +#if !NETSTANDARD2_0 && !__ANDROID__ && !__IOS__ + // Register a custom DllImportResolver to handle platform-specific library loading. + // Replaces default resolution specifically on Windows for case-sensitivity. + NativeLibrary.SetDllImportResolver(typeof(NativeMethods).Assembly, DllImportResolver); +#endif + #if NETSTANDARD2_0 IntPtr ortApiBasePtr = OrtGetApiBase(); OrtApiBase ortApiBase = (OrtApiBase)Marshal.PtrToStructure(ortApiBasePtr, typeof(OrtApiBase)); @@ -847,7 +854,7 @@ static NativeMethods() api_.CreateSyncStreamForEpDevice, typeof(DOrtCreateSyncStreamForEpDevice)); - OrtSyncStream_GetHandle = + OrtSyncStream_GetHandle = (DOrtSyncStream_GetHandle)Marshal.GetDelegateForFunctionPointer( api_.SyncStream_GetHandle, typeof(DOrtSyncStream_GetHandle)); @@ -872,11 +879,127 @@ internal class NativeLib // Define the library name required for iOS internal const string DllName = "__Internal"; #else - // Note: the file name in ONNX Runtime nuget package must be onnxruntime.dll instead of onnxruntime.DLL(Windows filesystem can be case sensitive) - internal const string DllName = "onnxruntime.dll"; + // For desktop platforms (including .NET Standard 2.0), we use the simple name + // to allow .NET's automatic platform-specific resolution (lib*.so, lib*.dylib, *.dll). + // For .NET Core 3.0+, case-sensitivity on Windows is handled by DllImportResolver. + internal const string DllName = "onnxruntime"; #endif } +#if !NETSTANDARD2_0 && !__ANDROID__ && !__IOS__ + /// + /// Custom DllImportResolver to handle platform-specific library loading. + /// On Windows, it explicitly loads the library with a lowercase .dll extension to handle + /// case-sensitive filesystems. + /// + private static IntPtr DllImportResolver(string libraryName, Assembly assembly, DllImportSearchPath? searchPath) + { + if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName) + { + string mappedName = null; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + // Explicitly load with .dll extension to avoid issues where the OS might try .DLL + mappedName = libraryName + ".dll"; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + // Explicitly load with .so extension and lib prefix + mappedName = "lib" + libraryName + ".so"; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + // Explicitly load with .dylib extension and lib prefix + mappedName = "lib" + libraryName + ".dylib"; + } + + if (mappedName != null) + { + // 1. Try default loading (name only) + if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle)) + { + return handle; + } + + // 2. Try relative to assembly location (look into runtimes subfolders) + string assemblyLocation = null; + try { assemblyLocation = assembly.Location; } catch { } + if (!string.IsNullOrEmpty(assemblyLocation)) + { + string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation); + string rid = RuntimeInformation.RuntimeIdentifier; + + // Probe the specific RID first, then common fallbacks for the current OS + string[] ridsToTry; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + ridsToTry = new[] { rid, "win-x64", "win-arm64" }; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + ridsToTry = new[] { rid, "linux-x64", "linux-arm64" }; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + // We no longer provide osx-x64 in official package since 1.24. + // However, we keep it in the list for build-from-source users. + ridsToTry = new[] { rid, "osx-arm64", "osx-x64" }; + } + else + { + ridsToTry = new[] { rid }; + } + + foreach (var tryRid in ridsToTry) + { + string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName); + if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + { + LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); + return handle; + } + } + } + + // 3. Try AppContext.BaseDirectory as a fallback + string baseDir = AppContext.BaseDirectory; + if (!string.IsNullOrEmpty(baseDir)) + { + string probePath = System.IO.Path.Combine(baseDir, mappedName); + if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + { + LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); + return handle; + } + + string rid = RuntimeInformation.RuntimeIdentifier; + probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName); + if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + { + LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); + return handle; + } + } + + LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})"); + + } + } + + // Fall back to default resolution + return IntPtr.Zero; + } + + private static void LogLibLoad(string message) + { + System.Diagnostics.Trace.WriteLine(message); + if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable("ORT_LOADER_VERBOSITY"))) + { + Console.WriteLine(message); + } + } +#endif + [DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)] #if NETSTANDARD2_0 public static extern IntPtr OrtGetApiBase(); @@ -2644,7 +2767,7 @@ public delegate void DOrtAddKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps, byte[] /* const char* */ value); /// - /// Get the value for the provided key. + /// Get the value for the provided key. /// /// Value. Returns IntPtr.Zero if key was not found. [UnmanagedFunctionPointer(CallingConvention.Winapi)] @@ -2767,7 +2890,7 @@ out IntPtr /* OrtSyncStream** */ stream // Auto Selection EP registration and selection customization /// - /// Register an execution provider library. + /// Register an execution provider library. /// The library must implement CreateEpFactories and ReleaseEpFactory. /// /// Environment to add the EP library to. @@ -2952,9 +3075,10 @@ internal static class OrtExtensionsNativeMethods #elif __IOS__ internal const string ExtensionsDllName = "__Internal"; #else - // For desktop platforms, explicitly specify the DLL name with extension to avoid - // issues on case-sensitive filesystems. See NativeLib.DllName for detailed explanation. - internal const string ExtensionsDllName = "ortextensions.dll"; + // For desktop platforms, use the simple name to allow .NET's + // automatic platform-specific resolution (lib*.so, lib*.dylib, *.dll). + // Case-sensitivity on Windows is handled by DllImportResolver. + internal const string ExtensionsDllName = "ortextensions"; #endif [DllImport(ExtensionsDllName, CharSet = CharSet.Ansi, diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml index d049c8d2d8990..c3cd38c9cd56b 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml +++ b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml @@ -113,7 +113,8 @@ + Condition="'$(PlatformTarget)' == 'ARM64' AND + Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime.dll')"> onnxruntime.dll PreserveNewest false @@ -128,7 +129,8 @@ + Condition="'$(PlatformTarget)' == 'ARM' AND + Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm\native\onnxruntime.dll')"> onnxruntime.dll PreserveNewest false diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs index f0d1313783643..c0475bb6102c1 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs @@ -601,6 +601,29 @@ private static Dictionary GetSkippedModels(DirectoryInfo modelsD skipModels["VGG 16-fp32"] = "bad allocation"; } + // The following models are from onnx repo and fail on MacOS nuget test pipeline. + if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + var macOSSkips = new[] + { + "test_castlike_FLOAT_to_STRING_expanded", + "test_castlike_FLOAT_to_BFLOAT16_expanded", + "test_castlike_BFLOAT16_to_FLOAT", + "test_cast_FLOAT_to_STRING", + "test_castlike_FLOAT_to_BFLOAT16", + "test_castlike_STRING_to_FLOAT_expanded", + "test_castlike_STRING_to_FLOAT", + "test_cast_STRING_to_FLOAT", + "test_castlike_BFLOAT16_to_FLOAT_expanded", + "test_cast_BFLOAT16_to_FLOAT", + "test_castlike_FLOAT_to_STRING" + }; + foreach (var model in macOSSkips) + { + skipModels[model] = "Skipped on macOS due to flakes or lack of support"; + } + } + return skipModels; } @@ -934,6 +957,7 @@ public void TestPretrainedModelsWithOrtValue(string opsetDir, string modelName) [MemberData(nameof(GetSkippedModelForTest), Skip = "Skipped due to Error, please fix the error and enable the test")] private void TestPreTrainedModels(string opsetDir, string modelName, bool useOrtValueAPIs = false) { + var opsetDirInfo = new DirectoryInfo(opsetDir); var opset = opsetDirInfo.Name; string onnxModelFileName = null; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts index 18bf30a325d83..994aeb83a0ed5 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts @@ -132,7 +132,7 @@ export const parseConvTransposeAttributes = (attributes: Record typeof attributes.autoPad == 'undefined' ? 0 : (attributes.autoPad as number) ]; const dilations = attributes.dilations as [number, number]; - const group = attributes.group as number; + const group = (attributes.group as number) ?? 1; // default to 1 per ONNX spec const kernelShape = attributes.kernelShape as [number, number]; const pads = attributes.pads as [number, number, number, number]; const strides = attributes.strides as [number, number]; diff --git a/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc b/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc index e413ccf580870..f4c3eb9914118 100644 --- a/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc +++ b/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc @@ -495,7 +495,7 @@ std::tuple ComputeRepeatAndRepeatStride( const std::vector& device_elements) { int64_t first_device_id = device_elements.at(0); int64_t first_device_id_count = 0; - for (size_t i = 0; i < device_elements.size(); ++i) { + for (size_t i = 0; i < static_cast(device_elements.size()); ++i) { if (device_elements.at(i) == first_device_id) { ++first_device_id_count; } @@ -505,8 +505,8 @@ std::tuple ComputeRepeatAndRepeatStride( // Check if the device mesh pattern is supported. // Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1]. // Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0]. - for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) { - for (size_t device_id = 0; device_id < repeat_stride; ++device_id) { + for (size_t repeat = 0; repeat < static_cast(first_device_id_count); ++repeat) { + for (size_t device_id = 0; device_id < static_cast(repeat_stride); ++device_id) { ORT_ENFORCE( device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id), "Unsupported device mesh pattern."); @@ -556,7 +556,7 @@ std::tuple ComputeNativeSpecForTwoAxisDecomposition( // S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1] std::vector dst_axis_specs; for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) { - if (src_axis != decomposed_axis_in_src) { + if (src_axis != static_cast(decomposed_axis_in_src)) { // Sharding spec is copied if the axis is not decomposed. // E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2] // The spec for "5" is copied. @@ -606,7 +606,7 @@ std::tuple ComputeNativeSpecForTwoAxisDecomposition( DeviceMesh dst_device_mesh; std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements); for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) { - if (src_axis != decomposed_axis_in_src) { + if (src_axis != static_cast(decomposed_axis_in_src)) { dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis))); } else if (dst_shape[decomposition_axis_in_dst] == 1) { // S[0] -> RS[0] @@ -660,7 +660,7 @@ std::tuple ComputeNativeSpecForTwoAxisDecomposition( // Source tensor is sharded on non-decomposed axis. std::vector dst_axis_specs; for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) { - if (src_axis != decomposed_axis_in_src) { + if (src_axis != static_cast(decomposed_axis_in_src)) { dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis))); } else { // R -> RR diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc index 167b2af946183..5170c982f248d 100644 --- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc +++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc @@ -73,9 +73,9 @@ Status ShardedMoE::ComputeInternal(OpKernelContext* context) const { MoEParameters moe_params(tensor_shards_); ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs( moe_params, input, router_probs, - fc1_experts_weights, fc1_experts_bias_optional, nullptr, - fc2_experts_weights, fc2_experts_bias_optional, nullptr, - fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, + fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr, + fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr, + fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr, 1, // no quantization so pack size is 1 activation_type_ == ort_fastertransformer::ActivationType::SwiGLU, 0)); // no block-wise quantization for sharded MoE diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h b/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h index 1fe8035cbcdae..7722cd5a84f07 100644 --- a/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h +++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h @@ -29,7 +29,14 @@ #if defined(ENABLE_FP4) #include "cutlass/float_subbyte.h" +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif #include +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif #endif namespace onnxruntime::llm { diff --git a/onnxruntime/core/mlas/lib/qlutgemm.cpp b/onnxruntime/core/mlas/lib/qlutgemm.cpp index cb099c2409a44..32c72342b4803 100644 --- a/onnxruntime/core/mlas/lib/qlutgemm.cpp +++ b/onnxruntime/core/mlas/lib/qlutgemm.cpp @@ -25,33 +25,53 @@ Module Name: #include #include #include +#include #include -/** T-MAC GEMM kernel Config */ +/** + * Global cache for T-MAC kernel parameters, indexed by configuration. + * This map and its associated mutex ensure thread-safe parameter management + * across concurrent MLAS calls. + */ static std::unordered_map tmac_kernel_configs; +static std::mutex tmac_kernel_configs_mutex; -const MlasTMACKernelParams& +static std::string +GetTmacKey(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point) +{ + // Generate a unique cache key based on the GEMM and quantization configuration. + return std::to_string(M) + "_" + std::to_string(N) + "_" + std::to_string(nbits) + "_" + + std::to_string(block_size) + "_" + (has_zero_point ? "1" : "0"); +} + +MlasTMACKernelParams MlasGetLutGemmKernelParams(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point) { - std::string key = std::to_string(M) + "_" + std::to_string(N) + "_" + std::to_string(nbits) + "_" + std::to_string(block_size) + "_" + (has_zero_point ? "1" : "0"); - if (tmac_kernel_configs.count(key)) { - return tmac_kernel_configs[key]; + std::string key = GetTmacKey(M, N, nbits, block_size, has_zero_point); + std::lock_guard lock(tmac_kernel_configs_mutex); + auto it = tmac_kernel_configs.find(key); + if (it != tmac_kernel_configs.end()) { + return it->second; } - MLAS_THROW_EX(std::runtime_error, "T-MAC kernel parameters not initialized"); + MLAS_THROW_EX(std::runtime_error, "T-MAC kernel parameters not initialized for key: " + key); } void MLASCALL MlasClearLutGemmKernelConfig() { + std::lock_guard lock(tmac_kernel_configs_mutex); tmac_kernel_configs.clear(); } void MLASCALL MlasInitLutGemmKernelConfig(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point) { - std::string key = std::to_string(M) + "_" + std::to_string(N) + "_" + std::to_string(nbits) + "_" + std::to_string(block_size) + "_" + (has_zero_point ? "1" : "0"); - if (tmac_kernel_configs.count(key)) { - return; + std::string key = GetTmacKey(M, N, nbits, block_size, has_zero_point); + { + std::lock_guard lock(tmac_kernel_configs_mutex); + if (tmac_kernel_configs.find(key) != tmac_kernel_configs.end()) { + return; + } } MlasTMACKernelParams params; @@ -121,7 +141,10 @@ MlasInitLutGemmKernelConfig(size_t M, size_t N, size_t nbits, size_t block_size, params.has_zero_point = has_zero_point; params.one_scale = false; // TODO(vraspar): support one scale case for bitnet - tmac_kernel_configs[key] = params; + { + std::lock_guard lock(tmac_kernel_configs_mutex); + tmac_kernel_configs[key] = params; + } return; } @@ -222,53 +245,52 @@ LutGemmPackQuantBData( const size_t PackedQuantBDataSize = (N * bits) * (K / g / ngroups_per_elem); memset(PackedQuantBDataBegin, 0, PackedQuantBDataSize); // TODO: is this needed? - MlasTrySimpleParallel( - ThreadPool, Iterations, - [&](ptrdiff_t tid) { - size_t im = static_cast(tid); - for (size_t ib = 0; ib < bits; ib++) { - for (size_t ik = 0; ik < K / g; ik++) { - // w = w.reshape(M // bits // simd_n_out, simd_n_out, bits, K // g).transpose(0, 2, 1, 3) - size_t new_im = im / simd_n_out; - size_t new_isno = im % simd_n_out; - size_t new_ib = ib; - size_t new_ik = ik; - size_t new_idx = new_im * c0_fac0 + new_ib * c0_fac1 + new_isno * c0_fac2 + new_ik; - - // w = w.reshape(M // mgroup, ngroups_per_elem, simd_n_in, K // g).transpose(0, 2, 1, 3) - new_im = new_idx / c1_nb0; - size_t new_ing = (new_idx % c1_nb0) / c1_nb1; - size_t new_isni = (new_idx % c1_nb1) / c1_nb2; - new_ik = (new_idx % c1_nb2); - new_idx = new_im * c1_fac0 + new_isni * c1_fac1 + new_ing * c1_fac2 + new_ik; - - // # 0 1 2 3 4 5 - // w = w.reshape(M // bm, bm // mgroup, simd_n_in, ngroups_per_elem, K // g // kfactor, kfactor).transpose(0, 4, 1, 5, 2, 3) - new_im = new_idx / c2_nb0; - size_t new_ibm = (new_idx % c2_nb0) / c2_nb1; - new_isni = (new_idx % c2_nb1) / c2_nb2; - new_ing = (new_idx % c2_nb2) / c2_nb3; - new_ik = (new_idx % c2_nb3) / c2_nb4; - size_t new_ikf = (new_idx % c2_nb4); - new_idx = new_im * c2_fac0 + - new_ik * c2_fac1 + - new_ibm * c2_fac2 + - new_ikf * c2_fac3 + - new_isni * ngroups_per_elem + - new_ing; - new_idx = new_idx / ngroups_per_elem; - size_t buf_idx = im * bits * K / g + ib * K / g + ik; - uint8_t buf_val = buf[buf_idx]; - - // w = sum([(w[:, :, :, :, :, ng] << (ng * g)) for ng in range(ngroups_per_elem)]) - PackedQuantBDataBegin[new_idx] = static_cast( - static_cast(PackedQuantBDataBegin[new_idx]) + - (buf_val << (new_ing * g)) - ); - } + // NOTE: The second packing loop is intentionally serialized to avoid data races. + // T-MAC packs multiple output features (N) into a single byte if ngroups_per_elem > 1. + // Parallelizing this across N would lead to concurrent bit-plane updates on the same memory location. + for (size_t im = 0; im < Iterations; im++) { + for (size_t ib = 0; ib < bits; ib++) { + for (size_t ik = 0; ik < K / g; ik++) { + // w = w.reshape(M // bits // simd_n_out, simd_n_out, bits, K // g).transpose(0, 2, 1, 3) + size_t new_im = im / simd_n_out; + size_t new_isno = im % simd_n_out; + size_t new_ib = ib; + size_t new_ik = ik; + size_t new_idx = new_im * c0_fac0 + new_ib * c0_fac1 + new_isno * c0_fac2 + new_ik; + + // w = w.reshape(M // mgroup, ngroups_per_elem, simd_n_in, K // g).transpose(0, 2, 1, 3) + new_im = new_idx / c1_nb0; + size_t new_ing = (new_idx % c1_nb0) / c1_nb1; + size_t new_isni = (new_idx % c1_nb1) / c1_nb2; + new_ik = (new_idx % c1_nb2); + new_idx = new_im * c1_fac0 + new_isni * c1_fac1 + new_ing * c1_fac2 + new_ik; + + // # 0 1 2 3 4 5 + // w = w.reshape(M // bm, bm // mgroup, simd_n_in, ngroups_per_elem, K // g // kfactor, kfactor).transpose(0, 4, 1, 5, 2, 3) + new_im = new_idx / c2_nb0; + size_t new_ibm = (new_idx % c2_nb0) / c2_nb1; + new_isni = (new_idx % c2_nb1) / c2_nb2; + new_ing = (new_idx % c2_nb2) / c2_nb3; + new_ik = (new_idx % c2_nb3) / c2_nb4; + size_t new_ikf = (new_idx % c2_nb4); + new_idx = new_im * c2_fac0 + + new_ik * c2_fac1 + + new_ibm * c2_fac2 + + new_ikf * c2_fac3 + + new_isni * ngroups_per_elem + + new_ing; + new_idx = new_idx / ngroups_per_elem; + size_t buf_idx = im * bits * K / g + ib * K / g + ik; + uint8_t buf_val = buf[buf_idx]; + + // w = sum([(w[:, :, :, :, :, ng] << (ng * g)) for ng in range(ngroups_per_elem)]) + PackedQuantBDataBegin[new_idx] = static_cast( + static_cast(PackedQuantBDataBegin[new_idx]) + + (buf_val << (new_ing * g)) + ); } } - ); + } } // Internal helper: calculates packed scales and zero points size in floats @@ -472,16 +494,15 @@ size_t CalculateLutBufferSize(size_t n, size_t k, size_t m, const MlasTMACKernelParams& tmac_params) { MLAS_UNREFERENCED_PARAMETER(n); - constexpr size_t kAllockAligment = 64; const size_t lut_scales_size = k / tmac_params.act_group_size; - size_t wsize = k * m * 4 * sizeof(int8_t); // 4 bytes per k element for 2-bit LUT - wsize += lut_scales_size * m * 2 * sizeof(float); // scales + biases - - wsize = ((wsize - 1) / kAllockAligment + 1) * kAllockAligment; + // The AVX2 kernel (g=4) expects 16 entries (16 bytes) per group of 4 activations. + // This effectively requires 4 bytes per activation in the K dimension. + size_t lut_size_bytes = m * k * 4; + size_t scales_size_bytes = m * lut_scales_size * sizeof(float); + size_t biases_size_bytes = m * lut_scales_size * sizeof(float); - // TODO(vrapar): add temp buffer for FP16 - return wsize; + return lut_size_bytes + scales_size_bytes + biases_size_bytes + 256; // + alignment/safety padding } void MLASCALL @@ -532,17 +553,23 @@ MlasLutGemm( // n_tiles_num = m * bits / bm; // TODO(vraspar): support other bitwidths + // For T-MAC, kernel properties (bm, n_tiles_num) are primarily driven by the number of output features (N). + // Initialization during packing (LutGemmPackQuantBDataSize) uses N as the major dimension, + // so we must match that here to ensure consistent weight tiling. + MlasInitLutGemmKernelConfig(N, K, 2, BlkLen, HasZeroPoint); const MlasTMACKernelParams& tmac_params = MlasGetLutGemmKernelParams(N, K, 2, BlkLen, HasZeroPoint); const size_t lut_scales_size = K / tmac_params.act_group_size; + const size_t lut_size_bytes = static_cast(M) * static_cast(K) * 4; size_t lut_buffer_size = CalculateLutBufferSize(N, K, M, tmac_params); // make buffer of lut_buffer_size bytes // TODO(vraspar): other way to do it auto lut_buffer = std::make_unique(lut_buffer_size); + memset(lut_buffer.get(), 0, lut_buffer_size); int8_t* qlut = reinterpret_cast(lut_buffer.get()); - float* lut_scales = reinterpret_cast(qlut + K * M * 4); // after lut - float* lut_biases = reinterpret_cast(lut_scales + lut_scales_size * M); // after scales + float* lut_scales = reinterpret_cast(qlut + lut_size_bytes); // after lut + float* lut_biases = reinterpret_cast(lut_scales + lut_scales_size * M); // after scales const auto* a_float = reinterpret_cast(A); // Activation data @@ -558,11 +585,12 @@ MlasLutGemm( for (size_t ine11 = 0; ine11 < static_cast(M); ine11++) { const size_t row_offset = ine11 * K; - const size_t lut_offset = ine11 * K * 4; // 4 bytes per K element for 2-bit LUT + // Call the LUT generation kernel for this activation row. + // We use a 4-byte stride (per activation) for the LUT entries to satisfy + // the memory layout requirements of the computation kernel. + const size_t lut_offset = ine11 * K * 4; const size_t scale_bias_offset = ine11 * lut_scales_size; - // Call the dispatch function for this row - // ggml_tmac_mul_mat_task_init Dispatch->GenerateLUT( const_cast(a_float + row_offset), // Input activation for this row qlut + lut_offset, // Output LUT for this row @@ -571,7 +599,8 @@ MlasLutGemm( M, K, N, - tmac_params.act_group_size + tmac_params.act_group_size, + tmac_params.act_group_size * 4 ); } @@ -657,15 +686,17 @@ MlasLutGemm( // Process all batch items in this chunk for (size_t ine11 = ir1_start; ine11 < ir1_end; ine11++) { - // Calculate LUT offsets for this batch item + // Calculate LUT offsets with 4-byte stride (per activation) for consistent access. const size_t qlut_offset = K * ine11 * 4; const size_t lut_scales_offset = lut_scales_size * ine11; // Calculate output offset const size_t dst_offset = OutputRows * ine11 + ichunk0 * ChunkSize0; - // Call the dispatch function to compute this tile - // Note M and N are swapped in TMAC terminology + // Call the dispatch function to compute this tile. + // We pass one batch item at a time (M=1) and ChunkSize0 output features. + // TotalN is passed specifically to allow the kernel to find the correct + // parameters (bm, tiles) used during weight packing. Dispatch->ComputeGemm( packed_weights + w_offset, // Weight tile QuantBScale + scales_offset, // Weight scales for this tile @@ -674,8 +705,9 @@ MlasLutGemm( lut_biases + lut_scales_offset, // LUT biases act_output + dst_offset, // Output location static_cast(K), // K dimension - static_cast(N), // N dimension - static_cast(1), // M dimension (processing one batch item at a time) + static_cast(1), // M dimension (batch size = 1) + static_cast(ir0_end - ir0_start), // N dimension (output features in chunk) + static_cast(N), // TotalN (total output features in weights) BlkLen, // Weight quantization group size HasZeroPoint // Whether zero points are used ); diff --git a/onnxruntime/core/mlas/lib/qlutgemm.h b/onnxruntime/core/mlas/lib/qlutgemm.h index ef4d01a2c5809..0a733199ea2e8 100644 --- a/onnxruntime/core/mlas/lib/qlutgemm.h +++ b/onnxruntime/core/mlas/lib/qlutgemm.h @@ -42,7 +42,11 @@ struct MlasTMACKernelParams { bool one_scale; }; -const MlasTMACKernelParams& +/** + * Retrieves the T-MAC kernel configuration for a given GEMM problem. + * Returns the parameters by value to ensure thread-safety across concurrent calls. + */ +MlasTMACKernelParams MlasGetLutGemmKernelParams(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point); typedef void(MLAS_QNBIT_GEMM_LUT_GEN)( @@ -53,19 +57,21 @@ typedef void(MLAS_QNBIT_GEMM_LUT_GEN)( size_t M, size_t K, size_t N, - size_t act_group_size + size_t act_group_size, + size_t lut_stride // Stride (in bytes) between consecutive LUT entries along the batch dimension. ); typedef void(MLAS_QNBIT_LUT_GEMM_COMPUTE)( - const uint8_t* weights, - const float* scales, + const uint8_t* A, + const float* Scales, const int8_t* LUT, const float* LUT_Scales, const float* LUT_Biases, float* C, int K, - int M, // batch size (number of rows in activation) - int N, + int M, // Batch size (current activation rows). + int N, // Number of output features to compute in this tile/chunk. + int TotalN, // Total number of output features in the weights (used for parameter mapping). size_t BlkLen, bool HasZeroPoint ); diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp index a89993d4515b8..7e4df13423be2 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp @@ -361,7 +361,8 @@ GenerateLUT_avx2( size_t M, size_t K, size_t N, - size_t act_group_size + size_t act_group_size, + size_t lut_stride ) { (void)M; // silence unused parameter warning @@ -379,7 +380,9 @@ GenerateLUT_avx2( } for (int32_t k_outer_1 = 0; k_outer_1 < kk_outer_max; ++k_outer_1) { - lut_ctor_g4_int8_impl(static_cast(act_group_size), (&(qlut[(k_outer_1 * act_group_size * 4)])), (&(b[(k_outer_1 * act_group_size)])), (&(lut_scales[k_outer_1])), (&(lut_biases[k_outer_1]))); + // Use the explicit lut_stride provided by the dispatch/caller to ensure + // consistent memory layout between construction and compute paths. + lut_ctor_g4_int8_impl(static_cast(act_group_size), (&(qlut[(k_outer_1 * lut_stride)])), (&(b[(k_outer_1 * act_group_size)])), (&(lut_scales[k_outer_1])), (&(lut_biases[k_outer_1]))); } } @@ -400,6 +403,20 @@ tbl_g4_int8_float_gather_bit2_impl(int32_t m, float* C_global, float* CBits, flo } } + // Handle tail cases where m is not a multiple of 32. + // This ensures C_global is fully initialized for all m elements. + int32_t m_tail = m % 32; + if (m_tail > 0) { + int32_t m_c_outer = m_c_outer_max; + int32_t cse_var_2 = (m_c_outer * 32 * bits); + int32_t cse_var_1 = (m_c_outer * 32); + for (int32_t m_c_inner = 0; m_c_inner < m_tail; ++m_c_inner) { + int32_t bit_offset_0 = (m_c_inner / 8) * 8 * bits + (m_c_inner % 8); + int32_t bit_offset_1 = (m_c_inner / 8) * 8 * bits + (m_c_inner % 8) + 8; + C_global[cse_var_1 + m_c_inner] = (CBits[cse_var_2 + bit_offset_0] * (float)5.000000e-01f) + (CBits[cse_var_2 + bit_offset_1]); + } + } + for (int32_t m_inner_outer = 0; m_inner_outer < m_c_outer_max; ++m_inner_outer) { PRAGMA_UNROLL for (int32_t m_inner = 0; m_inner < 32; ++m_inner) { @@ -407,6 +424,17 @@ tbl_g4_int8_float_gather_bit2_impl(int32_t m, float* C_global, float* CBits, flo C[offset] = C_global[offset]; } } + + // Transfer the remaining tail results from C_global to the final output matrix C. + // This is necessary when m is not a multiple of 32, ensuring all output features + // are correctly written to the destination buffer. + if (m_tail > 0) { + int offset_base = m_c_outer_max * 32; + for (int32_t m_inner = 0; m_inner < m_tail; ++m_inner) { + int offset = offset_base + m_inner; + C[offset] = C_global[offset]; + } + } } // When FastAggregation is enabled, FastAggregationK = ActK @@ -451,8 +479,8 @@ tbl_g4_int8_float_update_impl(int32_t m, float* c, const int8_t* lut, const uint __m256 vec_v_high_low = _mm256_cvtepi32_ps(extract_low_epi16_epi32(adder.get_high())); __m256 vec_v_high_high = _mm256_cvtepi32_ps(extract_high_epi16_epi32(adder.get_high())); - float lut_s = lut_scales[kk / ActK]; - float lut_b = lut_biases[kk / ActK]; + float lut_s = lut_scales[kk / (ActK * 4)]; + float lut_b = lut_biases[kk / (ActK * 4)]; partial_sum += lut_b; @@ -542,17 +570,20 @@ TMACComputeGemm_avx2( int K, int M, int N, + int TotalN, size_t BlkLen, // Weight quantization group size (q_group_size) bool HasZeroPoint ) { - // Validate batch size - if (N != 1) { - MLAS_THROW_EX(std::runtime_error, "N > 1 is not supported yet"); + // Validate batch size (M) + // For now, TMAC AVX2 kernel processes one batch row at a time. + if (M != 1) { + MLAS_THROW_EX(std::runtime_error, "M > 1 is not supported yet in TMAC AVX2 kernel"); } - // get kernel config - const MlasTMACKernelParams& tmac_params = MlasGetLutGemmKernelParams(M, K, 2, BlkLen, HasZeroPoint); + // get kernel config using the total output features (TotalN) + // This matches the parameters used during weight packing. + const MlasTMACKernelParams& tmac_params = MlasGetLutGemmKernelParams(TotalN, K, 2, BlkLen, HasZeroPoint); // ==================== CONFIGURATION ==================== // Fixed parameters for this kernel implementation @@ -572,7 +603,11 @@ TMACComputeGemm_avx2( const int32_t actk = static_cast(tmac_params.actk); // CRITICAL: = 16 for BlkLen=64, NOT BlkLen! const int32_t bm = static_cast(tmac_params.bm); - int32_t m = bm / bits; + // m is the number of output features this kernel tile produces. + // We clamp m by N (the number of features in the current chunk) to ensure + // we don't read or write past the tile boundary during the gather phase. + int32_t m_full = bm / bits; + int32_t m = std::min(m_full, N); // Validate configuration assert(bm % bits == 0); @@ -590,8 +625,9 @@ TMACComputeGemm_avx2( float* CBits = new float[bm]; float* C_global = new float[m]; - // Reset accumulator buffer to zero - tbl_int32_reset(bm * sizeof(float) / sizeof(int32_t), reinterpret_cast(CBits)); + // Explicitly zero-initialize accumulation buffers to ensure determinism. + memset(CBits, 0, bm * sizeof(float)); + memset(C_global, 0, m * sizeof(float)); // ==================== CALCULATE LOOP PARAMETERS ==================== const int32_t k_outer_max = K / (kfactor * g); diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc index 9b71f4ba2ebec..6d5a400be703b 100644 --- a/onnxruntime/core/platform/windows/telemetry.cc +++ b/onnxruntime/core/platform/windows/telemetry.cc @@ -3,6 +3,10 @@ #include "core/platform/windows/telemetry.h" #include +#include +#include +#include +#include #include "core/common/logging/logging.h" #include "onnxruntime_config.h" @@ -51,6 +55,80 @@ TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntim // {3a26b1ff-7484-7484-7484-15261f42614d} (0x3a26b1ff, 0x7484, 0x7484, 0x74, 0x84, 0x15, 0x26, 0x1f, 0x42, 0x61, 0x4d), TraceLoggingOptionMicrosoftTelemetry()); + +std::string ConvertWideStringToUtf8(const std::wstring& wide) { + if (wide.empty()) + return {}; + + const UINT code_page = CP_UTF8; + const DWORD flags = 0; + LPCWCH const src = wide.data(); + const int src_len = static_cast(wide.size()); + int utf8_length = ::WideCharToMultiByte(code_page, flags, src, src_len, nullptr, 0, nullptr, nullptr); + if (utf8_length == 0) + return {}; + + std::string utf8(utf8_length, '\0'); + if (::WideCharToMultiByte(code_page, flags, src, src_len, utf8.data(), utf8_length, nullptr, nullptr) == 0) + return {}; + + return utf8; +} + +std::string GetServiceNamesForCurrentProcess() { + static std::once_flag once_flag; + static std::string service_names; + + std::call_once(once_flag, [] { + SC_HANDLE service_manager = ::OpenSCManagerW(nullptr, nullptr, SC_MANAGER_ENUMERATE_SERVICE); + if (service_manager == nullptr) + return; + + DWORD bytes_needed = 0; + DWORD services_returned = 0; + DWORD resume_handle = 0; + if (!::EnumServicesStatusExW(service_manager, SC_ENUM_PROCESS_INFO, SERVICE_WIN32, SERVICE_ACTIVE, nullptr, 0, &bytes_needed, + &services_returned, &resume_handle, nullptr) && + ::GetLastError() != ERROR_MORE_DATA) { + ::CloseServiceHandle(service_manager); + return; + } + + if (bytes_needed == 0) { + ::CloseServiceHandle(service_manager); + return; + } + + std::vector buffer(bytes_needed); + auto* services = reinterpret_cast(buffer.data()); + services_returned = 0; + resume_handle = 0; + if (!::EnumServicesStatusExW(service_manager, SC_ENUM_PROCESS_INFO, SERVICE_WIN32, SERVICE_ACTIVE, reinterpret_cast(services), + bytes_needed, &bytes_needed, &services_returned, &resume_handle, nullptr)) { + ::CloseServiceHandle(service_manager); + return; + } + + DWORD current_pid = ::GetCurrentProcessId(); + std::wstring aggregated; + bool first = true; + for (DWORD i = 0; i < services_returned; ++i) { + if (services[i].ServiceStatusProcess.dwProcessId == current_pid) { + if (!first) { + aggregated.push_back(L','); + } + aggregated.append(services[i].lpServiceName); + first = false; + } + } + + ::CloseServiceHandle(service_manager); + + service_names = ConvertWideStringToUtf8(aggregated); + }); + + return service_names; +} } // namespace #ifdef _MSC_VER @@ -178,6 +256,7 @@ void WindowsTelemetry::LogProcessInfo() const { #if BUILD_INBOX isRedist = false; #endif + const std::string service_names = GetServiceNamesForCurrentProcess(); TraceLoggingWrite(telemetry_provider_handle, "ProcessInfo", TraceLoggingBool(true, "UTCReplace_AppSessionGuid"), @@ -189,7 +268,8 @@ void WindowsTelemetry::LogProcessInfo() const { TraceLoggingString(ORT_VERSION, "runtimeVersion"), TraceLoggingBool(IsDebuggerPresent(), "isDebuggerAttached"), TraceLoggingBool(isRedist, "isRedist"), - TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"), + TraceLoggingString(service_names.c_str(), "serviceNames")); process_info_logged = true; } @@ -204,7 +284,8 @@ void WindowsTelemetry::LogSessionCreationStart(uint32_t session_id) const { TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage), TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES), TraceLoggingUInt32(session_id, "sessionId"), - TraceLoggingLevel(WINEVENT_LEVEL_INFO)); + TraceLoggingLevel(WINEVENT_LEVEL_INFO), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); } void WindowsTelemetry::LogEvaluationStop(uint32_t session_id) const { @@ -278,6 +359,7 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio execution_provider_string += i; } + const std::string service_names = GetServiceNamesForCurrentProcess(); // Difference is MeasureEvent & isCaptureState, but keep in sync otherwise if (!captureState) { TraceLoggingWrite(telemetry_provider_handle, @@ -304,7 +386,9 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio TraceLoggingString(model_weight_hash.c_str(), "modelWeightHash"), TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"), TraceLoggingString(loaded_from.c_str(), "loadedFrom"), - TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds")); + TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"), + TraceLoggingString(service_names.c_str(), "serviceNames"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); } else { TraceLoggingWrite(telemetry_provider_handle, "SessionCreation_CaptureState", @@ -330,7 +414,9 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio TraceLoggingString(model_weight_hash.c_str(), "modelWeightHash"), TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"), TraceLoggingString(loaded_from.c_str(), "loadedFrom"), - TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds")); + TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"), + TraceLoggingString(service_names.c_str(), "serviceNames"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); } } @@ -419,7 +505,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"), TraceLoggingString(file, "file"), TraceLoggingString(function, "function"), - TraceLoggingInt32(line, "line")); + TraceLoggingInt32(line, "line"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); #else TraceLoggingWrite(telemetry_provider_handle, "RuntimeError", @@ -435,7 +522,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"), TraceLoggingString(file, "file"), TraceLoggingString(function, "function"), - TraceLoggingInt32(line, "line")); + TraceLoggingInt32(line, "line"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); #endif } @@ -465,7 +553,8 @@ void WindowsTelemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_s TraceLoggingUInt32(session_id, "sessionId"), TraceLoggingUInt32(total_runs_since_last, "totalRuns"), TraceLoggingInt64(total_run_duration_since_last, "totalRunDuration"), - TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize")); + TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); } void WindowsTelemetry::LogExecutionProviderEvent(LUID* adapterLuid) const { @@ -541,7 +630,8 @@ void WindowsTelemetry::LogAutoEpSelection(uint32_t session_id, const std::string TraceLoggingUInt32(session_id, "sessionId"), TraceLoggingString(selection_policy.c_str(), "selectionPolicy"), TraceLoggingString(requested_execution_provider_string.c_str(), "requestedExecutionProviderIds"), - TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds")); + TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); } void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const std::string& provider_options_string, bool captureState) const { @@ -560,7 +650,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const // Telemetry info TraceLoggingUInt8(0, "schemaVersion"), TraceLoggingString(provider_id.c_str(), "providerId"), - TraceLoggingString(provider_options_string.c_str(), "providerOptions")); + TraceLoggingString(provider_options_string.c_str(), "providerOptions"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); } else { TraceLoggingWrite(telemetry_provider_handle, "ProviderOptions_CaptureState", @@ -572,7 +663,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const // Telemetry info TraceLoggingUInt8(0, "schemaVersion"), TraceLoggingString(provider_id.c_str(), "providerId"), - TraceLoggingString(provider_options_string.c_str(), "providerOptions")); + TraceLoggingString(provider_options_string.c_str(), "providerOptions"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); } } diff --git a/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc b/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc index af67419f4fb91..60ebf862e1601 100644 --- a/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc +++ b/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc @@ -73,10 +73,10 @@ common::Status ArrayFeatureExtractorOp::Compute(OpKernelContext* context) con } for (int64_t i = 0; i < num_indices; ++i) { - if (y_data[i] >= stride) { + if (y_data[i] < 0 || y_data[i] >= stride) { return ORT_MAKE_STATUS( ONNXRUNTIME, INVALID_ARGUMENT, - "Invalid Y argument: index is out of range: Y[", i, "] (", y_data[i], ") >=", stride); + "Invalid Y argument: index is out of range: Y[", i, "] (", y_data[i], ") must be in [0, ", stride, ")"); } } diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h index 32f5c98da1585..d50a4deca3298 100644 --- a/onnxruntime/core/providers/cuda/cuda_common.h +++ b/onnxruntime/core/providers/cuda/cuda_common.h @@ -15,12 +15,17 @@ #pragma warning(push) // 'fp4_interpretation' : unreferenced parameter #pragma warning(disable : 4100) +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include #if defined(_MSC_VER) #pragma warning(pop) +#elif defined(__GNUC__) +#pragma GCC diagnostic pop #endif #endif diff --git a/onnxruntime/core/providers/cuda/cuda_type_conversion.h b/onnxruntime/core/providers/cuda/cuda_type_conversion.h index 38cdce1380fad..04e47a9930710 100644 --- a/onnxruntime/core/providers/cuda/cuda_type_conversion.h +++ b/onnxruntime/core/providers/cuda/cuda_type_conversion.h @@ -14,12 +14,17 @@ #pragma warning(push) // 'fp4_interpretation' : unreferenced parameter #pragma warning(disable : 4100) +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include #if defined(_MSC_VER) #pragma warning(pop) +#elif defined(__GNUC__) +#pragma GCC diagnostic pop #endif #endif diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index 656890e796a1c..d75c6e947e09c 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -259,7 +259,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { TArray fdm_output_strides(dimension_count); TensorPitches output_strides(output_dims); - for (auto i = 0; i < dimension_count; i++) { + for (size_t i = 0; i < dimension_count; i++) { fdm_output_strides[i] = fast_divmod(static_cast(output_strides[i])); } diff --git a/onnxruntime/core/providers/openvino/ov_shared_context.cc b/onnxruntime/core/providers/openvino/ov_shared_context.cc index b529009a205ea..900196c3f652a 100644 --- a/onnxruntime/core/providers/openvino/ov_shared_context.cc +++ b/onnxruntime/core/providers/openvino/ov_shared_context.cc @@ -10,9 +10,10 @@ namespace onnxruntime { namespace openvino_ep { -SharedContext::SharedContext(std::filesystem::path bin_path) - : bin_path_(std::move(bin_path)), - bin_manager_(bin_path_) { +SharedContext::SharedContext(const std::filesystem::path& bin_path) + : bin_path_(bin_path), + bin_manager_(bin_path_), + weight_file_manager_(WeightFileManager::Get()) { } static bool InRange(size_t offset, size_t size, size_t total_size) { @@ -74,7 +75,7 @@ void SharedContext::LoadTensorFromFile( const auto weights_location = model_dir / value.serialized.location; auto& weights_file = weight_files_[weights_location]; if (!weights_file) { - weights_file = std::make_unique(weights_location); + weights_file = weight_file_manager_->GetOrCreateWeightsFile(weights_location); } ov::Tensor tensor; diff --git a/onnxruntime/core/providers/openvino/ov_shared_context.h b/onnxruntime/core/providers/openvino/ov_shared_context.h index f6cfe56086517..99af8bf208805 100644 --- a/onnxruntime/core/providers/openvino/ov_shared_context.h +++ b/onnxruntime/core/providers/openvino/ov_shared_context.h @@ -19,10 +19,13 @@ namespace onnxruntime { namespace openvino_ep { +class WeightFileManager; + class SharedContext : public std::enable_shared_from_this { public: - explicit SharedContext(std::filesystem::path bin_path); + explicit SharedContext(const std::filesystem::path& bin_path); SharedContext() : SharedContext("") {} + virtual ~SharedContext() {} struct Metadata { struct Value { @@ -83,7 +86,6 @@ class SharedContext : public std::enable_shared_from_this { return BinManager::GetBinPathForModel(model_path); } - private: struct WeightsFile { ORT_DISALLOW_COPY_AND_ASSIGNMENT(WeightsFile); WeightsFile() = delete; @@ -104,7 +106,9 @@ class SharedContext : public std::enable_shared_from_this { std::map imported_device_tensors_; }; - void LoadTensorFromFile( + private: + void + LoadTensorFromFile( Metadata::Value& value, const std::filesystem::path& model_dir, std::optional& remote_context, @@ -114,10 +118,29 @@ class SharedContext : public std::enable_shared_from_this { mutable std::shared_mutex mutex_; std::filesystem::path bin_path_; BinManager bin_manager_; - std::unordered_map> weight_files_; + std::shared_ptr weight_file_manager_; + std::unordered_map> weight_files_; Metadata::Map metadata_; }; +class WeightFileManager : public WeakSingleton { + public: + using WeightsFile = SharedContext::WeightsFile; + std::shared_ptr GetOrCreateWeightsFile(const std::filesystem::path& weights_path) { + auto absolute_path = std::filesystem::absolute(weights_path); + std::lock_guard lock(mutex_); + auto [it, inserted] = files_.try_emplace(absolute_path, nullptr); + if (inserted) { + it->second = std::make_shared(absolute_path); + } + return it->second; + } + + private: + mutable std::mutex mutex_; + std::unordered_map> files_; +}; + class SharedContextManager : public WeakSingleton { public: std::shared_ptr GetOrCreateActiveSharedContext(const std::filesystem::path& model_path) { diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index 9fc1cd7f42939..eba0a8c2615aa 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -1168,7 +1168,7 @@ Status QnnBackendManager::ResetContextPriority() { return SetContextPriority(context_priority_); } -Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) { +Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode) { if (true == context_created_) { LOGS_DEFAULT(INFO) << "Context created already."; return Status::OK(); @@ -1184,8 +1184,16 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) { QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT; ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config)); + QnnContext_Config_t context_config_extended_udma = QNN_CONTEXT_CONFIG_INIT; + QnnHtpContext_CustomConfig_t udma_custom_config; + udma_custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_USE_EXTENDED_UDMA; + udma_custom_config.useExtendedUdma = enable_htp_extended_udma_mode; + context_config_extended_udma.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM; + context_config_extended_udma.customConfig = &udma_custom_config; + const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config, &context_config_weight_sharing, + &context_config_extended_udma, nullptr}; const QnnContext_Config_t* empty_context_configs[] = {nullptr}; @@ -1568,7 +1576,8 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool enable_vtcm_backup_buffer_sharing, bool enable_file_mapped_weights, std::shared_ptr rpcmem_library, - std::unordered_map>>& context_bin_map) { + std::unordered_map>>& context_bin_map, + bool enable_htp_extended_udma_mode) { std::lock_guard lock(logger_recursive_mutex_); if (backend_setup_completed_) { LOGS(logger, VERBOSE) << "Backend setup already!"; @@ -1679,7 +1688,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, if (status.IsOK() && (vtcm_backup_buffer_sharing_enabled_ || !load_from_cached_context)) { status = vtcm_backup_buffer_sharing_enabled_ ? CreateContextVtcmBackupBufferSharingEnabled(context_bin_map) - : CreateContext(enable_htp_weight_sharing); + : CreateContext(enable_htp_weight_sharing, enable_htp_extended_udma_mode); if (status.IsOK()) { LOGS(logger, VERBOSE) << "CreateContext succeed."; diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index 9b573531f7c3d..dfa40a2c8aa0d 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -171,7 +171,8 @@ class QnnBackendManager : public std::enable_shared_from_this bool enable_vtcm_backup_buffer_sharing, bool enable_file_mapped_weights, std::shared_ptr rpcmem_library, - std::unordered_map>>& context_bin_map); + std::unordered_map>>& context_bin_map, + bool enable_htp_extended_udma_mode); Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id); @@ -299,7 +300,7 @@ class QnnBackendManager : public std::enable_shared_from_this Status ReleaseProfilehandle(); - Status CreateContext(bool enable_htp_weight_sharing); + Status CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode); Status GetFileSizeIfValid(const std::string& filepath, size_t& file_size); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index a6f1d1c1681cf..c3d8328b37411 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -602,6 +602,19 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } } + static const std::string QNN_HTP_EXTENDED_UDMA_MODE = "extended_udma"; + auto htp_extended_udma_pos = provider_options_map.find(QNN_HTP_EXTENDED_UDMA_MODE); + if (htp_extended_udma_pos != provider_options_map.end()) { + if ("1" == htp_extended_udma_pos->second) { + enable_htp_extended_udma_mode_ = true; + } else if ("0" == htp_extended_udma_pos->second) { + enable_htp_extended_udma_mode_ = false; + } else { + LOGS_DEFAULT(WARNING) << "Invalid extended_udma mode: " << enable_htp_extended_udma_mode_ << " only 0 or 1 allowed. Set to 0."; + } + LOGS_DEFAULT(VERBOSE) << "User specified extended_udma mode: " << enable_htp_extended_udma_mode_; + } + // Option to skip QNN API interface version check to use other QNN library other than default. static const std::string SKIP_QNN_VERSION_CHECK = "skip_qnn_version_check"; auto skip_qnn_version_check = ParseBoolOption(SKIP_QNN_VERSION_CHECK, false, provider_options_map); @@ -1006,7 +1019,8 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer enable_vtcm_backup_buffer_sharing_, enable_file_mapped_weights_, rpcmem_library_, - context_bin_map); + context_bin_map, + enable_htp_extended_udma_mode_); context_bin_map.clear(); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index f7022229f6c7b..c5d41789e7a1f 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -127,6 +127,7 @@ class QNNExecutionProvider : public IExecutionProvider { qnn::ModelSettings model_settings_ = {}; bool dump_json_qnn_graph_ = false; std::string json_qnn_graph_dir_ = ""; + bool enable_htp_extended_udma_mode_ = false; // Whether this is set depends on a session option enabling it and if the RPCMEM dynamic library is available. // This is potentially shared with HtpSharedMemoryAllocator which may be returned by CreatePreferredAllocators(). diff --git a/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc b/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc index 84a0afd873d23..c3842a5c875e3 100644 --- a/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc @@ -57,6 +57,11 @@ Status ConvTranspose::ComputeInternal(ComputeContext& context) bool has_bias = context.InputCount() > 2; const auto* bias = has_bias ? context.Input(2) : nullptr; + // Validate bias shape if provided + if (has_bias && (bias->Shape().NumDimensions() != 1 || bias->Shape()[0] != num_output_channels)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid bias"); + } + if (input_shape.NumDimensions() == 3 && filter_shape.NumDimensions() == 3) { // ConvTranspose1D TensorShapeVector input_shape_vector = input_shape.AsShapeVector(); diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index 7cb6a852e8d7e..8b8d884a35281 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -138,10 +138,10 @@ void WebGpuContext::Initialize(const WebGpuContextConfig& config) { config.buffer_cache_config.uniform.mode, config.buffer_cache_config.query_resolve.mode); - // create initializer buffer manager. cache is always disabled for initializer buffer manager + // create initializer buffer manager. initializer_buffer_mgr_ = BufferManagerFactory::Create(*this, - BufferCacheMode::Disabled, - BufferCacheMode::Disabled, + BufferCacheMode::LazyRelease, + BufferCacheMode::LazyRelease, BufferCacheMode::Disabled); // create program manager diff --git a/onnxruntime/core/providers/webgpu/webgpu_kernel.cc b/onnxruntime/core/providers/webgpu/webgpu_kernel.cc index 8303d2ff4293f..8a52b7a188fd5 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_kernel.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_kernel.cc @@ -49,6 +49,12 @@ Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr / Status s = PrePackInternal(context, tensor, input_idx, ep_.PrepackAllocator(), is_packed); + if (is_packed) { + // Flush pending commands to ensure GPU buffer creations are completed. + // This allows the initializer buffer manager to release temporary buffers and reduce memory usage. + webgpu_context_.Flush(webgpu_context_.InitializerBufferManager()); + } + if (webgpu_context_.ValidationMode() >= ValidationMode::Full) { ORT_RETURN_IF_ERROR(webgpu_context_.PopErrorScope()); } diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 7881004671290..2806eb7a7a8d8 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -4843,7 +4843,7 @@ static_assert(offsetof(OrtApi, CreateExternalInitializerInfo) / sizeof(void*) == static_assert(offsetof(OrtApi, GetTensorElementTypeAndShapeDataReference) / sizeof(void*) == 414, "Size of version 24 API cannot change"); // So that nobody forgets to finish an API version, this check will serve as a reminder: -static_assert(std::string_view(ORT_VERSION) == "1.24.1", +static_assert(std::string_view(ORT_VERSION) == "1.24.2", "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly"); // 1. Update the hardcoded version string in above static_assert to silence it // 2. If there were any APIs added to ort_api_1_to_24 above: diff --git a/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc index 15bce163ba16a..55e0660622f87 100644 --- a/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc +++ b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc @@ -73,6 +73,8 @@ namespace qnnctxgen { "\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n" "\t Defaults to '1' (another EP (typically CPU EP) handles the graph I/O quantization and dequantization). \n" "\t [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary.\n" + "\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n" + "\t '0' (disabled), '1' (enabled). Default: '0'. \n" "\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n" "\n" "\t-h: help\n"); @@ -253,7 +255,7 @@ static bool ParsePluginEpConfig(const std::string& json_file_path, PluginEpConfi ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str); } } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || - key == "enable_htp_spill_fill_buffer") { + key == "enable_htp_spill_fill_buffer" || key == "extended_udma") { std::unordered_set supported_options = {"0", "1"}; if (supported_options.find(value) == supported_options.end()) { std::ostringstream str_stream; @@ -266,7 +268,7 @@ static bool ParsePluginEpConfig(const std::string& json_file_path, PluginEpConfi ORT_THROW( "Wrong key type entered. Choose from options: ['backend_type', 'backend_path', 'vtcm_mb', " "'htp_performance_mode', 'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', " - "'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer']"); + "'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer', 'extended_udma']"); } test_config.run_config.provider_options[key] = value; diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 8446f88639436..f4e15c49d92f0 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -90,6 +90,8 @@ void usage() { "\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n" "\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n" "\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n" + "\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n" + "\t '0' (disabled), '1' (enabled). Default: '0'. \n" "\t [Usage]: -e -i '| |' \n\n" "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_type|cpu\" \n\n" "\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n" @@ -612,7 +614,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { std::string str = str_stream.str(); ORT_THROW("Wrong value for htp_arch. select from: " + str); } - } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") { + } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || key == "extended_udma") { std::unordered_set supported_options = {"0", "1"}; if (supported_options.find(value) == supported_options.end()) { std::ostringstream str_stream; @@ -626,7 +628,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { "Wrong key type entered. Choose from options: ['backend_type', 'backend_path', " "'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', " "'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'op_packages', 'qnn_context_priority', " - "'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization']"); + "'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'extended_udma']"); } qnn_options[key] = value; diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index e21120e62e949..38e4d52d9a2d2 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -116,6 +116,8 @@ ABSL_FLAG(std::string, i, "", " [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill fill buffer, used while generating QNN context binary.\n" " [QNN only] [enable_htp_shared_memory_allocator]: Enable the QNN HTP shared memory allocator and use it for inputs and outputs. Requires libcdsprpc.so/dll to be available.\n" " Defaults to '0' (disabled).\n" + " [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n" + " '0' (disabled), '1' (enabled). Default: '0'. \n" " [Example] [For QNN EP] -e qnn -i \"backend_type|cpu\" \n" "\n" " [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 71f9050730c0b..91f0581af0633 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -258,7 +258,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device "qnn_saver_path", "htp_graph_finalization_optimization_mode", "qnn_context_priority", "htp_arch", "enable_htp_fp16_precision", "offload_graph_io_quantization", "enable_htp_spill_fill_buffer", "enable_htp_shared_memory_allocator", "dump_json_qnn_graph", - "json_qnn_graph_dir", "disable_file_mapped_weights", "htp_bf16_enable", "enable_vtcm_backup_buffer_sharing"}); + "json_qnn_graph_dir", "disable_file_mapped_weights", "htp_bf16_enable", "enable_vtcm_backup_buffer_sharing", "extended_udma"}); + for (const auto& provider_option : provider_options) { const std::string& key = provider_option.first; const std::string& value = provider_option.second; @@ -323,6 +324,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device key == "enable_htp_spill_fill_buffer" || key == "enable_htp_shared_memory_allocator" || key == "dump_json_qnn_graph" || + key == "extended_udma" || key == "disable_file_mapped_weights" || key == "enable_vtcm_backup_buffer_sharing") { std::set supported_options = {"0", "1"}; diff --git a/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc b/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc index c7fc73456dcba..671ada7d36383 100644 --- a/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc +++ b/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc @@ -109,5 +109,13 @@ TEST_F(ArrayFeatureExtractorTest, InvalidInputOutOfBoundsY) { test_.Run(OpTester::ExpectResult::kExpectFailure); } +TEST_F(ArrayFeatureExtractorTest, InvalidInputNegativeY) { + test_.AddInput("X", {10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); + test_.AddInput("Y", {1}, {-10}); + // Should fail due to negative index -10 + test_.AddOutput("Z", {0}, {}); + test_.Run(OpTester::ExpectResult::kExpectFailure); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index 418842ee0a81b..d1f43787c7717 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -1314,6 +1314,27 @@ TEST_F(QnnHTPBackendTests, DumpJsonQNNGraph) { std::filesystem::remove_all(dump_dir); } +// Test extended UDMA mode on supported hardware (should run successfully) +TEST_F(QnnHTPBackendTests, ExtendedUdmaModeTest) { + // Create provider options with extended UDMA mode enabled + ProviderOptions options; + options["backend_type"] = "htp"; + options["offload_graph_io_quantization"] = "0"; + options["htp_arch"] = "81"; + options["extended_udma"] = "1"; + + // Define a simple model with Add operation + auto input_defs = {TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f), + TestInputDef({1, 3, 4, 4}, false, -10.0f, 10.0f)}; + + // Run the test - this should succeed because v81 supports extended UDMA + RunQnnModelTest(BuildOpTestCase("Add", input_defs, {}, {}, kOnnxDomain), + options, + 13, + ExpectedEPNodeAssignment::All, + 0.008f); +} + // Test option for offloading quantization of graph inputs and dequantization of graph outputs to the CPU EP. TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) { // Returns a function that checks that the Q/DQ ops at the graph IO boundary are offloaded to CPU diff --git a/onnxruntime/test/unittest_util/base_tester.cc b/onnxruntime/test/unittest_util/base_tester.cc index d8bfd425f1f1a..2e0459103a7c9 100644 --- a/onnxruntime/test/unittest_util/base_tester.cc +++ b/onnxruntime/test/unittest_util/base_tester.cc @@ -424,7 +424,7 @@ void BaseTester::ExecuteModel(Model& model, SessionType& session, bool SetEpsForAllNodes(Graph& graph, const std::vector>& execution_providers, const std::vector>* custom_registries, - const std::function& ep_uses_kernel_registry_fn) { + const std::function& ep_only_uses_kernel_registry_fn) { const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{}; const KernelRegistry::TypeConstraintMap type_constraint_map{}; @@ -440,7 +440,7 @@ bool SetEpsForAllNodes(Graph& graph, node.SetExecutionProviderType(provider_type); - if (!ep_uses_kernel_registry_fn(*ep)) { + if (!ep_only_uses_kernel_registry_fn(*ep)) { found = true; break; } @@ -659,7 +659,12 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter, #endif kDnnlExecutionProvider, kTensorrtExecutionProvider, +#ifdef USE_NV + // Only include NV TRT RTX EP when is ORT is built with the provider-bridge + // version of the EP (i.e., USE_NV is defined). This allows use of the plugin EP version of the EP + // when ORT is not built any provider-bridge EPs. kNvTensorRTRTXExecutionProvider, +#endif kOpenVINOExecutionProvider, kDmlExecutionProvider, kAclExecutionProvider, @@ -830,12 +835,15 @@ void BaseTester::ExecuteModelForEps( ASSERT_TRUE(!execution_providers.empty()) << "Empty execution providers vector."; if (try_assign_ep_for_nodes) { - auto ep_uses_kernel_registry = [](const IExecutionProvider& ep) { + auto ep_only_uses_kernel_registry = [](const IExecutionProvider& ep) { const auto& provider_type = ep.Type(); - constexpr std::array kEpsThatDoNotUseKernelRegistry{ + constexpr std::array kEpsThatCompileNodes{ kOpenVINOExecutionProvider, - kTensorrtExecutionProvider, + kTensorrtExecutionProvider, // uses kernel registry for Memcpy* nodes only +#ifdef USE_NV + kNvTensorRTRTXExecutionProvider, // uses kernel registry for Memcpy* nodes only +#endif kNnapiExecutionProvider, kVSINPUExecutionProvider, kCoreMLExecutionProvider, @@ -844,24 +852,33 @@ void BaseTester::ExecuteModelForEps( kSnpeExecutionProvider, }; - // check list of known EPs that do not use a kernel registry - if (const auto ep_it = std::find(kEpsThatDoNotUseKernelRegistry.begin(), kEpsThatDoNotUseKernelRegistry.end(), + // check list of known EPs that compile nodes + if (const auto ep_it = std::find(kEpsThatCompileNodes.begin(), kEpsThatCompileNodes.end(), provider_type); - ep_it != kEpsThatDoNotUseKernelRegistry.end()) { + ep_it != kEpsThatCompileNodes.end()) { return false; } - // assume that a dynamic plugin EP which does not return a kernel registry does not use one - if (provider_type == dynamic_plugin_ep_infra::GetEpName() && - ep.GetKernelRegistry() == nullptr) { - return false; + const OrtEp* ort_ep = ep.GetOrtEp(); + + if (ort_ep != nullptr) { // This is a plugin EP + + if (ep.GetKernelRegistry() == nullptr) { + // assume that a dynamic plugin EP which does not return a kernel registry does not use one + return false; + } + + if (ort_ep->Compile != nullptr) { + // assume that a plugin EP that compiles nodes does not use a kernel registry for all nodes + return false; + } } // otherwise, assume that the EP uses a kernel registry return true; }; - if (!SetEpsForAllNodes(model.MainGraph(), execution_providers, custom_registries, ep_uses_kernel_registry)) { + if (!SetEpsForAllNodes(model.MainGraph(), execution_providers, custom_registries, ep_only_uses_kernel_registry)) { std::string providers; for (const auto& ep : execution_providers) { providers.append(ep->Type() + " "); diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml index 7242c5fe7b6a6..8d96c1ae99e0a 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml @@ -104,9 +104,18 @@ stages: - template: nuget/templates/test_macos.yml parameters: - AgentPool: macOS-14 + AgentPool: 'AcesShared' + UseHostedVmImage: 'false' + PoolDemands: 'ImageOverride -equals ACES_VM_SharedPool_Sequoia' ArtifactSuffix: 'CPU' +- template: nodejs/templates/test_macos.yml + parameters: + AgentPool: 'AcesShared' + UseHostedVmImage: 'false' + PoolDemands: 'ImageOverride -equals ACES_VM_SharedPool_Sequoia' + StageSuffix: 'MacOS_ARM64' + - template: nodejs/templates/test_win.yml parameters: AgentPool: 'onnxruntime-Win-CPU-VS2022-Latest' @@ -117,10 +126,6 @@ stages: AgentPool: 'onnxruntime-Ubuntu2204-AMD-CPU' StageSuffix: 'Linux_CPU_x64' -- template: nodejs/templates/test_macos.yml - parameters: - StageSuffix: 'macOS_CPU_x64' - - template: nuget/templates/test_win.yml parameters: AgentPool: 'onnxruntime-Win2022-GPU-A10' @@ -225,7 +230,7 @@ stages: - checkout: self clean: true submodules: none - + - download: build artifact: 'Windows_Packaging_tensorrt_build_artifacts' displayName: 'Download Windows GPU Packages Build' @@ -246,7 +251,7 @@ stages: versionSpec: "17" jdkArchitectureOption: x64 jdkSourceOption: 'PreInstalled' - + - task: PythonScript@0 displayName: 'Update CTest Path References' inputs: diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml index b4012b74196ee..ec3e8a9621e4c 100644 --- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml @@ -23,11 +23,6 @@ parameters: type: number default: 0 -- name: PackageName - displayName: What is the package name? Override using an environment variable CustomPackageName. - type: string - default: 'Microsoft.ML.OnnxRuntime.Foundry' - variables: - template: templates/common-variables.yml - name: ReleaseVersionSuffix @@ -121,7 +116,7 @@ extends: buildArch: x64 msbuildPlatform: arm64 packageName: arm64 - buildparameter: --arm64ec --buildasx --caller_framework WinAI + buildparameter: --arm64 --buildasx --caller_framework WinAI runTests: false buildJava: false buildNodejs: false @@ -137,141 +132,8 @@ extends: AdditionalBuildFlags: '--use_webgpu --skip_tests' DoEsrp: true - - stage: NugetPackaging - dependsOn: [Windows_Packaging_CUDA, Windows_Packaging_CPU_arm64, ManagedNugetPackaging, MacOS_C_API_Package_Publish] - jobs: - - job: CreateNugetPackage - pool: 'Onnxruntime-Win2022-GPU-A10' - timeoutInMinutes: 120 - steps: - - checkout: self - clean: true - submodules: none - - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.12' - addToPath: true - - task: PipAuthenticate@1 - displayName: 'Pip Authenticate' - inputs: - artifactFeeds: 'Lotus' - - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - managed nuget' - inputs: - artifactName: 'onnxruntime-managed-nuget' - targetPath: '$(Build.BinariesDirectory)/managed-nuget' - - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - win-x64' - inputs: - artifactName: 'onnxruntime-win-x64-cuda' - targetPath: '$(Build.BinariesDirectory)/win-x64' - - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - win-arm64' - inputs: - artifactName: 'onnxruntime-win-arm64' - targetPath: '$(Build.BinariesDirectory)/win-arm64' - - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - osx' - inputs: - artifactName: 'onnxruntime-osx' - targetPath: '$(Build.BinariesDirectory)/osx' - - - task: PowerShell@2 - displayName: 'Create osx directories' - inputs: - targetType: 'inline' - script: | - mkdir -p $(Build.BinariesDirectory)/osx-arm64 - Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64 - - - task: PowerShell@2 - displayName: 'List all files downloaded' - inputs: - targetType: 'inline' - script: | - $files = Get-ChildItem $(Build.BinariesDirectory) -Recurse - foreach ($file in $files) { - Write-Host "File: $($file.FullName)" - if ($file -like "*onnxruntime*") { - Write-Host "File onnxruntime: $($file.FullName) - Size: $($file.Length)" - } - } - $dirs = Get-ChildItem $(Build.BinariesDirectory) -Directory - foreach ($dir in $dirs) { - Write-Host "Directory: $($dir.FullName)" - } - $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64* - if ($osx_arm64_archive.Count -eq 0) { - Write-Host "No osx-arm64 archive found." - } else { - Write-Host "osx-arm64 archive found: $($osx_arm64_archive[0].FullName)" - } - workingDirectory: $(Build.BinariesDirectory) - - - task: PowerShell@2 - displayName: 'Extract Nuget Package Version' - inputs: - targetType: 'inline' - script: | - $nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/managed-nuget -Filter Microsoft.ML.OnnxRuntime.Managed.*.nupkg -Recurse) - $package_name = $nupkgs[0].Name - $version_length = $package_name.Length - "Microsoft.ML.OnnxRuntime.Managed.".Length - ".nupkg".Length - $package_version = $package_name.Substring("Microsoft.ML.OnnxRuntime.Managed.".Length, $version_length) - Write-Host "##vso[task.setvariable variable=package_version;]$package_version" - workingDirectory: $(Build.BinariesDirectory) - - - task: PowerShell@2 - displayName: 'Extract Archives' - inputs: - targetType: 'inline' - script: | - Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64 - Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64 - $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName - tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null - $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName - $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64*)[0].FullName - $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName - Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64" - Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64" - Write-Host "##vso[task.setvariable variable=osx_x64;]$osx_x64" - Write-Host "##vso[task.setvariable variable=osx_arm64;]$osx_arm64" - workingDirectory: $(Build.BinariesDirectory) - - - task: PowerShell@2 - displayName: 'Get Package Name' - inputs: - targetType: 'inline' - script: | - if ($env:CustomPackageName) { - Write-Host "##vso[task.setvariable variable=PackageName;]$env:CustomPackageName" - Write-Host "PackageName: $env:CustomPackageName" - } else { - Write-Host "##vso[task.setvariable variable=PackageName;]${{ parameters.PackageName }}" - Write-Host "PackageName: ${{ parameters.PackageName }}" - } - workingDirectory: $(Build.BinariesDirectory) - - - task: PythonScript@0 - displayName: 'Generate Nuget Package' - inputs: - scriptPath: '$(Build.SourcesDirectory)/tools/nuget/generate_nuspec_for_custom_nuget.py' - arguments: '--nuspec_path "$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec" --root_dir "$(Build.SourcesDirectory)" --commit_id "$(Build.SourceVersion)" --win_arm64 "$(win_arm64)" --win_x64 "$(win_x64)" --osx_arm64 "$(osx_arm64)" --osx_x64 "$(osx_x64)" --package_version "$(package_version)" --package_name "$(PackageName)"' - - - task: NuGetCommand@2 - displayName: 'Pack Nuget Package' - inputs: - command: 'pack' - packagesToPack: '$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec' - packDestination: $(Build.ArtifactStagingDirectory)\ - - - task: 1ES.PublishPipelineArtifact@1 - displayName: 'Publish Artifact: Nuget' - inputs: - artifactName: '${{ parameters.PackageName }}' - targetPath: '$(Build.ArtifactStagingDirectory)' + - template: templates/foundry-local-nuget-packaging.yml + parameters: + DependsOn: [Setup, Windows_Packaging_CUDA, Windows_Packaging_CPU_arm64, ManagedNugetPackaging, MacOS_C_API_Package_Publish] + DoEsrp: true + PackageName: 'Microsoft.ML.OnnxRuntime.Foundry' diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml index 9d831df54096a..275d911b7cca2 100644 --- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml +++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml @@ -21,7 +21,8 @@ stages: - template: templates/final-jar-testing-linux.yml parameters: OS: MacOS - PoolName: 'macOS-14' + PoolName: 'AcesShared' + PoolDemands: 'ImageOverride -equals ACES_VM_SharedPool_Sequoia' - stage: GPU_JAR_Testing dependsOn: [] diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml index ae595bbf0c96b..cd41fc575020b 100644 --- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml +++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml @@ -6,12 +6,20 @@ steps: - task: PowerShell@2 - displayName: 'Move Artifact Directory' + condition: and(succeeded(), eq(variables['Agent.OS'], 'Windows_NT')) + displayName: 'Move Artifact Directory (Windows)' inputs: targetType: 'inline' script: | Move-Item -Path "$(Pipeline.Workspace)/build/NPM_packages" -Destination "$(Build.BinariesDirectory)/nodejs-artifact" +- task: CmdLine@2 + condition: and(succeeded(), ne(variables['Agent.OS'], 'Windows_NT')) + displayName: 'Move Artifact Directory (POSIX)' + inputs: + script: | + mv "$(Pipeline.Workspace)/build/NPM_packages" "$(Build.BinariesDirectory)/nodejs-artifact" + - script: mkdir e2e_test workingDirectory: '$(Build.BinariesDirectory)' @@ -38,4 +46,4 @@ steps: npm init -y npm install $(NpmPackageFilesForTest) --onnxruntime-node-install-cuda=skip node -p "require('onnxruntime-node')" - workingDirectory: '$(Build.BinariesDirectory)/e2e_test' \ No newline at end of file + workingDirectory: '$(Build.BinariesDirectory)/e2e_test' diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml index 4dd19ce2c250c..7e184492fab59 100644 --- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml +++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml @@ -1,5 +1,9 @@ parameters: StageSuffix: '' + AgentPool : 'macOS-15' + UseHostedVmImage: 'true' + PoolDemands: '' + stages: - stage: Nodejs_Test_MacOS_${{ parameters.StageSuffix }} dependsOn: @@ -11,7 +15,12 @@ stages: clean: all timeoutInMinutes: 120 pool: - vmImage: 'macOS-15' + ${{ if eq(parameters.UseHostedVmImage, 'true') }}: + vmImage: ${{ parameters.AgentPool }} + ${{ else }}: + name: ${{ parameters.AgentPool }} + ${{ if ne(parameters.PoolDemands, '') }}: + demands: ${{ parameters.PoolDemands }} variables: - name: OnnxRuntimeBuildDirectory diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml index 02613871d61ff..2548eebeb9d42 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml @@ -49,8 +49,8 @@ stages: clean: true submodules: none - - - template: ../../templates/setup-build-tools.yml + + - template: ../../templates/setup-build-tools.yml parameters: host_cpu_arch: 'x64' diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml index 1d122d64b1211..5fc52e2c76468 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml @@ -1,6 +1,10 @@ parameters: + AgentPool : 'macOS-15' + UseHostedVmImage: 'true' IsMacOS : 'true' ArtifactSuffix: '' + PoolDemands: '' + stages: - stage: NuGet_Test_MacOS dependsOn: @@ -11,7 +15,12 @@ stages: workspace: clean: all pool: - vmImage: 'macOS-15' + ${{ if eq(parameters.UseHostedVmImage, 'true') }}: + vmImage: ${{ parameters.AgentPool }} + ${{ else }}: + name: ${{ parameters.AgentPool }} + ${{ if ne(parameters.PoolDemands, '') }}: + demands: ${{ parameters.PoolDemands }} variables: - name: OnnxRuntimeBuildDirectory @@ -27,18 +36,36 @@ stages: - script: | mv $(Pipeline.Workspace)/build/drop-signed-nuget-${{ parameters.ArtifactSuffix }} $(Build.BinariesDirectory)/nuget-artifact - mv $(Pipeline.Workspace)/build/onnxruntime-osx $(Build.BinariesDirectory)/testdata + + # Artifact is a folder containing tgz. Extract it to testdata. + mkdir -p $(Build.BinariesDirectory)/testdata + for archive in $(Pipeline.Workspace)/build/onnxruntime-osx/*.tgz; do + tar -xzf "$archive" -C $(Build.BinariesDirectory)/testdata + done + + # Ensure libcustom_op_library.dylib is where EndToEndTests expects it (testdata/testdata) + mkdir -p $(Build.BinariesDirectory)/testdata/testdata + find $(Build.BinariesDirectory)/testdata -name "libcustom_op_library.dylib" -exec cp {} $(Build.BinariesDirectory)/testdata/testdata/ \; + - template: get-nuget-package-version-as-variable.yml parameters: packageFolder: '$(Build.BinariesDirectory)/nuget-artifact' + - script: | + git submodule update --init cmake/external/onnx + cd cmake/external/onnx + git fetch origin v1.13.1 --depth=1 + git checkout v1.13.1 + cd ../../.. + displayName: 'Initialize ONNX submodule for test data (pinned to v1.13.1 since new data types like float8 is not supported in nuget)' + - script: | $(Build.SourcesDirectory)/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \ $(Build.BinariesDirectory)/nuget-artifact \ $(NuGetPackageVersionNumber) \ true - + if [ $? -ne 0 ]; then echo "Failed to run test" exit 1 @@ -48,4 +75,5 @@ stages: OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory) DisableContribOps: $(DisableContribOps) DisableMlOps: $(DisableMlOps) - IsReleaseBuild: $(IsReleaseBuild) \ No newline at end of file + IsReleaseBuild: $(IsReleaseBuild) + ORT_LOADER_VERBOSITY: 1 diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml index 6eb7c52712671..f767ef110561a 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml @@ -66,131 +66,17 @@ stages: - stage: Python_Packaging_Windows_CPU dependsOn: [] jobs: - - job: Windows_py_Wheels - pool: - name: 'onnxruntime-Win-CPU-VS2022-Latest' - os: windows - templateContext: - sdl: - codeSignValidation: - enabled: true - # TODO: check why pyd file was not signed - break: false - additionalTargetsGlobPattern: f|**\*.pyd - psscriptanalyzer: - enabled: true - binskim: - enabled: true - scanOutputDirectoryOnly: true - outputs: - - output: pipelineArtifact - targetPath: $(Build.ArtifactStagingDirectory) - artifactName: onnxruntime-win-$(PythonVersion) - strategy: - matrix: - Python311_x64: - PythonVersion: '3.11' - Python312_x64: - PythonVersion: '3.12' - Python313_x64: - PythonVersion: '3.13' - Python314_x64: - PythonVersion: '3.14' - variables: - OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)' - ExtraParam: ${{ parameters.build_py_parameters }} - timeoutInMinutes: 180 - workspace: - clean: all - - steps: - - checkout: self - clean: true - submodules: recursive - - - template: ../templates/setup-build-tools.yml - parameters: - host_cpu_arch: 'x64' - python_version: $(PythonVersion) - - - template: ../templates/set-nightly-build-option-variable-step.yml - - - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\windows\python\requirements.txt - env: - TMPDIR: "$(Agent.TempDirectory)" - - - task: PythonScript@0 - displayName: 'Build' - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: > - --config ${{ parameters.cmake_build_type }} - --enable_lto - --build_dir $(Build.SourcesDirectory)\build - --skip_submodule_sync - --cmake_generator "Visual Studio 17 2022" - --enable_pybind - --enable_onnx_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache - ${{ parameters.build_py_parameters }} - --parallel --use_binskim_compliant_compile_flags --update --build - $(TelemetryOption) - - - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}: - - template: ../templates/publish-symbolrequestprod-api.yml - parameters: - ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}: - symbolExpiryTime: 60 - includePublicSymbolServer: true - symbolsArtifactName: onnxruntime_cpu_win_x64_$(PythonVersion) - symbolsVersion: $(Build.BuildId) - symbolProject: 'ONNX Runtime' - subscription: 'OnnxrunTimeCodeSign_20240611' - searchPattern: | - $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime.pdb - $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_shared.pdb - $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_pybind11_state.pdb - - # Esrp signing - - template: ../templates/win-esrp-dll.yml - parameters: - FolderPath: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime\capi' - DisplayName: 'ESRP - Sign Native dlls' - DoEsrp: true - Pattern: '*.pyd,*.dll' - - - task: PythonScript@0 - displayName: 'Build wheel' - inputs: - scriptPath: '$(Build.SourcesDirectory)\setup.py' - arguments: 'bdist_wheel ${{ parameters.build_py_parameters }} $(NightlyBuildOption)' - workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}' - - - task: CopyFiles@2 - displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' - inputs: - SourceFolder: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\dist' - Contents: '*.whl' - TargetFolder: '$(Build.ArtifactStagingDirectory)' - - - script: | - 7z x *.whl - workingDirectory: '$(Build.ArtifactStagingDirectory)' - displayName: 'unzip the package' - + - template: ../templates/py-win-cpu.yml + parameters: + architecture: 'x64' + build_py_parameters: ${{ parameters.build_py_parameters }} + cmake_build_type: ${{ parameters.cmake_build_type }} - - powershell: | - if ("$(PythonVersion)" -notcontains "3.14") { - python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq - Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} - Remove-Item -Recurse -Force onnxruntime - if ("$(ExtraParam)" -contains "--use_azure") { - $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path" - python onnxruntime_test_python_azure.py - } - python onnx_backend_test_series.py - } - workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}' - displayName: 'Run Python Tests' + - template: ../templates/py-win-cpu.yml + parameters: + architecture: 'arm64' + build_py_parameters: ${{ parameters.build_py_parameters }} + cmake_build_type: ${{ parameters.cmake_build_type }} - ${{ if eq(parameters.enable_mac_cpu, true) }}: - stage: Python_Packaging_MacOS diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index 5025046a02b0e..a0f023325be04 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -203,6 +203,10 @@ stages: - input: pipelineArtifact artifactName: drop-onnxruntime-java-linux-aarch64 targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-linux-aarch64' + + - input: pipelineArtifact + artifactName: drop-onnxruntime-java-osx-arm64 + targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-osx-arm64' outputs: - output: pipelineArtifact targetPath: $(Build.BinariesDirectory)\java-artifact\onnxruntime-java-win-x64 diff --git a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml index f5ec5be2c1557..738ac27bafde2 100644 --- a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml +++ b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml @@ -8,6 +8,10 @@ parameters: - name: PoolName type: string +- name: PoolDemands + type: string + default: '' + stages: - stage: Final_Jar_Testing_${{parameters.OS}} dependsOn: [] @@ -17,7 +21,16 @@ stages: clean: all ${{ if eq(parameters.OS, 'MacOS') }}: pool: - vmImage: 'macOS-15' + # Use PoolName if provided, otherwise fallback to macOS-15 + ${{ if ne(parameters.PoolName, '') }}: + ${{ if contains(parameters.PoolName, '-') }}: + vmImage: ${{ parameters.PoolName }} + ${{ else }}: + name: ${{ parameters.PoolName }} + ${{ if ne(parameters.PoolDemands, '') }}: + demands: ${{ parameters.PoolDemands }} + ${{ else }}: + vmImage: 'macOS-15' ${{ if eq(parameters.OS, 'Linux') }}: pool: name: ${{ parameters.PoolName }} @@ -29,10 +42,15 @@ stages: - template: set-version-number-variables-step.yml - bash: | - echo "Downloading and installing Maven $(mavenVersion) for Linux..." + echo "Downloading and installing Maven $(mavenVersion)..." MAVEN_DIR="$(Agent.TempDirectory)/apache-maven-$(mavenVersion)" + # Download Maven binary - wget https://archive.apache.org/dist/maven/maven-3/$(mavenVersion)/binaries/apache-maven-$(mavenVersion)-bin.tar.gz -O $(Agent.TempDirectory)/maven.tar.gz + if command -v wget &> /dev/null; then + wget https://archive.apache.org/dist/maven/maven-3/$(mavenVersion)/binaries/apache-maven-$(mavenVersion)-bin.tar.gz -O $(Agent.TempDirectory)/maven.tar.gz + else + curl -L -o $(Agent.TempDirectory)/maven.tar.gz https://archive.apache.org/dist/maven/maven-3/$(mavenVersion)/binaries/apache-maven-$(mavenVersion)-bin.tar.gz + fi # Extract to the temp directory mkdir -p ${MAVEN_DIR} @@ -40,13 +58,25 @@ stages: # Add Maven's bin directory to the PATH for subsequent tasks in the job echo "##vso[task.prependpath]${MAVEN_DIR}/bin" - displayName: 'Install Maven (Linux)' - condition: and(succeeded(), eq(variables['Agent.OS'], 'Linux')) + displayName: 'Install Maven' + condition: and(succeeded(), in(variables['Agent.OS'], 'Linux', 'Darwin')) - script: | echo "Maven is now on the PATH." mvn --version + - script: | + set -e -x + if ! /usr/libexec/java_home -v 17 >/dev/null 2>&1; then + brew install --cask temurin@17 + fi + JAVA_HOME=$(/usr/libexec/java_home -v 17) + echo "JAVA_HOME is set to: $JAVA_HOME" + echo "##vso[task.setvariable variable=JAVA_HOME]$JAVA_HOME" + echo "##vso[task.prependpath]$JAVA_HOME/bin" + displayName: 'Install JDK 17 (macOS)' + condition: and(succeeded(), eq(variables['Agent.OS'], 'Darwin')) + - download: build artifact: 'onnxruntime-java' displayName: 'Download Final Jar' @@ -58,16 +88,17 @@ stages: goals: 'dependency:copy-dependencies' options: '-DoutputDirectory=$(Pipeline.Workspace)/build/onnxruntime-java' publishJUnitTestResults: false - javaHomeOption: 'JDKVersion' - jdkVersionOption: '1.17' mavenVersionOption: 'Default' + ${{ if eq(parameters.OS, 'MacOS') }}: + javaHomeOption: 'Path' + jdkDirectory: '$(JAVA_HOME)' + ${{ if eq(parameters.OS, 'Linux') }}: + javaHomeOption: 'JDKVersion' + jdkVersionOption: '1.17' - task: Bash@3 - displayName: 'Run Java Tests on Linux' -# condition: and(succeeded(), in(variables['Agent.OS'], 'Linux', 'Darwin')) - # MacOS packages have been removed from the JAR here: - # https://github.com/microsoft/onnxruntime/commit/5ed340f7a51f3cbdb62577a874daf2b3f23d6a93#diff-a14cc5ea231eb4fa49f13510a242043c47ae48516c860f8a87b0e55762632f49 - condition: and(succeeded(), in(variables['Agent.OS'], 'Linux')) + displayName: 'Run Java Tests' + condition: and(succeeded(), in(variables['Agent.OS'], 'Linux', 'Darwin')) inputs: targetType: 'inline' script: | @@ -83,24 +114,54 @@ stages: cd .. mkdir tests cd tests + # 1. Diagnostics + echo "System Info:" + uname -a + if [[ "$(uname)" == "Darwin" ]]; then arch; fi + echo "Java Version" + java -version + + # 2. Extract jar xf $(Pipeline.Workspace)/build/onnxruntime-java/testing.jar rm -f $(Pipeline.Workspace)/build/onnxruntime-java/testing.jar - ls $(Pipeline.Workspace)/build/tests + + # Identify main jar (avoiding sources and javadoc jars) + MAIN_JAR=$(ls $(Pipeline.Workspace)/build/onnxruntime-java/onnxruntime-*.jar | grep -v 'sources' | grep -v 'javadoc' | head -n 1) + echo "Extracting native libs from $MAIN_JAR" + jar xf $MAIN_JAR ai/onnxruntime/native + + ls -R $(Pipeline.Workspace)/build/tests/ai echo "Java Version" java -version - # Set the correct library path based on the OS + + # 3. Find with robustness os_name=$(uname) - if [[ "$os_name" == "Linux" ]]; then - echo "Platform: Linux. Setting LD_LIBRARY_PATH." - export LD_LIBRARY_PATH="$(pwd):$LD_LIBRARY_PATH" - java -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \ - --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults" - elif [[ "$os_name" == "Darwin" ]]; then - echo "Platform: macOS. Setting DYLD_LIBRARY_PATH." - export DYLD_LIBRARY_PATH="$(pwd):$DYLD_LIBRARY_PATH" - java -DUSE_WEBGPU=1 -DUSE_COREML=1 -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \ - --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults" + if [[ "$os_name" == "Linux" ]]; then S_FILE="libonnxruntime.so"; else S_FILE="libonnxruntime.dylib"; fi + + echo "Searching for $S_FILE in $(pwd)..." + # Exclude .dSYM paths and find actual file + NATIVE_LIB_PATH=$(find $(pwd) -name "$S_FILE" -not -path "*.dSYM*" -type f | head -n 1) + + if [[ -n "$NATIVE_LIB_PATH" ]]; then + NATIVE_LIB_DIR=$(dirname "$NATIVE_LIB_PATH") + echo "Found native lib dir: $NATIVE_LIB_DIR" + + if [[ "$os_name" == "Linux" ]]; then + echo "Platform: Linux. Setting LD_LIBRARY_PATH." + export LD_LIBRARY_PATH="$NATIVE_LIB_DIR:$(pwd):$LD_LIBRARY_PATH" + java -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \ + --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults" + elif [[ "$os_name" == "Darwin" ]]; then + echo "Platform: macOS. Setting DYLD_LIBRARY_PATH." + export DYLD_LIBRARY_PATH="$NATIVE_LIB_DIR:$(pwd):$DYLD_LIBRARY_PATH" + java -DUSE_WEBGPU=1 -DUSE_COREML=1 -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \ + --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults" + fi + else + echo "Error: $S_FILE not found!" + ls -R ai + exit 1 fi diff --git a/tools/ci_build/github/azure-pipelines/templates/foundry-local-nuget-packaging.yml b/tools/ci_build/github/azure-pipelines/templates/foundry-local-nuget-packaging.yml new file mode 100644 index 0000000000000..0ad230f835778 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/foundry-local-nuget-packaging.yml @@ -0,0 +1,149 @@ +parameters: + DoEsrp: false + StageName: 'FoundryLocalNugetPackaging' + DependsOn: [] + PackageName: 'Microsoft.ML.OnnxRuntime.Foundry' + +stages: +- stage: ${{ parameters.StageName }} + dependsOn: ${{ parameters.DependsOn }} + jobs: + - job: ${{ parameters.StageName }} + timeoutInMinutes: 120 + pool: + name: 'onnxruntime-Win2022-GPU-A10' + os: windows + templateContext: + sdl: + codeSignValidation: + enabled: true + break: true + psscriptanalyzer: + enabled: true + binskim: + enabled: true + scanOutputDirectoryOnly: true + outputs: + - output: pipelineArtifact + targetPath: $(Build.ArtifactStagingDirectory) + artifactName: "onnxruntime-foundry-nuget" + variables: + DoEsrp: ${{ parameters.DoEsrp }} + ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']] + BuildDate: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Date.BuildDate']] + BuildTime: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Time.BuildTime']] + + steps: + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - managed nuget' + inputs: + artifactName: 'onnxruntime-managed-nuget' + targetPath: '$(Build.BinariesDirectory)/managed-nuget' + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - win-x64' + inputs: + artifactName: 'onnxruntime-win-x64-cuda' + targetPath: '$(Build.BinariesDirectory)/win-x64' + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - win-arm64' + inputs: + artifactName: 'onnxruntime-win-arm64' + targetPath: '$(Build.BinariesDirectory)/win-arm64' + + - task: DownloadPipelineArtifact@0 + displayName: 'Download Pipeline Artifact - osx' + inputs: + artifactName: 'onnxruntime-osx' + targetPath: '$(Build.BinariesDirectory)/osx' + + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.12' + addToPath: true + + - task: PipAuthenticate@1 + displayName: 'Pip Authenticate' + inputs: + artifactFeeds: 'Lotus' + + - task: PowerShell@2 + displayName: 'Create osx directories' + inputs: + targetType: 'inline' + script: | + New-Item -ItemType Directory -Force -Path "$(Build.BinariesDirectory)/osx-arm64" | Out-Null + Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64 + + - task: PowerShell@2 + displayName: 'List all files downloaded' + inputs: + targetType: 'inline' + script: | + $files = Get-ChildItem $(Build.BinariesDirectory) -Recurse + foreach ($file in $files) { + Write-Host "File: $($file.FullName)" + if ($file -like "*onnxruntime*") { + Write-Host "File onnxruntime: $($file.FullName) - Size: $($file.Length)" + } + } + $dirs = Get-ChildItem $(Build.BinariesDirectory) -Directory + foreach ($dir in $dirs) { + Write-Host "Directory: $($dir.FullName)" + } + $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64* + if ($osx_arm64_archive.Count -eq 0) { + Write-Host "No osx-arm64 archive found." + } else { + Write-Host "osx-arm64 archive found: $($osx_arm64_archive[0].FullName)" + } + workingDirectory: $(Build.BinariesDirectory) + + - task: PowerShell@2 + displayName: 'Extract Nuget Package Version' + inputs: + targetType: 'inline' + script: | + $nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/managed-nuget -Filter Microsoft.ML.OnnxRuntime.Managed.*.nupkg -Recurse) + $package_name = $nupkgs[0].Name + $version_length = $package_name.Length - "Microsoft.ML.OnnxRuntime.Managed.".Length - ".nupkg".Length + $package_version = $package_name.Substring("Microsoft.ML.OnnxRuntime.Managed.".Length, $version_length) + Write-Host "##vso[task.setvariable variable=package_version;]$package_version" + workingDirectory: $(Build.BinariesDirectory) + + - task: PowerShell@2 + displayName: 'Extract Archives' + inputs: + targetType: 'inline' + script: | + Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64 + Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64 + $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName + tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null + $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Directory -Filter onnxruntime-win-x64-cuda*)[0].FullName + $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Directory -Filter onnxruntime-win-arm64*)[0].FullName + $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Directory -Filter onnxruntime-osx-arm64*)[0].FullName + Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64" + Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64" + Write-Host "##vso[task.setvariable variable=osx_arm64;]$osx_arm64" + workingDirectory: $(Build.BinariesDirectory) + + - task: PythonScript@0 + displayName: 'Generate Nuget Package' + inputs: + scriptPath: '$(Build.SourcesDirectory)/tools/nuget/generate_nuspec_for_custom_nuget.py' + arguments: '--nuspec_path "$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec" --root_dir "$(Build.SourcesDirectory)" --commit_id "$(Build.SourceVersion)" --win_arm64 "$(win_arm64)" --win_x64 "$(win_x64)" --osx_arm64 "$(osx_arm64)" --package_version "$(package_version)" --package_name "${{ parameters.PackageName }}"' + + - task: NuGetCommand@2 + displayName: 'Pack Nuget Package' + inputs: + command: 'pack' + packagesToPack: '$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec' + packDestination: $(Build.ArtifactStagingDirectory)\ + + - template: esrp_nuget.yml + parameters: + DisplayName: 'ESRP - sign NuGet package' + FolderPath: '$(Build.ArtifactStagingDirectory)' + DoEsrp: ${{ parameters.DoEsrp }} diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml index 8e454f2137ce8..795945a8581ba 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml @@ -26,6 +26,15 @@ steps: args: '-r $(Build.BinariesDirectory) -a onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion) -l libonnxruntime.$(OnnxRuntimeVersion).dylib -c Release -s $(Build.SourcesDirectory) -t $(Build.SourceVersion)' workingDirectory: '$(Build.BinariesDirectory)/Release' +- bash: | + mkdir -p $(Build.BinariesDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion)/testdata + cp $(Build.BinariesDirectory)/Release/libcustom_op_library.dylib $(Build.BinariesDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion)/testdata/libcustom_op_library.dylib + # Copy to testdata/testdata so EndToEndTests can find it when running in Debug configuration + mkdir -p $(Build.BinariesDirectory)/testdata/testdata + cp $(Build.BinariesDirectory)/Release/libcustom_op_library.dylib $(Build.BinariesDirectory)/testdata/testdata/libcustom_op_library.dylib + displayName: 'Copy custom op library' + condition: succeeded() + - task: ArchiveFiles@2 inputs: rootFolderOrFile: '$(Build.BinariesDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion)' @@ -40,6 +49,14 @@ steps: targetPath: '$(Build.ArtifactStagingDirectory)' artifactName: 'onnxruntime-osx-${{ parameters.MacosArch }}' +- template: java-api-artifacts-package-and-publish-steps-posix.yml + parameters: + arch: 'osx-${{ parameters.MacosArch }}' + buildConfig: 'Release' + artifactName: 'onnxruntime-java-osx-${{ parameters.MacosArch }}' + libraryName: 'libonnxruntime.dylib' + nativeLibraryName: 'libonnxruntime4j_jni.dylib' + - template: nodejs-artifacts-package-and-publish-steps-posix.yml parameters: arch: arm64 diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml index bfccaef1c9852..de16ce483a9f4 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml @@ -45,9 +45,20 @@ jobs: set -e -x export ONNX_ML=1 export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=ON -DONNX_WERROR=OFF" - python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt' + python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt' + + - script: | + set -e -x + if ! /usr/libexec/java_home -v 17 >/dev/null 2>&1; then + brew install --cask temurin@17 + fi + JAVA_HOME=$(/usr/libexec/java_home -v 17) + echo "JAVA_HOME is set to: $JAVA_HOME" + echo "##vso[task.setvariable variable=JAVA_HOME]$JAVA_HOME" + echo "##vso[task.prependpath]$JAVA_HOME/bin" + displayName: 'Install JDK 17' - template: mac-cpu-packaging-steps.yml parameters: MacosArch: arm64 - AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64 + AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_java --build_nodejs --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml new file mode 100644 index 0000000000000..09603f2350657 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml @@ -0,0 +1,168 @@ +parameters: +- name: architecture + type: string + default: 'x64' + values: + - x64 + - arm64 + +- name: build_py_parameters + displayName: 'Specify extra build parameters' + type: string + default: '--use_azure' + +- name: cmake_build_type + type: string + displayName: 'CMake build type for Windows. Only for Windows CPU packages.' + default: 'RelWithDebInfo' + values: + - Debug + - Release + - RelWithDebInfo + - MinSizeRel + +jobs: +- job: Windows_py_Wheels_${{parameters.architecture}} + ${{ if eq(parameters.architecture, 'arm64') }}: + pool: + name: 'onnxruntime-qnn-windows-vs-2022-arm64' + os: windows + hostArchitecture: Arm64 + demands: + - Agent.Version -equals 4.264.2 + ${{ else }}: + pool: + name: 'onnxruntime-Win-CPU-VS2022-Latest' + os: windows + templateContext: + sdl: + codeSignValidation: + enabled: true + # TODO: check why pyd file was not signed + break: false + additionalTargetsGlobPattern: f|**\*.pyd + psscriptanalyzer: + enabled: true + binskim: + enabled: true + scanOutputDirectoryOnly: true + ${{ if eq(parameters.architecture, 'arm64') }}: + outputs: + - output: pipelineArtifact + targetPath: $(Build.ArtifactStagingDirectory) + artifactName: onnxruntime-win-$(PythonVersion)-arm64 + ${{ else }}: + outputs: + - output: pipelineArtifact + targetPath: $(Build.ArtifactStagingDirectory) + artifactName: onnxruntime-win-$(PythonVersion) + strategy: + matrix: + Python311_${{parameters.architecture}}: + PythonVersion: '3.11' + Python312_${{parameters.architecture}}: + PythonVersion: '3.12' + Python313_${{parameters.architecture}}: + PythonVersion: '3.13' + Python314_${{parameters.architecture}}: + PythonVersion: '3.14' + variables: + OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)' + ExtraParam: ${{ parameters.build_py_parameters }} + timeoutInMinutes: 180 + workspace: + clean: all + + steps: + - checkout: self + clean: true + submodules: recursive + + - template: setup-build-tools.yml + parameters: + host_cpu_arch: ${{parameters.architecture}} + python_version: $(PythonVersion) + + - template: set-nightly-build-option-variable-step.yml + + - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\windows\python\requirements.txt + env: + TMPDIR: "$(Agent.TempDirectory)" + + - task: PythonScript@0 + displayName: 'Build' + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: > + --config ${{ parameters.cmake_build_type }} + --enable_lto + --build_dir $(Build.SourcesDirectory)\build + --skip_submodule_sync + --cmake_generator "Visual Studio 17 2022" + --enable_pybind + --enable_onnx_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --build + ${{ parameters.build_py_parameters }} + --parallel --use_binskim_compliant_compile_flags --update + $(TelemetryOption) + + - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}: + - template: publish-symbolrequestprod-api.yml + parameters: + ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}: + symbolExpiryTime: 60 + includePublicSymbolServer: true + symbolsArtifactName: onnxruntime_cpu_win_${{ parameters.architecture }}_$(PythonVersion) + symbolsVersion: $(Build.BuildId) + symbolProject: 'ONNX Runtime' + subscription: 'OnnxrunTimeCodeSign_20240611' + searchPattern: | + $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime.pdb + $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_shared.pdb + $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_pybind11_state.pdb + + # Esrp signing + - template: win-esrp-dll.yml + parameters: + FolderPath: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime\capi' + DisplayName: 'ESRP - Sign Native dlls' + DoEsrp: true + Pattern: '*.pyd,*.dll' + + - task: PythonScript@0 + displayName: 'Build wheel' + inputs: + scriptPath: '$(Build.SourcesDirectory)\setup.py' + arguments: 'bdist_wheel ${{ parameters.build_py_parameters }} $(NightlyBuildOption)' + workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}' + + - task: CopyFiles@2 + displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\dist' + Contents: '*.whl' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - script: | + 7z x *.whl + workingDirectory: '$(Build.ArtifactStagingDirectory)' + displayName: 'unzip the package' + + + - powershell: | + if ("$(PythonVersion)" -notcontains "3.14") { + python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq + Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} + Remove-Item -Recurse -Force onnxruntime + if ("$(ExtraParam)".Split() -contains "--use_azure") { + + if( "${{parameters.architecture}}" -eq 'arm64') { + $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path" + } else { + $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path" + } + python onnxruntime_test_python_azure.py + } + python onnx_backend_test_series.py + } + workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}' + displayName: 'Run Python Tests' diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh index f5b4c38c85d4c..88eff3ebff86a 100755 --- a/tools/ci_build/github/linux/copy_strip_binary.sh +++ b/tools/ci_build/github/linux/copy_strip_binary.sh @@ -27,6 +27,17 @@ if [[ $LIB_NAME == *.dylib ]] then dsymutil $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME -o $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME.dSYM strip -S $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME + + # ORT NuGet packaging expects the unversioned library (libonnxruntime.dylib) to contain the binary content, + # because the versioned library is excluded by the nuspec generation script. + # We explicitly overwrite the symlink with the real file to ensure 'nuget pack' (especially on Windows) + # doesn't pack an empty/broken symlink. + # Only applies to versioned libonnxruntime libraries (e.g. libonnxruntime.1.24.0.dylib). + if [[ "$LIB_NAME" =~ ^libonnxruntime\..*\.dylib$ && -L "$BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib" ]]; then + rm "$BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib" + cp "$BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME" "$BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib" + fi + # copy the CoreML EP header for macOS build (libs with .dylib ext) cp $SOURCE_DIR/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h $BINARY_DIR/$ARTIFACT_NAME/include else diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu index 766a2c8a8b73b..0c63b7775256a 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu @@ -49,7 +49,9 @@ RUN apt-get update && \ libnvonnxparsers-dev=${TRT_VERSION} \ libnvonnxparsers10=${TRT_VERSION} \ tensorrt-dev=${TRT_VERSION} \ - libnvinfer-bin=${TRT_VERSION} && \ + libnvinfer-bin=${TRT_VERSION} \ + libnvinfer-headers-python-plugin-dev=${TRT_VERSION} \ + libnvinfer-win-builder-resource10=${TRT_VERSION} && \ rm -rf /var/lib/apt/lists/* COPY scripts /tmp/scripts diff --git a/tools/ci_build/github/windows/jar_packaging.py b/tools/ci_build/github/windows/jar_packaging.py index 8ec380a5d2523..f4bc6899260c1 100644 --- a/tools/ci_build/github/windows/jar_packaging.py +++ b/tools/ci_build/github/windows/jar_packaging.py @@ -232,6 +232,7 @@ def run_packaging(package_type: str, build_dir: str): "platforms": [ {"path": "onnxruntime-java-linux-x64", "lib": "libcustom_op_library.so", "archive_lib": True}, {"path": "onnxruntime-java-linux-aarch64", "lib": "libcustom_op_library.so", "archive_lib": False}, + {"path": "onnxruntime-java-osx-arm64", "lib": "libcustom_op_library.dylib", "archive_lib": True}, ] }, "gpu": { diff --git a/tools/ci_build/github/windows/jar_packaging_test.py b/tools/ci_build/github/windows/jar_packaging_test.py index 2dd61cf9c3088..e4f7e4945442c 100644 --- a/tools/ci_build/github/windows/jar_packaging_test.py +++ b/tools/ci_build/github/windows/jar_packaging_test.py @@ -52,14 +52,19 @@ def _setup_test_directory(package_type: str, version_string: str): create_empty_file(linux_native_dir / "libonnxruntime_providers_cuda.so") (linux_dir / "_manifest" / "spdx_2.2").mkdir(parents=True, exist_ok=True) - # --- Additional platforms (for CPU test) --- + # --- macOS and other platforms (for CPU test) --- if package_type == "cpu": - # Add linux-aarch64 for CPU test + # Add linux-aarch64 and osx-arm64 for CPU test linux_aarch64_dir = java_artifact_dir / "onnxruntime-java-linux-aarch64" linux_aarch64_native_dir = linux_aarch64_dir / "ai" / "onnxruntime" / "native" / "linux-aarch64" linux_aarch64_native_dir.mkdir(parents=True, exist_ok=True) create_empty_file(linux_aarch64_dir / "libcustom_op_library.so") + osx_arm64_dir = java_artifact_dir / "onnxruntime-java-osx-arm64" + osx_arm64_native_dir = osx_arm64_dir / "ai" / "onnxruntime" / "native" / "osx-arm64" + osx_arm64_native_dir.mkdir(parents=True, exist_ok=True) + create_empty_file(osx_arm64_dir / "libcustom_op_library.dylib") + return tmp_path return _setup_test_directory @@ -128,9 +133,12 @@ def test_cpu_packaging(directory_setup_factory, version_string): with zipfile.ZipFile(testing_jar_path, "r") as zf: jar_contents = zf.namelist() assert "libcustom_op_library.so" in jar_contents + assert "libcustom_op_library.dylib" in jar_contents # 3. Verify the custom op libraries were removed from the source directories linux_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-x64" linux_aarch64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-aarch64" + osx_arm64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-osx-arm64" assert not (linux_dir / "libcustom_op_library.so").exists() assert not (linux_aarch64_dir / "libcustom_op_library.so").exists() + assert not (osx_arm64_dir / "libcustom_op_library.dylib").exists() diff --git a/tools/nuget/generate_nuspec_for_custom_nuget.py b/tools/nuget/generate_nuspec_for_custom_nuget.py index 3abd03119cbc5..6e51c51895191 100644 --- a/tools/nuget/generate_nuspec_for_custom_nuget.py +++ b/tools/nuget/generate_nuspec_for_custom_nuget.py @@ -14,7 +14,6 @@ def generate_files(lines, args): platform_map = { "win-arm64": args.win_arm64, "win-x64": args.win_x64, - "osx-x64": args.osx_x64, "osx-arm64": args.osx_arm64, } @@ -116,7 +115,6 @@ def parse_arguments(): parser.add_argument("--win_arm64", required=True, help="Ort win-arm64 directory") parser.add_argument("--win_x64", required=True, help="Ort win-x64 directory") parser.add_argument("--osx_arm64", required=True, help="Ort osx-arm64 directory") - parser.add_argument("--osx_x64", required=True, help="Ort osx-x64 directory") parser.add_argument("--package_version", required=True, help="Version of the package") parser.add_argument("--package_name", required=True, help="Name of the package") diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index 9884cbf5793df..1f882c847c707 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -238,6 +238,9 @@ def add_common_dependencies(xml_text, package_name, version): xml_text.append('') xml_text.append('') + if package_name == "Microsoft.ML.OnnxRuntime.Foundry": + xml_text.append('') + def generate_dependencies(xml_text, package_name, version): dml_dependency = ''