diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 6405236da1734..6c5464851937c 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -20,8 +20,13 @@ else()
   endif()
 endif()
 
-if(Patch_FOUND AND WIN32)
-  set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_windows.patch)
+if(Patch_FOUND)
+  if (WIN32)
+    set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_windows.patch &&
+                           ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_cuda_warnings.patch)
+  else()
+    set(ABSL_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/abseil/absl_cuda_warnings.patch)
+  endif()
 else()
   set(ABSL_PATCH_COMMAND "")
 endif()
diff --git a/cmake/external/cuda_configuration.cmake b/cmake/external/cuda_configuration.cmake
index be6a5febf3e14..00f7d81eda53d 100644
--- a/cmake/external/cuda_configuration.cmake
+++ b/cmake/external/cuda_configuration.cmake
@@ -85,6 +85,11 @@ macro(setup_cuda_architectures)
   #  * Always use accelerated (`-a` suffix) target for supported real architectures.
   # cmake-format: on
 
+  # Allow override via CUDAARCHS environment variable (standard CMake variable)
+  if(NOT CMAKE_CUDA_ARCHITECTURES AND DEFINED ENV{CUDAARCHS})
+    set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}")
+  endif()
+
   if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
     # Detect highest available compute capability
     set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
@@ -139,12 +144,12 @@ macro(setup_cuda_architectures)
       continue()
     endif()
 
-    if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$")
+    if(CUDA_ARCH MATCHES "^([1-9])([0-9])+[af]?-virtual$")
       set(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL ${CUDA_ARCH})
-    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?-real$")
-      list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
-    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?$")
+    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)[af]?-real$")
       list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
+    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)([af]?)$")
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}${CMAKE_MATCH_4})
     else()
       message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
     endif()
@@ -156,7 +161,7 @@ macro(setup_cuda_architectures)
   set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
   message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
 
-  set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120")
+  set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
   foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
     if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
       add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")
@@ -165,10 +170,13 @@ macro(setup_cuda_architectures)
   endforeach()
 
   # Enable accelerated features (like WGMMA, TMA and setmaxnreg) for SM >= 90.
-  set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120")
+  set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "110" "120")
   unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED)
   foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
+    if(CUDA_ARCH MATCHES "^([0-9]+)f$")
+      # Family code, no -real suffix
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}")
+    elseif("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
       list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real")
     else()
       list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real")
diff --git a/cmake/patches/abseil/absl_cuda_warnings.patch b/cmake/patches/abseil/absl_cuda_warnings.patch
new file mode 100644
index 0000000000000..144b9f904bf0f
--- /dev/null
+++ b/cmake/patches/abseil/absl_cuda_warnings.patch
@@ -0,0 +1,40 @@
+diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h
+index 1234567..abcdefg 100644
+--- a/absl/hash/internal/hash.h
++++ b/absl/hash/internal/hash.h
+@@ -477,7 +477,7 @@ H AbslHashValue(H hash_state, T (&)[N]) {
+ template <typename H, typename T, size_t N>
+ H AbslHashValue(H hash_state, T (&)[N]) {
+   static_assert(
+-      sizeof(T) == -1,
++      sizeof(T) == size_t(-1),
+       "Hashing C arrays is not allowed. For string literals, wrap the literal "
+       "in absl::string_view(). To hash the array contents, use "
+       "absl::MakeSpan() or make the array an std::array. To hash the array "
+diff --git a/absl/hash/hash.h b/absl/hash/hash.h
+index 1234567..abcdefg 100644
+--- a/absl/hash/hash.h
++++ b/absl/hash/hash.h
+@@ -333,7 +333,8 @@ class HashState : public hash_internal::HashStateBase<HashState> {
+       absl::enable_if_t<
+           std::is_base_of<hash_internal::HashStateBase<T>, T>::value, int> = 0>
+   static HashState Create(T* state) {
+-    HashState s;
++    HashState s = {};
++    (void)s;
+     s.Init(state);
+     return s;
+   }
+diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h
+index 1234567..abcdefg 100644
+--- a/absl/container/internal/raw_hash_set.h
++++ b/absl/container/internal/raw_hash_set.h
+@@ -464,7 +464,7 @@ inline uint16_t NextSeed() {
+ inline uint16_t NextSeed() {
+   static_assert(PerTableSeed::kBitCount == 16);
+   thread_local uint16_t seed =
+-      static_cast<uint16_t>(reinterpret_cast<uintptr_t>(&seed));
++      static_cast<uint16_t>(reinterpret_cast<uintptr_t>(&seed) & 0xFFFFu);
+   seed += uint16_t{0xad53};
+   return seed;
+ }
diff --git a/cmake/vcpkg-ports/abseil/absl_cuda_warnings.patch b/cmake/vcpkg-ports/abseil/absl_cuda_warnings.patch
new file mode 100644
index 0000000000000..144b9f904bf0f
--- /dev/null
+++ b/cmake/vcpkg-ports/abseil/absl_cuda_warnings.patch
@@ -0,0 +1,40 @@
+diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h
+index 1234567..abcdefg 100644
+--- a/absl/hash/internal/hash.h
++++ b/absl/hash/internal/hash.h
+@@ -477,7 +477,7 @@ H AbslHashValue(H hash_state, T (&)[N]) {
+ template <typename H, typename T, size_t N>
+ H AbslHashValue(H hash_state, T (&)[N]) {
+   static_assert(
+-      sizeof(T) == -1,
++      sizeof(T) == size_t(-1),
+       "Hashing C arrays is not allowed. For string literals, wrap the literal "
+       "in absl::string_view(). To hash the array contents, use "
+       "absl::MakeSpan() or make the array an std::array. To hash the array "
+diff --git a/absl/hash/hash.h b/absl/hash/hash.h
+index 1234567..abcdefg 100644
+--- a/absl/hash/hash.h
++++ b/absl/hash/hash.h
+@@ -333,7 +333,8 @@ class HashState : public hash_internal::HashStateBase<HashState> {
+       absl::enable_if_t<
+           std::is_base_of<hash_internal::HashStateBase<T>, T>::value, int> = 0>
+   static HashState Create(T* state) {
+-    HashState s;
++    HashState s = {};
++    (void)s;
+     s.Init(state);
+     return s;
+   }
+diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h
+index 1234567..abcdefg 100644
+--- a/absl/container/internal/raw_hash_set.h
++++ b/absl/container/internal/raw_hash_set.h
+@@ -464,7 +464,7 @@ inline uint16_t NextSeed() {
+ inline uint16_t NextSeed() {
+   static_assert(PerTableSeed::kBitCount == 16);
+   thread_local uint16_t seed =
+-      static_cast<uint16_t>(reinterpret_cast<uintptr_t>(&seed));
++      static_cast<uint16_t>(reinterpret_cast<uintptr_t>(&seed) & 0xFFFFu);
+   seed += uint16_t{0xad53};
+   return seed;
+ }
diff --git a/cmake/vcpkg-ports/abseil/portfile.cmake b/cmake/vcpkg-ports/abseil/portfile.cmake
index 3cdedca7265ef..1e9c48ea834b2 100644
--- a/cmake/vcpkg-ports/abseil/portfile.cmake
+++ b/cmake/vcpkg-ports/abseil/portfile.cmake
@@ -9,6 +9,7 @@ vcpkg_from_github(
     SHA512 4ee1a217203933382e728d354a149253a517150eee7580a0abecc69584b2eb200d91933ef424487e3a3fe0e8ab5e77b0288485cac982171b3585314a4417e7d4
     HEAD_REF master
     PATCHES absl_windows.patch
+            absl_cuda_warnings.patch
 )
 
 
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 1ae7b5c9eb991..abe73b77f4071 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 using System;
+using System.Reflection;
 using System.Runtime.InteropServices;
 using static Microsoft.ML.OnnxRuntime.NativeMethods;
 
@@ -474,6 +475,12 @@ internal static class NativeMethods
 
         static NativeMethods()
         {
+#if !NETSTANDARD2_0 && !__ANDROID__ && !__IOS__
+            // Register a custom DllImportResolver to handle platform-specific library loading.
+            // Replaces default resolution specifically on Windows for case-sensitivity.
+            NativeLibrary.SetDllImportResolver(typeof(NativeMethods).Assembly, DllImportResolver);
+#endif
+
 #if NETSTANDARD2_0
             IntPtr ortApiBasePtr = OrtGetApiBase();
             OrtApiBase ortApiBase = (OrtApiBase)Marshal.PtrToStructure(ortApiBasePtr, typeof(OrtApiBase));
@@ -847,7 +854,7 @@ static NativeMethods()
                     api_.CreateSyncStreamForEpDevice,
                     typeof(DOrtCreateSyncStreamForEpDevice));
 
-            OrtSyncStream_GetHandle = 
+            OrtSyncStream_GetHandle =
                 (DOrtSyncStream_GetHandle)Marshal.GetDelegateForFunctionPointer(
                     api_.SyncStream_GetHandle,
                     typeof(DOrtSyncStream_GetHandle));
@@ -872,11 +879,127 @@ internal class NativeLib
             // Define the library name required for iOS
             internal const string DllName = "__Internal";
 #else
-            // Note: the file name in ONNX Runtime nuget package must be onnxruntime.dll instead of onnxruntime.DLL(Windows filesystem can be case sensitive)
-            internal const string DllName = "onnxruntime.dll";
+            // For desktop platforms (including .NET Standard 2.0), we use the simple name
+            // to allow .NET's automatic platform-specific resolution (lib*.so, lib*.dylib, *.dll).
+            // For .NET Core 3.0+, case-sensitivity on Windows is handled by DllImportResolver.
+            internal const string DllName = "onnxruntime";
 #endif
         }
 
+#if !NETSTANDARD2_0 && !__ANDROID__ && !__IOS__
+        /// <summary>
+        /// Custom DllImportResolver to handle platform-specific library loading.
+        /// On Windows, it explicitly loads the library with a lowercase .dll extension to handle
+        /// case-sensitive filesystems.
+        /// </summary>
+        private static IntPtr DllImportResolver(string libraryName, Assembly assembly, DllImportSearchPath? searchPath)
+        {
+            if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName)
+            {
+                string mappedName = null;
+                if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+                {
+                    // Explicitly load with .dll extension to avoid issues where the OS might try .DLL
+                    mappedName = libraryName + ".dll";
+                }
+                else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                {
+                    // Explicitly load with .so extension and lib prefix
+                    mappedName = "lib" + libraryName + ".so";
+                }
+                else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+                {
+                    // Explicitly load with .dylib extension and lib prefix
+                    mappedName = "lib" + libraryName + ".dylib";
+                }
+
+                if (mappedName != null)
+                {
+                    // 1. Try default loading (name only)
+                    if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle))
+                    {
+                        return handle;
+                    }
+
+                    // 2. Try relative to assembly location (look into runtimes subfolders)
+                    string assemblyLocation = null;
+                    try { assemblyLocation = assembly.Location; } catch { }
+                    if (!string.IsNullOrEmpty(assemblyLocation))
+                    {
+                        string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation);
+                        string rid = RuntimeInformation.RuntimeIdentifier;
+
+                        // Probe the specific RID first, then common fallbacks for the current OS
+                        string[] ridsToTry;
+                        if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+                        {
+                            ridsToTry = new[] { rid, "win-x64", "win-arm64" };
+                        }
+                        else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                        {
+                            ridsToTry = new[] { rid, "linux-x64", "linux-arm64" };
+                        }
+                        else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+                        {
+                            // We no longer provide osx-x64 in official package since 1.24.
+                            // However, we keep it in the list for build-from-source users.
+                            ridsToTry = new[] { rid, "osx-arm64", "osx-x64" };
+                        }
+                        else
+                        {
+                            ridsToTry = new[] { rid };
+                        }
+
+                        foreach (var tryRid in ridsToTry)
+                        {
+                            string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName);
+                            if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                            {
+                                LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                                return handle;
+                            }
+                        }
+                    }
+
+                    // 3. Try AppContext.BaseDirectory as a fallback
+                    string baseDir = AppContext.BaseDirectory;
+                    if (!string.IsNullOrEmpty(baseDir))
+                    {
+                        string probePath = System.IO.Path.Combine(baseDir, mappedName);
+                        if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                        {
+                            LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                            return handle;
+                        }
+
+                        string rid = RuntimeInformation.RuntimeIdentifier;
+                        probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName);
+                        if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                        {
+                            LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                            return handle;
+                        }
+                    }
+
+                    LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})");
+
+                }
+            }
+
+            // Fall back to default resolution
+            return IntPtr.Zero;
+        }
+
+        private static void LogLibLoad(string message)
+        {
+            System.Diagnostics.Trace.WriteLine(message);
+            if (!string.IsNullOrEmpty(Environment.GetEnvironmentVariable("ORT_LOADER_VERBOSITY")))
+            {
+                Console.WriteLine(message);
+            }
+        }
+#endif
+
         [DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)]
 #if NETSTANDARD2_0
         public static extern IntPtr OrtGetApiBase();
@@ -2644,7 +2767,7 @@ public delegate void DOrtAddKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
                                                  byte[] /* const char* */ value);
 
         /// <summary>
-        /// Get the value for the provided key. 
+        /// Get the value for the provided key.
         /// </summary>
         /// <returns>Value. Returns IntPtr.Zero if key was not found.</returns>
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -2767,7 +2890,7 @@ out IntPtr /* OrtSyncStream** */ stream
         // Auto Selection EP registration and selection customization
 
         /// <summary>
-        /// Register an execution provider library. 
+        /// Register an execution provider library.
         /// The library must implement CreateEpFactories and ReleaseEpFactory.
         /// </summary>
         /// <param name="env">Environment to add the EP library to.</param>
@@ -2952,9 +3075,10 @@ internal static class OrtExtensionsNativeMethods
 #elif __IOS__
         internal const string ExtensionsDllName = "__Internal";
 #else
-        // For desktop platforms, explicitly specify the DLL name with extension to avoid
-        // issues on case-sensitive filesystems. See NativeLib.DllName for detailed explanation.
-        internal const string ExtensionsDllName = "ortextensions.dll";
+        // For desktop platforms, use the simple name to allow .NET's
+        // automatic platform-specific resolution (lib*.so, lib*.dylib, *.dll).
+        // Case-sensitivity on Windows is handled by DllImportResolver.
+        internal const string ExtensionsDllName = "ortextensions";
 #endif
 
         [DllImport(ExtensionsDllName, CharSet = CharSet.Ansi,
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml
index d049c8d2d8990..c3cd38c9cd56b 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml
@@ -113,7 +113,8 @@
 
     <!-- arm64 -->
     <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime.dll"
-          Condition="'$(PlatformTarget)' == 'ARM64'">
+          Condition="'$(PlatformTarget)' == 'ARM64' AND
+                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm64\native\onnxruntime.dll')">
       <Link>onnxruntime.dll</Link>
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
@@ -128,7 +129,8 @@
 
     <!-- arm -->
     <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-arm\native\onnxruntime.dll"
-          Condition="'$(PlatformTarget)' == 'ARM'">
+          Condition="'$(PlatformTarget)' == 'ARM' AND
+                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-arm\native\onnxruntime.dll')">
       <Link>onnxruntime.dll</Link>
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
index f0d1313783643..c0475bb6102c1 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
@@ -601,6 +601,29 @@ private static Dictionary<string, string> GetSkippedModels(DirectoryInfo modelsD
                 skipModels["VGG 16-fp32"] = "bad allocation";
             }
 
+            // The following models are from onnx repo and fail on MacOS nuget test pipeline.
+            if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+            {
+                var macOSSkips = new[]
+                {
+                    "test_castlike_FLOAT_to_STRING_expanded",
+                    "test_castlike_FLOAT_to_BFLOAT16_expanded",
+                    "test_castlike_BFLOAT16_to_FLOAT",
+                    "test_cast_FLOAT_to_STRING",
+                    "test_castlike_FLOAT_to_BFLOAT16",
+                    "test_castlike_STRING_to_FLOAT_expanded",
+                    "test_castlike_STRING_to_FLOAT",
+                    "test_cast_STRING_to_FLOAT",
+                    "test_castlike_BFLOAT16_to_FLOAT_expanded",
+                    "test_cast_BFLOAT16_to_FLOAT",
+                    "test_castlike_FLOAT_to_STRING"
+                };
+                foreach (var model in macOSSkips)
+                {
+                    skipModels[model] = "Skipped on macOS due to flakes or lack of support";
+                }
+            }
+
             return skipModels;
         }
 
@@ -934,6 +957,7 @@ public void TestPretrainedModelsWithOrtValue(string opsetDir, string modelName)
         [MemberData(nameof(GetSkippedModelForTest), Skip = "Skipped due to Error, please fix the error and enable the test")]
         private void TestPreTrainedModels(string opsetDir, string modelName, bool useOrtValueAPIs = false)
         {
+
             var opsetDirInfo = new DirectoryInfo(opsetDir);
             var opset = opsetDirInfo.Name;
             string onnxModelFileName = null;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
index 18bf30a325d83..994aeb83a0ed5 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts
@@ -132,7 +132,7 @@ export const parseConvTransposeAttributes = (attributes: Record<string, unknown>
     typeof attributes.autoPad == 'undefined' ? 0 : (attributes.autoPad as number)
   ];
   const dilations = attributes.dilations as [number, number];
-  const group = attributes.group as number;
+  const group = (attributes.group as number) ?? 1; // default to 1 per ONNX spec
   const kernelShape = attributes.kernelShape as [number, number];
   const pads = attributes.pads as [number, number, number, number];
   const strides = attributes.strides as [number, number];
diff --git a/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc b/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc
index e413ccf580870..f4c3eb9914118 100644
--- a/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc
@@ -495,7 +495,7 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
     const std::vector<int64_t>& device_elements) {
   int64_t first_device_id = device_elements.at(0);
   int64_t first_device_id_count = 0;
-  for (size_t i = 0; i < device_elements.size(); ++i) {
+  for (size_t i = 0; i < static_cast<size_t>(device_elements.size()); ++i) {
     if (device_elements.at(i) == first_device_id) {
       ++first_device_id_count;
     }
@@ -505,8 +505,8 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
   // Check if the device mesh pattern is supported.
   // Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1].
   // Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0].
-  for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) {
-    for (size_t device_id = 0; device_id < repeat_stride; ++device_id) {
+  for (size_t repeat = 0; repeat < static_cast<size_t>(first_device_id_count); ++repeat) {
+    for (size_t device_id = 0; device_id < static_cast<size_t>(repeat_stride); ++device_id) {
       ORT_ENFORCE(
           device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id),
           "Unsupported device mesh pattern.");
@@ -556,7 +556,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
       // S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1]
       std::vector<AxisPartitionSpec> dst_axis_specs;
       for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-        if (src_axis != decomposed_axis_in_src) {
+        if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
           // Sharding spec is copied if the axis is not decomposed.
           // E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2]
           // The spec for "5" is copied.
@@ -606,7 +606,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
       DeviceMesh dst_device_mesh;
       std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements);
       for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-        if (src_axis != decomposed_axis_in_src) {
+        if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
           dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
         } else if (dst_shape[decomposition_axis_in_dst] == 1) {
           // S[0] -> RS[0]
@@ -660,7 +660,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
     // Source tensor is sharded on non-decomposed axis.
     std::vector<AxisPartitionSpec> dst_axis_specs;
     for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-      if (src_axis != decomposed_axis_in_src) {
+      if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
         dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
       } else {
         // R -> RR
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
index 167b2af946183..5170c982f248d 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
+++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -73,9 +73,9 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   MoEParameters moe_params(tensor_shards_);
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs<Tensor>(
       moe_params, input, router_probs,
-      fc1_experts_weights, fc1_experts_bias_optional, nullptr,
-      fc2_experts_weights, fc2_experts_bias_optional, nullptr,
-      fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr,
+      fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr,
+      fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr,
+      fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr,
       1,  // no quantization so pack size is 1
       activation_type_ == ort_fastertransformer::ActivationType::SwiGLU,
       0));  // no block-wise quantization for sharded MoE
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h b/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h
index 1fe8035cbcdae..7722cd5a84f07 100644
--- a/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h
+++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h
@@ -29,7 +29,14 @@
 
 #if defined(ENABLE_FP4)
 #include "cutlass/float_subbyte.h"
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
 #include <cuda_fp4.h>
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 namespace onnxruntime::llm {
diff --git a/onnxruntime/core/mlas/lib/qlutgemm.cpp b/onnxruntime/core/mlas/lib/qlutgemm.cpp
index cb099c2409a44..32c72342b4803 100644
--- a/onnxruntime/core/mlas/lib/qlutgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qlutgemm.cpp
@@ -25,33 +25,53 @@ Module Name:
 #include <memory>
 #include <string>
 #include <thread>
+#include <mutex>
 #include <unordered_map>
 
-/** T-MAC GEMM kernel Config */
+/**
+ * Global cache for T-MAC kernel parameters, indexed by configuration.
+ * This map and its associated mutex ensure thread-safe parameter management
+ * across concurrent MLAS calls.
+ */
 static std::unordered_map<std::string, struct MlasTMACKernelParams> tmac_kernel_configs;
+static std::mutex tmac_kernel_configs_mutex;
 
-const MlasTMACKernelParams&
+static std::string
+GetTmacKey(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point)
+{
+    // Generate a unique cache key based on the GEMM and quantization configuration.
+    return std::to_string(M) + "_" + std::to_string(N) + "_" + std::to_string(nbits) + "_" +
+           std::to_string(block_size) + "_" + (has_zero_point ? "1" : "0");
+}
+
+MlasTMACKernelParams
 MlasGetLutGemmKernelParams(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point)
 {
-    std::string key = std::to_string(M) + "_" + std::to_string(N) + "_" + std::to_string(nbits) + "_" + std::to_string(block_size) + "_" + (has_zero_point ? "1" : "0");
-    if (tmac_kernel_configs.count(key)) {
-        return tmac_kernel_configs[key];
+    std::string key = GetTmacKey(M, N, nbits, block_size, has_zero_point);
+    std::lock_guard<std::mutex> lock(tmac_kernel_configs_mutex);
+    auto it = tmac_kernel_configs.find(key);
+    if (it != tmac_kernel_configs.end()) {
+        return it->second;
     }
-    MLAS_THROW_EX(std::runtime_error, "T-MAC kernel parameters not initialized");
+    MLAS_THROW_EX(std::runtime_error, "T-MAC kernel parameters not initialized for key: " + key);
 }
 
 void MLASCALL
 MlasClearLutGemmKernelConfig()
 {
+    std::lock_guard<std::mutex> lock(tmac_kernel_configs_mutex);
     tmac_kernel_configs.clear();
 }
 
 void MLASCALL
 MlasInitLutGemmKernelConfig(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point)
 {
-    std::string key = std::to_string(M) + "_" + std::to_string(N) + "_" + std::to_string(nbits) + "_" + std::to_string(block_size) + "_" + (has_zero_point ? "1" : "0");
-    if (tmac_kernel_configs.count(key)) {
-        return;
+    std::string key = GetTmacKey(M, N, nbits, block_size, has_zero_point);
+    {
+        std::lock_guard<std::mutex> lock(tmac_kernel_configs_mutex);
+        if (tmac_kernel_configs.find(key) != tmac_kernel_configs.end()) {
+            return;
+        }
     }
 
     MlasTMACKernelParams params;
@@ -121,7 +141,10 @@ MlasInitLutGemmKernelConfig(size_t M, size_t N, size_t nbits, size_t block_size,
     params.has_zero_point = has_zero_point;
     params.one_scale = false;  // TODO(vraspar): support one scale case for bitnet
 
-    tmac_kernel_configs[key] = params;
+    {
+        std::lock_guard<std::mutex> lock(tmac_kernel_configs_mutex);
+        tmac_kernel_configs[key] = params;
+    }
     return;
 }
 
@@ -222,53 +245,52 @@ LutGemmPackQuantBData(
     const size_t PackedQuantBDataSize = (N * bits) * (K / g / ngroups_per_elem);
     memset(PackedQuantBDataBegin, 0, PackedQuantBDataSize);  // TODO: is this needed?
 
-    MlasTrySimpleParallel(
-        ThreadPool, Iterations,
-        [&](ptrdiff_t tid) {
-            size_t im = static_cast<size_t>(tid);
-            for (size_t ib = 0; ib < bits; ib++) {
-                for (size_t ik = 0; ik < K / g; ik++) {
-                    // w = w.reshape(M // bits // simd_n_out, simd_n_out, bits, K // g).transpose(0, 2, 1, 3)
-                    size_t new_im = im / simd_n_out;
-                    size_t new_isno = im % simd_n_out;
-                    size_t new_ib = ib;
-                    size_t new_ik = ik;
-                    size_t new_idx = new_im * c0_fac0 + new_ib * c0_fac1 + new_isno * c0_fac2 + new_ik;
-
-                    // w = w.reshape(M // mgroup, ngroups_per_elem, simd_n_in, K // g).transpose(0, 2, 1, 3)
-                    new_im = new_idx / c1_nb0;
-                    size_t new_ing = (new_idx % c1_nb0) / c1_nb1;
-                    size_t new_isni = (new_idx % c1_nb1) / c1_nb2;
-                    new_ik = (new_idx % c1_nb2);
-                    new_idx = new_im * c1_fac0 + new_isni * c1_fac1 + new_ing * c1_fac2 + new_ik;
-
-                    // #             0        1             2             3                 4                  5
-                    // w = w.reshape(M // bm, bm // mgroup, simd_n_in, ngroups_per_elem, K // g // kfactor, kfactor).transpose(0, 4, 1, 5, 2, 3)
-                    new_im = new_idx / c2_nb0;
-                    size_t new_ibm = (new_idx % c2_nb0) / c2_nb1;
-                    new_isni = (new_idx % c2_nb1) / c2_nb2;
-                    new_ing = (new_idx % c2_nb2) / c2_nb3;
-                    new_ik = (new_idx % c2_nb3) / c2_nb4;
-                    size_t new_ikf = (new_idx % c2_nb4);
-                    new_idx = new_im * c2_fac0 +
-                              new_ik * c2_fac1 +
-                              new_ibm * c2_fac2 +
-                              new_ikf * c2_fac3 +
-                              new_isni * ngroups_per_elem +
-                              new_ing;
-                    new_idx = new_idx / ngroups_per_elem;
-                    size_t buf_idx = im * bits * K / g + ib * K / g + ik;
-                    uint8_t buf_val = buf[buf_idx];
-
-                    // w = sum([(w[:, :, :, :, :, ng] << (ng * g)) for ng in range(ngroups_per_elem)])
-                    PackedQuantBDataBegin[new_idx] = static_cast<std::byte>(
-                        static_cast<unsigned>(PackedQuantBDataBegin[new_idx]) +
-                        (buf_val << (new_ing * g))
-                    );
-                }
+    // NOTE: The second packing loop is intentionally serialized to avoid data races.
+    // T-MAC packs multiple output features (N) into a single byte if ngroups_per_elem > 1.
+    // Parallelizing this across N would lead to concurrent bit-plane updates on the same memory location.
+    for (size_t im = 0; im < Iterations; im++) {
+        for (size_t ib = 0; ib < bits; ib++) {
+            for (size_t ik = 0; ik < K / g; ik++) {
+                // w = w.reshape(M // bits // simd_n_out, simd_n_out, bits, K // g).transpose(0, 2, 1, 3)
+                size_t new_im = im / simd_n_out;
+                size_t new_isno = im % simd_n_out;
+                size_t new_ib = ib;
+                size_t new_ik = ik;
+                size_t new_idx = new_im * c0_fac0 + new_ib * c0_fac1 + new_isno * c0_fac2 + new_ik;
+
+                // w = w.reshape(M // mgroup, ngroups_per_elem, simd_n_in, K // g).transpose(0, 2, 1, 3)
+                new_im = new_idx / c1_nb0;
+                size_t new_ing = (new_idx % c1_nb0) / c1_nb1;
+                size_t new_isni = (new_idx % c1_nb1) / c1_nb2;
+                new_ik = (new_idx % c1_nb2);
+                new_idx = new_im * c1_fac0 + new_isni * c1_fac1 + new_ing * c1_fac2 + new_ik;
+
+                // #             0        1             2             3                 4                  5
+                // w = w.reshape(M // bm, bm // mgroup, simd_n_in, ngroups_per_elem, K // g // kfactor, kfactor).transpose(0, 4, 1, 5, 2, 3)
+                new_im = new_idx / c2_nb0;
+                size_t new_ibm = (new_idx % c2_nb0) / c2_nb1;
+                new_isni = (new_idx % c2_nb1) / c2_nb2;
+                new_ing = (new_idx % c2_nb2) / c2_nb3;
+                new_ik = (new_idx % c2_nb3) / c2_nb4;
+                size_t new_ikf = (new_idx % c2_nb4);
+                new_idx = new_im * c2_fac0 +
+                          new_ik * c2_fac1 +
+                          new_ibm * c2_fac2 +
+                          new_ikf * c2_fac3 +
+                          new_isni * ngroups_per_elem +
+                          new_ing;
+                new_idx = new_idx / ngroups_per_elem;
+                size_t buf_idx = im * bits * K / g + ib * K / g + ik;
+                uint8_t buf_val = buf[buf_idx];
+
+                // w = sum([(w[:, :, :, :, :, ng] << (ng * g)) for ng in range(ngroups_per_elem)])
+                PackedQuantBDataBegin[new_idx] = static_cast<std::byte>(
+                    static_cast<unsigned>(PackedQuantBDataBegin[new_idx]) +
+                    (buf_val << (new_ing * g))
+                );
             }
         }
-    );
+    }
 }
 
 // Internal helper: calculates packed scales and zero points size in floats
@@ -472,16 +494,15 @@ size_t
 CalculateLutBufferSize(size_t n, size_t k, size_t m, const MlasTMACKernelParams& tmac_params)
 {
     MLAS_UNREFERENCED_PARAMETER(n);
-    constexpr size_t kAllockAligment = 64;
     const size_t lut_scales_size = k / tmac_params.act_group_size;
 
-    size_t wsize = k * m * 4 * sizeof(int8_t);         // 4 bytes per k element for 2-bit LUT
-    wsize += lut_scales_size * m * 2 * sizeof(float);  // scales + biases
-
-    wsize = ((wsize - 1) / kAllockAligment + 1) * kAllockAligment;
+    // The AVX2 kernel (g=4) expects 16 entries (16 bytes) per group of 4 activations.
+    // This effectively requires 4 bytes per activation in the K dimension.
+    size_t lut_size_bytes = m * k * 4;
+    size_t scales_size_bytes = m * lut_scales_size * sizeof(float);
+    size_t biases_size_bytes = m * lut_scales_size * sizeof(float);
 
-    // TODO(vrapar): add temp buffer for FP16
-    return wsize;
+    return lut_size_bytes + scales_size_bytes + biases_size_bytes + 256;  // + alignment/safety padding
 }
 
 void MLASCALL
@@ -532,17 +553,23 @@ MlasLutGemm(
     // n_tiles_num = m * bits / bm;
 
     // TODO(vraspar): support other bitwidths
+    // For T-MAC, kernel properties (bm, n_tiles_num) are primarily driven by the number of output features (N).
+    // Initialization during packing (LutGemmPackQuantBDataSize) uses N as the major dimension,
+    // so we must match that here to ensure consistent weight tiling.
+    MlasInitLutGemmKernelConfig(N, K, 2, BlkLen, HasZeroPoint);
     const MlasTMACKernelParams& tmac_params = MlasGetLutGemmKernelParams(N, K, 2, BlkLen, HasZeroPoint);
     const size_t lut_scales_size = K / tmac_params.act_group_size;
+    const size_t lut_size_bytes = static_cast<size_t>(M) * static_cast<size_t>(K) * 4;
     size_t lut_buffer_size = CalculateLutBufferSize(N, K, M, tmac_params);
 
     // make buffer of lut_buffer_size bytes
     // TODO(vraspar): other way to do it
     auto lut_buffer = std::make_unique<int8_t[]>(lut_buffer_size);
+    memset(lut_buffer.get(), 0, lut_buffer_size);
 
     int8_t* qlut = reinterpret_cast<int8_t*>(lut_buffer.get());
-    float* lut_scales = reinterpret_cast<float*>(qlut + K * M * 4);                  // after lut
-    float* lut_biases = reinterpret_cast<float*>(lut_scales + lut_scales_size * M);  // after scales
+    float* lut_scales = reinterpret_cast<float*>(qlut + lut_size_bytes);                  // after lut
+    float* lut_biases = reinterpret_cast<float*>(lut_scales + lut_scales_size * M);       // after scales
 
     const auto* a_float = reinterpret_cast<const float*>(A);  // Activation data
 
@@ -558,11 +585,12 @@ MlasLutGemm(
 
     for (size_t ine11 = 0; ine11 < static_cast<size_t>(M); ine11++) {
         const size_t row_offset = ine11 * K;
-        const size_t lut_offset = ine11 * K * 4;  // 4 bytes per K element for 2-bit LUT
+        // Call the LUT generation kernel for this activation row.
+        // We use a 4-byte stride (per activation) for the LUT entries to satisfy
+        // the memory layout requirements of the computation kernel.
+        const size_t lut_offset = ine11 * K * 4;
         const size_t scale_bias_offset = ine11 * lut_scales_size;
 
-        // Call the dispatch function for this row
-        // ggml_tmac_mul_mat_task_init
         Dispatch->GenerateLUT(
             const_cast<float*>(a_float + row_offset),  // Input activation for this row
             qlut + lut_offset,                         // Output LUT for this row
@@ -571,7 +599,8 @@ MlasLutGemm(
             M,
             K,
             N,
-            tmac_params.act_group_size
+            tmac_params.act_group_size,
+            tmac_params.act_group_size * 4
         );
     }
 
@@ -657,15 +686,17 @@ MlasLutGemm(
 
                 // Process all batch items in this chunk
                 for (size_t ine11 = ir1_start; ine11 < ir1_end; ine11++) {
-                    // Calculate LUT offsets for this batch item
+                    // Calculate LUT offsets with 4-byte stride (per activation) for consistent access.
                     const size_t qlut_offset = K * ine11 * 4;
                     const size_t lut_scales_offset = lut_scales_size * ine11;
 
                     // Calculate output offset
                     const size_t dst_offset = OutputRows * ine11 + ichunk0 * ChunkSize0;
 
-                    // Call the dispatch function to compute this tile
-                    // Note M and N are swapped in TMAC terminology
+                    // Call the dispatch function to compute this tile.
+                    // We pass one batch item at a time (M=1) and ChunkSize0 output features.
+                    // TotalN is passed specifically to allow the kernel to find the correct
+                    // parameters (bm, tiles) used during weight packing.
                     Dispatch->ComputeGemm(
                         packed_weights + w_offset,       // Weight tile
                         QuantBScale + scales_offset,     // Weight scales for this tile
@@ -674,8 +705,9 @@ MlasLutGemm(
                         lut_biases + lut_scales_offset,  // LUT biases
                         act_output + dst_offset,         // Output location
                         static_cast<int>(K),             // K dimension
-                        static_cast<int>(N),             // N dimension
-                        static_cast<int>(1),             // M dimension (processing one batch item at a time)
+                        static_cast<int>(1),             // M dimension (batch size = 1)
+                        static_cast<int>(ir0_end - ir0_start), // N dimension (output features in chunk)
+                        static_cast<int>(N),             // TotalN (total output features in weights)
                         BlkLen,                          // Weight quantization group size
                         HasZeroPoint                     // Whether zero points are used
                     );
diff --git a/onnxruntime/core/mlas/lib/qlutgemm.h b/onnxruntime/core/mlas/lib/qlutgemm.h
index ef4d01a2c5809..0a733199ea2e8 100644
--- a/onnxruntime/core/mlas/lib/qlutgemm.h
+++ b/onnxruntime/core/mlas/lib/qlutgemm.h
@@ -42,7 +42,11 @@ struct MlasTMACKernelParams {
     bool one_scale;
 };
 
-const MlasTMACKernelParams&
+/**
+ * Retrieves the T-MAC kernel configuration for a given GEMM problem.
+ * Returns the parameters by value to ensure thread-safety across concurrent calls.
+ */
+MlasTMACKernelParams
 MlasGetLutGemmKernelParams(size_t M, size_t N, size_t nbits, size_t block_size, bool has_zero_point);
 
 typedef void(MLAS_QNBIT_GEMM_LUT_GEN)(
@@ -53,19 +57,21 @@ typedef void(MLAS_QNBIT_GEMM_LUT_GEN)(
     size_t M,
     size_t K,
     size_t N,
-    size_t act_group_size
+    size_t act_group_size,
+    size_t lut_stride        // Stride (in bytes) between consecutive LUT entries along the batch dimension.
 );
 
 typedef void(MLAS_QNBIT_LUT_GEMM_COMPUTE)(
-    const uint8_t* weights,
-    const float* scales,
+    const uint8_t* A,
+    const float* Scales,
     const int8_t* LUT,
     const float* LUT_Scales,
     const float* LUT_Biases,
     float* C,
     int K,
-    int M,  // batch size (number of rows in activation)
-    int N,
+    int M,                  // Batch size (current activation rows).
+    int N,                  // Number of output features to compute in this tile/chunk.
+    int TotalN,             // Total number of output features in the weights (used for parameter mapping).
     size_t BlkLen,
     bool HasZeroPoint
 );
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp
index a89993d4515b8..7e4df13423be2 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_lut_kernel_avx2.cpp
@@ -361,7 +361,8 @@ GenerateLUT_avx2(
     size_t M,
     size_t K,
     size_t N,
-    size_t act_group_size
+    size_t act_group_size,
+    size_t lut_stride
 )
 {
     (void)M;  // silence unused parameter warning
@@ -379,7 +380,9 @@ GenerateLUT_avx2(
     }
 
     for (int32_t k_outer_1 = 0; k_outer_1 < kk_outer_max; ++k_outer_1) {
-        lut_ctor_g4_int8_impl(static_cast<int32_t>(act_group_size), (&(qlut[(k_outer_1 * act_group_size * 4)])), (&(b[(k_outer_1 * act_group_size)])), (&(lut_scales[k_outer_1])), (&(lut_biases[k_outer_1])));
+        // Use the explicit lut_stride provided by the dispatch/caller to ensure
+        // consistent memory layout between construction and compute paths.
+        lut_ctor_g4_int8_impl(static_cast<int32_t>(act_group_size), (&(qlut[(k_outer_1 * lut_stride)])), (&(b[(k_outer_1 * act_group_size)])), (&(lut_scales[k_outer_1])), (&(lut_biases[k_outer_1])));
     }
 }
 
@@ -400,6 +403,20 @@ tbl_g4_int8_float_gather_bit2_impl(int32_t m, float* C_global, float* CBits, flo
         }
     }
 
+    // Handle tail cases where m is not a multiple of 32.
+    // This ensures C_global is fully initialized for all m elements.
+    int32_t m_tail = m % 32;
+    if (m_tail > 0) {
+        int32_t m_c_outer = m_c_outer_max;
+        int32_t cse_var_2 = (m_c_outer * 32 * bits);
+        int32_t cse_var_1 = (m_c_outer * 32);
+        for (int32_t m_c_inner = 0; m_c_inner < m_tail; ++m_c_inner) {
+            int32_t bit_offset_0 = (m_c_inner / 8) * 8 * bits + (m_c_inner % 8);
+            int32_t bit_offset_1 = (m_c_inner / 8) * 8 * bits + (m_c_inner % 8) + 8;
+            C_global[cse_var_1 + m_c_inner] = (CBits[cse_var_2 + bit_offset_0] * (float)5.000000e-01f) + (CBits[cse_var_2 + bit_offset_1]);
+        }
+    }
+
     for (int32_t m_inner_outer = 0; m_inner_outer < m_c_outer_max; ++m_inner_outer) {
         PRAGMA_UNROLL
         for (int32_t m_inner = 0; m_inner < 32; ++m_inner) {
@@ -407,6 +424,17 @@ tbl_g4_int8_float_gather_bit2_impl(int32_t m, float* C_global, float* CBits, flo
             C[offset] = C_global[offset];
         }
     }
+
+    // Transfer the remaining tail results from C_global to the final output matrix C.
+    // This is necessary when m is not a multiple of 32, ensuring all output features
+    // are correctly written to the destination buffer.
+    if (m_tail > 0) {
+        int offset_base = m_c_outer_max * 32;
+        for (int32_t m_inner = 0; m_inner < m_tail; ++m_inner) {
+            int offset = offset_base + m_inner;
+            C[offset] = C_global[offset];
+        }
+    }
 }
 
 // When FastAggregation is enabled, FastAggregationK = ActK
@@ -451,8 +479,8 @@ tbl_g4_int8_float_update_impl(int32_t m, float* c, const int8_t* lut, const uint
             __m256 vec_v_high_low = _mm256_cvtepi32_ps(extract_low_epi16_epi32(adder.get_high()));
             __m256 vec_v_high_high = _mm256_cvtepi32_ps(extract_high_epi16_epi32(adder.get_high()));
 
-            float lut_s = lut_scales[kk / ActK];
-            float lut_b = lut_biases[kk / ActK];
+            float lut_s = lut_scales[kk / (ActK * 4)];
+            float lut_b = lut_biases[kk / (ActK * 4)];
 
             partial_sum += lut_b;
 
@@ -542,17 +570,20 @@ TMACComputeGemm_avx2(
     int K,
     int M,
     int N,
+    int TotalN,
     size_t BlkLen,  // Weight quantization group size (q_group_size)
     bool HasZeroPoint
 )
 {
-    // Validate batch size
-    if (N != 1) {
-        MLAS_THROW_EX(std::runtime_error, "N > 1 is not supported yet");
+    // Validate batch size (M)
+    // For now, TMAC AVX2 kernel processes one batch row at a time.
+    if (M != 1) {
+        MLAS_THROW_EX(std::runtime_error, "M > 1 is not supported yet in TMAC AVX2 kernel");
     }
 
-    // get kernel config
-    const MlasTMACKernelParams& tmac_params = MlasGetLutGemmKernelParams(M, K, 2, BlkLen, HasZeroPoint);
+    // get kernel config using the total output features (TotalN)
+    // This matches the parameters used during weight packing.
+    const MlasTMACKernelParams& tmac_params = MlasGetLutGemmKernelParams(TotalN, K, 2, BlkLen, HasZeroPoint);
 
     // ==================== CONFIGURATION ====================
     // Fixed parameters for this kernel implementation
@@ -572,7 +603,11 @@ TMACComputeGemm_avx2(
     const int32_t actk = static_cast<int32_t>(tmac_params.actk);                      // CRITICAL: = 16 for BlkLen=64, NOT BlkLen!
 
     const int32_t bm = static_cast<int32_t>(tmac_params.bm);
-    int32_t m = bm / bits;
+    // m is the number of output features this kernel tile produces.
+    // We clamp m by N (the number of features in the current chunk) to ensure
+    // we don't read or write past the tile boundary during the gather phase.
+    int32_t m_full = bm / bits;
+    int32_t m = std::min(m_full, N);
 
     // Validate configuration
     assert(bm % bits == 0);
@@ -590,8 +625,9 @@ TMACComputeGemm_avx2(
     float* CBits = new float[bm];
     float* C_global = new float[m];
 
-    // Reset accumulator buffer to zero
-    tbl_int32_reset(bm * sizeof(float) / sizeof(int32_t), reinterpret_cast<int32_t*>(CBits));
+    // Explicitly zero-initialize accumulation buffers to ensure determinism.
+    memset(CBits, 0, bm * sizeof(float));
+    memset(C_global, 0, m * sizeof(float));
 
     // ==================== CALCULATE LOOP PARAMETERS ====================
     const int32_t k_outer_max = K / (kfactor * g);
diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc
index 9b71f4ba2ebec..6d5a400be703b 100644
--- a/onnxruntime/core/platform/windows/telemetry.cc
+++ b/onnxruntime/core/platform/windows/telemetry.cc
@@ -3,6 +3,10 @@
 
 #include "core/platform/windows/telemetry.h"
 #include <mutex>
+#include <string>
+#include <vector>
+#include <cwchar>
+#include <winsvc.h>
 #include "core/common/logging/logging.h"
 #include "onnxruntime_config.h"
 
@@ -51,6 +55,80 @@ TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntim
                              // {3a26b1ff-7484-7484-7484-15261f42614d}
                              (0x3a26b1ff, 0x7484, 0x7484, 0x74, 0x84, 0x15, 0x26, 0x1f, 0x42, 0x61, 0x4d),
                              TraceLoggingOptionMicrosoftTelemetry());
+
+std::string ConvertWideStringToUtf8(const std::wstring& wide) {
+  if (wide.empty())
+    return {};
+
+  const UINT code_page = CP_UTF8;
+  const DWORD flags = 0;
+  LPCWCH const src = wide.data();
+  const int src_len = static_cast<int>(wide.size());
+  int utf8_length = ::WideCharToMultiByte(code_page, flags, src, src_len, nullptr, 0, nullptr, nullptr);
+  if (utf8_length == 0)
+    return {};
+
+  std::string utf8(utf8_length, '\0');
+  if (::WideCharToMultiByte(code_page, flags, src, src_len, utf8.data(), utf8_length, nullptr, nullptr) == 0)
+    return {};
+
+  return utf8;
+}
+
+std::string GetServiceNamesForCurrentProcess() {
+  static std::once_flag once_flag;
+  static std::string service_names;
+
+  std::call_once(once_flag, [] {
+    SC_HANDLE service_manager = ::OpenSCManagerW(nullptr, nullptr, SC_MANAGER_ENUMERATE_SERVICE);
+    if (service_manager == nullptr)
+      return;
+
+    DWORD bytes_needed = 0;
+    DWORD services_returned = 0;
+    DWORD resume_handle = 0;
+    if (!::EnumServicesStatusExW(service_manager, SC_ENUM_PROCESS_INFO, SERVICE_WIN32, SERVICE_ACTIVE, nullptr, 0, &bytes_needed,
+                                 &services_returned, &resume_handle, nullptr) &&
+        ::GetLastError() != ERROR_MORE_DATA) {
+      ::CloseServiceHandle(service_manager);
+      return;
+    }
+
+    if (bytes_needed == 0) {
+      ::CloseServiceHandle(service_manager);
+      return;
+    }
+
+    std::vector<uint8_t> buffer(bytes_needed);
+    auto* services = reinterpret_cast<ENUM_SERVICE_STATUS_PROCESSW*>(buffer.data());
+    services_returned = 0;
+    resume_handle = 0;
+    if (!::EnumServicesStatusExW(service_manager, SC_ENUM_PROCESS_INFO, SERVICE_WIN32, SERVICE_ACTIVE, reinterpret_cast<LPBYTE>(services),
+                                 bytes_needed, &bytes_needed, &services_returned, &resume_handle, nullptr)) {
+      ::CloseServiceHandle(service_manager);
+      return;
+    }
+
+    DWORD current_pid = ::GetCurrentProcessId();
+    std::wstring aggregated;
+    bool first = true;
+    for (DWORD i = 0; i < services_returned; ++i) {
+      if (services[i].ServiceStatusProcess.dwProcessId == current_pid) {
+        if (!first) {
+          aggregated.push_back(L',');
+        }
+        aggregated.append(services[i].lpServiceName);
+        first = false;
+      }
+    }
+
+    ::CloseServiceHandle(service_manager);
+
+    service_names = ConvertWideStringToUtf8(aggregated);
+  });
+
+  return service_names;
+}
 }  // namespace
 
 #ifdef _MSC_VER
@@ -178,6 +256,7 @@ void WindowsTelemetry::LogProcessInfo() const {
 #if BUILD_INBOX
   isRedist = false;
 #endif
+  const std::string service_names = GetServiceNamesForCurrentProcess();
   TraceLoggingWrite(telemetry_provider_handle,
                     "ProcessInfo",
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
@@ -189,7 +268,8 @@ void WindowsTelemetry::LogProcessInfo() const {
                     TraceLoggingString(ORT_VERSION, "runtimeVersion"),
                     TraceLoggingBool(IsDebuggerPresent(), "isDebuggerAttached"),
                     TraceLoggingBool(isRedist, "isRedist"),
-                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"),
+                    TraceLoggingString(service_names.c_str(), "serviceNames"));
 
   process_info_logged = true;
 }
@@ -204,7 +284,8 @@ void WindowsTelemetry::LogSessionCreationStart(uint32_t session_id) const {
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
                     TraceLoggingUInt32(session_id, "sessionId"),
-                    TraceLoggingLevel(WINEVENT_LEVEL_INFO));
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 }
 
 void WindowsTelemetry::LogEvaluationStop(uint32_t session_id) const {
@@ -278,6 +359,7 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
     execution_provider_string += i;
   }
 
+  const std::string service_names = GetServiceNamesForCurrentProcess();
   // Difference is MeasureEvent & isCaptureState, but keep in sync otherwise
   if (!captureState) {
     TraceLoggingWrite(telemetry_provider_handle,
@@ -304,7 +386,9 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
                       TraceLoggingString(model_weight_hash.c_str(), "modelWeightHash"),
                       TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"),
                       TraceLoggingString(loaded_from.c_str(), "loadedFrom"),
-                      TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"));
+                      TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"),
+                      TraceLoggingString(service_names.c_str(), "serviceNames"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   } else {
     TraceLoggingWrite(telemetry_provider_handle,
                       "SessionCreation_CaptureState",
@@ -330,7 +414,9 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
                       TraceLoggingString(model_weight_hash.c_str(), "modelWeightHash"),
                       TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"),
                       TraceLoggingString(loaded_from.c_str(), "loadedFrom"),
-                      TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"));
+                      TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"),
+                      TraceLoggingString(service_names.c_str(), "serviceNames"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   }
 }
 
@@ -419,7 +505,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"),
                     TraceLoggingString(file, "file"),
                     TraceLoggingString(function, "function"),
-                    TraceLoggingInt32(line, "line"));
+                    TraceLoggingInt32(line, "line"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 #else
   TraceLoggingWrite(telemetry_provider_handle,
                     "RuntimeError",
@@ -435,7 +522,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"),
                     TraceLoggingString(file, "file"),
                     TraceLoggingString(function, "function"),
-                    TraceLoggingInt32(line, "line"));
+                    TraceLoggingInt32(line, "line"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 #endif
 }
 
@@ -465,7 +553,8 @@ void WindowsTelemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_s
                     TraceLoggingUInt32(session_id, "sessionId"),
                     TraceLoggingUInt32(total_runs_since_last, "totalRuns"),
                     TraceLoggingInt64(total_run_duration_since_last, "totalRunDuration"),
-                    TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize"));
+                    TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 }
 
 void WindowsTelemetry::LogExecutionProviderEvent(LUID* adapterLuid) const {
@@ -541,7 +630,8 @@ void WindowsTelemetry::LogAutoEpSelection(uint32_t session_id, const std::string
                     TraceLoggingUInt32(session_id, "sessionId"),
                     TraceLoggingString(selection_policy.c_str(), "selectionPolicy"),
                     TraceLoggingString(requested_execution_provider_string.c_str(), "requestedExecutionProviderIds"),
-                    TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds"));
+                    TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 }
 
 void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const std::string& provider_options_string, bool captureState) const {
@@ -560,7 +650,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const
                       // Telemetry info
                       TraceLoggingUInt8(0, "schemaVersion"),
                       TraceLoggingString(provider_id.c_str(), "providerId"),
-                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"));
+                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   } else {
     TraceLoggingWrite(telemetry_provider_handle,
                       "ProviderOptions_CaptureState",
@@ -572,7 +663,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const
                       // Telemetry info
                       TraceLoggingUInt8(0, "schemaVersion"),
                       TraceLoggingString(provider_id.c_str(), "providerId"),
-                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"));
+                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   }
 }
 
diff --git a/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc b/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc
index af67419f4fb91..60ebf862e1601 100644
--- a/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc
+++ b/onnxruntime/core/providers/cpu/ml/array_feature_extractor.cc
@@ -73,10 +73,10 @@ common::Status ArrayFeatureExtractorOp<T>::Compute(OpKernelContext* context) con
   }
 
   for (int64_t i = 0; i < num_indices; ++i) {
-    if (y_data[i] >= stride) {
+    if (y_data[i] < 0 || y_data[i] >= stride) {
       return ORT_MAKE_STATUS(
           ONNXRUNTIME, INVALID_ARGUMENT,
-          "Invalid Y argument: index is out of range: Y[", i, "] (", y_data[i], ") >=", stride);
+          "Invalid Y argument: index is out of range: Y[", i, "] (", y_data[i], ") must be in [0, ", stride, ")");
     }
   }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 32f5c98da1585..d50a4deca3298 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -15,12 +15,17 @@
 #pragma warning(push)
 // 'fp4_interpretation' : unreferenced parameter
 #pragma warning(disable : 4100)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #include <cuda_fp4.h>
 
 #if defined(_MSC_VER)
 #pragma warning(pop)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
 #endif
 
 #endif
diff --git a/onnxruntime/core/providers/cuda/cuda_type_conversion.h b/onnxruntime/core/providers/cuda/cuda_type_conversion.h
index 38cdce1380fad..04e47a9930710 100644
--- a/onnxruntime/core/providers/cuda/cuda_type_conversion.h
+++ b/onnxruntime/core/providers/cuda/cuda_type_conversion.h
@@ -14,12 +14,17 @@
 #pragma warning(push)
 // 'fp4_interpretation' : unreferenced parameter
 #pragma warning(disable : 4100)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #include <cuda_fp4.h>
 
 #if defined(_MSC_VER)
 #pragma warning(pop)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
 #endif
 
 #endif
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index 656890e796a1c..d75c6e947e09c 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -259,7 +259,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   TArray<fast_divmod> fdm_output_strides(dimension_count);
   TensorPitches output_strides(output_dims);
-  for (auto i = 0; i < dimension_count; i++) {
+  for (size_t i = 0; i < dimension_count; i++) {
     fdm_output_strides[i] = fast_divmod(static_cast<int>(output_strides[i]));
   }
 
diff --git a/onnxruntime/core/providers/openvino/ov_shared_context.cc b/onnxruntime/core/providers/openvino/ov_shared_context.cc
index b529009a205ea..900196c3f652a 100644
--- a/onnxruntime/core/providers/openvino/ov_shared_context.cc
+++ b/onnxruntime/core/providers/openvino/ov_shared_context.cc
@@ -10,9 +10,10 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-SharedContext::SharedContext(std::filesystem::path bin_path)
-    : bin_path_(std::move(bin_path)),
-      bin_manager_(bin_path_) {
+SharedContext::SharedContext(const std::filesystem::path& bin_path)
+    : bin_path_(bin_path),
+      bin_manager_(bin_path_),
+      weight_file_manager_(WeightFileManager::Get()) {
 }
 
 static bool InRange(size_t offset, size_t size, size_t total_size) {
@@ -74,7 +75,7 @@ void SharedContext::LoadTensorFromFile(
   const auto weights_location = model_dir / value.serialized.location;
   auto& weights_file = weight_files_[weights_location];
   if (!weights_file) {
-    weights_file = std::make_unique<WeightsFile>(weights_location);
+    weights_file = weight_file_manager_->GetOrCreateWeightsFile(weights_location);
   }
 
   ov::Tensor tensor;
diff --git a/onnxruntime/core/providers/openvino/ov_shared_context.h b/onnxruntime/core/providers/openvino/ov_shared_context.h
index f6cfe56086517..99af8bf208805 100644
--- a/onnxruntime/core/providers/openvino/ov_shared_context.h
+++ b/onnxruntime/core/providers/openvino/ov_shared_context.h
@@ -19,10 +19,13 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
+class WeightFileManager;
+
 class SharedContext : public std::enable_shared_from_this<SharedContext> {
  public:
-  explicit SharedContext(std::filesystem::path bin_path);
+  explicit SharedContext(const std::filesystem::path& bin_path);
   SharedContext() : SharedContext("") {}
+  virtual ~SharedContext() {}
 
   struct Metadata {
     struct Value {
@@ -83,7 +86,6 @@ class SharedContext : public std::enable_shared_from_this<SharedContext> {
     return BinManager::GetBinPathForModel(model_path);
   }
 
- private:
   struct WeightsFile {
     ORT_DISALLOW_COPY_AND_ASSIGNMENT(WeightsFile);
     WeightsFile() = delete;
@@ -104,7 +106,9 @@ class SharedContext : public std::enable_shared_from_this<SharedContext> {
     std::map<std::string, MappingContainer> imported_device_tensors_;
   };
 
-  void LoadTensorFromFile(
+ private:
+  void
+  LoadTensorFromFile(
       Metadata::Value& value,
       const std::filesystem::path& model_dir,
       std::optional<ov::RemoteContext>& remote_context,
@@ -114,10 +118,29 @@ class SharedContext : public std::enable_shared_from_this<SharedContext> {
   mutable std::shared_mutex mutex_;
   std::filesystem::path bin_path_;
   BinManager bin_manager_;
-  std::unordered_map<std::filesystem::path, std::unique_ptr<WeightsFile>> weight_files_;
+  std::shared_ptr<WeightFileManager> weight_file_manager_;
+  std::unordered_map<std::filesystem::path, std::shared_ptr<WeightsFile>> weight_files_;
   Metadata::Map metadata_;
 };
 
+class WeightFileManager : public WeakSingleton<WeightFileManager> {
+ public:
+  using WeightsFile = SharedContext::WeightsFile;
+  std::shared_ptr<WeightsFile> GetOrCreateWeightsFile(const std::filesystem::path& weights_path) {
+    auto absolute_path = std::filesystem::absolute(weights_path);
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto [it, inserted] = files_.try_emplace(absolute_path, nullptr);
+    if (inserted) {
+      it->second = std::make_shared<WeightsFile>(absolute_path);
+    }
+    return it->second;
+  }
+
+ private:
+  mutable std::mutex mutex_;
+  std::unordered_map<std::filesystem::path, std::shared_ptr<WeightsFile>> files_;
+};
+
 class SharedContextManager : public WeakSingleton<SharedContextManager> {
  public:
   std::shared_ptr<SharedContext> GetOrCreateActiveSharedContext(const std::filesystem::path& model_path) {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 9fc1cd7f42939..eba0a8c2615aa 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -1168,7 +1168,7 @@ Status QnnBackendManager::ResetContextPriority() {
   return SetContextPriority(context_priority_);
 }
 
-Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
+Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode) {
   if (true == context_created_) {
     LOGS_DEFAULT(INFO) << "Context created already.";
     return Status::OK();
@@ -1184,8 +1184,16 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
   QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT;
   ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config));
 
+  QnnContext_Config_t context_config_extended_udma = QNN_CONTEXT_CONFIG_INIT;
+  QnnHtpContext_CustomConfig_t udma_custom_config;
+  udma_custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_USE_EXTENDED_UDMA;
+  udma_custom_config.useExtendedUdma = enable_htp_extended_udma_mode;
+  context_config_extended_udma.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
+  context_config_extended_udma.customConfig = &udma_custom_config;
+
   const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config,
                                                       &context_config_weight_sharing,
+                                                      &context_config_extended_udma,
                                                       nullptr};
 
   const QnnContext_Config_t* empty_context_configs[] = {nullptr};
@@ -1568,7 +1576,8 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
                                        bool enable_vtcm_backup_buffer_sharing,
                                        bool enable_file_mapped_weights,
                                        std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
-                                       std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map) {
+                                       std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
+                                       bool enable_htp_extended_udma_mode) {
   std::lock_guard<std::recursive_mutex> lock(logger_recursive_mutex_);
   if (backend_setup_completed_) {
     LOGS(logger, VERBOSE) << "Backend setup already!";
@@ -1679,7 +1688,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
 
   if (status.IsOK() && (vtcm_backup_buffer_sharing_enabled_ || !load_from_cached_context)) {
     status = vtcm_backup_buffer_sharing_enabled_ ? CreateContextVtcmBackupBufferSharingEnabled(context_bin_map)
-                                                 : CreateContext(enable_htp_weight_sharing);
+                                                 : CreateContext(enable_htp_weight_sharing, enable_htp_extended_udma_mode);
 
     if (status.IsOK()) {
       LOGS(logger, VERBOSE) << "CreateContext succeed.";
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 9b573531f7c3d..dfa40a2c8aa0d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -171,7 +171,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
                       bool enable_vtcm_backup_buffer_sharing,
                       bool enable_file_mapped_weights,
                       std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
-                      std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map);
+                      std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
+                      bool enable_htp_extended_udma_mode);
 
   Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
 
@@ -299,7 +300,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
 
   Status ReleaseProfilehandle();
 
-  Status CreateContext(bool enable_htp_weight_sharing);
+  Status CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode);
 
   Status GetFileSizeIfValid(const std::string& filepath, size_t& file_size);
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index a6f1d1c1681cf..c3d8328b37411 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -602,6 +602,19 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
   }
 
+  static const std::string QNN_HTP_EXTENDED_UDMA_MODE = "extended_udma";
+  auto htp_extended_udma_pos = provider_options_map.find(QNN_HTP_EXTENDED_UDMA_MODE);
+  if (htp_extended_udma_pos != provider_options_map.end()) {
+    if ("1" == htp_extended_udma_pos->second) {
+      enable_htp_extended_udma_mode_ = true;
+    } else if ("0" == htp_extended_udma_pos->second) {
+      enable_htp_extended_udma_mode_ = false;
+    } else {
+      LOGS_DEFAULT(WARNING) << "Invalid extended_udma mode: " << enable_htp_extended_udma_mode_ << " only 0 or 1 allowed. Set to 0.";
+    }
+    LOGS_DEFAULT(VERBOSE) << "User specified extended_udma mode: " << enable_htp_extended_udma_mode_;
+  }
+
   // Option to skip QNN API interface version check to use other QNN library other than default.
   static const std::string SKIP_QNN_VERSION_CHECK = "skip_qnn_version_check";
   auto skip_qnn_version_check = ParseBoolOption(SKIP_QNN_VERSION_CHECK, false, provider_options_map);
@@ -1006,7 +1019,8 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
                                                enable_vtcm_backup_buffer_sharing_,
                                                enable_file_mapped_weights_,
                                                rpcmem_library_,
-                                               context_bin_map);
+                                               context_bin_map,
+                                               enable_htp_extended_udma_mode_);
 
   context_bin_map.clear();
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index f7022229f6c7b..c5d41789e7a1f 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -127,6 +127,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   qnn::ModelSettings model_settings_ = {};
   bool dump_json_qnn_graph_ = false;
   std::string json_qnn_graph_dir_ = "";
+  bool enable_htp_extended_udma_mode_ = false;
 
   // Whether this is set depends on a session option enabling it and if the RPCMEM dynamic library is available.
   // This is potentially shared with HtpSharedMemoryAllocator which may be returned by CreatePreferredAllocators().
diff --git a/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc b/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc
index 84a0afd873d23..c3842a5c875e3 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/webgpu/nn/conv_transpose.cc
@@ -57,6 +57,11 @@ Status ConvTranspose<is_channels_last>::ComputeInternal(ComputeContext& context)
 
   bool has_bias = context.InputCount() > 2;
   const auto* bias = has_bias ? context.Input<Tensor>(2) : nullptr;
+  // Validate bias shape if provided
+  if (has_bias && (bias->Shape().NumDimensions() != 1 || bias->Shape()[0] != num_output_channels)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid bias");
+  }
+
   if (input_shape.NumDimensions() == 3 && filter_shape.NumDimensions() == 3) {
     // ConvTranspose1D
     TensorShapeVector input_shape_vector = input_shape.AsShapeVector();
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 7cb6a852e8d7e..8b8d884a35281 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -138,10 +138,10 @@ void WebGpuContext::Initialize(const WebGpuContextConfig& config) {
                                                config.buffer_cache_config.uniform.mode,
                                                config.buffer_cache_config.query_resolve.mode);
 
-    // create initializer buffer manager. cache is always disabled for initializer buffer manager
+    // create initializer buffer manager.
     initializer_buffer_mgr_ = BufferManagerFactory::Create(*this,
-                                                           BufferCacheMode::Disabled,
-                                                           BufferCacheMode::Disabled,
+                                                           BufferCacheMode::LazyRelease,
+                                                           BufferCacheMode::LazyRelease,
                                                            BufferCacheMode::Disabled);
 
     // create program manager
diff --git a/onnxruntime/core/providers/webgpu/webgpu_kernel.cc b/onnxruntime/core/providers/webgpu/webgpu_kernel.cc
index 8303d2ff4293f..8a52b7a188fd5 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_kernel.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_kernel.cc
@@ -49,6 +49,12 @@ Status WebGpuKernel::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /
 
   Status s = PrePackInternal(context, tensor, input_idx, ep_.PrepackAllocator(), is_packed);
 
+  if (is_packed) {
+    // Flush pending commands to ensure GPU buffer creations are completed.
+    // This allows the initializer buffer manager to release temporary buffers and reduce memory usage.
+    webgpu_context_.Flush(webgpu_context_.InitializerBufferManager());
+  }
+
   if (webgpu_context_.ValidationMode() >= ValidationMode::Full) {
     ORT_RETURN_IF_ERROR(webgpu_context_.PopErrorScope());
   }
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 7881004671290..2806eb7a7a8d8 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -4843,7 +4843,7 @@ static_assert(offsetof(OrtApi, CreateExternalInitializerInfo) / sizeof(void*) ==
 static_assert(offsetof(OrtApi, GetTensorElementTypeAndShapeDataReference) / sizeof(void*) == 414, "Size of version 24 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.24.1",
+static_assert(std::string_view(ORT_VERSION) == "1.24.2",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
 // 2. If there were any APIs added to ort_api_1_to_24 above:
diff --git a/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc
index 15bce163ba16a..55e0660622f87 100644
--- a/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc
+++ b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc
@@ -73,6 +73,8 @@ namespace qnnctxgen {
       "\t    [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
       "\t    Defaults to '1' (another EP (typically CPU EP) handles the graph I/O quantization and dequantization). \n"
       "\t    [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary.\n"
+      "\t    [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
+      "\t    '0' (disabled), '1' (enabled). Default: '0'. \n"
       "\t    [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
       "\n"
       "\t-h: help\n");
@@ -253,7 +255,7 @@ static bool ParsePluginEpConfig(const std::string& json_file_path, PluginEpConfi
               ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
             }
           } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" ||
-                     key == "enable_htp_spill_fill_buffer") {
+                     key == "enable_htp_spill_fill_buffer" || key == "extended_udma") {
             std::unordered_set<std::string> supported_options = {"0", "1"};
             if (supported_options.find(value) == supported_options.end()) {
               std::ostringstream str_stream;
@@ -266,7 +268,7 @@ static bool ParsePluginEpConfig(const std::string& json_file_path, PluginEpConfi
             ORT_THROW(
                 "Wrong key type entered. Choose from options: ['backend_type', 'backend_path', 'vtcm_mb', "
                 "'htp_performance_mode', 'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', "
-                "'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer']");
+                "'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer', 'extended_udma']");
           }
 
           test_config.run_config.provider_options[key] = value;
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 8446f88639436..f4e15c49d92f0 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -90,6 +90,8 @@ void usage() {
       "\t    Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
       "\t    [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
       "\t    Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
+      "\t    [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
+      "\t    '0' (disabled), '1' (enabled). Default: '0'. \n"
       "\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
       "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_type|cpu\" \n\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
@@ -612,7 +614,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             std::string str = str_stream.str();
             ORT_THROW("Wrong value for htp_arch. select from: " + str);
           }
-        } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") {
+        } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || key == "extended_udma") {
           std::unordered_set<std::string> supported_options = {"0", "1"};
           if (supported_options.find(value) == supported_options.end()) {
             std::ostringstream str_stream;
@@ -626,7 +628,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
               "Wrong key type entered. Choose from options: ['backend_type', 'backend_path', "
               "'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', "
               "'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'op_packages', 'qnn_context_priority', "
-              "'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization']");
+              "'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'extended_udma']");
         }
 
         qnn_options[key] = value;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index e21120e62e949..38e4d52d9a2d2 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -116,6 +116,8 @@ ABSL_FLAG(std::string, i, "",
           "  [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill fill buffer, used while generating QNN context binary.\n"
           "  [QNN only] [enable_htp_shared_memory_allocator]: Enable the QNN HTP shared memory allocator and use it for inputs and outputs. Requires libcdsprpc.so/dll to be available.\n"
           "  Defaults to '0' (disabled).\n"
+          "  [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
+          "  '0' (disabled), '1' (enabled). Default: '0'. \n"
           "  [Example] [For QNN EP] -e qnn -i \"backend_type|cpu\" \n"
           "\n"
           "  [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 71f9050730c0b..91f0581af0633 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -258,7 +258,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
                          "qnn_saver_path", "htp_graph_finalization_optimization_mode", "qnn_context_priority",
                          "htp_arch", "enable_htp_fp16_precision", "offload_graph_io_quantization",
                          "enable_htp_spill_fill_buffer", "enable_htp_shared_memory_allocator", "dump_json_qnn_graph",
-                         "json_qnn_graph_dir", "disable_file_mapped_weights", "htp_bf16_enable", "enable_vtcm_backup_buffer_sharing"});
+                         "json_qnn_graph_dir", "disable_file_mapped_weights", "htp_bf16_enable", "enable_vtcm_backup_buffer_sharing", "extended_udma"});
+
     for (const auto& provider_option : provider_options) {
       const std::string& key = provider_option.first;
       const std::string& value = provider_option.second;
@@ -323,6 +324,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
                  key == "enable_htp_spill_fill_buffer" ||
                  key == "enable_htp_shared_memory_allocator" ||
                  key == "dump_json_qnn_graph" ||
+                 key == "extended_udma" ||
                  key == "disable_file_mapped_weights" ||
                  key == "enable_vtcm_backup_buffer_sharing") {
         std::set<std::string> supported_options = {"0", "1"};
diff --git a/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc b/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc
index c7fc73456dcba..671ada7d36383 100644
--- a/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/array_feature_extractor_test.cc
@@ -109,5 +109,13 @@ TEST_F(ArrayFeatureExtractorTest, InvalidInputOutOfBoundsY) {
   test_.Run(OpTester::ExpectResult::kExpectFailure);
 }
 
+TEST_F(ArrayFeatureExtractorTest, InvalidInputNegativeY) {
+  test_.AddInput<int64_t>("X", {10, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  test_.AddInput<int64_t>("Y", {1}, {-10});
+  // Should fail due to negative index -10
+  test_.AddOutput<int64_t>("Z", {0}, {});
+  test_.Run(OpTester::ExpectResult::kExpectFailure);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 418842ee0a81b..d1f43787c7717 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -1314,6 +1314,27 @@ TEST_F(QnnHTPBackendTests, DumpJsonQNNGraph) {
   std::filesystem::remove_all(dump_dir);
 }
 
+// Test extended UDMA mode on supported hardware (should run successfully)
+TEST_F(QnnHTPBackendTests, ExtendedUdmaModeTest) {
+  // Create provider options with extended UDMA mode enabled
+  ProviderOptions options;
+  options["backend_type"] = "htp";
+  options["offload_graph_io_quantization"] = "0";
+  options["htp_arch"] = "81";
+  options["extended_udma"] = "1";
+
+  // Define a simple model with Add operation
+  auto input_defs = {TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                     TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f)};
+
+  // Run the test - this should succeed because v81 supports extended UDMA
+  RunQnnModelTest(BuildOpTestCase<float>("Add", input_defs, {}, {}, kOnnxDomain),
+                  options,
+                  13,
+                  ExpectedEPNodeAssignment::All,
+                  0.008f);
+}
+
 // Test option for offloading quantization of graph inputs and dequantization of graph outputs to the CPU EP.
 TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) {
   // Returns a function that checks that the Q/DQ ops at the graph IO boundary are offloaded to CPU
diff --git a/onnxruntime/test/unittest_util/base_tester.cc b/onnxruntime/test/unittest_util/base_tester.cc
index d8bfd425f1f1a..2e0459103a7c9 100644
--- a/onnxruntime/test/unittest_util/base_tester.cc
+++ b/onnxruntime/test/unittest_util/base_tester.cc
@@ -424,7 +424,7 @@ void BaseTester::ExecuteModel(Model& model, SessionType& session,
 bool SetEpsForAllNodes(Graph& graph,
                        const std::vector<std::unique_ptr<IExecutionProvider>>& execution_providers,
                        const std::vector<std::shared_ptr<CustomRegistry>>* custom_registries,
-                       const std::function<bool(const IExecutionProvider&)>& ep_uses_kernel_registry_fn) {
+                       const std::function<bool(const IExecutionProvider&)>& ep_only_uses_kernel_registry_fn) {
   const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{};
   const KernelRegistry::TypeConstraintMap type_constraint_map{};
 
@@ -440,7 +440,7 @@ bool SetEpsForAllNodes(Graph& graph,
 
       node.SetExecutionProviderType(provider_type);
 
-      if (!ep_uses_kernel_registry_fn(*ep)) {
+      if (!ep_only_uses_kernel_registry_fn(*ep)) {
         found = true;
         break;
       }
@@ -659,7 +659,12 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
 #endif
           kDnnlExecutionProvider,
           kTensorrtExecutionProvider,
+#ifdef USE_NV
+          // Only include NV TRT RTX EP when is ORT is built with the provider-bridge
+          // version of the EP (i.e., USE_NV is defined). This allows use of the plugin EP version of the EP
+          // when ORT is not built any provider-bridge EPs.
           kNvTensorRTRTXExecutionProvider,
+#endif
           kOpenVINOExecutionProvider,
           kDmlExecutionProvider,
           kAclExecutionProvider,
@@ -830,12 +835,15 @@ void BaseTester::ExecuteModelForEps(
 
   ASSERT_TRUE(!execution_providers.empty()) << "Empty execution providers vector.";
   if (try_assign_ep_for_nodes) {
-    auto ep_uses_kernel_registry = [](const IExecutionProvider& ep) {
+    auto ep_only_uses_kernel_registry = [](const IExecutionProvider& ep) {
       const auto& provider_type = ep.Type();
 
-      constexpr std::array kEpsThatDoNotUseKernelRegistry{
+      constexpr std::array kEpsThatCompileNodes{
           kOpenVINOExecutionProvider,
-          kTensorrtExecutionProvider,
+          kTensorrtExecutionProvider,  // uses kernel registry for Memcpy* nodes only
+#ifdef USE_NV
+          kNvTensorRTRTXExecutionProvider,  // uses kernel registry for Memcpy* nodes only
+#endif
           kNnapiExecutionProvider,
           kVSINPUExecutionProvider,
           kCoreMLExecutionProvider,
@@ -844,24 +852,33 @@ void BaseTester::ExecuteModelForEps(
           kSnpeExecutionProvider,
       };
 
-      // check list of known EPs that do not use a kernel registry
-      if (const auto ep_it = std::find(kEpsThatDoNotUseKernelRegistry.begin(), kEpsThatDoNotUseKernelRegistry.end(),
+      // check list of known EPs that compile nodes
+      if (const auto ep_it = std::find(kEpsThatCompileNodes.begin(), kEpsThatCompileNodes.end(),
                                        provider_type);
-          ep_it != kEpsThatDoNotUseKernelRegistry.end()) {
+          ep_it != kEpsThatCompileNodes.end()) {
         return false;
       }
 
-      // assume that a dynamic plugin EP which does not return a kernel registry does not use one
-      if (provider_type == dynamic_plugin_ep_infra::GetEpName() &&
-          ep.GetKernelRegistry() == nullptr) {
-        return false;
+      const OrtEp* ort_ep = ep.GetOrtEp();
+
+      if (ort_ep != nullptr) {  // This is a plugin EP
+
+        if (ep.GetKernelRegistry() == nullptr) {
+          // assume that a dynamic plugin EP which does not return a kernel registry does not use one
+          return false;
+        }
+
+        if (ort_ep->Compile != nullptr) {
+          // assume that a plugin EP that compiles nodes does not use a kernel registry for all nodes
+          return false;
+        }
       }
 
       // otherwise, assume that the EP uses a kernel registry
       return true;
     };
 
-    if (!SetEpsForAllNodes(model.MainGraph(), execution_providers, custom_registries, ep_uses_kernel_registry)) {
+    if (!SetEpsForAllNodes(model.MainGraph(), execution_providers, custom_registries, ep_only_uses_kernel_registry)) {
       std::string providers;
       for (const auto& ep : execution_providers) {
         providers.append(ep->Type() + " ");
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
index 7242c5fe7b6a6..8d96c1ae99e0a 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
@@ -104,9 +104,18 @@ stages:
 
 - template: nuget/templates/test_macos.yml
   parameters:
-    AgentPool: macOS-14
+    AgentPool: 'AcesShared'
+    UseHostedVmImage: 'false'
+    PoolDemands: 'ImageOverride -equals ACES_VM_SharedPool_Sequoia'
     ArtifactSuffix: 'CPU'
 
+- template: nodejs/templates/test_macos.yml
+  parameters:
+    AgentPool: 'AcesShared'
+    UseHostedVmImage: 'false'
+    PoolDemands: 'ImageOverride -equals ACES_VM_SharedPool_Sequoia'
+    StageSuffix: 'MacOS_ARM64'
+
 - template: nodejs/templates/test_win.yml
   parameters:
     AgentPool: 'onnxruntime-Win-CPU-VS2022-Latest'
@@ -117,10 +126,6 @@ stages:
     AgentPool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     StageSuffix: 'Linux_CPU_x64'
 
-- template: nodejs/templates/test_macos.yml
-  parameters:
-    StageSuffix: 'macOS_CPU_x64'
-
 - template: nuget/templates/test_win.yml
   parameters:
     AgentPool: 'onnxruntime-Win2022-GPU-A10'
@@ -225,7 +230,7 @@ stages:
         - checkout: self
           clean: true
           submodules: none
-        
+
         - download: build
           artifact: 'Windows_Packaging_tensorrt_build_artifacts'
           displayName: 'Download Windows GPU Packages Build'
@@ -246,7 +251,7 @@ stages:
             versionSpec: "17"
             jdkArchitectureOption: x64
             jdkSourceOption: 'PreInstalled'
-       
+
         - task: PythonScript@0
           displayName: 'Update CTest Path References'
           inputs:
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index b4012b74196ee..ec3e8a9621e4c 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -23,11 +23,6 @@ parameters:
   type: number
   default: 0
 
-- name: PackageName
-  displayName: What is the package name? Override using an environment variable CustomPackageName.
-  type: string
-  default: 'Microsoft.ML.OnnxRuntime.Foundry'
-
 variables:
   - template: templates/common-variables.yml
   - name: ReleaseVersionSuffix
@@ -121,7 +116,7 @@ extends:
           buildArch: x64
           msbuildPlatform: arm64
           packageName: arm64
-          buildparameter: --arm64ec --buildasx --caller_framework WinAI
+          buildparameter: --arm64 --buildasx --caller_framework WinAI
           runTests: false
           buildJava: false
           buildNodejs: false
@@ -137,141 +132,8 @@ extends:
           AdditionalBuildFlags: '--use_webgpu --skip_tests'
           DoEsrp: true
 
-      - stage: NugetPackaging
-        dependsOn: [Windows_Packaging_CUDA, Windows_Packaging_CPU_arm64, ManagedNugetPackaging, MacOS_C_API_Package_Publish]
-        jobs:
-        - job: CreateNugetPackage
-          pool: 'Onnxruntime-Win2022-GPU-A10'
-          timeoutInMinutes: 120
-          steps:
-          - checkout: self
-            clean: true
-            submodules: none
-
-          - task: UsePythonVersion@0
-            inputs:
-              versionSpec: '3.12'
-              addToPath: true
-          - task: PipAuthenticate@1
-            displayName: 'Pip Authenticate'
-            inputs:
-              artifactFeeds: 'Lotus'
-
-          - task: DownloadPipelineArtifact@0
-            displayName: 'Download Pipeline Artifact - managed nuget'
-            inputs:
-              artifactName: 'onnxruntime-managed-nuget'
-              targetPath: '$(Build.BinariesDirectory)/managed-nuget'
-
-          - task: DownloadPipelineArtifact@0
-            displayName: 'Download Pipeline Artifact - win-x64'
-            inputs:
-              artifactName: 'onnxruntime-win-x64-cuda'
-              targetPath: '$(Build.BinariesDirectory)/win-x64'
-
-          - task: DownloadPipelineArtifact@0
-            displayName: 'Download Pipeline Artifact - win-arm64'
-            inputs:
-              artifactName: 'onnxruntime-win-arm64'
-              targetPath: '$(Build.BinariesDirectory)/win-arm64'
-
-          - task: DownloadPipelineArtifact@0
-            displayName: 'Download Pipeline Artifact - osx'
-            inputs:
-              artifactName: 'onnxruntime-osx'
-              targetPath: '$(Build.BinariesDirectory)/osx'
-
-          - task: PowerShell@2
-            displayName: 'Create osx directories'
-            inputs:
-              targetType: 'inline'
-              script: |
-                mkdir -p $(Build.BinariesDirectory)/osx-arm64
-                Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64
-
-          - task: PowerShell@2
-            displayName: 'List all files downloaded'
-            inputs:
-              targetType: 'inline'
-              script: |
-                $files = Get-ChildItem $(Build.BinariesDirectory) -Recurse
-                foreach ($file in $files) {
-                  Write-Host "File: $($file.FullName)"
-                  if ($file -like "*onnxruntime*") {
-                    Write-Host "File onnxruntime: $($file.FullName) - Size: $($file.Length)"
-                  }
-                }
-                $dirs = Get-ChildItem $(Build.BinariesDirectory) -Directory
-                foreach ($dir in $dirs) {
-                  Write-Host "Directory: $($dir.FullName)"
-                }
-                $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*
-                if ($osx_arm64_archive.Count -eq 0) {
-                  Write-Host "No osx-arm64 archive found."
-                } else {
-                  Write-Host "osx-arm64 archive found: $($osx_arm64_archive[0].FullName)"
-                }
-              workingDirectory: $(Build.BinariesDirectory)
-
-          - task: PowerShell@2
-            displayName: 'Extract Nuget Package Version'
-            inputs:
-              targetType: 'inline'
-              script: |
-                $nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/managed-nuget -Filter Microsoft.ML.OnnxRuntime.Managed.*.nupkg -Recurse)
-                $package_name = $nupkgs[0].Name
-                $version_length = $package_name.Length - "Microsoft.ML.OnnxRuntime.Managed.".Length - ".nupkg".Length
-                $package_version = $package_name.Substring("Microsoft.ML.OnnxRuntime.Managed.".Length, $version_length)
-                Write-Host "##vso[task.setvariable variable=package_version;]$package_version"
-              workingDirectory: $(Build.BinariesDirectory)
-
-          - task: PowerShell@2
-            displayName: 'Extract Archives'
-            inputs:
-              targetType: 'inline'
-              script: |
-                Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64
-                Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64
-                $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
-                tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null
-                $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Filter onnxruntime-win-x64-cuda*)[0].FullName
-                $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Filter onnxruntime-win-arm64*)[0].FullName
-                $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
-                Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64"
-                Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64"
-                Write-Host "##vso[task.setvariable variable=osx_x64;]$osx_x64"
-                Write-Host "##vso[task.setvariable variable=osx_arm64;]$osx_arm64"
-              workingDirectory: $(Build.BinariesDirectory)
-
-          - task: PowerShell@2
-            displayName: 'Get Package Name'
-            inputs:
-              targetType: 'inline'
-              script: |
-                if ($env:CustomPackageName) {
-                  Write-Host "##vso[task.setvariable variable=PackageName;]$env:CustomPackageName"
-                  Write-Host "PackageName: $env:CustomPackageName"
-                } else {
-                  Write-Host "##vso[task.setvariable variable=PackageName;]${{ parameters.PackageName }}"
-                  Write-Host "PackageName: ${{ parameters.PackageName }}"
-                }
-              workingDirectory: $(Build.BinariesDirectory)
-
-          - task: PythonScript@0
-            displayName: 'Generate Nuget Package'
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)/tools/nuget/generate_nuspec_for_custom_nuget.py'
-              arguments: '--nuspec_path "$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec" --root_dir "$(Build.SourcesDirectory)" --commit_id "$(Build.SourceVersion)" --win_arm64 "$(win_arm64)" --win_x64 "$(win_x64)" --osx_arm64 "$(osx_arm64)" --osx_x64 "$(osx_x64)" --package_version "$(package_version)" --package_name "$(PackageName)"'
-
-          - task: NuGetCommand@2
-            displayName: 'Pack Nuget Package'
-            inputs:
-              command: 'pack'
-              packagesToPack: '$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec'
-              packDestination: $(Build.ArtifactStagingDirectory)\
-
-          - task: 1ES.PublishPipelineArtifact@1
-            displayName: 'Publish Artifact: Nuget'
-            inputs:
-              artifactName: '${{ parameters.PackageName }}'
-              targetPath: '$(Build.ArtifactStagingDirectory)'
+      - template: templates/foundry-local-nuget-packaging.yml
+        parameters:
+          DependsOn: [Setup, Windows_Packaging_CUDA, Windows_Packaging_CPU_arm64, ManagedNugetPackaging, MacOS_C_API_Package_Publish]
+          DoEsrp: true
+          PackageName: 'Microsoft.ML.OnnxRuntime.Foundry'
diff --git a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
index 9d831df54096a..275d911b7cca2 100644
--- a/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
+++ b/tools/ci_build/github/azure-pipelines/jar_package_testing.yml
@@ -21,7 +21,8 @@ stages:
 - template: templates/final-jar-testing-linux.yml
   parameters:
     OS: MacOS
-    PoolName: 'macOS-14'
+    PoolName: 'AcesShared'
+    PoolDemands: 'ImageOverride -equals ACES_VM_SharedPool_Sequoia'
 
 - stage: GPU_JAR_Testing
   dependsOn: []
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml
index ae595bbf0c96b..cd41fc575020b 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml
@@ -6,12 +6,20 @@ steps:
 
 
 - task: PowerShell@2
-  displayName: 'Move Artifact Directory'
+  condition: and(succeeded(), eq(variables['Agent.OS'], 'Windows_NT'))
+  displayName: 'Move Artifact Directory (Windows)'
   inputs:
     targetType: 'inline'
     script: |
       Move-Item -Path "$(Pipeline.Workspace)/build/NPM_packages" -Destination "$(Build.BinariesDirectory)/nodejs-artifact"
 
+- task: CmdLine@2
+  condition: and(succeeded(), ne(variables['Agent.OS'], 'Windows_NT'))
+  displayName: 'Move Artifact Directory (POSIX)'
+  inputs:
+    script: |
+      mv "$(Pipeline.Workspace)/build/NPM_packages" "$(Build.BinariesDirectory)/nodejs-artifact"
+
 - script: mkdir e2e_test
   workingDirectory: '$(Build.BinariesDirectory)'
 
@@ -38,4 +46,4 @@ steps:
     npm init -y
     npm install $(NpmPackageFilesForTest) --onnxruntime-node-install-cuda=skip
     node -p "require('onnxruntime-node')"
-  workingDirectory: '$(Build.BinariesDirectory)/e2e_test'
\ No newline at end of file
+  workingDirectory: '$(Build.BinariesDirectory)/e2e_test'
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
index 4dd19ce2c250c..7e184492fab59 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
@@ -1,5 +1,9 @@
 parameters:
   StageSuffix: ''
+  AgentPool : 'macOS-15'
+  UseHostedVmImage: 'true'
+  PoolDemands: ''
+
 stages:
 - stage: Nodejs_Test_MacOS_${{ parameters.StageSuffix }}
   dependsOn:
@@ -11,7 +15,12 @@ stages:
       clean: all
     timeoutInMinutes:  120
     pool:
-      vmImage: 'macOS-15'
+      ${{ if eq(parameters.UseHostedVmImage, 'true') }}:
+        vmImage: ${{ parameters.AgentPool }}
+      ${{ else }}:
+        name: ${{ parameters.AgentPool }}
+        ${{ if ne(parameters.PoolDemands, '') }}:
+          demands: ${{ parameters.PoolDemands }}
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
index 02613871d61ff..2548eebeb9d42 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml
@@ -49,8 +49,8 @@ stages:
         clean: true
         submodules: none
 
-      
-      - template: ../../templates/setup-build-tools.yml      
+
+      - template: ../../templates/setup-build-tools.yml
         parameters:
           host_cpu_arch: 'x64'
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
index 1d122d64b1211..5fc52e2c76468 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
@@ -1,6 +1,10 @@
 parameters:
+  AgentPool : 'macOS-15'
+  UseHostedVmImage: 'true'
   IsMacOS : 'true'
   ArtifactSuffix: ''
+  PoolDemands: ''
+
 stages:
 - stage: NuGet_Test_MacOS
   dependsOn:
@@ -11,7 +15,12 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-15'
+      ${{ if eq(parameters.UseHostedVmImage, 'true') }}:
+        vmImage: ${{ parameters.AgentPool }}
+      ${{ else }}:
+        name: ${{ parameters.AgentPool }}
+        ${{ if ne(parameters.PoolDemands, '') }}:
+          demands: ${{ parameters.PoolDemands }}
 
     variables:
     - name: OnnxRuntimeBuildDirectory
@@ -27,18 +36,36 @@ stages:
 
     - script: |
         mv $(Pipeline.Workspace)/build/drop-signed-nuget-${{ parameters.ArtifactSuffix }} $(Build.BinariesDirectory)/nuget-artifact
-        mv $(Pipeline.Workspace)/build/onnxruntime-osx $(Build.BinariesDirectory)/testdata
+
+        # Artifact is a folder containing tgz. Extract it to testdata.
+        mkdir -p $(Build.BinariesDirectory)/testdata
+        for archive in $(Pipeline.Workspace)/build/onnxruntime-osx/*.tgz; do
+          tar -xzf "$archive" -C $(Build.BinariesDirectory)/testdata
+        done
+
+        # Ensure libcustom_op_library.dylib is where EndToEndTests expects it (testdata/testdata)
+        mkdir -p $(Build.BinariesDirectory)/testdata/testdata
+        find $(Build.BinariesDirectory)/testdata -name "libcustom_op_library.dylib" -exec cp {} $(Build.BinariesDirectory)/testdata/testdata/ \;
+
 
     - template: get-nuget-package-version-as-variable.yml
       parameters:
         packageFolder: '$(Build.BinariesDirectory)/nuget-artifact'
 
+    - script: |
+        git submodule update --init cmake/external/onnx
+        cd cmake/external/onnx
+        git fetch origin v1.13.1 --depth=1
+        git checkout v1.13.1
+        cd ../../..
+      displayName: 'Initialize ONNX submodule for test data (pinned to v1.13.1 since new data types like float8 is not supported in nuget)'
+
     - script: |
        $(Build.SourcesDirectory)/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/runtest.sh \
                  $(Build.BinariesDirectory)/nuget-artifact \
                  $(NuGetPackageVersionNumber) \
                  true
-  
+
        if [ $? -ne 0 ]; then
            echo "Failed to run test"
            exit 1
@@ -48,4 +75,5 @@ stages:
           OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory)
           DisableContribOps: $(DisableContribOps)
           DisableMlOps: $(DisableMlOps)
-          IsReleaseBuild: $(IsReleaseBuild)
\ No newline at end of file
+          IsReleaseBuild: $(IsReleaseBuild)
+          ORT_LOADER_VERBOSITY: 1
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 6eb7c52712671..f767ef110561a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -66,131 +66,17 @@ stages:
   - stage: Python_Packaging_Windows_CPU
     dependsOn: []
     jobs:
-    - job: Windows_py_Wheels
-      pool:
-        name: 'onnxruntime-Win-CPU-VS2022-Latest'
-        os: windows
-      templateContext:
-        sdl:
-          codeSignValidation:
-            enabled: true
-            # TODO: check why pyd file was not signed
-            break: false
-            additionalTargetsGlobPattern: f|**\*.pyd
-          psscriptanalyzer:
-            enabled: true
-          binskim:
-            enabled: true
-            scanOutputDirectoryOnly: true
-        outputs:
-        - output: pipelineArtifact
-          targetPath: $(Build.ArtifactStagingDirectory)
-          artifactName: onnxruntime-win-$(PythonVersion)
-      strategy:
-        matrix:
-          Python311_x64:
-            PythonVersion: '3.11'
-          Python312_x64:
-            PythonVersion: '3.12'
-          Python313_x64:
-            PythonVersion: '3.13'
-          Python314_x64:
-            PythonVersion: '3.14'
-      variables:
-        OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
-        ExtraParam: ${{ parameters.build_py_parameters }}
-      timeoutInMinutes: 180
-      workspace:
-        clean: all
-
-      steps:
-      - checkout: self
-        clean: true
-        submodules: recursive
-
-      - template: ../templates/setup-build-tools.yml
-        parameters:
-          host_cpu_arch: 'x64'
-          python_version: $(PythonVersion)
-
-      - template: ../templates/set-nightly-build-option-variable-step.yml
-
-      - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\windows\python\requirements.txt
-        env:
-          TMPDIR: "$(Agent.TempDirectory)"
-
-      - task: PythonScript@0
-        displayName: 'Build'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: >
-            --config ${{ parameters.cmake_build_type }}
-            --enable_lto
-            --build_dir $(Build.SourcesDirectory)\build
-            --skip_submodule_sync
-            --cmake_generator "Visual Studio 17 2022"
-            --enable_pybind
-            --enable_onnx_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache
-            ${{ parameters.build_py_parameters }}
-            --parallel --use_binskim_compliant_compile_flags --update --build
-            $(TelemetryOption)
-
-      - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
-        - template: ../templates/publish-symbolrequestprod-api.yml
-          parameters:
-            ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
-              symbolExpiryTime: 60
-            includePublicSymbolServer: true
-            symbolsArtifactName: onnxruntime_cpu_win_x64_$(PythonVersion)
-            symbolsVersion: $(Build.BuildId)
-            symbolProject: 'ONNX Runtime'
-            subscription: 'OnnxrunTimeCodeSign_20240611'
-            searchPattern: |
-              $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime.pdb
-              $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_shared.pdb
-              $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_pybind11_state.pdb
-
-      # Esrp signing
-      - template: ../templates/win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime\capi'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: true
-          Pattern: '*.pyd,*.dll'
-
-      - task: PythonScript@0
-        displayName: 'Build wheel'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\setup.py'
-          arguments: 'bdist_wheel ${{ parameters.build_py_parameters }} $(NightlyBuildOption)'
-          workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}'
-
-      - task: CopyFiles@2
-        displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-        inputs:
-          SourceFolder: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\dist'
-          Contents: '*.whl'
-          TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-      - script: |
-          7z x *.whl
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-        displayName: 'unzip the package'
-
+    - template: ../templates/py-win-cpu.yml
+      parameters:
+        architecture: 'x64'
+        build_py_parameters: ${{ parameters.build_py_parameters }}
+        cmake_build_type: ${{ parameters.cmake_build_type }}
 
-      - powershell: |
-          if ("$(PythonVersion)" -notcontains "3.14") {
-            python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
-            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-            Remove-Item -Recurse -Force onnxruntime
-            if ("$(ExtraParam)" -contains "--use_azure") {
-              $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path"
-              python onnxruntime_test_python_azure.py
-            }
-            python onnx_backend_test_series.py
-          }
-        workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}'
-        displayName: 'Run Python Tests'
+    - template: ../templates/py-win-cpu.yml
+      parameters:
+        architecture: 'arm64'
+        build_py_parameters: ${{ parameters.build_py_parameters }}
+        cmake_build_type: ${{ parameters.cmake_build_type }}
 
 - ${{ if eq(parameters.enable_mac_cpu, true) }}:
   - stage: Python_Packaging_MacOS
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 5025046a02b0e..a0f023325be04 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -203,6 +203,10 @@ stages:
       - input: pipelineArtifact
         artifactName: drop-onnxruntime-java-linux-aarch64
         targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-linux-aarch64'
+
+      - input: pipelineArtifact
+        artifactName: drop-onnxruntime-java-osx-arm64
+        targetPath: '$(Build.BinariesDirectory)\java-artifact\onnxruntime-java-osx-arm64'
       outputs:
       - output: pipelineArtifact
         targetPath: $(Build.BinariesDirectory)\java-artifact\onnxruntime-java-win-x64
diff --git a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml
index f5ec5be2c1557..738ac27bafde2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing-linux.yml
@@ -8,6 +8,10 @@ parameters:
 - name: PoolName
   type: string
 
+- name: PoolDemands
+  type: string
+  default: ''
+
 stages:
 - stage: Final_Jar_Testing_${{parameters.OS}}
   dependsOn: []
@@ -17,7 +21,16 @@ stages:
       clean: all
     ${{ if eq(parameters.OS, 'MacOS') }}:
       pool:
-        vmImage: 'macOS-15'
+        # Use PoolName if provided, otherwise fallback to macOS-15
+        ${{ if ne(parameters.PoolName, '') }}:
+          ${{ if contains(parameters.PoolName, '-') }}:
+            vmImage: ${{ parameters.PoolName }}
+          ${{ else }}:
+            name: ${{ parameters.PoolName }}
+            ${{ if ne(parameters.PoolDemands, '') }}:
+              demands: ${{ parameters.PoolDemands }}
+        ${{ else }}:
+          vmImage: 'macOS-15'
     ${{ if eq(parameters.OS, 'Linux') }}:
       pool:
         name: ${{ parameters.PoolName }}
@@ -29,10 +42,15 @@ stages:
     - template: set-version-number-variables-step.yml
 
     - bash: |
-        echo "Downloading and installing Maven $(mavenVersion) for Linux..."
+        echo "Downloading and installing Maven $(mavenVersion)..."
         MAVEN_DIR="$(Agent.TempDirectory)/apache-maven-$(mavenVersion)"
+
         # Download Maven binary
-        wget https://archive.apache.org/dist/maven/maven-3/$(mavenVersion)/binaries/apache-maven-$(mavenVersion)-bin.tar.gz -O $(Agent.TempDirectory)/maven.tar.gz
+        if command -v wget &> /dev/null; then
+            wget https://archive.apache.org/dist/maven/maven-3/$(mavenVersion)/binaries/apache-maven-$(mavenVersion)-bin.tar.gz -O $(Agent.TempDirectory)/maven.tar.gz
+        else
+            curl -L -o $(Agent.TempDirectory)/maven.tar.gz https://archive.apache.org/dist/maven/maven-3/$(mavenVersion)/binaries/apache-maven-$(mavenVersion)-bin.tar.gz
+        fi
 
         # Extract to the temp directory
         mkdir -p ${MAVEN_DIR}
@@ -40,13 +58,25 @@ stages:
 
         # Add Maven's bin directory to the PATH for subsequent tasks in the job
         echo "##vso[task.prependpath]${MAVEN_DIR}/bin"
-      displayName: 'Install Maven (Linux)'
-      condition: and(succeeded(), eq(variables['Agent.OS'], 'Linux'))
+      displayName: 'Install Maven'
+      condition: and(succeeded(), in(variables['Agent.OS'], 'Linux', 'Darwin'))
 
     - script: |
         echo "Maven is now on the PATH."
         mvn --version
 
+    - script: |
+        set -e -x
+        if ! /usr/libexec/java_home -v 17 >/dev/null 2>&1; then
+          brew install --cask temurin@17
+        fi
+        JAVA_HOME=$(/usr/libexec/java_home -v 17)
+        echo "JAVA_HOME is set to: $JAVA_HOME"
+        echo "##vso[task.setvariable variable=JAVA_HOME]$JAVA_HOME"
+        echo "##vso[task.prependpath]$JAVA_HOME/bin"
+      displayName: 'Install JDK 17 (macOS)'
+      condition: and(succeeded(), eq(variables['Agent.OS'], 'Darwin'))
+
     - download: build
       artifact: 'onnxruntime-java'
       displayName: 'Download Final Jar'
@@ -58,16 +88,17 @@ stages:
         goals: 'dependency:copy-dependencies'
         options: '-DoutputDirectory=$(Pipeline.Workspace)/build/onnxruntime-java'
         publishJUnitTestResults: false
-        javaHomeOption: 'JDKVersion'
-        jdkVersionOption: '1.17'
         mavenVersionOption: 'Default'
+        ${{ if eq(parameters.OS, 'MacOS') }}:
+          javaHomeOption: 'Path'
+          jdkDirectory: '$(JAVA_HOME)'
+        ${{ if eq(parameters.OS, 'Linux') }}:
+          javaHomeOption: 'JDKVersion'
+          jdkVersionOption: '1.17'
 
     - task: Bash@3
-      displayName: 'Run Java Tests on Linux'
-#      condition: and(succeeded(), in(variables['Agent.OS'], 'Linux', 'Darwin'))
-      # MacOS packages have been removed from the JAR here:
-      # https://github.com/microsoft/onnxruntime/commit/5ed340f7a51f3cbdb62577a874daf2b3f23d6a93#diff-a14cc5ea231eb4fa49f13510a242043c47ae48516c860f8a87b0e55762632f49
-      condition: and(succeeded(), in(variables['Agent.OS'], 'Linux'))
+      displayName: 'Run Java Tests'
+      condition: and(succeeded(), in(variables['Agent.OS'], 'Linux', 'Darwin'))
       inputs:
         targetType: 'inline'
         script: |
@@ -83,24 +114,54 @@ stages:
           cd ..
           mkdir tests
           cd tests
+          # 1. Diagnostics
+          echo "System Info:"
+          uname -a
+          if [[ "$(uname)" == "Darwin" ]]; then arch; fi
+          echo "Java Version"
+          java -version
+
+          # 2. Extract
           jar xf $(Pipeline.Workspace)/build/onnxruntime-java/testing.jar
           rm -f $(Pipeline.Workspace)/build/onnxruntime-java/testing.jar
-          ls $(Pipeline.Workspace)/build/tests
+
+          # Identify main jar (avoiding sources and javadoc jars)
+          MAIN_JAR=$(ls $(Pipeline.Workspace)/build/onnxruntime-java/onnxruntime-*.jar | grep -v 'sources' | grep -v 'javadoc' | head -n 1)
+          echo "Extracting native libs from $MAIN_JAR"
+          jar xf $MAIN_JAR ai/onnxruntime/native
+
+          ls -R $(Pipeline.Workspace)/build/tests/ai
           echo "Java Version"
           java -version
 
-          # Set the correct library path based on the OS
+
+          # 3. Find with robustness
           os_name=$(uname)
-          if [[ "$os_name" == "Linux" ]]; then
-            echo "Platform: Linux. Setting LD_LIBRARY_PATH."
-            export LD_LIBRARY_PATH="$(pwd):$LD_LIBRARY_PATH"
-            java -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \
-            --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults"
-          elif [[ "$os_name" == "Darwin" ]]; then
-            echo "Platform: macOS. Setting DYLD_LIBRARY_PATH."
-            export DYLD_LIBRARY_PATH="$(pwd):$DYLD_LIBRARY_PATH"
-            java -DUSE_WEBGPU=1 -DUSE_COREML=1 -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \
-            --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults"
+          if [[ "$os_name" == "Linux" ]]; then S_FILE="libonnxruntime.so"; else S_FILE="libonnxruntime.dylib"; fi
+
+          echo "Searching for $S_FILE in $(pwd)..."
+          # Exclude .dSYM paths and find actual file
+          NATIVE_LIB_PATH=$(find $(pwd) -name "$S_FILE" -not -path "*.dSYM*" -type f | head -n 1)
+
+          if [[ -n "$NATIVE_LIB_PATH" ]]; then
+            NATIVE_LIB_DIR=$(dirname "$NATIVE_LIB_PATH")
+            echo "Found native lib dir: $NATIVE_LIB_DIR"
+
+            if [[ "$os_name" == "Linux" ]]; then
+              echo "Platform: Linux. Setting LD_LIBRARY_PATH."
+              export LD_LIBRARY_PATH="$NATIVE_LIB_DIR:$(pwd):$LD_LIBRARY_PATH"
+              java -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \
+              --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults"
+            elif [[ "$os_name" == "Darwin" ]]; then
+              echo "Platform: macOS. Setting DYLD_LIBRARY_PATH."
+              export DYLD_LIBRARY_PATH="$NATIVE_LIB_DIR:$(pwd):$DYLD_LIBRARY_PATH"
+              java -DUSE_WEBGPU=1 -DUSE_COREML=1 -cp '$(Pipeline.Workspace)/build/tests:$(Pipeline.Workspace)/build/onnxruntime-java/*' org.junit.platform.console.ConsoleLauncher --scan-classpath=$(Pipeline.Workspace)/build/tests \
+              --fail-if-no-tests --disable-banner --reports-dir "$(Build.ArtifactStagingDirectory)/TestResults"
+            fi
+          else
+            echo "Error: $S_FILE not found!"
+            ls -R ai
+            exit 1
           fi
 
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/foundry-local-nuget-packaging.yml b/tools/ci_build/github/azure-pipelines/templates/foundry-local-nuget-packaging.yml
new file mode 100644
index 0000000000000..0ad230f835778
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/foundry-local-nuget-packaging.yml
@@ -0,0 +1,149 @@
+parameters:
+  DoEsrp: false
+  StageName: 'FoundryLocalNugetPackaging'
+  DependsOn: []
+  PackageName: 'Microsoft.ML.OnnxRuntime.Foundry'
+
+stages:
+- stage: ${{ parameters.StageName }}
+  dependsOn: ${{ parameters.DependsOn }}
+  jobs:
+  - job: ${{ parameters.StageName }}
+    timeoutInMinutes: 120
+    pool:
+      name: 'onnxruntime-Win2022-GPU-A10'
+      os: windows
+    templateContext:
+      sdl:
+        codeSignValidation:
+          enabled: true
+          break: true
+        psscriptanalyzer:
+          enabled: true
+        binskim:
+          enabled: true
+          scanOutputDirectoryOnly: true
+      outputs:
+      - output: pipelineArtifact
+        targetPath: $(Build.ArtifactStagingDirectory)
+        artifactName: "onnxruntime-foundry-nuget"
+    variables:
+      DoEsrp: ${{ parameters.DoEsrp }}
+      ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
+      BuildDate: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Date.BuildDate']]
+      BuildTime: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Time.BuildTime']]
+
+    steps:
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - managed nuget'
+      inputs:
+        artifactName: 'onnxruntime-managed-nuget'
+        targetPath: '$(Build.BinariesDirectory)/managed-nuget'
+
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - win-x64'
+      inputs:
+        artifactName: 'onnxruntime-win-x64-cuda'
+        targetPath: '$(Build.BinariesDirectory)/win-x64'
+
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - win-arm64'
+      inputs:
+        artifactName: 'onnxruntime-win-arm64'
+        targetPath: '$(Build.BinariesDirectory)/win-arm64'
+
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - osx'
+      inputs:
+        artifactName: 'onnxruntime-osx'
+        targetPath: '$(Build.BinariesDirectory)/osx'
+
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: '3.12'
+        addToPath: true
+
+    - task: PipAuthenticate@1
+      displayName: 'Pip Authenticate'
+      inputs:
+        artifactFeeds: 'Lotus'
+
+    - task: PowerShell@2
+      displayName: 'Create osx directories'
+      inputs:
+        targetType: 'inline'
+        script: |
+          New-Item -ItemType Directory -Force -Path "$(Build.BinariesDirectory)/osx-arm64" | Out-Null
+          Move-Item -Path $(Build.BinariesDirectory)/osx/onnxruntime-osx-arm64* -Destination $(Build.BinariesDirectory)/osx-arm64
+
+    - task: PowerShell@2
+      displayName: 'List all files downloaded'
+      inputs:
+        targetType: 'inline'
+        script: |
+          $files = Get-ChildItem $(Build.BinariesDirectory) -Recurse
+          foreach ($file in $files) {
+            Write-Host "File: $($file.FullName)"
+            if ($file -like "*onnxruntime*") {
+              Write-Host "File onnxruntime: $($file.FullName) - Size: $($file.Length)"
+            }
+          }
+          $dirs = Get-ChildItem $(Build.BinariesDirectory) -Directory
+          foreach ($dir in $dirs) {
+            Write-Host "Directory: $($dir.FullName)"
+          }
+          $osx_arm64_archive = Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*
+          if ($osx_arm64_archive.Count -eq 0) {
+            Write-Host "No osx-arm64 archive found."
+          } else {
+            Write-Host "osx-arm64 archive found: $($osx_arm64_archive[0].FullName)"
+          }
+        workingDirectory: $(Build.BinariesDirectory)
+
+    - task: PowerShell@2
+      displayName: 'Extract Nuget Package Version'
+      inputs:
+        targetType: 'inline'
+        script: |
+          $nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/managed-nuget -Filter Microsoft.ML.OnnxRuntime.Managed.*.nupkg -Recurse)
+          $package_name = $nupkgs[0].Name
+          $version_length = $package_name.Length - "Microsoft.ML.OnnxRuntime.Managed.".Length - ".nupkg".Length
+          $package_version = $package_name.Substring("Microsoft.ML.OnnxRuntime.Managed.".Length, $version_length)
+          Write-Host "##vso[task.setvariable variable=package_version;]$package_version"
+        workingDirectory: $(Build.BinariesDirectory)
+
+    - task: PowerShell@2
+      displayName: 'Extract Archives'
+      inputs:
+        targetType: 'inline'
+        script: |
+          Expand-Archive -Path $(Build.BinariesDirectory)/win-x64/onnxruntime-win-x64-cuda*.zip -DestinationPath $(Build.BinariesDirectory)/win-x64
+          Expand-Archive -Path $(Build.BinariesDirectory)/win-arm64/onnxruntime-win-arm64*.zip -DestinationPath $(Build.BinariesDirectory)/win-arm64
+          $osx_arm64_archive = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Filter onnxruntime-osx-arm64*)[0].FullName
+          tar -xzf $osx_arm64_archive -C $(Build.BinariesDirectory)/osx-arm64 2>$null
+          $win_x64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-x64 -Directory -Filter onnxruntime-win-x64-cuda*)[0].FullName
+          $win_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/win-arm64 -Directory -Filter onnxruntime-win-arm64*)[0].FullName
+          $osx_arm64 = (Get-ChildItem -Path $(Build.BinariesDirectory)/osx-arm64 -Directory -Filter onnxruntime-osx-arm64*)[0].FullName
+          Write-Host "##vso[task.setvariable variable=win_x64;]$win_x64"
+          Write-Host "##vso[task.setvariable variable=win_arm64;]$win_arm64"
+          Write-Host "##vso[task.setvariable variable=osx_arm64;]$osx_arm64"
+        workingDirectory: $(Build.BinariesDirectory)
+
+    - task: PythonScript@0
+      displayName: 'Generate Nuget Package'
+      inputs:
+        scriptPath: '$(Build.SourcesDirectory)/tools/nuget/generate_nuspec_for_custom_nuget.py'
+        arguments: '--nuspec_path "$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec" --root_dir "$(Build.SourcesDirectory)" --commit_id "$(Build.SourceVersion)" --win_arm64 "$(win_arm64)" --win_x64 "$(win_x64)" --osx_arm64 "$(osx_arm64)" --package_version "$(package_version)" --package_name "${{ parameters.PackageName }}"'
+
+    - task: NuGetCommand@2
+      displayName: 'Pack Nuget Package'
+      inputs:
+        command: 'pack'
+        packagesToPack: '$(Build.BinariesDirectory)/${{ parameters.PackageName }}.nuspec'
+        packDestination: $(Build.ArtifactStagingDirectory)\
+
+    - template: esrp_nuget.yml
+      parameters:
+        DisplayName: 'ESRP - sign NuGet package'
+        FolderPath: '$(Build.ArtifactStagingDirectory)'
+        DoEsrp: ${{ parameters.DoEsrp }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
index 8e454f2137ce8..795945a8581ba 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml
@@ -26,6 +26,15 @@ steps:
     args: '-r $(Build.BinariesDirectory) -a onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion) -l libonnxruntime.$(OnnxRuntimeVersion).dylib -c Release -s $(Build.SourcesDirectory) -t $(Build.SourceVersion)'
     workingDirectory: '$(Build.BinariesDirectory)/Release'
 
+- bash: |
+    mkdir -p $(Build.BinariesDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion)/testdata
+    cp $(Build.BinariesDirectory)/Release/libcustom_op_library.dylib $(Build.BinariesDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion)/testdata/libcustom_op_library.dylib
+    # Copy to testdata/testdata so EndToEndTests can find it when running in Debug configuration
+    mkdir -p $(Build.BinariesDirectory)/testdata/testdata
+    cp $(Build.BinariesDirectory)/Release/libcustom_op_library.dylib $(Build.BinariesDirectory)/testdata/testdata/libcustom_op_library.dylib
+  displayName: 'Copy custom op library'
+  condition: succeeded()
+
 - task: ArchiveFiles@2
   inputs:
     rootFolderOrFile: '$(Build.BinariesDirectory)/onnxruntime-osx-${{ parameters.MacosArch }}-$(OnnxRuntimeVersion)'
@@ -40,6 +49,14 @@ steps:
     targetPath: '$(Build.ArtifactStagingDirectory)'
     artifactName: 'onnxruntime-osx-${{ parameters.MacosArch }}'
 
+- template: java-api-artifacts-package-and-publish-steps-posix.yml
+  parameters:
+    arch: 'osx-${{ parameters.MacosArch }}'
+    buildConfig: 'Release'
+    artifactName: 'onnxruntime-java-osx-${{ parameters.MacosArch }}'
+    libraryName: 'libonnxruntime.dylib'
+    nativeLibraryName: 'libonnxruntime4j_jni.dylib'
+
 - template: nodejs-artifacts-package-and-publish-steps-posix.yml
   parameters:
       arch: arm64
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index bfccaef1c9852..de16ce483a9f4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -45,9 +45,20 @@ jobs:
       set -e -x
       export ONNX_ML=1
       export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=ON -DONNX_WERROR=OFF"
-      python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt'    
+      python3 -m pip install -r '$(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/requirements.txt'
+
+  - script: |
+      set -e -x
+      if ! /usr/libexec/java_home -v 17 >/dev/null 2>&1; then
+        brew install --cask temurin@17
+      fi
+      JAVA_HOME=$(/usr/libexec/java_home -v 17)
+      echo "JAVA_HOME is set to: $JAVA_HOME"
+      echo "##vso[task.setvariable variable=JAVA_HOME]$JAVA_HOME"
+      echo "##vso[task.prependpath]$JAVA_HOME/bin"
+    displayName: 'Install JDK 17'
 
   - template: mac-cpu-packaging-steps.yml
     parameters:
       MacosArch: arm64
-      AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
+      AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_java --build_nodejs --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
new file mode 100644
index 0000000000000..09603f2350657
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
@@ -0,0 +1,168 @@
+parameters:
+- name: architecture
+  type: string
+  default: 'x64'
+  values:
+    - x64
+    - arm64
+
+- name: build_py_parameters
+  displayName: 'Specify extra build parameters'
+  type: string
+  default: '--use_azure'
+
+- name: cmake_build_type
+  type: string
+  displayName: 'CMake build type for Windows. Only for Windows CPU packages.'
+  default: 'RelWithDebInfo'
+  values:
+   - Debug
+   - Release
+   - RelWithDebInfo
+   - MinSizeRel
+
+jobs:
+- job: Windows_py_Wheels_${{parameters.architecture}}
+  ${{ if eq(parameters.architecture, 'arm64') }}:
+    pool:
+      name: 'onnxruntime-qnn-windows-vs-2022-arm64'
+      os: windows
+      hostArchitecture: Arm64
+      demands:
+        - Agent.Version -equals 4.264.2
+  ${{ else }}:
+    pool:
+      name: 'onnxruntime-Win-CPU-VS2022-Latest'
+      os: windows
+  templateContext:
+    sdl:
+      codeSignValidation:
+        enabled: true
+        # TODO: check why pyd file was not signed
+        break: false
+        additionalTargetsGlobPattern: f|**\*.pyd
+      psscriptanalyzer:
+        enabled: true
+      binskim:
+        enabled: true
+        scanOutputDirectoryOnly: true
+    ${{ if eq(parameters.architecture, 'arm64') }}:
+      outputs:
+        - output: pipelineArtifact
+          targetPath: $(Build.ArtifactStagingDirectory)
+          artifactName: onnxruntime-win-$(PythonVersion)-arm64
+    ${{ else }}:
+      outputs:
+        - output: pipelineArtifact
+          targetPath: $(Build.ArtifactStagingDirectory)
+          artifactName: onnxruntime-win-$(PythonVersion)
+  strategy:
+    matrix:
+      Python311_${{parameters.architecture}}:
+        PythonVersion: '3.11'
+      Python312_${{parameters.architecture}}:
+        PythonVersion: '3.12'
+      Python313_${{parameters.architecture}}:
+        PythonVersion: '3.13'
+      Python314_${{parameters.architecture}}:
+        PythonVersion: '3.14'
+  variables:
+    OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
+    ExtraParam: ${{ parameters.build_py_parameters }}
+  timeoutInMinutes: 180
+  workspace:
+    clean: all
+
+  steps:
+  - checkout: self
+    clean: true
+    submodules: recursive
+
+  - template: setup-build-tools.yml
+    parameters:
+      host_cpu_arch: ${{parameters.architecture}}
+      python_version: $(PythonVersion)
+
+  - template: set-nightly-build-option-variable-step.yml
+
+  - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\windows\python\requirements.txt
+    env:
+      TMPDIR: "$(Agent.TempDirectory)"
+
+  - task: PythonScript@0
+    displayName: 'Build'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+      arguments: >
+        --config ${{ parameters.cmake_build_type }}
+        --enable_lto
+        --build_dir $(Build.SourcesDirectory)\build
+        --skip_submodule_sync
+        --cmake_generator "Visual Studio 17 2022"
+        --enable_pybind
+        --enable_onnx_tests --use_vcpkg --use_vcpkg_ms_internal_asset_cache --build
+        ${{ parameters.build_py_parameters }}
+        --parallel --use_binskim_compliant_compile_flags --update
+        $(TelemetryOption)
+
+  - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
+    - template: publish-symbolrequestprod-api.yml
+      parameters:
+        ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
+          symbolExpiryTime: 60
+        includePublicSymbolServer: true
+        symbolsArtifactName: onnxruntime_cpu_win_${{ parameters.architecture }}_$(PythonVersion)
+        symbolsVersion: $(Build.BuildId)
+        symbolProject: 'ONNX Runtime'
+        subscription: 'OnnxrunTimeCodeSign_20240611'
+        searchPattern: |
+          $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime.pdb
+          $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_providers_shared.pdb
+          $(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime_pybind11_state.pdb
+
+  # Esrp signing
+  - template: win-esrp-dll.yml
+    parameters:
+      FolderPath: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\onnxruntime\capi'
+      DisplayName: 'ESRP - Sign Native dlls'
+      DoEsrp: true
+      Pattern: '*.pyd,*.dll'
+
+  - task: PythonScript@0
+    displayName: 'Build wheel'
+    inputs:
+      scriptPath: '$(Build.SourcesDirectory)\setup.py'
+      arguments: 'bdist_wheel ${{ parameters.build_py_parameters }} $(NightlyBuildOption)'
+      workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}'
+
+  - task: CopyFiles@2
+    displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+    inputs:
+      SourceFolder: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}\dist'
+      Contents: '*.whl'
+      TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+  - script: |
+      7z x *.whl
+    workingDirectory: '$(Build.ArtifactStagingDirectory)'
+    displayName: 'unzip the package'
+
+
+  - powershell: |
+      if ("$(PythonVersion)" -notcontains "3.14") {
+        python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
+        Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+        Remove-Item -Recurse -Force onnxruntime
+        if ("$(ExtraParam)".Split() -contains "--use_azure") {
+
+          if( "${{parameters.architecture}}" -eq 'arm64') {
+            $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path"
+          }  else {
+            $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path"
+          }
+          python onnxruntime_test_python_azure.py
+        }
+        python onnx_backend_test_series.py
+      }
+    workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}'
+    displayName: 'Run Python Tests'
diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh
index f5b4c38c85d4c..88eff3ebff86a 100755
--- a/tools/ci_build/github/linux/copy_strip_binary.sh
+++ b/tools/ci_build/github/linux/copy_strip_binary.sh
@@ -27,6 +27,17 @@ if [[ $LIB_NAME == *.dylib ]]
 then
     dsymutil $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME -o $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME.dSYM
     strip -S $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME
+
+    # ORT NuGet packaging expects the unversioned library (libonnxruntime.dylib) to contain the binary content,
+    # because the versioned library is excluded by the nuspec generation script.
+    # We explicitly overwrite the symlink with the real file to ensure 'nuget pack' (especially on Windows)
+    # doesn't pack an empty/broken symlink.
+    # Only applies to versioned libonnxruntime libraries (e.g. libonnxruntime.1.24.0.dylib).
+    if [[ "$LIB_NAME" =~ ^libonnxruntime\..*\.dylib$ && -L "$BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib" ]]; then
+       rm "$BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib"
+       cp "$BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME" "$BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib"
+    fi
+
     # copy the CoreML EP header for macOS build (libs with .dylib ext)
     cp $SOURCE_DIR/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h  $BINARY_DIR/$ARTIFACT_NAME/include
 else
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu
index 766a2c8a8b73b..0c63b7775256a 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2404_gpu
@@ -49,7 +49,9 @@ RUN apt-get update && \
     libnvonnxparsers-dev=${TRT_VERSION} \
     libnvonnxparsers10=${TRT_VERSION} \
     tensorrt-dev=${TRT_VERSION} \
-    libnvinfer-bin=${TRT_VERSION} && \
+    libnvinfer-bin=${TRT_VERSION} \
+    libnvinfer-headers-python-plugin-dev=${TRT_VERSION} \
+    libnvinfer-win-builder-resource10=${TRT_VERSION} && \
     rm -rf /var/lib/apt/lists/*
 
 COPY scripts /tmp/scripts
diff --git a/tools/ci_build/github/windows/jar_packaging.py b/tools/ci_build/github/windows/jar_packaging.py
index 8ec380a5d2523..f4bc6899260c1 100644
--- a/tools/ci_build/github/windows/jar_packaging.py
+++ b/tools/ci_build/github/windows/jar_packaging.py
@@ -232,6 +232,7 @@ def run_packaging(package_type: str, build_dir: str):
             "platforms": [
                 {"path": "onnxruntime-java-linux-x64", "lib": "libcustom_op_library.so", "archive_lib": True},
                 {"path": "onnxruntime-java-linux-aarch64", "lib": "libcustom_op_library.so", "archive_lib": False},
+                {"path": "onnxruntime-java-osx-arm64", "lib": "libcustom_op_library.dylib", "archive_lib": True},
             ]
         },
         "gpu": {
diff --git a/tools/ci_build/github/windows/jar_packaging_test.py b/tools/ci_build/github/windows/jar_packaging_test.py
index 2dd61cf9c3088..e4f7e4945442c 100644
--- a/tools/ci_build/github/windows/jar_packaging_test.py
+++ b/tools/ci_build/github/windows/jar_packaging_test.py
@@ -52,14 +52,19 @@ def _setup_test_directory(package_type: str, version_string: str):
             create_empty_file(linux_native_dir / "libonnxruntime_providers_cuda.so")
         (linux_dir / "_manifest" / "spdx_2.2").mkdir(parents=True, exist_ok=True)
 
-        # --- Additional platforms (for CPU test) ---
+        # --- macOS and other platforms (for CPU test) ---
         if package_type == "cpu":
-            # Add linux-aarch64 for CPU test
+            # Add linux-aarch64 and osx-arm64 for CPU test
             linux_aarch64_dir = java_artifact_dir / "onnxruntime-java-linux-aarch64"
             linux_aarch64_native_dir = linux_aarch64_dir / "ai" / "onnxruntime" / "native" / "linux-aarch64"
             linux_aarch64_native_dir.mkdir(parents=True, exist_ok=True)
             create_empty_file(linux_aarch64_dir / "libcustom_op_library.so")
 
+            osx_arm64_dir = java_artifact_dir / "onnxruntime-java-osx-arm64"
+            osx_arm64_native_dir = osx_arm64_dir / "ai" / "onnxruntime" / "native" / "osx-arm64"
+            osx_arm64_native_dir.mkdir(parents=True, exist_ok=True)
+            create_empty_file(osx_arm64_dir / "libcustom_op_library.dylib")
+
         return tmp_path
 
     return _setup_test_directory
@@ -128,9 +133,12 @@ def test_cpu_packaging(directory_setup_factory, version_string):
     with zipfile.ZipFile(testing_jar_path, "r") as zf:
         jar_contents = zf.namelist()
         assert "libcustom_op_library.so" in jar_contents
+        assert "libcustom_op_library.dylib" in jar_contents
 
     # 3. Verify the custom op libraries were removed from the source directories
     linux_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-x64"
     linux_aarch64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-linux-aarch64"
+    osx_arm64_dir = temp_build_dir / "java-artifact" / "onnxruntime-java-osx-arm64"
     assert not (linux_dir / "libcustom_op_library.so").exists()
     assert not (linux_aarch64_dir / "libcustom_op_library.so").exists()
+    assert not (osx_arm64_dir / "libcustom_op_library.dylib").exists()
diff --git a/tools/nuget/generate_nuspec_for_custom_nuget.py b/tools/nuget/generate_nuspec_for_custom_nuget.py
index 3abd03119cbc5..6e51c51895191 100644
--- a/tools/nuget/generate_nuspec_for_custom_nuget.py
+++ b/tools/nuget/generate_nuspec_for_custom_nuget.py
@@ -14,7 +14,6 @@ def generate_files(lines, args):
     platform_map = {
         "win-arm64": args.win_arm64,
         "win-x64": args.win_x64,
-        "osx-x64": args.osx_x64,
         "osx-arm64": args.osx_arm64,
     }
 
@@ -116,7 +115,6 @@ def parse_arguments():
     parser.add_argument("--win_arm64", required=True, help="Ort win-arm64 directory")
     parser.add_argument("--win_x64", required=True, help="Ort win-x64 directory")
     parser.add_argument("--osx_arm64", required=True, help="Ort osx-arm64 directory")
-    parser.add_argument("--osx_x64", required=True, help="Ort osx-x64 directory")
     parser.add_argument("--package_version", required=True, help="Version of the package")
     parser.add_argument("--package_name", required=True, help="Name of the package")
 
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 9884cbf5793df..1f882c847c707 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -238,6 +238,9 @@ def add_common_dependencies(xml_text, package_name, version):
         xml_text.append('<dependency id="Microsoft.ML.OnnxRuntime.Gpu.Windows"' + ' version="' + version + '"/>')
         xml_text.append('<dependency id="Microsoft.ML.OnnxRuntime.Gpu.Linux"' + ' version="' + version + '"/>')
 
+    if package_name == "Microsoft.ML.OnnxRuntime.Foundry":
+        xml_text.append('<dependency id="Microsoft.ML.OnnxRuntime.Gpu.Linux"' + ' version="' + version + '"/>')
+
 
 def generate_dependencies(xml_text, package_name, version):
     dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.15.4"/>'