microsoft · tianleiwu · Feb 3, 2026 · Jan 28, 2026 · Jan 24, 2026 · Jan 30, 2026
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props.xml
@@ -28,14 +28,7 @@
    </Link>
   </ItemDefinitionGroup>
 
-  <ItemDefinitionGroup Condition="'$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')">
-   <Link>
-      <AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-x86/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
-   </Link>
-  </ItemDefinitionGroup>
-
   <PropertyGroup>
-    <EnginePlatform Condition="'$(Platform)' == 'Win32'">x86</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' == 'ARM64'">arm64</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' == 'ARM'">arm</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' != 'Win32' AND '$(Platform)' != 'ARM64'">$(Platform)</EnginePlatform>
@@ -147,34 +140,5 @@
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
     </None>
-
-    <!-- x86 -->
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\onnxruntime.dll"
-          Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true'))">
-      <Link>onnxruntime.dll</Link>
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-      <Visible>false</Visible>
-    </None>
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\dnnl.dll"
-          Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')) AND
-                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\dnnl.dll')">
-      <Link>dnnl.dll</Link>
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-      <Visible>false</Visible>
-    </None>
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\mklml.dll"
-          Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')) AND
-                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\mklml.dll')">
-      <Link>mklml.dll</Link>
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-      <Visible>false</Visible>
-    </None>
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\libiomp5md.dll"
-          Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')) AND
-                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\libiomp5md.dll')">
-      <Link>libiomp5md.dll</Link>
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-      <Visible>false</Visible>
-    </None>
   </ItemGroup>
 </Project>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props_qnn.xml b/csharp/src/Microsoft.ML.OnnxRuntime/targets/netstandard/props_qnn.xml
@@ -28,14 +28,7 @@
    </Link>
   </ItemDefinitionGroup>
 
-  <ItemDefinitionGroup Condition="'$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true')">
-   <Link>
-      <AdditionalDependencies>$(MSBuildThisFileDirectory)../../runtimes/win-x86/native/onnxruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
-   </Link>
-  </ItemDefinitionGroup>
-
   <PropertyGroup>
-    <EnginePlatform Condition="'$(Platform)' == 'Win32'">x86</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' == 'ARM64'">arm64</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' == 'ARM'">arm</EnginePlatform>
     <EnginePlatform Condition="'$(Platform)' != 'Win32' AND '$(Platform)' != 'ARM64'">$(Platform)</EnginePlatform>
@@ -91,13 +84,5 @@
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
     </None>
-
-    <!-- x86 -->
-    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x86\native\onnxruntime.dll"
-          Condition="('$(PlatformTarget)' == 'x86' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' == 'true'))">
-      <Link>onnxruntime.dll</Link>
-      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
-      <Visible>false</Visible>
-    </None>
   </ItemGroup>
 </Project>
diff --git a/onnxruntime/contrib_ops/webgpu/moe/moe.h b/onnxruntime/contrib_ops/webgpu/moe/moe.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <limits>
+
 #include "core/providers/webgpu/program.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
 
@@ -31,7 +33,7 @@ class MoE : public WebGpuKernel {
     activation_alpha_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_alpha", 1.0));
     activation_beta_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_beta", 1.0));
     swiglu_fusion_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("swiglu_fusion", 0));
-    swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", 0);
+    swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", std::numeric_limits<float>::infinity());
     k_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("k", 4));
     normalize_routing_weights_ = info.GetAttrOrDefault<int64_t>("normalize_routing_weights", 0) == 1;
     use_sparse_mixer_ = info.GetAttrOrDefault<int64_t>("use_sparse_mixer", 0) == 1;

diff --git a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
@@ -395,6 +395,12 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
     auto lhs_ptrs = std::shared_ptr<const void*[]>(new const void*[lhs_ptrs_k * lhs_ptrs_m],
                                                 std::default_delete<const void*[]>());
 
+    // Initialize all padding entries. For partial tiles (m < m_step),
+    // the kai LHS packing kernel may still read pointer entries beyond the logically
+    // filled 'm' positions. Leaving these uninitialized can cause non-deterministic
+    // reads and corrupt packed LHS data.
+    auto lhs_ptrs_ = lhs_ptrs.get();
+    std::fill(lhs_ptrs_, lhs_ptrs_ + (lhs_ptrs_k * lhs_ptrs_m), reinterpret_cast<const void*>(&pad_ptr[0]));
 
     auto ih_out_size = ComputeConvOutSize(ih, kh, padding, 1);
     auto iw_out_size = ComputeConvOutSize(iw, kw, padding, 1);
@@ -430,7 +436,6 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
     };
 
     size_t m_{0};
-    auto lhs_ptrs_ = lhs_ptrs.get();
     for (size_t ih_ = 0; ih_ < ih_out_size; ih_ += sh) {
         for (size_t iw_ = 0; iw_ < iw_out_size; iw_ += sw, ++m_) {
             size_t k_{0};
@@ -460,7 +465,23 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
         // figure out how many blocks needed to correctly fill padding
         padsize = ((ci + padsize - 1) / padsize) * padsize;
     }
-    static std::vector<float>pad_ptr(padsize, 0.f);
+
+    // pad_ptr must be at least 'ci' floats for padding pixels.
+    // Using a thread_local grow-only buffer to avoid cross-thread interference and ensure sizing is correct.
+    thread_local std::vector<float> pad_ptr;
+    const float* old_pad_ptr = pad_ptr.data();
+    bool has_pad_ptr_changed = false;
+
+    if (pad_ptr.size() < padsize) {
+        pad_ptr.resize(padsize, 0.f);
+        if (pad_ptr.data() != old_pad_ptr) {
+            has_pad_ptr_changed = true;
+        }
+    } else {
+        // Ensure any previously-used region remains zeroed (grow-only means it should already be zeros,
+        // but keep this explicit for safety).
+        std::fill(pad_ptr.begin(), pad_ptr.end(), 0.f);
+    }
 
     LhsCacheKey key = {
         ci, ih, iw,
@@ -481,6 +502,16 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
     // Cache of computed lhs ptr offsets.  thread_local to prevent interference from parallel sessions.
     thread_local std::unordered_map<LhsCacheKey, std::shared_ptr<const void*[]>> lhs_ptrs_cache;
 
+    if (has_pad_ptr_changed)
+    {
+        // If the pad buffer was resized and a re-allocation has occurred, the cached lhs ptrs are invalid as they
+        // would be referencing the old pad buffer.
+        // See discussion in https://github.com/microsoft/onnxruntime/pull/27214.
+        // TODO(hasesh / JonathanC-ARM): A better approach would be to include the pad buffer address in the cache key
+        // or any other approach that would reduce unnecessary cache invalidations.
+        lhs_ptrs_cache.clear();
+    }
+
     std::shared_ptr<const void*[]> lhs_ptrs;
     if (auto found = lhs_ptrs_cache.find(key); found != lhs_ptrs_cache.end()) {
         lhs_ptrs = found->second;

diff --git a/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/rotary_embedding_kernel_neon_fp16.cpp
@@ -150,8 +150,8 @@ RopeKernel_Fp16_Impl<true>(
     if (i + 15 < dim) {
         float16x8_t x0 = MlasLoadFloat16x8(input + i);
         float16x8_t x1 = MlasLoadFloat16x8(input + i + 8);
-        float16x8_t sin_val = MlasLoadFloat16x8(sin + i);
-        float16x8_t cos_val = MlasLoadFloat16x8(cos + i);
+        float16x8_t sin_val = MlasLoadFloat16x8(sin + i / 2);
+        float16x8_t cos_val = MlasLoadFloat16x8(cos + i / 2);
         for (; i + 31 < dim; i += 16) {
             float16x8_t real = vuzp1q_f16(x0, x1);
             float16x8_t imag = vuzp2q_f16(x0, x1);
@@ -163,8 +163,8 @@ RopeKernel_Fp16_Impl<true>(
             MlasStoreFloat16x8(output + i + 8, y1);
             x0 = MlasLoadFloat16x8(input + i + 16);
             x1 = MlasLoadFloat16x8(input + i + 24);
-            sin_val = MlasLoadFloat16x8(sin + i + 16);
-            cos_val = MlasLoadFloat16x8(cos + i + 16);
+            sin_val = MlasLoadFloat16x8(sin + (i + 16) / 2);
+            cos_val = MlasLoadFloat16x8(cos + (i + 16) / 2);
         }
         float16x8_t real = vuzp1q_f16(x0, x1);
         float16x8_t imag = vuzp2q_f16(x0, x1);
@@ -181,8 +181,8 @@ RopeKernel_Fp16_Impl<true>(
         float16x4_t x1 = MlasLoadFloat16x4(input + i + 4);
         float16x4_t real = vuzp1_f16(x0, x1);
         float16x4_t imag = vuzp2_f16(x0, x1);
-        float16x4_t sin_val = MlasLoadFloat16x4(sin + i);
-        float16x4_t cos_val = MlasLoadFloat16x4(cos + i);
+        float16x4_t sin_val = MlasLoadFloat16x4(sin + i / 2);
+        float16x4_t cos_val = MlasLoadFloat16x4(cos + i / 2);
         float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
         float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
         float16x4_t y0 = vzip1_f16(real_out, imag_out);
@@ -201,12 +201,12 @@ RopeKernel_Fp16_Impl<true>(
         imag = MlasLoadLaneFloat16x4<1>(input + i + 3, imag);
         real = MlasLoadLaneFloat16x4<2>(input + i + 4, real);
         imag = MlasLoadLaneFloat16x4<2>(input + i + 5, imag);
-        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
-        sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
-        sin_val = MlasLoadLaneFloat16x4<2>(sin + i + 2, sin_val);
-        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
-        cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
-        cos_val = MlasLoadLaneFloat16x4<2>(cos + i + 2, cos_val);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i / 2, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<1>(sin + i / 2 + 1, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<2>(sin + i / 2 + 2, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i / 2, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<1>(cos + i / 2 + 1, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<2>(cos + i / 2 + 2, cos_val);
         float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
         float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
         MlasStoreLaneFloat16x4<0>(output + i, real_out);
@@ -224,10 +224,10 @@ RopeKernel_Fp16_Impl<true>(
         imag = MlasLoadLaneFloat16x4<0>(input + i + 1, imag);
         real = MlasLoadLaneFloat16x4<1>(input + i + 2, real);
         imag = MlasLoadLaneFloat16x4<1>(input + i + 3, imag);
-        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
-        sin_val = MlasLoadLaneFloat16x4<1>(sin + i + 1, sin_val);
-        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
-        cos_val = MlasLoadLaneFloat16x4<1>(cos + i + 1, cos_val);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i / 2, sin_val);
+        sin_val = MlasLoadLaneFloat16x4<1>(sin + i / 2 + 1, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i / 2, cos_val);
+        cos_val = MlasLoadLaneFloat16x4<1>(cos + i / 2 + 1, cos_val);
         float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
         float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
         MlasStoreLaneFloat16x4<0>(output + i, real_out);
@@ -241,8 +241,8 @@ RopeKernel_Fp16_Impl<true>(
         float16x4_t cos_val = MlasZeroFloat16x4();
         real = MlasLoadLaneFloat16x4<0>(input + i, real);
         imag = MlasLoadLaneFloat16x4<0>(input + i + 1, imag);
-        sin_val = MlasLoadLaneFloat16x4<0>(sin + i, sin_val);
-        cos_val = MlasLoadLaneFloat16x4<0>(cos + i, cos_val);
+        sin_val = MlasLoadLaneFloat16x4<0>(sin + i / 2, sin_val);
+        cos_val = MlasLoadLaneFloat16x4<0>(cos + i / 2, cos_val);
         float16x4_t real_out = vfms_f16(vmul_f16(real, cos_val), imag, sin_val);
         float16x4_t imag_out = vfma_f16(vmul_f16(real, sin_val), imag, cos_val);
         MlasStoreLaneFloat16x4<0>(output + i, real_out);

diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -521,8 +521,25 @@ def __init__(
     def _create_inference_session(self, providers, provider_options, disabled_optimizers=None):
         available_providers = C.get_available_providers()
 
-        # Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
-        if "TensorrtExecutionProvider" in available_providers:
+        # Validate that TensorrtExecutionProvider and NvTensorRTRTXExecutionProvider are not both specified
+        if providers:
+            has_tensorrt = any(
+                provider == "TensorrtExecutionProvider"
+                or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
+                for provider in providers
+            )
+            has_tensorrt_rtx = any(
+                provider == "NvTensorRTRTXExecutionProvider"
+                or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
+                for provider in providers
+            )
+            if has_tensorrt and has_tensorrt_rtx:
+                raise ValueError(
+                    "Cannot enable both 'TensorrtExecutionProvider' and 'NvTensorRTRTXExecutionProvider' "
+                    "in the same session."
+                )
+        # Tensorrt and TensorRT RTX can fall back to CUDA if it's explicitly assigned. All others fall back to CPU.
+        if "NvTensorRTRTXExecutionProvider" in available_providers:
             if (
                 providers
                 and any(
@@ -531,15 +548,15 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
                     for provider in providers
                 )
                 and any(
-                    provider == "TensorrtExecutionProvider"
-                    or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
+                    provider == "NvTensorRTRTXExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "NvTensorRTRTXExecutionProvider")
                     for provider in providers
                 )
             ):
                 self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
             else:
                 self._fallback_providers = ["CPUExecutionProvider"]
-        if "NvTensorRTRTXExecutionProvider" in available_providers:
+        elif "TensorrtExecutionProvider" in available_providers:
             if (
                 providers
                 and any(
@@ -548,8 +565,8 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
                     for provider in providers
                 )
                 and any(
-                    provider == "NvTensorRTRTXExecutionProvider"
-                    or (isinstance(provider, tuple) and provider[0] == "NvExecutionProvider")
+                    provider == "TensorrtExecutionProvider"
+                    or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider")
                     for provider in providers
                 )
             ):

diff --git a/onnxruntime/python/tools/transformers/machine_info.py b/onnxruntime/python/tools/transformers/machine_info.py
@@ -6,6 +6,7 @@
 # It is used to dump machine information for Notebooks
 
 import argparse
+import importlib.metadata
 import json
 import logging
 import platform
@@ -122,10 +123,7 @@ def get_gpu_info_by_nvml(self) -> dict:
         return result
 
     def get_related_packages(self) -> list[str]:
-        import pkg_resources  # noqa: PLC0415
-
-        installed_packages = pkg_resources.working_set
-        related_packages = [
+        related_packages = {
             "onnxruntime-gpu",
             "onnxruntime",
             "onnx",
@@ -137,8 +135,12 @@ def get_related_packages(self) -> list[str]:
             "flatbuffers",
             "numpy",
             "onnxconverter-common",
-        ]
-        related_packages_list = {i.key: i.version for i in installed_packages if i.key in related_packages}
+        }
+        related_packages_list = {}
+        for dist in importlib.metadata.distributions():
+            if dist.metadata["Name"].lower() in related_packages:
+                related_packages_list[dist.metadata["Name"].lower()] = dist.version
+
         return related_packages_list
 
     def get_onnxruntime_info(self) -> dict: