microsoft
diff --git a/‎js/web/test/e2e/exports/testcases/nextjs-default/package-lock.json‎
Lines changed: 215 additions & 219 deletions b/‎js/web/test/e2e/exports/testcases/nextjs-default/package-lock.json‎
Lines changed: 215 additions & 219 deletions
diff --git a/‎js/web/test/e2e/exports/testcases/nextjs-default/package.json‎
Lines changed: 1 addition & 1 deletion b/‎js/web/test/e2e/exports/testcases/nextjs-default/package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu‎
Lines changed: 3 additions & 1 deletion b/‎onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/webgpu/moe/moe.h‎
Lines changed: 3 additions & 1 deletion b/‎onnxruntime/contrib_ops/webgpu/moe/moe.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp‎
Lines changed: 33 additions & 2 deletions b/‎onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h‎
Lines changed: 12 additions & 0 deletions b/‎onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/cuda/llm/attention.cc‎
Lines changed: 1 addition & 2 deletions b/‎onnxruntime/core/providers/cuda/llm/attention.cc‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎onnxruntime/core/providers/cuda/nn/conv_transpose.cc‎
Lines changed: 12 additions & 0 deletions b/‎onnxruntime/core/providers/cuda/nn/conv_transpose.cc‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc‎
Lines changed: 12 additions & 3 deletions b/‎onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h‎
Lines changed: 3 additions & 2 deletions b/‎onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h‎
Lines changed: 3 additions & 2 deletions
@@ -11,6 +11,6 @@
   "dependencies": {
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
-    "next": "15.4.10"
+    "next": "16.1.5"
   }
 }
@@ -258,7 +258,9 @@ Status PrepareQkv_MHA_NoPast(contrib::AttentionParameters& parameters,
   assert(data.past_value == nullptr);
   assert(data.present_key == nullptr);
   assert(data.present_value == nullptr);
-  assert(!parameters.is_unidirectional);
+  // Note: is_unidirectional (causal) is supported by flash attention, memory efficient attention,
+  // cuDNN flash attention, and unfused kernel. TRT fused runner is only used when !is_unidirectional
+  // (enforced in MultiHeadAttention::ComputeInternal).
   assert(data.has_qkv_workspace == !NoQkvWorkspace_MHA_NoPast(data));
 
   if (parameters.qkv_format == AttentionQkvFormat::Q_K_V_BSNH) {
 
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <limits>
+
 #include "core/providers/webgpu/program.h"
 #include "core/providers/webgpu/webgpu_kernel.h"
 
@@ -31,7 +33,7 @@ class MoE : public WebGpuKernel {
     activation_alpha_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_alpha", 1.0));
     activation_beta_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_beta", 1.0));
     swiglu_fusion_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("swiglu_fusion", 0));
-    swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", 0);
+    swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", std::numeric_limits<float>::infinity());
     k_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("k", 4));
     normalize_routing_weights_ = info.GetAttrOrDefault<int64_t>("normalize_routing_weights", 0) == 1;
     use_sparse_mixer_ = info.GetAttrOrDefault<int64_t>("use_sparse_mixer", 0) == 1;
 
@@ -395,6 +395,12 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
     auto lhs_ptrs = std::shared_ptr<const void*[]>(new const void*[lhs_ptrs_k * lhs_ptrs_m],
                                                 std::default_delete<const void*[]>());
 
+    // Initialize all padding entries. For partial tiles (m < m_step),
+    // the kai LHS packing kernel may still read pointer entries beyond the logically
+    // filled 'm' positions. Leaving these uninitialized can cause non-deterministic
+    // reads and corrupt packed LHS data.
+    auto lhs_ptrs_ = lhs_ptrs.get();
+    std::fill(lhs_ptrs_, lhs_ptrs_ + (lhs_ptrs_k * lhs_ptrs_m), reinterpret_cast<const void*>(&pad_ptr[0]));
 
     auto ih_out_size = ComputeConvOutSize(ih, kh, padding, 1);
     auto iw_out_size = ComputeConvOutSize(iw, kw, padding, 1);
@@ -430,7 +436,6 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
     };
 
     size_t m_{0};
-    auto lhs_ptrs_ = lhs_ptrs.get();
     for (size_t ih_ = 0; ih_ < ih_out_size; ih_ += sh) {
         for (size_t iw_ = 0; iw_ < iw_out_size; iw_ += sw, ++m_) {
             size_t k_{0};
@@ -460,7 +465,23 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
         // figure out how many blocks needed to correctly fill padding
         padsize = ((ci + padsize - 1) / padsize) * padsize;
     }
-    static std::vector<float>pad_ptr(padsize, 0.f);
+
+    // pad_ptr must be at least 'ci' floats for padding pixels.
+    // Using a thread_local grow-only buffer to avoid cross-thread interference and ensure sizing is correct.
+    thread_local std::vector<float> pad_ptr;
+    const float* old_pad_ptr = pad_ptr.data();
+    bool has_pad_ptr_changed = false;
+
+    if (pad_ptr.size() < padsize) {
+        pad_ptr.resize(padsize, 0.f);
+        if (pad_ptr.data() != old_pad_ptr) {
+            has_pad_ptr_changed = true;
+        }
+    } else {
+        // Ensure any previously-used region remains zeroed (grow-only means it should already be zeros,
+        // but keep this explicit for safety).
+        std::fill(pad_ptr.begin(), pad_ptr.end(), 0.f);
+    }
 
     LhsCacheKey key = {
         ci, ih, iw,
@@ -481,6 +502,16 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
     // Cache of computed lhs ptr offsets.  thread_local to prevent interference from parallel sessions.
     thread_local std::unordered_map<LhsCacheKey, std::shared_ptr<const void*[]>> lhs_ptrs_cache;
 
+    if (has_pad_ptr_changed)
+    {
+        // If the pad buffer was resized and a re-allocation has occurred, the cached lhs ptrs are invalid as they
+        // would be referencing the old pad buffer.
+        // See discussion in https://github.com/microsoft/onnxruntime/pull/27214.
+        // TODO(hasesh / JonathanC-ARM): A better approach would be to include the pad buffer address in the cache key
+        // or any other approach that would reduce unnecessary cache invalidations.
+        lhs_ptrs_cache.clear();
+    }
+
     std::shared_ptr<const void*[]> lhs_ptrs;
     if (auto found = lhs_ptrs_cache.find(key); found != lhs_ptrs_cache.end()) {
         lhs_ptrs = found->second;
 
@@ -99,6 +99,18 @@ struct ConvTransposeAttributes : public ConvAttributes {
                              " group: ", group);
     }
 
+    // Bias shape validation (It should be a 1D tensor with size M)
+    // See https://github.com/microsoft/onnxruntime/issues/26144
+    if (B != nullptr) {
+      if (B->Shape().NumDimensions() != 1 || B->Shape()[0] != num_output_channels) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Bias shape is not compatible with number of output channels."
+                               " It should be a 1-D tensor with size num_output_channels(M).",
+                               " Bias: ", B->Shape(),
+                               " num_output_channels: ", num_output_channels);
+      }
+    }
+
     TensorShapeVector kernel_shape;
     ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape, is_nhwc));
 
 
@@ -191,7 +191,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
     ORT_THROW("softmax_precision is not supported yet in Attention op (CUDA).");
   }
 
-  // TODO(titaiwang): Continue on these parameters
   // Construct AttentionData to pass to QkvToContext
   typedef typename ToCudaType<T>::MappedType CudaT;
   onnxruntime::contrib::cuda::AttentionData<CudaT> data;
@@ -220,12 +219,12 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   }
   data.qkv_format = contribop_parameters.qkv_format;
 
-  // TODO: Determine which kernel to use (Flash Attention, Memory Efficient Attention, etc.)
   // For now, set flags to false and let QkvToContext use the unfused path
   data.use_flash_attention = false;
   data.use_memory_efficient_attention = false;
   data.fused_runner = nullptr;
   data.fused_cross_attention_kernel = nullptr;
+  data.kernel_type = onnxruntime::contrib::AttentionKernelType::AttentionKernel_Unfused;
 
   // Allocate workspace for Q, K, V processing and scratch buffer
   const bool no_qkv_workspace = onnxruntime::contrib::cuda::NoQkvWorkspace(contribop_parameters, data);
 
@@ -311,6 +311,18 @@ Status ConvTranspose<T, Layout>::UpdateState(OpKernelContext* context, bool dyna
                              " group: ", conv_transpose_attrs_.group);
     }
 
+    // Bias shape validation (It should be a 1D tensor with size M)
+    // See https://github.com/microsoft/onnxruntime/issues/26144
+    if (B != nullptr) {
+      if (B->Shape().NumDimensions() != 1 || B->Shape()[0] != num_output_channels) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Bias shape is not compatible with number of output channels."
+                               " It should be a 1-D tensor with size num_output_channels(M).",
+                               " Bias: ", B->Shape(),
+                               " num_output_channels: ", num_output_channels);
+      }
+    }
+
     TensorShapeVector kernel_shape;
     ORT_RETURN_IF_ERROR(conv_transpose_attrs_.ComputeKernelShape(w_shape, kernel_shape, w_in_nhwc));
 
 
@@ -1168,7 +1168,7 @@ Status QnnBackendManager::ResetContextPriority() {
   return SetContextPriority(context_priority_);
 }
 
-Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
+Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode) {
   if (true == context_created_) {
     LOGS_DEFAULT(INFO) << "Context created already.";
     return Status::OK();
@@ -1184,8 +1184,16 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
   QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT;
   ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config));
 
+  QnnContext_Config_t context_config_extended_udma = QNN_CONTEXT_CONFIG_INIT;
+  QnnHtpContext_CustomConfig_t udma_custom_config;
+  udma_custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_USE_EXTENDED_UDMA;
+  udma_custom_config.useExtendedUdma = enable_htp_extended_udma_mode;
+  context_config_extended_udma.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
+  context_config_extended_udma.customConfig = &udma_custom_config;
+
   const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config,
                                                       &context_config_weight_sharing,
+                                                      &context_config_extended_udma,
                                                       nullptr};
 
   const QnnContext_Config_t* empty_context_configs[] = {nullptr};
@@ -1568,7 +1576,8 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
                                        bool enable_vtcm_backup_buffer_sharing,
                                        bool enable_file_mapped_weights,
                                        std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
-                                       std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map) {
+                                       std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
+                                       bool enable_htp_extended_udma_mode) {
   std::lock_guard<std::recursive_mutex> lock(logger_recursive_mutex_);
   if (backend_setup_completed_) {
     LOGS(logger, VERBOSE) << "Backend setup already!";
@@ -1679,7 +1688,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
 
   if (status.IsOK() && (vtcm_backup_buffer_sharing_enabled_ || !load_from_cached_context)) {
     status = vtcm_backup_buffer_sharing_enabled_ ? CreateContextVtcmBackupBufferSharingEnabled(context_bin_map)
-                                                 : CreateContext(enable_htp_weight_sharing);
+                                                 : CreateContext(enable_htp_weight_sharing, enable_htp_extended_udma_mode);
 
     if (status.IsOK()) {
       LOGS(logger, VERBOSE) << "CreateContext succeed.";
 
@@ -171,7 +171,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
                       bool enable_vtcm_backup_buffer_sharing,
                       bool enable_file_mapped_weights,
                       std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
-                      std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map);
+                      std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
+                      bool enable_htp_extended_udma_mode);
 
   Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
 
@@ -299,7 +300,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
 
   Status ReleaseProfilehandle();
 
-  Status CreateContext(bool enable_htp_weight_sharing);
+  Status CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode);
 
   Status GetFileSizeIfValid(const std::string& filepath, size_t& file_size);
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,6 @@`
`11`	`11`	`"dependencies": {`
`12`	`12`	`"react": "^19.0.0",`
`13`	`13`	`"react-dom": "^19.0.0",`
`14`		`- "next": "15.4.10"`
	`14`	`+ "next": "16.1.5"`
`15`	`15`	`}`
`16`	`16`	`}`