Skip to content

Commit 23303da

Browse files
committed
Merge remote-tracking branch 'origin' into hari/einsum_fix_1
2 parents 3bedf32 + 892b2f1 commit 23303da

36 files changed

+568
-309
lines changed

js/web/test/e2e/exports/testcases/nextjs-default/package-lock.json

Lines changed: 215 additions & 219 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/web/test/e2e/exports/testcases/nextjs-default/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@
1111
"dependencies": {
1212
"react": "^19.0.0",
1313
"react-dom": "^19.0.0",
14-
"next": "15.4.10"
14+
"next": "16.1.5"
1515
}
1616
}

onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,9 @@ Status PrepareQkv_MHA_NoPast(contrib::AttentionParameters& parameters,
258258
assert(data.past_value == nullptr);
259259
assert(data.present_key == nullptr);
260260
assert(data.present_value == nullptr);
261-
assert(!parameters.is_unidirectional);
261+
// Note: is_unidirectional (causal) is supported by flash attention, memory efficient attention,
262+
// cuDNN flash attention, and unfused kernel. TRT fused runner is only used when !is_unidirectional
263+
// (enforced in MultiHeadAttention::ComputeInternal).
262264
assert(data.has_qkv_workspace == !NoQkvWorkspace_MHA_NoPast(data));
263265

264266
if (parameters.qkv_format == AttentionQkvFormat::Q_K_V_BSNH) {

onnxruntime/contrib_ops/webgpu/moe/moe.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
#pragma once
55

6+
#include <limits>
7+
68
#include "core/providers/webgpu/program.h"
79
#include "core/providers/webgpu/webgpu_kernel.h"
810

@@ -31,7 +33,7 @@ class MoE : public WebGpuKernel {
3133
activation_alpha_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_alpha", 1.0));
3234
activation_beta_ = static_cast<float>(info.GetAttrOrDefault<float>("activation_beta", 1.0));
3335
swiglu_fusion_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("swiglu_fusion", 0));
34-
swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", 0);
36+
swiglu_limit_ = info.GetAttrOrDefault<float>("swiglu_limit", std::numeric_limits<float>::infinity());
3537
k_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("k", 4));
3638
normalize_routing_weights_ = info.GetAttrOrDefault<int64_t>("normalize_routing_weights", 0) == 1;
3739
use_sparse_mixer_ = info.GetAttrOrDefault<int64_t>("use_sparse_mixer", 0) == 1;

onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,12 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
395395
auto lhs_ptrs = std::shared_ptr<const void*[]>(new const void*[lhs_ptrs_k * lhs_ptrs_m],
396396
std::default_delete<const void*[]>());
397397

398+
// Initialize all padding entries. For partial tiles (m < m_step),
399+
// the kai LHS packing kernel may still read pointer entries beyond the logically
400+
// filled 'm' positions. Leaving these uninitialized can cause non-deterministic
401+
// reads and corrupt packed LHS data.
402+
auto lhs_ptrs_ = lhs_ptrs.get();
403+
std::fill(lhs_ptrs_, lhs_ptrs_ + (lhs_ptrs_k * lhs_ptrs_m), reinterpret_cast<const void*>(&pad_ptr[0]));
398404

399405
auto ih_out_size = ComputeConvOutSize(ih, kh, padding, 1);
400406
auto iw_out_size = ComputeConvOutSize(iw, kw, padding, 1);
@@ -430,7 +436,6 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
430436
};
431437

432438
size_t m_{0};
433-
auto lhs_ptrs_ = lhs_ptrs.get();
434439
for (size_t ih_ = 0; ih_ < ih_out_size; ih_ += sh) {
435440
for (size_t iw_ = 0; iw_ < iw_out_size; iw_ += sw, ++m_) {
436441
size_t k_{0};
@@ -460,7 +465,23 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
460465
// figure out how many blocks needed to correctly fill padding
461466
padsize = ((ci + padsize - 1) / padsize) * padsize;
462467
}
463-
static std::vector<float>pad_ptr(padsize, 0.f);
468+
469+
// pad_ptr must be at least 'ci' floats for padding pixels.
470+
// Using a thread_local grow-only buffer to avoid cross-thread interference and ensure sizing is correct.
471+
thread_local std::vector<float> pad_ptr;
472+
const float* old_pad_ptr = pad_ptr.data();
473+
bool has_pad_ptr_changed = false;
474+
475+
if (pad_ptr.size() < padsize) {
476+
pad_ptr.resize(padsize, 0.f);
477+
if (pad_ptr.data() != old_pad_ptr) {
478+
has_pad_ptr_changed = true;
479+
}
480+
} else {
481+
// Ensure any previously-used region remains zeroed (grow-only means it should already be zeros,
482+
// but keep this explicit for safety).
483+
std::fill(pad_ptr.begin(), pad_ptr.end(), 0.f);
484+
}
464485

465486
LhsCacheKey key = {
466487
ci, ih, iw,
@@ -481,6 +502,16 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
481502
// Cache of computed lhs ptr offsets. thread_local to prevent interference from parallel sessions.
482503
thread_local std::unordered_map<LhsCacheKey, std::shared_ptr<const void*[]>> lhs_ptrs_cache;
483504

505+
if (has_pad_ptr_changed)
506+
{
507+
// If the pad buffer was resized and a re-allocation has occurred, the cached lhs ptrs are invalid as they
508+
// would be referencing the old pad buffer.
509+
// See discussion in https://github.com/microsoft/onnxruntime/pull/27214.
510+
// TODO(hasesh / JonathanC-ARM): A better approach would be to include the pad buffer address in the cache key
511+
// or any other approach that would reduce unnecessary cache invalidations.
512+
lhs_ptrs_cache.clear();
513+
}
514+
484515
std::shared_ptr<const void*[]> lhs_ptrs;
485516
if (auto found = lhs_ptrs_cache.find(key); found != lhs_ptrs_cache.end()) {
486517
lhs_ptrs = found->second;

onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,18 @@ struct ConvTransposeAttributes : public ConvAttributes {
9999
" group: ", group);
100100
}
101101

102+
// Bias shape validation (It should be a 1D tensor with size M)
103+
// See https://github.com/microsoft/onnxruntime/issues/26144
104+
if (B != nullptr) {
105+
if (B->Shape().NumDimensions() != 1 || B->Shape()[0] != num_output_channels) {
106+
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
107+
"Bias shape is not compatible with number of output channels."
108+
" It should be a 1-D tensor with size num_output_channels(M).",
109+
" Bias: ", B->Shape(),
110+
" num_output_channels: ", num_output_channels);
111+
}
112+
}
113+
102114
TensorShapeVector kernel_shape;
103115
ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape, is_nhwc));
104116

onnxruntime/core/providers/cuda/llm/attention.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
191191
ORT_THROW("softmax_precision is not supported yet in Attention op (CUDA).");
192192
}
193193

194-
// TODO(titaiwang): Continue on these parameters
195194
// Construct AttentionData to pass to QkvToContext
196195
typedef typename ToCudaType<T>::MappedType CudaT;
197196
onnxruntime::contrib::cuda::AttentionData<CudaT> data;
@@ -220,12 +219,12 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
220219
}
221220
data.qkv_format = contribop_parameters.qkv_format;
222221

223-
// TODO: Determine which kernel to use (Flash Attention, Memory Efficient Attention, etc.)
224222
// For now, set flags to false and let QkvToContext use the unfused path
225223
data.use_flash_attention = false;
226224
data.use_memory_efficient_attention = false;
227225
data.fused_runner = nullptr;
228226
data.fused_cross_attention_kernel = nullptr;
227+
data.kernel_type = onnxruntime::contrib::AttentionKernelType::AttentionKernel_Unfused;
229228

230229
// Allocate workspace for Q, K, V processing and scratch buffer
231230
const bool no_qkv_workspace = onnxruntime::contrib::cuda::NoQkvWorkspace(contribop_parameters, data);

onnxruntime/core/providers/cuda/nn/conv_transpose.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,18 @@ Status ConvTranspose<T, Layout>::UpdateState(OpKernelContext* context, bool dyna
311311
" group: ", conv_transpose_attrs_.group);
312312
}
313313

314+
// Bias shape validation (It should be a 1D tensor with size M)
315+
// See https://github.com/microsoft/onnxruntime/issues/26144
316+
if (B != nullptr) {
317+
if (B->Shape().NumDimensions() != 1 || B->Shape()[0] != num_output_channels) {
318+
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
319+
"Bias shape is not compatible with number of output channels."
320+
" It should be a 1-D tensor with size num_output_channels(M).",
321+
" Bias: ", B->Shape(),
322+
" num_output_channels: ", num_output_channels);
323+
}
324+
}
325+
314326
TensorShapeVector kernel_shape;
315327
ORT_RETURN_IF_ERROR(conv_transpose_attrs_.ComputeKernelShape(w_shape, kernel_shape, w_in_nhwc));
316328

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,7 +1168,7 @@ Status QnnBackendManager::ResetContextPriority() {
11681168
return SetContextPriority(context_priority_);
11691169
}
11701170

1171-
Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
1171+
Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode) {
11721172
if (true == context_created_) {
11731173
LOGS_DEFAULT(INFO) << "Context created already.";
11741174
return Status::OK();
@@ -1184,8 +1184,16 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
11841184
QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT;
11851185
ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config));
11861186

1187+
QnnContext_Config_t context_config_extended_udma = QNN_CONTEXT_CONFIG_INIT;
1188+
QnnHtpContext_CustomConfig_t udma_custom_config;
1189+
udma_custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_USE_EXTENDED_UDMA;
1190+
udma_custom_config.useExtendedUdma = enable_htp_extended_udma_mode;
1191+
context_config_extended_udma.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
1192+
context_config_extended_udma.customConfig = &udma_custom_config;
1193+
11871194
const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config,
11881195
&context_config_weight_sharing,
1196+
&context_config_extended_udma,
11891197
nullptr};
11901198

11911199
const QnnContext_Config_t* empty_context_configs[] = {nullptr};
@@ -1568,7 +1576,8 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
15681576
bool enable_vtcm_backup_buffer_sharing,
15691577
bool enable_file_mapped_weights,
15701578
std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
1571-
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map) {
1579+
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
1580+
bool enable_htp_extended_udma_mode) {
15721581
std::lock_guard<std::recursive_mutex> lock(logger_recursive_mutex_);
15731582
if (backend_setup_completed_) {
15741583
LOGS(logger, VERBOSE) << "Backend setup already!";
@@ -1679,7 +1688,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
16791688

16801689
if (status.IsOK() && (vtcm_backup_buffer_sharing_enabled_ || !load_from_cached_context)) {
16811690
status = vtcm_backup_buffer_sharing_enabled_ ? CreateContextVtcmBackupBufferSharingEnabled(context_bin_map)
1682-
: CreateContext(enable_htp_weight_sharing);
1691+
: CreateContext(enable_htp_weight_sharing, enable_htp_extended_udma_mode);
16831692

16841693
if (status.IsOK()) {
16851694
LOGS(logger, VERBOSE) << "CreateContext succeed.";

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
171171
bool enable_vtcm_backup_buffer_sharing,
172172
bool enable_file_mapped_weights,
173173
std::shared_ptr<qnn::RpcMemLibrary> rpcmem_library,
174-
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map);
174+
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
175+
bool enable_htp_extended_udma_mode);
175176

176177
Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
177178

@@ -299,7 +300,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
299300

300301
Status ReleaseProfilehandle();
301302

302-
Status CreateContext(bool enable_htp_weight_sharing);
303+
Status CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode);
303304

304305
Status GetFileSizeIfValid(const std::string& filepath, size_t& file_size);
305306

0 commit comments

Comments
 (0)