Skip to content

Commit 20c02d3

Browse files
committed
Merge branch 'main' into copilot/support-group-query-attention
2 parents cd992ee + a3749f1 commit 20c02d3

33 files changed

+3084
-423
lines changed

cmake/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1441,7 +1441,7 @@ get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_
14411441
if (onnxruntime_USE_CUDA)
14421442
set(CMAKE_CUDA_STANDARD 17)
14431443
if(onnxruntime_CUDA_HOME)
1444-
file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
1444+
file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
14451445
endif()
14461446
find_package(CUDAToolkit REQUIRED)
14471447

cmake/external/cuda_configuration.cmake

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ macro(setup_cuda_architectures)
8585
# * Always use accelerated (`-a` suffix) target for supported real architectures.
8686
# cmake-format: on
8787

88+
# Allow override via CUDAARCHS environment variable (standard CMake variable)
89+
if(NOT CMAKE_CUDA_ARCHITECTURES AND DEFINED ENV{CUDAARCHS})
90+
set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}")
91+
endif()
92+
8893
if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
8994
# Detect highest available compute capability
9095
set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
@@ -142,12 +147,12 @@ macro(setup_cuda_architectures)
142147
continue()
143148
endif()
144149

145-
if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$")
150+
if(CUDA_ARCH MATCHES "^([1-9])([0-9])+[af]?-virtual$")
146151
set(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL ${CUDA_ARCH})
147-
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?-real$")
148-
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
149-
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?$")
152+
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)[af]?-real$")
150153
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
154+
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)([af]?)$")
155+
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}${CMAKE_MATCH_4})
151156
else()
152157
message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
153158
endif()
@@ -159,7 +164,7 @@ macro(setup_cuda_architectures)
159164
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
160165
message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
161166

162-
set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120")
167+
set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
163168
foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
164169
if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
165170
add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")
@@ -168,10 +173,13 @@ macro(setup_cuda_architectures)
168173
endforeach()
169174

170175
# Enable accelerated features (like WGMMA, TMA and setmaxnreg) for SM >= 90.
171-
set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120")
176+
set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "110" "120")
172177
unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED)
173178
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
174-
if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
179+
if(CUDA_ARCH MATCHES "^([0-9]+)f$")
180+
# Family code, no -real suffix
181+
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}")
182+
elseif("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
175183
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real")
176184
else()
177185
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real")

cmake/onnxruntime_providers_nv.cmake

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
22
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# Licensed under the MIT License.
4-
find_package(CUDAToolkit REQUIRED 12.8)
4+
if(onnxruntime_CUDA_HOME)
5+
file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
6+
endif()
7+
find_package(CUDAToolkit REQUIRED)
58
enable_language(CUDA)
69
if(onnxruntime_DISABLE_CONTRIB_OPS)
710
message( FATAL_ERROR "To compile TensorRT execution provider contrib ops have to be enabled to dump an engine using com.microsoft:EPContext node." )
@@ -146,9 +149,9 @@ endif ()
146149
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE Eigen3::Eigen onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface Eigen3::Eigen)
147150
add_dependencies(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
148151
if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
149-
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
152+
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
150153
else()
151-
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
154+
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
152155
endif()
153156
target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_RTX_INCLUDE_DIR} ${onnx_tensorrt_SOURCE_DIR}
154157
PUBLIC ${CUDAToolkit_INCLUDE_DIRS})

js/web/docs/webnn-operators.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
5252
| GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
5353
| Greater | ai.onnx(7-8, 9-12, 13+) | greater | |
5454
| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | |
55-
| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. Does not support cos_cache and sin_cache inputs |
55+
| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. |
5656
| GRU | ai.onnx(7-13, 14-21, 22+) | gru | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
5757
| HardSigmoid | ai.onnx(7+) | hardSigmoid | |
5858
| HardSwish | ai.onnx(14+) | hardSwish | |

js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ export const parseConvTransposeAttributes = (attributes: Record<string, unknown>
132132
typeof attributes.autoPad == 'undefined' ? 0 : (attributes.autoPad as number)
133133
];
134134
const dilations = attributes.dilations as [number, number];
135-
const group = attributes.group as number;
135+
const group = (attributes.group as number) ?? 1; // default to 1 per ONNX spec
136136
const kernelShape = attributes.kernelShape as [number, number];
137137
const pads = attributes.pads as [number, number, number, number];
138138
const strides = attributes.strides as [number, number];

onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
495495
const std::vector<int64_t>& device_elements) {
496496
int64_t first_device_id = device_elements.at(0);
497497
int64_t first_device_id_count = 0;
498-
for (size_t i = 0; i < device_elements.size(); ++i) {
498+
for (size_t i = 0; i < static_cast<size_t>(device_elements.size()); ++i) {
499499
if (device_elements.at(i) == first_device_id) {
500500
++first_device_id_count;
501501
}
@@ -505,8 +505,8 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
505505
// Check if the device mesh pattern is supported.
506506
// Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1].
507507
// Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0].
508-
for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) {
509-
for (size_t device_id = 0; device_id < repeat_stride; ++device_id) {
508+
for (size_t repeat = 0; repeat < static_cast<size_t>(first_device_id_count); ++repeat) {
509+
for (size_t device_id = 0; device_id < static_cast<size_t>(repeat_stride); ++device_id) {
510510
ORT_ENFORCE(
511511
device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id),
512512
"Unsupported device mesh pattern.");
@@ -556,7 +556,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
556556
// S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1]
557557
std::vector<AxisPartitionSpec> dst_axis_specs;
558558
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
559-
if (src_axis != decomposed_axis_in_src) {
559+
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
560560
// Sharding spec is copied if the axis is not decomposed.
561561
// E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2]
562562
// The spec for "5" is copied.
@@ -606,7 +606,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
606606
DeviceMesh dst_device_mesh;
607607
std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements);
608608
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
609-
if (src_axis != decomposed_axis_in_src) {
609+
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
610610
dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
611611
} else if (dst_shape[decomposition_axis_in_dst] == 1) {
612612
// S[0] -> RS[0]
@@ -660,7 +660,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
660660
// Source tensor is sharded on non-decomposed axis.
661661
std::vector<AxisPartitionSpec> dst_axis_specs;
662662
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
663-
if (src_axis != decomposed_axis_in_src) {
663+
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
664664
dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
665665
} else {
666666
// R -> RR

onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
7373
MoEParameters moe_params(tensor_shards_);
7474
ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs<Tensor>(
7575
moe_params, input, router_probs,
76-
fc1_experts_weights, fc1_experts_bias_optional, nullptr,
77-
fc2_experts_weights, fc2_experts_bias_optional, nullptr,
78-
fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr,
76+
fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr,
77+
fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr,
78+
fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr,
7979
1, // no quantization so pack size is 1
8080
activation_type_ == ort_fastertransformer::ActivationType::SwiGLU,
8181
0)); // no block-wise quantization for sharded MoE

onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,14 @@
2929

3030
#if defined(ENABLE_FP4)
3131
#include "cutlass/float_subbyte.h"
32+
#if defined(__GNUC__)
33+
#pragma GCC diagnostic push
34+
#pragma GCC diagnostic ignored "-Wunused-parameter"
35+
#endif
3236
#include <cuda_fp4.h>
37+
#if defined(__GNUC__)
38+
#pragma GCC diagnostic pop
39+
#endif
3340
#endif
3441

3542
namespace onnxruntime::llm {

onnxruntime/core/platform/windows/telemetry.cc

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,8 @@ void WindowsTelemetry::LogSessionCreationStart(uint32_t session_id) const {
284284
TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
285285
TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
286286
TraceLoggingUInt32(session_id, "sessionId"),
287-
TraceLoggingLevel(WINEVENT_LEVEL_INFO));
287+
TraceLoggingLevel(WINEVENT_LEVEL_INFO),
288+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
288289
}
289290

290291
void WindowsTelemetry::LogEvaluationStop(uint32_t session_id) const {
@@ -386,7 +387,8 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
386387
TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"),
387388
TraceLoggingString(loaded_from.c_str(), "loadedFrom"),
388389
TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"),
389-
TraceLoggingString(service_names.c_str(), "serviceNames"));
390+
TraceLoggingString(service_names.c_str(), "serviceNames"),
391+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
390392
} else {
391393
TraceLoggingWrite(telemetry_provider_handle,
392394
"SessionCreation_CaptureState",
@@ -413,7 +415,8 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
413415
TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"),
414416
TraceLoggingString(loaded_from.c_str(), "loadedFrom"),
415417
TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"),
416-
TraceLoggingString(service_names.c_str(), "serviceNames"));
418+
TraceLoggingString(service_names.c_str(), "serviceNames"),
419+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
417420
}
418421
}
419422

@@ -502,7 +505,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
502505
TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"),
503506
TraceLoggingString(file, "file"),
504507
TraceLoggingString(function, "function"),
505-
TraceLoggingInt32(line, "line"));
508+
TraceLoggingInt32(line, "line"),
509+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
506510
#else
507511
TraceLoggingWrite(telemetry_provider_handle,
508512
"RuntimeError",
@@ -518,7 +522,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
518522
TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"),
519523
TraceLoggingString(file, "file"),
520524
TraceLoggingString(function, "function"),
521-
TraceLoggingInt32(line, "line"));
525+
TraceLoggingInt32(line, "line"),
526+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
522527
#endif
523528
}
524529

@@ -548,7 +553,8 @@ void WindowsTelemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_s
548553
TraceLoggingUInt32(session_id, "sessionId"),
549554
TraceLoggingUInt32(total_runs_since_last, "totalRuns"),
550555
TraceLoggingInt64(total_run_duration_since_last, "totalRunDuration"),
551-
TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize"));
556+
TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize"),
557+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
552558
}
553559

554560
void WindowsTelemetry::LogExecutionProviderEvent(LUID* adapterLuid) const {
@@ -624,7 +630,8 @@ void WindowsTelemetry::LogAutoEpSelection(uint32_t session_id, const std::string
624630
TraceLoggingUInt32(session_id, "sessionId"),
625631
TraceLoggingString(selection_policy.c_str(), "selectionPolicy"),
626632
TraceLoggingString(requested_execution_provider_string.c_str(), "requestedExecutionProviderIds"),
627-
TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds"));
633+
TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds"),
634+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
628635
}
629636

630637
void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const std::string& provider_options_string, bool captureState) const {
@@ -643,7 +650,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const
643650
// Telemetry info
644651
TraceLoggingUInt8(0, "schemaVersion"),
645652
TraceLoggingString(provider_id.c_str(), "providerId"),
646-
TraceLoggingString(provider_options_string.c_str(), "providerOptions"));
653+
TraceLoggingString(provider_options_string.c_str(), "providerOptions"),
654+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
647655
} else {
648656
TraceLoggingWrite(telemetry_provider_handle,
649657
"ProviderOptions_CaptureState",
@@ -655,7 +663,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const
655663
// Telemetry info
656664
TraceLoggingUInt8(0, "schemaVersion"),
657665
TraceLoggingString(provider_id.c_str(), "providerId"),
658-
TraceLoggingString(provider_options_string.c_str(), "providerOptions"));
666+
TraceLoggingString(provider_options_string.c_str(), "providerOptions"),
667+
TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
659668
}
660669
}
661670

0 commit comments

Comments
 (0)