microsoft
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/external/cuda_configuration.cmake‎
Lines changed: 15 additions & 7 deletions b/‎cmake/external/cuda_configuration.cmake‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎cmake/onnxruntime_providers_nv.cmake‎
Lines changed: 6 additions & 3 deletions b/‎cmake/onnxruntime_providers_nv.cmake‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎js/web/docs/webnn-operators.md‎
Lines changed: 1 addition & 1 deletion b/‎js/web/docs/webnn-operators.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts‎
Lines changed: 1 addition & 1 deletion b/‎js/web/lib/wasm/jsep/webgpu/ops/conv-transpose.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc‎
Lines changed: 6 additions & 6 deletions b/‎onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc‎
Lines changed: 3 additions & 3 deletions b/‎onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h‎
Lines changed: 7 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎onnxruntime/core/platform/windows/telemetry.cc‎
Lines changed: 18 additions & 9 deletions b/‎onnxruntime/core/platform/windows/telemetry.cc‎
Lines changed: 18 additions & 9 deletions
@@ -1441,7 +1441,7 @@ get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_
 if (onnxruntime_USE_CUDA)
   set(CMAKE_CUDA_STANDARD 17)
   if(onnxruntime_CUDA_HOME)
-    file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
+    file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
   endif()
   find_package(CUDAToolkit REQUIRED)
 
 
@@ -85,6 +85,11 @@ macro(setup_cuda_architectures)
   #  * Always use accelerated (`-a` suffix) target for supported real architectures.
   # cmake-format: on
 
+  # Allow override via CUDAARCHS environment variable (standard CMake variable)
+  if(NOT CMAKE_CUDA_ARCHITECTURES AND DEFINED ENV{CUDAARCHS})
+    set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}")
+  endif()
+
   if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
     # Detect highest available compute capability
     set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
@@ -142,12 +147,12 @@ macro(setup_cuda_architectures)
       continue()
     endif()
 
-    if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$")
+    if(CUDA_ARCH MATCHES "^([1-9])([0-9])+[af]?-virtual$")
       set(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL ${CUDA_ARCH})
-    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?-real$")
-      list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
-    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?$")
+    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)[af]?-real$")
       list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
+    elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)([af]?)$")
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}${CMAKE_MATCH_4})
     else()
       message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
     endif()
@@ -159,7 +164,7 @@ macro(setup_cuda_architectures)
   set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
   message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
 
-  set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120")
+  set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
   foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
     if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
       add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")
@@ -168,10 +173,13 @@ macro(setup_cuda_architectures)
   endforeach()
 
   # Enable accelerated features (like WGMMA, TMA and setmaxnreg) for SM >= 90.
-  set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120")
+  set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "110" "120")
   unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED)
   foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
-    if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
+    if(CUDA_ARCH MATCHES "^([0-9]+)f$")
+      # Family code, no -real suffix
+      list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}")
+    elseif("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
       list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real")
     else()
       list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real")
 
@@ -1,7 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # Licensed under the MIT License.
-  find_package(CUDAToolkit REQUIRED 12.8)
+  if(onnxruntime_CUDA_HOME)
+    file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
+  endif()
+  find_package(CUDAToolkit REQUIRED)
   enable_language(CUDA)
   if(onnxruntime_DISABLE_CONTRIB_OPS)
     message( FATAL_ERROR "To compile TensorRT execution provider contrib ops have to be enabled to dump an engine using com.microsoft:EPContext node." )
@@ -146,9 +149,9 @@ endif ()
   target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE Eigen3::Eigen  onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface Eigen3::Eigen)
   add_dependencies(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
-    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
+    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
   else()
-    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
+    target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
   endif()
   target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_RTX_INCLUDE_DIR} ${onnx_tensorrt_SOURCE_DIR}
     PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
 
@@ -52,7 +52,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
 | GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
 | Greater | ai.onnx(7-8, 9-12, 13+) | greater | |
 | GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | |
-| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. Does not support cos_cache and sin_cache inputs |
+| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. |
 | GRU | ai.onnx(7-13, 14-21, 22+) | gru | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
 | HardSigmoid | ai.onnx(7+) | hardSigmoid | |
 | HardSwish | ai.onnx(14+) | hardSwish | |
 
@@ -132,7 +132,7 @@ export const parseConvTransposeAttributes = (attributes: Record<string, unknown>
     typeof attributes.autoPad == 'undefined' ? 0 : (attributes.autoPad as number)
   ];
   const dilations = attributes.dilations as [number, number];
-  const group = attributes.group as number;
+  const group = (attributes.group as number) ?? 1; // default to 1 per ONNX spec
   const kernelShape = attributes.kernelShape as [number, number];
   const pads = attributes.pads as [number, number, number, number];
   const strides = attributes.strides as [number, number];
 
@@ -495,7 +495,7 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
     const std::vector<int64_t>& device_elements) {
   int64_t first_device_id = device_elements.at(0);
   int64_t first_device_id_count = 0;
-  for (size_t i = 0; i < device_elements.size(); ++i) {
+  for (size_t i = 0; i < static_cast<size_t>(device_elements.size()); ++i) {
     if (device_elements.at(i) == first_device_id) {
       ++first_device_id_count;
     }
@@ -505,8 +505,8 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
   // Check if the device mesh pattern is supported.
   // Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1].
   // Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0].
-  for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) {
-    for (size_t device_id = 0; device_id < repeat_stride; ++device_id) {
+  for (size_t repeat = 0; repeat < static_cast<size_t>(first_device_id_count); ++repeat) {
+    for (size_t device_id = 0; device_id < static_cast<size_t>(repeat_stride); ++device_id) {
       ORT_ENFORCE(
           device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id),
           "Unsupported device mesh pattern.");
@@ -556,7 +556,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
       // S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1]
       std::vector<AxisPartitionSpec> dst_axis_specs;
       for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-        if (src_axis != decomposed_axis_in_src) {
+        if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
           // Sharding spec is copied if the axis is not decomposed.
           // E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2]
           // The spec for "5" is copied.
@@ -606,7 +606,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
       DeviceMesh dst_device_mesh;
       std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements);
       for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-        if (src_axis != decomposed_axis_in_src) {
+        if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
           dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
         } else if (dst_shape[decomposition_axis_in_dst] == 1) {
           // S[0] -> RS[0]
@@ -660,7 +660,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
     // Source tensor is sharded on non-decomposed axis.
     std::vector<AxisPartitionSpec> dst_axis_specs;
     for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-      if (src_axis != decomposed_axis_in_src) {
+      if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
         dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
       } else {
         // R -> RR
 
@@ -73,9 +73,9 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   MoEParameters moe_params(tensor_shards_);
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs<Tensor>(
       moe_params, input, router_probs,
-      fc1_experts_weights, fc1_experts_bias_optional, nullptr,
-      fc2_experts_weights, fc2_experts_bias_optional, nullptr,
-      fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr,
+      fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr,
+      fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr,
+      fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr,
       1,  // no quantization so pack size is 1
       activation_type_ == ort_fastertransformer::ActivationType::SwiGLU,
       0));  // no block-wise quantization for sharded MoE
 
@@ -29,7 +29,14 @@
 
 #if defined(ENABLE_FP4)
 #include "cutlass/float_subbyte.h"
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
 #include <cuda_fp4.h>
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 namespace onnxruntime::llm {
 
@@ -284,7 +284,8 @@ void WindowsTelemetry::LogSessionCreationStart(uint32_t session_id) const {
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
                     TraceLoggingUInt32(session_id, "sessionId"),
-                    TraceLoggingLevel(WINEVENT_LEVEL_INFO));
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 }
 
 void WindowsTelemetry::LogEvaluationStop(uint32_t session_id) const {
@@ -386,7 +387,8 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
                       TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"),
                       TraceLoggingString(loaded_from.c_str(), "loadedFrom"),
                       TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"),
-                      TraceLoggingString(service_names.c_str(), "serviceNames"));
+                      TraceLoggingString(service_names.c_str(), "serviceNames"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   } else {
     TraceLoggingWrite(telemetry_provider_handle,
                       "SessionCreation_CaptureState",
@@ -413,7 +415,8 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
                       TraceLoggingString(model_metadata_string.c_str(), "modelMetaData"),
                       TraceLoggingString(loaded_from.c_str(), "loadedFrom"),
                       TraceLoggingString(execution_provider_string.c_str(), "executionProviderIds"),
-                      TraceLoggingString(service_names.c_str(), "serviceNames"));
+                      TraceLoggingString(service_names.c_str(), "serviceNames"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   }
 }
 
@@ -502,7 +505,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"),
                     TraceLoggingString(file, "file"),
                     TraceLoggingString(function, "function"),
-                    TraceLoggingInt32(line, "line"));
+                    TraceLoggingInt32(line, "line"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 #else
   TraceLoggingWrite(telemetry_provider_handle,
                     "RuntimeError",
@@ -518,7 +522,8 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingString(status.ErrorMessage().c_str(), "errorMessage"),
                     TraceLoggingString(file, "file"),
                     TraceLoggingString(function, "function"),
-                    TraceLoggingInt32(line, "line"));
+                    TraceLoggingInt32(line, "line"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 #endif
 }
 
@@ -548,7 +553,8 @@ void WindowsTelemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_s
                     TraceLoggingUInt32(session_id, "sessionId"),
                     TraceLoggingUInt32(total_runs_since_last, "totalRuns"),
                     TraceLoggingInt64(total_run_duration_since_last, "totalRunDuration"),
-                    TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize"));
+                    TraceLoggingString(total_duration_per_batch_size.c_str(), "totalRunDurationPerBatchSize"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 }
 
 void WindowsTelemetry::LogExecutionProviderEvent(LUID* adapterLuid) const {
@@ -624,7 +630,8 @@ void WindowsTelemetry::LogAutoEpSelection(uint32_t session_id, const std::string
                     TraceLoggingUInt32(session_id, "sessionId"),
                     TraceLoggingString(selection_policy.c_str(), "selectionPolicy"),
                     TraceLoggingString(requested_execution_provider_string.c_str(), "requestedExecutionProviderIds"),
-                    TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds"));
+                    TraceLoggingString(available_execution_provider_string.c_str(), "availableExecutionProviderIds"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
 }
 
 void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const std::string& provider_options_string, bool captureState) const {
@@ -643,7 +650,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const
                       // Telemetry info
                       TraceLoggingUInt8(0, "schemaVersion"),
                       TraceLoggingString(provider_id.c_str(), "providerId"),
-                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"));
+                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   } else {
     TraceLoggingWrite(telemetry_provider_handle,
                       "ProviderOptions_CaptureState",
@@ -655,7 +663,8 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const
                       // Telemetry info
                       TraceLoggingUInt8(0, "schemaVersion"),
                       TraceLoggingString(provider_id.c_str(), "providerId"),
-                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"));
+                      TraceLoggingString(provider_options_string.c_str(), "providerOptions"),
+                      TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
   }
 }