Skip to content

Commit 0a04afc

Browse files
authored
Merge branch 'microsoft:main' into main
2 parents 878dff6 + 4295524 commit 0a04afc

38 files changed

+3423
-518
lines changed

.github/workflows/linux_ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ jobs:
9494
dockerfile_path: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
9595
docker_image_repo: onnxruntimecpubuildciaarch64
9696
# ASan disabled due to excessive runtime (>4hr). Includes wheel build for basic checks.
97-
extra_build_flags: '--use_binskim_compliant_compile_flags --build_shared_lib'
97+
extra_build_flags: '--use_binskim_compliant_compile_flags --build_shared_lib --enable_arm_neon_nchwc'
9898
job_identifier: build-linux-arm64-debug
9999
secrets:
100100
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/linux_cuda_ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
3030
docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_almalinux8_gcc14:20251017.1'
3131
docker_image_repo: onnxruntimecuda12manylinuxbuild
32-
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
32+
extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --parallel --nvcc_threads 1 --cuda_version=12.8 --cuda_home=/usr/local/cuda-12.8 --cudnn_home=/usr/local/cuda-12.8 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
3333
python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
3434
run_tests: false # <<< Do not run tests in this job
3535
upload_build_output: true # <<< Upload the build/Release directory

.github/workflows/windows_cuda.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ jobs:
115115
exit $lastExitCode
116116
}
117117
# Execute the build process
118-
python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
118+
python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
119119
if ($lastExitCode -ne 0) {
120120
exit $lastExitCode
121121
}
@@ -235,7 +235,7 @@ jobs:
235235
exit $lastExitCode
236236
}
237237
238-
python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
238+
python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
239239
if ($lastExitCode -ne 0) {
240240
exit $lastExitCode
241241
}

.github/workflows/windows_tensorrt.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ jobs:
121121
exit $lastExitCode
122122
}
123123
# Execute the build process
124-
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
124+
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
125125
if ($lastExitCode -ne 0) {
126126
exit $lastExitCode
127127
}
@@ -247,7 +247,7 @@ jobs:
247247
exit $lastExitCode
248248
}
249249
250-
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
250+
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --parallel --nvcc_threads 1 --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
251251
if ($lastExitCode -ne 0) {
252252
exit $lastExitCode
253253
}

cmake/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1441,7 +1441,7 @@ get_property(onnxruntime_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_
14411441
if (onnxruntime_USE_CUDA)
14421442
set(CMAKE_CUDA_STANDARD 17)
14431443
if(onnxruntime_CUDA_HOME)
1444-
file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
1444+
file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
14451445
endif()
14461446
find_package(CUDAToolkit REQUIRED)
14471447

cmake/external/cuda_configuration.cmake

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,11 @@ macro(setup_cuda_architectures)
8585
# * Always use accelerated (`-a` suffix) target for supported real architectures.
8686
# cmake-format: on
8787

88+
# Allow override via CUDAARCHS environment variable (standard CMake variable)
89+
if(NOT CMAKE_CUDA_ARCHITECTURES AND DEFINED ENV{CUDAARCHS})
90+
set(CMAKE_CUDA_ARCHITECTURES "$ENV{CUDAARCHS}")
91+
endif()
92+
8893
if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
8994
# Detect highest available compute capability
9095
set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
@@ -142,12 +147,12 @@ macro(setup_cuda_architectures)
142147
continue()
143148
endif()
144149

145-
if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$")
150+
if(CUDA_ARCH MATCHES "^([1-9])([0-9])+[af]?-virtual$")
146151
set(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL ${CUDA_ARCH})
147-
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?-real$")
148-
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
149-
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?$")
152+
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)[af]?-real$")
150153
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
154+
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)([af]?)$")
155+
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1}${CMAKE_MATCH_4})
151156
else()
152157
message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
153158
endif()
@@ -159,7 +164,7 @@ macro(setup_cuda_architectures)
159164
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
160165
message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
161166

162-
set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120")
167+
set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
163168
foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
164169
if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
165170
add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")
@@ -168,10 +173,13 @@ macro(setup_cuda_architectures)
168173
endforeach()
169174

170175
# Enable accelerated features (like WGMMA, TMA and setmaxnreg) for SM >= 90.
171-
set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120")
176+
set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "110" "120")
172177
unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED)
173178
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
174-
if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
179+
if(CUDA_ARCH MATCHES "^([0-9]+)f$")
180+
# Family code, no -real suffix
181+
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}")
182+
elseif("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
175183
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real")
176184
else()
177185
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real")

cmake/onnxruntime_providers_nv.cmake

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
22
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# Licensed under the MIT License.
4-
find_package(CUDAToolkit REQUIRED 12.8)
4+
if(onnxruntime_CUDA_HOME)
5+
file(TO_CMAKE_PATH ${onnxruntime_CUDA_HOME} CUDAToolkit_ROOT)
6+
endif()
7+
find_package(CUDAToolkit REQUIRED)
58
enable_language(CUDA)
69
if(onnxruntime_DISABLE_CONTRIB_OPS)
710
message( FATAL_ERROR "To compile TensorRT execution provider contrib ops have to be enabled to dump an engine using com.microsoft:EPContext node." )
@@ -146,9 +149,9 @@ endif ()
146149
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE Eigen3::Eigen onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface Eigen3::Eigen)
147150
add_dependencies(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
148151
if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
149-
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
152+
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
150153
else()
151-
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
154+
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart CUDA::cuda_driver)
152155
endif()
153156
target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${TENSORRT_RTX_INCLUDE_DIR} ${onnx_tensorrt_SOURCE_DIR}
154157
PUBLIC ${CUDAToolkit_INCLUDE_DIRS})

js/web/docs/webnn-operators.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-s
5252
| GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
5353
| Greater | ai.onnx(7-8, 9-12, 13+) | greater | |
5454
| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | |
55-
| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. Does not support cos_cache and sin_cache inputs |
55+
| GroupQueryAttention | com.microsoft(1+) | add, cast, concat, constant, cumulativeSum, div, expand, lesser, matmul, reshape, scatterND, softmax, transpose, where | Only supports input total_sequence_length is constant and past_sequence_length of past kv equals to present_sequence_length of present kv. |
5656
| GRU | ai.onnx(7-13, 14-21, 22+) | gru | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
5757
| HardSigmoid | ai.onnx(7+) | hardSigmoid | |
5858
| HardSwish | ai.onnx(14+) | hardSwish | |

0 commit comments

Comments
 (0)