Skip to content

Commit 242ee30

Browse files
committed
Merge branch 'main' into tlwu/gqa_xqa
2 parents 0acfe30 + 1817f4a commit 242ee30

40 files changed

+1365
-526
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
[submodule "cmake/external/emsdk"]
88
path = cmake/external/emsdk
99
url = https://github.com/emscripten-core/emsdk.git
10-
branch = 4.0.21
10+
branch = 4.0.23

cmake/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1208,7 +1208,7 @@ function(onnxruntime_configure_target target_name)
12081208

12091209
# Keep BinSkim happy
12101210
if(MSVC AND NOT onnxruntime_target_platform MATCHES "ARM")
1211-
target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>" "$<$<LINK_LANGUAGE:CUDA>:-Xlinker=/CETCOMPAT>")
1211+
target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>")
12121212
endif()
12131213

12141214
endfunction()

cmake/external/cuDNN.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
33
find_path(
44
CUDNN_INCLUDE_DIR cudnn.h
55
HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS}
6-
PATH_SUFFIXES include
6+
PATH_SUFFIXES include include/${onnxruntime_CUDA_VERSION}
77
REQUIRED
88
)
99

@@ -15,7 +15,7 @@ function(find_cudnn_library NAME)
1515
find_library(
1616
${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}"
1717
HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR}
18-
PATH_SUFFIXES lib64 lib/x64 lib
18+
PATH_SUFFIXES lib64 lib/x64 lib lib/${onnxruntime_CUDA_VERSION}/x64
1919
REQUIRED
2020
)
2121

cmake/external/cuda_configuration.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ macro(setup_cuda_architectures)
126126
set(CMAKE_CUDA_ARCHITECTURES "37;50;52;60;70;75;80;86;89")
127127
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
128128
set(CMAKE_CUDA_ARCHITECTURES "52;60;70;75;80;86;89;90")
129+
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
130+
# 13.x drops support for 60 and 70
131+
set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90;100;120")
129132
else()
130133
set(CMAKE_CUDA_ARCHITECTURES "60;70;75;80;86;89;90;100;120")
131134
endif()

cmake/onnxruntime_python.cmake

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,22 @@ if (WIN32)
297297
if (onnxruntime_USE_CUDA)
298298
file(WRITE "${VERSION_INFO_FILE}" "use_cuda = True\n")
299299
if(onnxruntime_CUDNN_HOME)
300-
file(GLOB CUDNN_DLL_PATH "${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll")
301-
if (NOT CUDNN_DLL_PATH)
300+
# may have x64 in the path
301+
# may have a path with CUDA toolkit version if multiple installed on the machine
302+
set(CUDNN_SEARCH_PATHS
303+
"${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll"
304+
"${onnxruntime_CUDNN_HOME}/bin/x64/cudnn64_*.dll"
305+
"${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/cudnn64_*.dll"
306+
"${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/x64/cudnn64_*.dll"
307+
)
308+
set(CUDNN_DLL_PATH "")
309+
foreach(search_path ${CUDNN_SEARCH_PATHS})
310+
file(GLOB CUDNN_DLL_PATH "${search_path}")
311+
if(CUDNN_DLL_PATH)
312+
break()
313+
endif()
314+
endforeach()
315+
if(NOT CUDNN_DLL_PATH)
302316
message(FATAL_ERROR "cuDNN not found in ${onnxruntime_CUDNN_HOME}")
303317
endif()
304318
else()

cmake/onnxruntime_unittests.cmake

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -949,9 +949,9 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
949949
onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
950950
config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
951951
onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
952-
add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils onnxruntime_common)
952+
add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils)
953953
target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
954-
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils onnxruntime_common)
954+
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils)
955955
if (MSVC)
956956
# Cutlass code has an issue with the following:
957957
# warning C4100: 'magic': unreferenced formal parameter
@@ -1233,15 +1233,15 @@ block()
12331233
DEPENDS ${onnxruntime_provider_test_deps}
12341234
)
12351235

1236-
# Expose QNN SDK headers to unit tests via an interface target
1236+
# Expose QNN SDK headers to unit tests via an interface target
12371237
if(onnxruntime_USE_QNN)
12381238
add_library(qnn_sdk_headers_include INTERFACE)
12391239
target_include_directories(qnn_sdk_headers_include INTERFACE
12401240
${onnxruntime_QNN_HOME}/include
12411241
${onnxruntime_QNN_HOME}/include/QNN)
12421242
target_link_libraries(onnxruntime_provider_test PRIVATE qnn_sdk_headers_include)
12431243
endif()
1244-
1244+
12451245
if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
12461246
# The test_main.cc includes NvInfer.h where it has many deprecated declarations
12471247
# simply ignore them for TensorRT EP build

docs/OperatorKernels.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,7 @@ Do not modify directly.*
653653
|ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
654654
|||12|**T** = tensor(double), tensor(float), tensor(float16)|
655655
|||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
656+
|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|23+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
656657
|AveragePool|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
657658
|||[19, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
658659
|||[11, 18]|**T** = tensor(double), tensor(float), tensor(float16)|

include/onnxruntime/core/session/onnxruntime_ep_c_api.h

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2046,6 +2046,65 @@ struct OrtEpFactory {
20462046
ORT_API2_STATUS(CreateExternalResourceImporterForDevice, _In_ OrtEpFactory* this_ptr,
20472047
_In_ const OrtEpDevice* ep_device,
20482048
_Outptr_result_maybenull_ OrtExternalResourceImporterImpl** out_importer);
2049+
2050+
/** \brief Returns the number of OrtCustomOpDomains that this factory provides.
2051+
*
2052+
* \param[in] this_ptr The OrtEpFactory instance.
2053+
* \param[out] num_domains Output parameter set to the number of provided OrtCustomOpDomain instances.
2054+
*
2055+
* \snippet{doc} snippets.dox OrtStatus Return Value
2056+
*
2057+
* \since Version 1.24.
2058+
*/
2059+
ORT_API2_STATUS(GetNumCustomOpDomains, _In_ OrtEpFactory* this_ptr, _Out_ size_t* num_domains);
2060+
2061+
/** \brief Gets the EP-specific OrtCustomOpDomains.
2062+
*
2063+
* This function is used when running inference on a model that contains EP-specific custom operations.
2064+
*
2065+
* Workflow:
2066+
* 1. The EP factory implements this function to supply a list of OrtCustomOpDomain instances.
2067+
* 2. The application either 1) calls SessionOptionsAppendExecutionProvider_V2() with an OrtEpDevice containing
2068+
* the plugin EP's factory or 2) enables auto ep selection.
2069+
* 3. 1) SessionOptionsAppendExecutionProvider_V2() appends the provided OrtCustomOpDomains to the
2070+
* session options or 2) ORT registers the OrtCustomOpDomains provided by the EP devices
2071+
* that could be potentially selected.
2072+
*
2073+
* As a result, any session created from these session options will have these custom op domains registered
2074+
* in ORT, ensuring that the custom ops are properly recognized and validated when the model is loaded.
2075+
*
2076+
* Plugin EPs can provide two types of custom ops:
2077+
* 1. A full OrtCustomOp with a concrete kernel implementation
2078+
* - A Plugin EP can supply an OrtCustomOp and a corresponding CustomKernel::Compute() implementation.
2079+
* - In GetCapability(), it calls EpGraphSupportInfo_AddSingleNode() to inform ORT
2080+
* that the custom node should NOT be fused or compiled. Instead, ORT should invoke
2081+
* the custom node's Compute() function at runtime.
2082+
*
2083+
* 2. A "placeholder" OrtCustomOp with an empty kernel implementation
2084+
* - A compile-based Plugin EP can supply an OrtCustomOp whose CustomKernel::Compute()
2085+
* does nothing. The purpose is to satisfy model validation during model loading by
2086+
* registering the custom op as a valid operator in the session.
2087+
* - In GetCapability(), the EP should call EpGraphSupportInfo_AddNodesToFuse() to
2088+
* notify ORT that this custom node should be fused and compiled by the EP.
2089+
* - In Compile(), the EP executes its compiled bits to perform inference for
2090+
* the fused custom node.
2091+
*
2092+
* Note: The OrtCustomOpDomain instances must be valid while any session is using them.
2093+
EP factory has the responsibility to release OrtCustomOpDomain instances it creates. It happens
2094+
* automatically if using the C++ Ort::CustomOpDomain class.
2095+
*
2096+
* \param[in] this_ptr The OrtEpFactory instance.
2097+
* \param[out] domains Array of `num_domains` elements pre-allocated by ORT that should be filled with
2098+
OrtCustomOpDomain instances created by the EP. The `num_domains` is the value returned by
2099+
GetNumCustomOpDomains().
2100+
* \param[in] num_domains The size of the `domains` array pre-allocated by ORT.
2101+
*
2102+
* \snippet{doc} snippets.dox OrtStatus Return Value
2103+
*
2104+
* \since Version 1.24.
2105+
*/
2106+
ORT_API2_STATUS(GetCustomOpDomains, _In_ OrtEpFactory* this_ptr,
2107+
_Out_writes_all_(num_domains) OrtCustomOpDomain** domains, _In_ size_t num_domains);
20492108
};
20502109

20512110
#ifdef __cplusplus

onnxruntime/contrib_ops/cpu/bert/attention_parameters.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ struct AttentionParameters {
3434
float mask_filter_value;
3535
float scale;
3636
bool use_tf32 = false;
37+
bool is_output_bnsh = false; // whether the output format is BNSH
3738
AttentionMaskType mask_type;
3839
AttentionQkvFormat qkv_format;
3940
};

0 commit comments

Comments
 (0)