Skip to content

Commit 9ddab63

Browse files
committed
Merge remote-tracking branch 'origin/main' into fs-eire/test-target-settings
2 parents bc41889 + 50e3362 commit 9ddab63

File tree

166 files changed

+11749
-1159
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

166 files changed

+11749
-1159
lines changed

.github/workflows/mac.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
matrix:
7373
target_arch: [x86_64, arm64]
7474

75-
timeout-minutes: 90
75+
timeout-minutes: 120
7676

7777
steps:
7878
- name: Checkout code

cmake/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
9191
option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)
9292

9393
option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
94+
option(onnxruntime_USE_QMX_KLEIDIAI_COEXIST "Build with QMX and Arm KLEIDIAI libraries" OFF)
9495
option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
9596
option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
9697
option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
@@ -1206,7 +1207,7 @@ function(onnxruntime_configure_target target_name)
12061207

12071208
# Keep BinSkim happy
12081209
if(MSVC AND NOT onnxruntime_target_platform MATCHES "ARM")
1209-
target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>" "$<$<LINK_LANGUAGE:CUDA>:-Xlinker=/CETCOMPAT>")
1210+
target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>")
12101211
endif()
12111212

12121213
endfunction()

cmake/deps.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,8 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0
5656
directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
5757
cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
5858
dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
59-
kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.15.0.tar.gz;62ccd24ab60bcef68766440fb42d79071ac2a5d2
59+
kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.20.0.tar.gz;6895e72b3d5cf1173358164cb3d64c9d7d33cc84
60+
# kleidiai-qmx is pinned to a specific commit as there are no tagged releases. When an appropriate tagged release becomes available,
61+
# this entry will be updated to use refs/tags/<version> instead of the raw commit hash.
62+
kleidiai-qmx;https://github.com/qualcomm/kleidiai/archive/2f10c9a8d32f81ffeeb6d4885a29cc35d2b0da87.zip;5e855730a2d69057a569f43dd7532db3b2d2a05c
6063
duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794

cmake/external/cuDNN.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
33
find_path(
44
CUDNN_INCLUDE_DIR cudnn.h
55
HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS}
6-
PATH_SUFFIXES include
6+
PATH_SUFFIXES include include/${onnxruntime_CUDA_VERSION}
77
REQUIRED
88
)
99

@@ -15,7 +15,7 @@ function(find_cudnn_library NAME)
1515
find_library(
1616
${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}"
1717
HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR}
18-
PATH_SUFFIXES lib64 lib/x64 lib
18+
PATH_SUFFIXES lib64 lib/x64 lib lib/${onnxruntime_CUDA_VERSION}/x64
1919
REQUIRED
2020
)
2121

cmake/external/cuda_configuration.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ macro(setup_cuda_architectures)
126126
set(CMAKE_CUDA_ARCHITECTURES "37;50;52;60;70;75;80;86;89")
127127
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
128128
set(CMAKE_CUDA_ARCHITECTURES "52;60;70;75;80;86;89;90")
129+
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
130+
# 13.x drops support for 60 and 70
131+
set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90;100;120")
129132
else()
130133
set(CMAKE_CUDA_ARCHITECTURES "60;70;75;80;86;89;90;100;120")
131134
endif()

cmake/external/onnxruntime_external_deps.cmake

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,12 @@ if(onnxruntime_USE_KLEIDIAI)
845845

846846
onnxruntime_fetchcontent_declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai} EXCLUDE_FROM_ALL)
847847
onnxruntime_fetchcontent_makeavailable(kleidiai)
848+
# Fetch Qualcomm's kleidiai library
849+
if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
850+
onnxruntime_fetchcontent_declare(kleidiai-qmx URL ${DEP_URL_kleidiai-qmx} URL_HASH SHA1=${DEP_SHA1_kleidiai-qmx}
851+
EXCLUDE_FROM_ALL)
852+
onnxruntime_fetchcontent_makeavailable(kleidiai-qmx)
853+
endif()
848854
endif()
849855

850856
set(onnxruntime_LINK_DIRS)

cmake/onnxruntime_mlas.cmake

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ onnxruntime_add_static_library(onnxruntime_mlas
4545
${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
4646
${MLAS_SRC_DIR}/qnbitgemm.h
4747
${MLAS_SRC_DIR}/qnbitgemm.cpp
48+
${MLAS_SRC_DIR}/qlutgemm.h
49+
${MLAS_SRC_DIR}/qlutgemm.cpp
4850
${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
4951
${MLAS_SRC_DIR}/flashattn.cpp
5052
${MLAS_SRC_DIR}/cast.cpp
@@ -113,6 +115,7 @@ function(setup_mlas_source_for_windows)
113115
${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
114116
${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
115117
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
118+
${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
116119
)
117120

118121
set(mlas_platform_preprocess_srcs
@@ -209,6 +212,8 @@ function(setup_mlas_source_for_windows)
209212
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
210213
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
211214
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
215+
${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.h
216+
${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.cpp
212217
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
213218
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
214219
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
@@ -284,6 +289,11 @@ function(setup_kleidiai)
284289
)
285290
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai)
286291
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
292+
if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
293+
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai-qmx)
294+
target_compile_definitions(onnxruntime_mlas PRIVATE ENABLE_QMX_KERNELS=1)
295+
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai-qmx)
296+
endif()
287297
set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)
288298

289299
# If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages.
@@ -302,13 +312,21 @@ function(setup_kleidiai)
302312
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
303313
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
304314
endif()
315+
316+
if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
317+
install(TARGETS kleidiai-qmx EXPORT ${PROJECT_NAME}Targets
318+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
319+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
320+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
321+
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
322+
endif()
305323
endfunction()
306324

307325
function (setup_arm_neon_nchwc)
308326
target_sources(onnxruntime_mlas PRIVATE
309-
${MLAS_SRC_DIR}/sconv.h
310-
${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
311-
${MLAS_SRC_DIR}/spool_kernel_neon.cpp
327+
${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.h
328+
${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.cpp
329+
${MLAS_SRC_DIR}/spool_nchwc_kernel_neon.cpp
312330
)
313331
list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC)
314332
set(mlas_private_compile_definitions ${mlas_private_compile_definitions} PARENT_SCOPE)
@@ -460,6 +478,7 @@ else()
460478
${MLAS_SRC_DIR}/eltwise_kernel_neon.h
461479
${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
462480
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
481+
${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
463482
)
464483

465484
# Conditionally add the SVE implementation if compiler supports it
@@ -496,6 +515,7 @@ else()
496515
${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
497516
${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
498517
${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
518+
${MLAS_SRC_DIR}/sbconv_kernel_neon.cpp
499519
${MLAS_SRC_DIR}/cast_kernel_neon.cpp
500520
${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
501521
${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
@@ -511,6 +531,7 @@ else()
511531
set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
512532
set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
513533
set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
534+
set_source_files_properties(${MLAS_SRC_DIR}/sbconv_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
514535
set_source_files_properties(${MLAS_SRC_DIR}/cast_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
515536
set_source_files_properties(${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
516537
set_source_files_properties(${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
@@ -693,6 +714,8 @@ else()
693714
${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
694715
${MLAS_SRC_DIR}/intrinsics/avx2/saturation_check_avx2.cpp
695716
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
717+
${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.h
718+
${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.cpp
696719
${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.h
697720
${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.cpp
698721
${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.cpp

cmake/onnxruntime_python.cmake

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,22 @@ if (WIN32)
297297
if (onnxruntime_USE_CUDA)
298298
file(WRITE "${VERSION_INFO_FILE}" "use_cuda = True\n")
299299
if(onnxruntime_CUDNN_HOME)
300-
file(GLOB CUDNN_DLL_PATH "${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll")
301-
if (NOT CUDNN_DLL_PATH)
300+
# may have x64 in the path
301+
# may have a path with CUDA toolkit version if multiple installed on the machine
302+
set(CUDNN_SEARCH_PATHS
303+
"${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll"
304+
"${onnxruntime_CUDNN_HOME}/bin/x64/cudnn64_*.dll"
305+
"${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/cudnn64_*.dll"
306+
"${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/x64/cudnn64_*.dll"
307+
)
308+
set(CUDNN_DLL_PATH "")
309+
foreach(search_path ${CUDNN_SEARCH_PATHS})
310+
file(GLOB CUDNN_DLL_PATH "${search_path}")
311+
if(CUDNN_DLL_PATH)
312+
break()
313+
endif()
314+
endforeach()
315+
if(NOT CUDNN_DLL_PATH)
302316
message(FATAL_ERROR "cuDNN not found in ${onnxruntime_CUDNN_HOME}")
303317
endif()
304318
else()

cmake/onnxruntime_unittests.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -949,9 +949,9 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
949949
onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
950950
config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
951951
onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
952-
add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils onnxruntime_common)
952+
add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils)
953953
target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
954-
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils onnxruntime_common)
954+
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils)
955955
if (MSVC)
956956
# Cutlass code has an issue with the following:
957957
# warning C4100: 'magic': unreferenced formal parameter

0 commit comments

Comments
 (0)