Fix some issues when building with the latest CUDA and cuDNN versions (#27049)

skottmckay · web-flow · commit 50e3362d5695 · 2026-01-19T07:41:33.000+10:00
### Description
&lt;!-- Describe your changes. --&gt;
Fix some issues when building with the latest CUDA and cuDNN versions on
Windows.

* Latest cuDNN install has the CUDA toolkit version in the path. 
  * Adjust cmake files to support that.
* CUDA 13.x drops support for compute capability 6.0 and 7.0. 
  * Remove from CMAKE_CUDA_ARCHITECTURES.
* Remove a LINK_LANGUAGE:CUDA flag for CETCOMPAT
* Syntax doesn't seem to be supported with MSVC. Build is successful
without this (CUDA 13.1, cuDNN 9.17).
* `LINK : warning LNK4044: unrecognized option '/Xlinker=/CETCOMPAT';
ignored
[D:\src\github\ort.cuda\build\Windows.CUDA\Debug\onnxruntime_providers_cuda_ut.vcxproj]`
* Memory leak checker fixes
* onnxruntime_providers_cuda_ut was incorrectly linking against ORT
common causing a duplicate symbol when the debug leak checker is enabled
(multiple overrides of `new` and `delete`.
* As the CUDA EP is built as a separate library it shouldn't need to
link against `common`.
* Use the debug alloc/free for provider bridge when leak checker is
enabled
* Ignore EtwEventWriteNoRegistration in leak checker output as we don't
control that.


### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1207,7 +1207,7 @@ function(onnxruntime_configure_target target_name)
 
   # Keep BinSkim happy
   if(MSVC AND NOT onnxruntime_target_platform MATCHES "ARM")
-    target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>" "$<$<LINK_LANGUAGE:CUDA>:-Xlinker=/CETCOMPAT>")
+    target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>")
   endif()
 
 endfunction()
diff --git a/cmake/external/cuDNN.cmake b/cmake/external/cuDNN.cmake
@@ -3,7 +3,7 @@ add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
 find_path(
     CUDNN_INCLUDE_DIR cudnn.h
     HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS}
-    PATH_SUFFIXES include
+    PATH_SUFFIXES include include/${onnxruntime_CUDA_VERSION}
     REQUIRED
 )
 
@@ -15,7 +15,7 @@ function(find_cudnn_library NAME)
     find_library(
         ${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}"
         HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR}
-        PATH_SUFFIXES lib64 lib/x64 lib
+        PATH_SUFFIXES lib64 lib/x64 lib lib/${onnxruntime_CUDA_VERSION}/x64
         REQUIRED
     )
 
diff --git a/cmake/external/cuda_configuration.cmake b/cmake/external/cuda_configuration.cmake
@@ -126,6 +126,9 @@ macro(setup_cuda_architectures)
         set(CMAKE_CUDA_ARCHITECTURES "37;50;52;60;70;75;80;86;89")
       elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
         set(CMAKE_CUDA_ARCHITECTURES "52;60;70;75;80;86;89;90")
+      elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+        # 13.x drops support for 60 and 70
+        set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90;100;120")
       else()
         set(CMAKE_CUDA_ARCHITECTURES "60;70;75;80;86;89;90;100;120")
       endif()
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -297,8 +297,22 @@ if (WIN32)
   if (onnxruntime_USE_CUDA)
     file(WRITE "${VERSION_INFO_FILE}" "use_cuda = True\n")
     if(onnxruntime_CUDNN_HOME)
-      file(GLOB CUDNN_DLL_PATH "${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll")
-      if (NOT CUDNN_DLL_PATH)
+      # may have x64 in the path
+      # may have a path with CUDA toolkit version if multiple installed on the machine
+      set(CUDNN_SEARCH_PATHS
+        "${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll"
+        "${onnxruntime_CUDNN_HOME}/bin/x64/cudnn64_*.dll"
+        "${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/cudnn64_*.dll"
+        "${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/x64/cudnn64_*.dll"
+      )
+      set(CUDNN_DLL_PATH "")
+      foreach(search_path ${CUDNN_SEARCH_PATHS})
+        file(GLOB CUDNN_DLL_PATH "${search_path}")
+        if(CUDNN_DLL_PATH)
+          break()
+        endif()
+      endforeach()
+      if(NOT CUDNN_DLL_PATH)
         message(FATAL_ERROR "cuDNN not found in ${onnxruntime_CUDNN_HOME}")
       endif()
     else()
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -949,9 +949,9 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
-  add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils onnxruntime_common)
+  add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
-  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils onnxruntime_common)
+  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils)
   if (MSVC)
     # Cutlass code has an issue with the following:
     # warning C4100: 'magic': unreferenced formal parameter
@@ -1233,15 +1233,15 @@ block()
     DEPENDS ${onnxruntime_provider_test_deps}
   )
 
-  # Expose QNN SDK headers to unit tests via an interface target 
+  # Expose QNN SDK headers to unit tests via an interface target
   if(onnxruntime_USE_QNN)
     add_library(qnn_sdk_headers_include INTERFACE)
     target_include_directories(qnn_sdk_headers_include INTERFACE
       ${onnxruntime_QNN_HOME}/include
       ${onnxruntime_QNN_HOME}/include/QNN)
     target_link_libraries(onnxruntime_provider_test PRIVATE qnn_sdk_headers_include)
   endif()
-  
+
   if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
     # The test_main.cc includes NvInfer.h where it has many deprecated declarations
     # simply ignore them for TensorRT EP build
diff --git a/onnxruntime/core/platform/windows/debug_alloc.cc b/onnxruntime/core/platform/windows/debug_alloc.cc
@@ -254,7 +254,8 @@ Memory_LeakCheck::~Memory_LeakCheck() {
         string.find("testing::internal::ThreadLocalRegistryImpl::GetThreadLocalsMapLocked") == std::string::npos &&
         string.find("testing::internal::ThreadLocalRegistryImpl::GetValueOnCurrentThread") == std::string::npos &&
         string.find("PyInit_onnxruntime_pybind11_state") == std::string::npos &&
-        string.find("google::protobuf::internal::InitProtobufDefaultsSlow") == std::string::npos) {
+        string.find("google::protobuf::internal::InitProtobufDefaultsSlow") == std::string::npos &&
+        string.find("EtwEventWriteNoRegistration") == std::string::npos) {
       if (leaked_bytes == 0)
         DebugPrint("\n-----Starting Heap Trace-----\n\n");
 
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -92,9 +92,9 @@ using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwIn
 
 #include "core/common/cpuid_info.h"
 #include "core/common/logging/logging.h"
+
 #include "core/providers/shared_library/provider_interfaces.h"
 #include "core/providers/partitioning_utils.h"
-
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cann/cann_provider_factory_creator.h"
 #include "core/providers/dnnl/dnnl_provider_factory_creator.h"
@@ -118,6 +118,10 @@ using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwIn
 #include "core/providers/nv_tensorrt_rtx/nv_provider_factory.h"
 #include "core/providers/nv_tensorrt_rtx/nv_provider_options.h"
 
+#if defined(_WIN32) && !defined(NDEBUG) && defined(ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
+#include "core/platform/windows/debug_alloc.h"
+#endif
+
 #if !defined(ORT_MINIMAL_BUILD) &&                                        \
     (defined(USE_TENSORRT) || defined(USE_TENSORRT_PROVIDER_INTERFACE) || \
      defined(USE_NV) || defined(USE_NV_PROVIDER_INTERFACE))
@@ -279,8 +283,13 @@ struct ProviderHostImpl : ProviderHost {
     return Status::OK();
   };
 
+#if defined(_WIN32) && !defined(NDEBUG) && defined(ONNXRUNTIME_ENABLE_MEMLEAK_CHECK)
+  void* HeapAllocate(size_t size) override { return DebugHeapAlloc(size, 1); }
+  void HeapFree(void* p) override { DebugHeapFree(p); }
+#else
   void* HeapAllocate(size_t size) override { return new uint8_t[size]; }
   void HeapFree(void* p) override { delete[] reinterpret_cast<uint8_t*>(p); }
+#endif
 
   logging::Logger* LoggingManager_GetDefaultLogger() override {
     return const_cast<logging::Logger*>(&logging::LoggingManager::DefaultLogger());