[Cherry-pick] CUDA 13 support on Linux (#75878)

swgu98 · web-flow · commit 9f1b0a81da6b · 2025-10-16T17:38:51.000+08:00
* CUDA 13 support on Linux (#75372) * cuda13 linux * cuda_graph * multi archs * fix * windows cpu fix * windows cpu fix * cuda13 test problem (#75509) * cuda13 * fix * Cuda13 linux nvshmem (#75557) * nvshmem cuda13 * cuda13 * templete bypass * cuda13 almalinux trt (#75695)
diff --git a/.github/workflows/CheckPRTemplate.yml b/.github/workflows/CheckPRTemplate.yml
@@ -16,7 +16,15 @@ jobs:
       - name: Clone paddle
         uses: actions/checkout@v4
 
+      - name: Check bypass
+        id: check-bypass
+        uses: ./.github/actions/check-bypass
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          workflow-name: template
+
       - name: Check PR Template
+        if: steps.check-bypass.outputs.can-skip != 'true'
         env:
           AGILE_PULL_ID: ${{ github.event.pull_request.number }}
           AGILE_COMPILE_BRANCH: ${{ github.base_ref }}
diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
@@ -29,10 +29,17 @@ set(DGC_INCLUDE_DIR
 set(DGC_LIBRARIES
     "${DGC_INSTALL_DIR}/lib/libdgc.a"
     CACHE FILEPATH "dgc library." FORCE)
-set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_7369ff.tgz")
 include_directories(${DGC_INCLUDE_DIR})
-set(DGC_CACHE_FILENAME "collective_7369ff.tgz")
-set(DGC_URL_MD5 ede459281a0f979da8d84f81287369ff)
+
+if(CUDA_VERSION LESS 13.0)
+  set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_7369ff.tgz")
+  set(DGC_CACHE_FILENAME "collective_7369ff.tgz")
+  set(DGC_URL_MD5 ede459281a0f979da8d84f81287369ff)
+else()
+  set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_250918cuda13.tgz")
+  set(DGC_CACHE_FILENAME "collective_250918cuda13.tgz")
+  set(DGC_URL_MD5 82ea96cfca668b8f8731613827658444)
+endif()
 
 function(download_dgc)
   message(
diff --git a/cmake/external/nvshmem.cmake b/cmake/external/nvshmem.cmake
@@ -53,7 +53,12 @@ else()
       extern_nvshmem)
 endif()
 
-set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem.patch)
+if(CUDA_VERSION VERSION_GREATER_EQUAL 13)
+  set(NVSHMEM_PATCH_PATH
+      ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem_cuda13.patch)
+else()
+  set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem.patch)
+endif()
 set(NVSHMEM_PATCH_COMMAND
     git init && git config --global --add safe.directory ${NVSHMEM_SOURCE_DIR}
     && git config user.name "PaddlePaddle" && git config user.email
diff --git a/paddle/cinn/backends/nvrtc/nvrtc_util.cc b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
@@ -51,6 +51,23 @@ static std::vector<std::string> GetNvidiaAllIncludePath(
   std::vector<std::string> include_paths;
   const std::string delimiter = "/";
   // Expand this list if necessary.
+#if CUDA_VERSION >= 13000 && defined(__linux__)
+  const std::vector<std::string> sub_modules = {"cu13",
+                                                "cublas",
+                                                "cuda_cupti",
+                                                "cudnn",
+                                                "cufft",
+                                                "cufile",
+                                                "cusparse",
+                                                "cusparselt",
+                                                "cusolver",
+                                                "cuda_nvrtc",
+                                                "curand",
+                                                "nccl",
+                                                "nvjitlink",
+                                                "nvtx",
+                                                "cuda_runtime"};
+#else
   const std::vector<std::string> sub_modules = {"cuda_cccl",
                                                 "cublas",
                                                 "cudnn",
@@ -60,11 +77,17 @@ static std::vector<std::string> GetNvidiaAllIncludePath(
                                                 "cuda_nvrtc",
                                                 "curand",
                                                 "cuda_runtime"};
+#endif
   for (auto& sub_module : sub_modules) {
     std::string path =
         nvidia_package_dir + delimiter + sub_module + delimiter + "include";
     include_paths.push_back(path);
   }
+#if CUDA_VERSION >= 13000 && defined(__linux__)
+  include_paths.push_back(nvidia_package_dir + delimiter + "cu13/include/cccl");
+  include_paths.push_back(nvidia_package_dir + delimiter +
+                          "cu13/include/nvtx3");
+#endif
   return include_paths;
 }
 
@@ -153,7 +176,11 @@ std::string Compiler::CompileCudaSource(const std::string& code,
   } else {
     compile_options.push_back("-arch=compute_" + cc);
   }
+#if CUDA_VERSION >= 13000 && defined(__linux__)
+  compile_options.push_back("-std=c++17");
+#else
   compile_options.push_back("-std=c++14");
+#endif
   compile_options.push_back("-default-device");
 
   if (include_headers) {  // prepare include headers
diff --git a/paddle/fluid/distributed/collective/deep_ep/kernels/utils.cuh b/paddle/fluid/distributed/collective/deep_ep/kernels/utils.cuh
@@ -231,11 +231,29 @@ __device__ __forceinline__ int64_t ld_volatile_global(const uint64_t *ptr) {
 #define DISABLE_AGGRESSIVE_PTX_INSTRS
 #endif
 
+// swgu98: cuda13 strictly limits graphics cards below 80 architecture from
+// using ".L2::256B" optimization
+#if (__CUDACC_VER_MAJOR__ >= 13)
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
 #ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
 #define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
 #else
 #define LD_NC_FUNC "ld.volatile.global.L2::256B"
 #endif
+#else
+#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
+#define LD_NC_FUNC "ld.global.nc.L1::no_allocate"
+#else
+#define LD_NC_FUNC "ld.volatile.global"
+#endif
+#endif
+#else
+#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
+#define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
+#else
+#define LD_NC_FUNC "ld.volatile.global.L2::256B"
+#endif
+#endif
 
 // `ld.global.nc.L1::no_allocate` will be translated into
 // `LDG.E.NA.[width].CONSTANT` in SASS
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -163,7 +163,7 @@ size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const
   return 0;
 }
 
-#ifdef _WIN32
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
 template <typename T>
 __global__ void GenAnchors(T* out,
                            const T* aspect_ratios,
@@ -233,7 +233,7 @@ int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
   const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
   const T* stride_device = static_cast<const T*>(stride_device_);
   const T* variances_device = static_cast<const T*>(variances_device_);
-#ifdef _WIN32
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
   GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(anchors,
                                                        aspect_ratios_device,
                                                        aspect_ratios_.size(),
@@ -258,7 +258,7 @@ int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
                                               offset_);
 #endif
   const int var_grid = (box_num_ * 4 + block - 1) / block;
-#ifdef _WIN32
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
   SetVariance<T><<<var_grid, block, 0, stream>>>(
       vars, variances_device, variances_.size(), box_num_ * 4);
 #else
@@ -592,7 +592,7 @@ int AnchorGeneratorPluginDynamic::enqueue_impl(
   const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
   const T* stride_device = static_cast<const T*>(stride_device_);
   const T* variances_device = static_cast<const T*>(variances_device_);
-#ifdef _WIN32
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
   GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(anchors,
                                                        aspect_ratios_device,
                                                        aspect_ratios_.size(),
@@ -617,7 +617,7 @@ int AnchorGeneratorPluginDynamic::enqueue_impl(
                                               offset_);
 #endif
   const int var_grid = (box_num * 4 + block - 1) / block;
-#ifdef _WIN32
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
   SetVariance<T><<<var_grid, block, 0, stream>>>(
       vars, variances_device, variances_.size(), box_num * 4);
 #else
@@ -894,7 +894,7 @@ int PIRAnchorGeneratorPluginDynamic::enqueue_impl(
   const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
   const T* stride_device = static_cast<const T*>(stride_device_);
   const T* variances_device = static_cast<const T*>(variances_device_);
-#ifdef _WIN32
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
   GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(anchors,
                                                        aspect_ratios_device,
                                                        aspect_ratios_.size(),
@@ -919,7 +919,7 @@ int PIRAnchorGeneratorPluginDynamic::enqueue_impl(
                                               offset_);
 #endif
   const int var_grid = (box_num * 4 + block - 1) / block;
-#ifdef _WIN32
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
   SetVariance<T><<<var_grid, block, 0, stream>>>(
       vars, variances_device, variances_.size(), box_num * 4);
 #else
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -467,10 +467,16 @@ void* GetCublasDsoHandle() {
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.13");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
 #endif
   } else {
     std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
         "temporarily no longer supports");
     return nullptr;
   }
@@ -497,10 +503,16 @@ void* GetCublasLtDsoHandle() {
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.13");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
 #endif
   } else {
     std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
         "temporarily no longer supports");
     return nullptr;
   }
@@ -518,10 +530,17 @@ void* GetCublasLtDsoHandle() {
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_13.dll");
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #endif
   } else {
     std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
         "temporarily no longer supports");
     return nullptr;
   }
@@ -619,10 +638,18 @@ void* GetCUPTIDsoHandle() {
 #else
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so.13", false, {cupti_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
 #endif
   } else {
     std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
         "temporarily no longer supports");
     return nullptr;
   }
@@ -695,12 +722,22 @@ void* GetCusolverDsoHandle() {
 #endif
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsolver.so");
+#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
+  if (CUDA_VERSION < 13000) {
+#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
 #else
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
+#endif
+  } else {
 #ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.12");
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
 #endif
+  }
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
 #endif
 }
 
@@ -737,15 +774,15 @@ void* GetCusparseDsoHandle() {
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
 #endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 14000) {
 #ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
 #else
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
 #endif
   } else {
     std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
         "temporarily no longer.");
     return nullptr;
   }
@@ -979,10 +1016,12 @@ void* GetCUFFTDsoHandle() {
 #endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.11");
+  } else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.12");
   } else {
     std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer.");
+        "Your CUDA_VERSION is less than 11 or greater than 14, paddle "
+        "temporarily no longer supports");
     return nullptr;
   }
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.cc b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
@@ -42,11 +42,19 @@ static std::vector<cudaGraphNode_t> ToposortCUDAGraph(cudaGraph_t graph) {
       cudaGraphGetNodes(graph, nodes.data(), &num_nodes));
 
   size_t num_edges;
+#if CUDA_VERSION < 13000
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges));
   std::vector<cudaGraphNode_t> from(num_edges), to(num_edges);
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaGraphGetEdges(graph, from.data(), to.data(), &num_edges));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaGraphGetEdges(graph, nullptr, nullptr, nullptr, &num_edges));
+  std::vector<cudaGraphNode_t> from(num_edges), to(num_edges);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaGraphGetEdges(graph, from.data(), to.data(), nullptr, &num_edges));
+#endif
 
   std::unordered_map<cudaGraphNode_t, std::unordered_set<cudaGraphNode_t>>
       in_edges, out_edges;
diff --git a/patches/nvshmem/nvshmem_cuda13.patch b/patches/nvshmem/nvshmem_cuda13.patch
diff --git a/python/setup.py.in b/python/setup.py.in
diff --git a/setup.py b/setup.py
diff --git a/tools/dockerfile/manylinux/Dockerfile-130 b/tools/dockerfile/manylinux/Dockerfile-130