Skip to content

Commit 9f1b0a8

Browse files
authored
[Cherry-pick] CUDA 13 support on Linux (#75878)
* CUDA 13 support on Linux (#75372) * cuda13 linux * cuda_graph * multi archs * fix * windows cpu fix * windows cpu fix * cuda13 test problem (#75509) * cuda13 * fix * Cuda13 linux nvshmem (#75557) * nvshmem cuda13 * cuda13 * templete bypass * cuda13 almalinux trt (#75695)
1 parent e22e2f9 commit 9f1b0a8

12 files changed

Lines changed: 572 additions & 21 deletions

File tree

.github/workflows/CheckPRTemplate.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,15 @@ jobs:
1616
- name: Clone paddle
1717
uses: actions/checkout@v4
1818

19+
- name: Check bypass
20+
id: check-bypass
21+
uses: ./.github/actions/check-bypass
22+
with:
23+
github-token: ${{ secrets.GITHUB_TOKEN }}
24+
workflow-name: template
25+
1926
- name: Check PR Template
27+
if: steps.check-bypass.outputs.can-skip != 'true'
2028
env:
2129
AGILE_PULL_ID: ${{ github.event.pull_request.number }}
2230
AGILE_COMPILE_BRANCH: ${{ github.base_ref }}

cmake/external/dgc.cmake

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,17 @@ set(DGC_INCLUDE_DIR
2929
set(DGC_LIBRARIES
3030
"${DGC_INSTALL_DIR}/lib/libdgc.a"
3131
CACHE FILEPATH "dgc library." FORCE)
32-
set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_7369ff.tgz")
3332
include_directories(${DGC_INCLUDE_DIR})
34-
set(DGC_CACHE_FILENAME "collective_7369ff.tgz")
35-
set(DGC_URL_MD5 ede459281a0f979da8d84f81287369ff)
33+
34+
if(CUDA_VERSION LESS 13.0)
35+
set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_7369ff.tgz")
36+
set(DGC_CACHE_FILENAME "collective_7369ff.tgz")
37+
set(DGC_URL_MD5 ede459281a0f979da8d84f81287369ff)
38+
else()
39+
set(DGC_URL "https://fleet.bj.bcebos.com/dgc/collective_250918cuda13.tgz")
40+
set(DGC_CACHE_FILENAME "collective_250918cuda13.tgz")
41+
set(DGC_URL_MD5 82ea96cfca668b8f8731613827658444)
42+
endif()
3643

3744
function(download_dgc)
3845
message(

cmake/external/nvshmem.cmake

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,12 @@ else()
5353
extern_nvshmem)
5454
endif()
5555

56-
set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem.patch)
56+
if(CUDA_VERSION VERSION_GREATER_EQUAL 13)
57+
set(NVSHMEM_PATCH_PATH
58+
${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem_cuda13.patch)
59+
else()
60+
set(NVSHMEM_PATCH_PATH ${PADDLE_SOURCE_DIR}/patches/nvshmem/nvshmem.patch)
61+
endif()
5762
set(NVSHMEM_PATCH_COMMAND
5863
git init && git config --global --add safe.directory ${NVSHMEM_SOURCE_DIR}
5964
&& git config user.name "PaddlePaddle" && git config user.email

paddle/cinn/backends/nvrtc/nvrtc_util.cc

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,23 @@ static std::vector<std::string> GetNvidiaAllIncludePath(
5151
std::vector<std::string> include_paths;
5252
const std::string delimiter = "/";
5353
// Expand this list if necessary.
54+
#if CUDA_VERSION >= 13000 && defined(__linux__)
55+
const std::vector<std::string> sub_modules = {"cu13",
56+
"cublas",
57+
"cuda_cupti",
58+
"cudnn",
59+
"cufft",
60+
"cufile",
61+
"cusparse",
62+
"cusparselt",
63+
"cusolver",
64+
"cuda_nvrtc",
65+
"curand",
66+
"nccl",
67+
"nvjitlink",
68+
"nvtx",
69+
"cuda_runtime"};
70+
#else
5471
const std::vector<std::string> sub_modules = {"cuda_cccl",
5572
"cublas",
5673
"cudnn",
@@ -60,11 +77,17 @@ static std::vector<std::string> GetNvidiaAllIncludePath(
6077
"cuda_nvrtc",
6178
"curand",
6279
"cuda_runtime"};
80+
#endif
6381
for (auto& sub_module : sub_modules) {
6482
std::string path =
6583
nvidia_package_dir + delimiter + sub_module + delimiter + "include";
6684
include_paths.push_back(path);
6785
}
86+
#if CUDA_VERSION >= 13000 && defined(__linux__)
87+
include_paths.push_back(nvidia_package_dir + delimiter + "cu13/include/cccl");
88+
include_paths.push_back(nvidia_package_dir + delimiter +
89+
"cu13/include/nvtx3");
90+
#endif
6891
return include_paths;
6992
}
7093

@@ -153,7 +176,11 @@ std::string Compiler::CompileCudaSource(const std::string& code,
153176
} else {
154177
compile_options.push_back("-arch=compute_" + cc);
155178
}
179+
#if CUDA_VERSION >= 13000 && defined(__linux__)
180+
compile_options.push_back("-std=c++17");
181+
#else
156182
compile_options.push_back("-std=c++14");
183+
#endif
157184
compile_options.push_back("-default-device");
158185

159186
if (include_headers) { // prepare include headers

paddle/fluid/distributed/collective/deep_ep/kernels/utils.cuh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,11 +231,29 @@ __device__ __forceinline__ int64_t ld_volatile_global(const uint64_t *ptr) {
231231
#define DISABLE_AGGRESSIVE_PTX_INSTRS
232232
#endif
233233

234+
// swgu98: cuda13 strictly limits graphics cards below 80 architecture from
235+
// using ".L2::256B" optimization
236+
#if (__CUDACC_VER_MAJOR__ >= 13)
237+
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
234238
#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
235239
#define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
236240
#else
237241
#define LD_NC_FUNC "ld.volatile.global.L2::256B"
238242
#endif
243+
#else
244+
#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
245+
#define LD_NC_FUNC "ld.global.nc.L1::no_allocate"
246+
#else
247+
#define LD_NC_FUNC "ld.volatile.global"
248+
#endif
249+
#endif
250+
#else
251+
#ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
252+
#define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
253+
#else
254+
#define LD_NC_FUNC "ld.volatile.global.L2::256B"
255+
#endif
256+
#endif
239257

240258
// `ld.global.nc.L1::no_allocate` will be translated into
241259
// `LDG.E.NA.[width].CONSTANT` in SASS

paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const
163163
return 0;
164164
}
165165

166-
#ifdef _WIN32
166+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
167167
template <typename T>
168168
__global__ void GenAnchors(T* out,
169169
const T* aspect_ratios,
@@ -233,7 +233,7 @@ int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
233233
const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
234234
const T* stride_device = static_cast<const T*>(stride_device_);
235235
const T* variances_device = static_cast<const T*>(variances_device_);
236-
#ifdef _WIN32
236+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
237237
GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(anchors,
238238
aspect_ratios_device,
239239
aspect_ratios_.size(),
@@ -258,7 +258,7 @@ int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
258258
offset_);
259259
#endif
260260
const int var_grid = (box_num_ * 4 + block - 1) / block;
261-
#ifdef _WIN32
261+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
262262
SetVariance<T><<<var_grid, block, 0, stream>>>(
263263
vars, variances_device, variances_.size(), box_num_ * 4);
264264
#else
@@ -592,7 +592,7 @@ int AnchorGeneratorPluginDynamic::enqueue_impl(
592592
const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
593593
const T* stride_device = static_cast<const T*>(stride_device_);
594594
const T* variances_device = static_cast<const T*>(variances_device_);
595-
#ifdef _WIN32
595+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
596596
GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(anchors,
597597
aspect_ratios_device,
598598
aspect_ratios_.size(),
@@ -617,7 +617,7 @@ int AnchorGeneratorPluginDynamic::enqueue_impl(
617617
offset_);
618618
#endif
619619
const int var_grid = (box_num * 4 + block - 1) / block;
620-
#ifdef _WIN32
620+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
621621
SetVariance<T><<<var_grid, block, 0, stream>>>(
622622
vars, variances_device, variances_.size(), box_num * 4);
623623
#else
@@ -894,7 +894,7 @@ int PIRAnchorGeneratorPluginDynamic::enqueue_impl(
894894
const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
895895
const T* stride_device = static_cast<const T*>(stride_device_);
896896
const T* variances_device = static_cast<const T*>(variances_device_);
897-
#ifdef _WIN32
897+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
898898
GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(anchors,
899899
aspect_ratios_device,
900900
aspect_ratios_.size(),
@@ -919,7 +919,7 @@ int PIRAnchorGeneratorPluginDynamic::enqueue_impl(
919919
offset_);
920920
#endif
921921
const int var_grid = (box_num * 4 + block - 1) / block;
922-
#ifdef _WIN32
922+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 13000 || defined(_WIN32)
923923
SetVariance<T><<<var_grid, block, 0, stream>>>(
924924
vars, variances_device, variances_.size(), box_num * 4);
925925
#else

paddle/phi/backends/dynload/dynamic_loader.cc

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -467,10 +467,16 @@ void* GetCublasDsoHandle() {
467467
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
468468
#else
469469
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
470+
#endif
471+
} else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
472+
#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
473+
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.13");
474+
#else
475+
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
470476
#endif
471477
} else {
472478
std::string warning_msg(
473-
"Your CUDA_VERSION is less than 11 or greater than 13, paddle "
479+
"Your CUDA_VERSION is less than 11 or greater than 14, paddle "
474480
"temporarily no longer supports");
475481
return nullptr;
476482
}
@@ -497,10 +503,16 @@ void* GetCublasLtDsoHandle() {
497503
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
498504
#else
499505
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
506+
#endif
507+
} else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
508+
#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
509+
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.13");
510+
#else
511+
return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
500512
#endif
501513
} else {
502514
std::string warning_msg(
503-
"Your CUDA_VERSION is less than 11 or greater than 13, paddle "
515+
"Your CUDA_VERSION is less than 11 or greater than 14, paddle "
504516
"temporarily no longer supports");
505517
return nullptr;
506518
}
@@ -518,10 +530,17 @@ void* GetCublasLtDsoHandle() {
518530
#else
519531
return GetDsoHandleFromSearchPath(
520532
FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
533+
#endif
534+
} else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
535+
#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
536+
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_13.dll");
537+
#else
538+
return GetDsoHandleFromSearchPath(
539+
FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
521540
#endif
522541
} else {
523542
std::string warning_msg(
524-
"Your CUDA_VERSION is less than 11 or greater than 12, paddle "
543+
"Your CUDA_VERSION is less than 11 or greater than 14, paddle "
525544
"temporarily no longer supports");
526545
return nullptr;
527546
}
@@ -619,10 +638,18 @@ void* GetCUPTIDsoHandle() {
619638
#else
620639
return GetDsoHandleFromSearchPath(
621640
FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
641+
#endif
642+
} else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
643+
#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
644+
return GetDsoHandleFromSearchPath(
645+
FLAGS_cupti_dir, "libcupti.so.13", false, {cupti_lib_path});
646+
#else
647+
return GetDsoHandleFromSearchPath(
648+
FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
622649
#endif
623650
} else {
624651
std::string warning_msg(
625-
"Your CUDA_VERSION is less than 11 or greater than 13, paddle "
652+
"Your CUDA_VERSION is less than 11 or greater than 14, paddle "
626653
"temporarily no longer supports");
627654
return nullptr;
628655
}
@@ -695,12 +722,22 @@ void* GetCusolverDsoHandle() {
695722
#endif
696723
#elif defined(PADDLE_WITH_HIP)
697724
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsolver.so");
725+
#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
726+
if (CUDA_VERSION < 13000) {
727+
#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
728+
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
698729
#else
730+
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
731+
#endif
732+
} else {
699733
#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
700-
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
734+
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.12");
701735
#else
702-
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
736+
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
703737
#endif
738+
}
739+
#else
740+
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
704741
#endif
705742
}
706743

@@ -737,15 +774,15 @@ void* GetCusparseDsoHandle() {
737774
#else
738775
return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
739776
#endif
740-
} else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
777+
} else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 14000) {
741778
#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
742779
return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
743780
#else
744781
return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
745782
#endif
746783
} else {
747784
std::string warning_msg(
748-
"Your CUDA_VERSION is less than 11 or greater than 12, paddle "
785+
"Your CUDA_VERSION is less than 11 or greater than 14, paddle "
749786
"temporarily no longer.");
750787
return nullptr;
751788
}
@@ -979,10 +1016,12 @@ void* GetCUFFTDsoHandle() {
9791016
#endif
9801017
} else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
9811018
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.11");
1019+
} else if (CUDA_VERSION >= 13000 && CUDA_VERSION < 14000) {
1020+
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.12");
9821021
} else {
9831022
std::string warning_msg(
984-
"Your CUDA_VERSION is less than 11 or greater than 13, paddle "
985-
"temporarily no longer.");
1023+
"Your CUDA_VERSION is less than 11 or greater than 14, paddle "
1024+
"temporarily no longer supports");
9861025
return nullptr;
9871026
}
9881027
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)

paddle/phi/backends/gpu/cuda/cuda_graph.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,19 @@ static std::vector<cudaGraphNode_t> ToposortCUDAGraph(cudaGraph_t graph) {
4242
cudaGraphGetNodes(graph, nodes.data(), &num_nodes));
4343

4444
size_t num_edges;
45+
#if CUDA_VERSION < 13000
4546
PADDLE_ENFORCE_GPU_SUCCESS(
4647
cudaGraphGetEdges(graph, nullptr, nullptr, &num_edges));
4748
std::vector<cudaGraphNode_t> from(num_edges), to(num_edges);
4849
PADDLE_ENFORCE_GPU_SUCCESS(
4950
cudaGraphGetEdges(graph, from.data(), to.data(), &num_edges));
51+
#else
52+
PADDLE_ENFORCE_GPU_SUCCESS(
53+
cudaGraphGetEdges(graph, nullptr, nullptr, nullptr, &num_edges));
54+
std::vector<cudaGraphNode_t> from(num_edges), to(num_edges);
55+
PADDLE_ENFORCE_GPU_SUCCESS(
56+
cudaGraphGetEdges(graph, from.data(), to.data(), nullptr, &num_edges));
57+
#endif
5058

5159
std::unordered_map<cudaGraphNode_t, std::unordered_set<cudaGraphNode_t>>
5260
in_edges, out_edges;

0 commit comments

Comments
 (0)