|
| 1 | +From 712fbe0f6e491a2edd7388f99ea4124f25cda774 Mon Sep 17 00:00:00 2001 |
| 2 | +From: "M. Chornyi" <99709299+mc-nv@users.noreply.github.com> |
| 3 | +Date: Fri, 1 May 2026 21:48:54 +0000 |
| 4 | +Subject: [PATCH] Fix CUDA 13.2 (CUB 3.2.0) build failure: invalid C++ in |
| 5 | + device_transform.cuh |
| 6 | +MIME-Version: 1.0 |
| 7 | +Content-Type: text/plain; charset=UTF-8 |
| 8 | +Content-Transfer-Encoding: 8bit |
| 9 | + |
| 10 | +CUB 3.2.0 ships device_transform.cuh with an invalid template specialisation: |
| 11 | + struct ::cuda::proclaims_copyable_arguments<...> : ::cuda::std::true_type {}; |
| 12 | +A globally-qualified class name in a specialisation is rejected by the compiler |
| 13 | +under -std=c++20. device_copy.cuh transitively pulls device_transform.cuh in |
| 14 | +via dispatch_copy_mdspan.cuh, so it fails for the same reason. |
| 15 | + |
| 16 | +Fix: two shadow stubs under onnxruntime/cub/device/, resolved first via -I |
| 17 | +ahead of the -isystem CUDA toolkit path. |
| 18 | + |
| 19 | + device_transform.cuh — re-emits the parts Thrust uses internally |
| 20 | + (cub::detail::__return_constant and the proclaims_copyable_arguments |
| 21 | + specialisation) with the specialisation written inside the cuda namespace |
| 22 | + so the class name is unqualified. cub::DeviceTransform is omitted. |
| 23 | + |
| 24 | + device_copy.cuh — empty stub. ORT does not use cub::DeviceCopy. |
| 25 | + |
| 26 | +cub.cuh is unchanged. |
| 27 | +--- |
| 28 | + onnxruntime/cub/device/device_copy.cuh | 9 +++++ |
| 29 | + onnxruntime/cub/device/device_transform.cuh | 42 +++++++++++++++++++++ |
| 30 | + 2 files changed, 51 insertions(+) |
| 31 | + create mode 100644 onnxruntime/cub/device/device_copy.cuh |
| 32 | + create mode 100644 onnxruntime/cub/device/device_transform.cuh |
| 33 | + |
| 34 | +diff --git a/onnxruntime/cub/device/device_copy.cuh b/onnxruntime/cub/device/device_copy.cuh |
| 35 | +new file mode 100644 |
| 36 | +index 0000000000000..14e9f1772a3ef |
| 37 | +--- /dev/null |
| 38 | ++++ b/onnxruntime/cub/device/device_copy.cuh |
| 39 | +@@ -0,0 +1,9 @@ |
| 40 | ++// Copyright (c) Microsoft Corporation. All rights reserved. |
| 41 | ++// Licensed under the MIT License. |
| 42 | ++ |
| 43 | ++// Shadow stub for <cub/device/device_copy.cuh>. The real header transitively |
| 44 | ++// includes dispatch_copy_mdspan.cuh, which references cub::DeviceTransform — a |
| 45 | ++// type our device_transform.cuh stub intentionally omits. ORT does not use |
| 46 | ++// cub::DeviceCopy, so this empty stub is sufficient. |
| 47 | ++ |
| 48 | ++#pragma once |
| 49 | +diff --git a/onnxruntime/cub/device/device_transform.cuh b/onnxruntime/cub/device/device_transform.cuh |
| 50 | +new file mode 100644 |
| 51 | +index 0000000000000..378bd8f0b5be8 |
| 52 | +--- /dev/null |
| 53 | ++++ b/onnxruntime/cub/device/device_transform.cuh |
| 54 | +@@ -0,0 +1,42 @@ |
| 55 | ++// Copyright (c) Microsoft Corporation. All rights reserved. |
| 56 | ++// Licensed under the MIT License. |
| 57 | ++ |
| 58 | ++// Shadow stub for <cub/device/device_transform.cuh>. Resolved first via -I, |
| 59 | ++// ahead of the -isystem CUDA toolkit path. |
| 60 | ++// |
| 61 | ++// CUB 3.2.0 (CUDA 13.2) ships an invalid template specialisation: |
| 62 | ++// struct ::cuda::proclaims_copyable_arguments<...> : ::cuda::std::true_type {}; |
| 63 | ++// A globally-qualified class name in a specialisation is rejected by the compiler. |
| 64 | ++// We re-emit the parts Thrust needs internally with the fixed syntax (the |
| 65 | ++// specialisation is written inside the cuda namespace so the name is unqualified). |
| 66 | ++// cub::DeviceTransform itself is not used by ORT and is intentionally omitted. |
| 67 | ++ |
| 68 | ++#pragma once |
| 69 | ++ |
| 70 | ++#include <cub/version.cuh> |
| 71 | ++ |
| 72 | ++#if CUB_VERSION >= 300200 |
| 73 | ++ |
| 74 | ++#include <cub/device/dispatch/dispatch_transform.cuh> // cub::detail::transform::dispatch_t (Thrust) |
| 75 | ++#include <cuda/__functional/address_stability.h> // cuda::proclaims_copyable_arguments primary |
| 76 | ++ |
| 77 | ++CUB_NAMESPACE_BEGIN |
| 78 | ++namespace detail |
| 79 | ++{ |
| 80 | ++template <typename T> |
| 81 | ++struct __return_constant |
| 82 | ++{ |
| 83 | ++ T value; |
| 84 | ++ template <typename... Args> |
| 85 | ++ _CCCL_HOST_DEVICE T operator()(Args&&...) const { return value; } |
| 86 | ++}; |
| 87 | ++} // namespace detail |
| 88 | ++CUB_NAMESPACE_END |
| 89 | ++ |
| 90 | ++_CCCL_BEGIN_NAMESPACE_CUDA |
| 91 | ++template <typename T> |
| 92 | ++struct proclaims_copyable_arguments<CUB_NS_QUALIFIER::detail::__return_constant<T>> |
| 93 | ++ : ::cuda::std::true_type {}; |
| 94 | ++_CCCL_END_NAMESPACE_CUDA |
| 95 | ++ |
| 96 | ++#endif // CUB_VERSION >= 300200 |
1 | 97 | diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all_impl.cu b/orttraining/orttraining/training_ops/cuda/reduction/all_impl.cu |
2 | 98 | index 638c7d6637..73063765d7 100644 |
3 | 99 | --- a/orttraining/orttraining/training_ops/cuda/reduction/all_impl.cu |
|
0 commit comments