Skip to content

Commit d0befe5

Browse files
gonidelisbernhardmgruberpre-commit-ci[bot]
authored
Add env based DeviceFor::* algorithms (#7798)
* Add env DeviceFor * Unify env overloads with old non-temp-storage overloads * Fix disregarding stream in old APIs introduced bug and add ForEachInLayout/Extents * Revert return codes as they were to conform to upstream/main * Remove nodiscard as it breaks backwards compatibility * Check return code in non env api test and fix stream calls * Remove 'above header needs to be included first' comment regarding NVTX range guard in unit tests * Add environment literalinclude examples in the docs * Support non-copyable stream types * Apply suggestion from @bernhardmgruber * [pre-commit.ci] auto code formatting --------- Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 5f2e26f commit d0befe5

10 files changed

+667
-57
lines changed

cub/cub/device/device_for.cuh

Lines changed: 280 additions & 37 deletions
Large diffs are not rendered by default.

cub/test/catch2_test_device_for_api.cu

Lines changed: 70 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,11 @@ C2H_TEST("Device bulk works with temporary storage", "[bulk][device]")
6464
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
6565

6666
// 3) Perform bulk operation
67-
cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
67+
auto result = cub::DeviceFor::Bulk(d_temp_storage, temp_storage_bytes, vec.size(), op);
68+
if (result != cudaSuccess)
69+
{
70+
std::cerr << "Bulk operation failed with error code: " << result << std::endl;
71+
}
6872

6973
c2h::device_vector<int> expected = {1, 4, 9, 16};
7074
// example-end bulk-temp-storage
@@ -78,7 +82,11 @@ C2H_TEST("Device bulk works without temporary storage", "[bulk][device]")
7882
c2h::device_vector<int> vec = {1, 2, 3, 4};
7983
square_t op{thrust::raw_pointer_cast(vec.data())};
8084

81-
cub::DeviceFor::Bulk(vec.size(), op);
85+
auto result = cub::DeviceFor::Bulk(vec.size(), op);
86+
if (result != cudaSuccess)
87+
{
88+
std::cerr << "Bulk operation failed with error code: " << result << std::endl;
89+
}
8290

8391
c2h::device_vector<int> expected = {1, 4, 9, 16};
8492
// example-end bulk-wo-temp-storage
@@ -95,14 +103,22 @@ C2H_TEST("Device for each n works with temporary storage", "[for_each][device]")
95103
// 1) Get temp storage size
96104
std::uint8_t* d_temp_storage{};
97105
std::size_t temp_storage_bytes{};
98-
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
106+
auto result = cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
107+
if (result != cudaSuccess)
108+
{
109+
std::cerr << "ForEachN operation failed with error code: " << result << std::endl;
110+
}
99111

100112
// 2) Allocate temp storage
101113
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
102114
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
103115

104116
// 3) Perform for each n operation
105-
cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
117+
result = cub::DeviceFor::ForEachN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
118+
if (result != cudaSuccess)
119+
{
120+
std::cerr << "ForEachN operation failed with error code: " << result << std::endl;
121+
}
106122

107123
c2h::device_vector<int> expected = {1, 4, 9, 16};
108124
// example-end for-each-n-temp-storage
@@ -116,7 +132,11 @@ C2H_TEST("Device for each n works without temporary storage", "[for_each][device
116132
c2h::device_vector<int> vec = {1, 2, 3, 4};
117133
square_ref_t op{};
118134

119-
cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);
135+
auto result = cub::DeviceFor::ForEachN(vec.begin(), vec.size(), op);
136+
if (result != cudaSuccess)
137+
{
138+
std::cerr << "ForEachN operation failed with error code: " << result << std::endl;
139+
}
120140

121141
c2h::device_vector<int> expected = {1, 4, 9, 16};
122142
// example-end for-each-n-wo-temp-storage
@@ -133,14 +153,22 @@ C2H_TEST("Device for each works with temporary storage", "[for_each][device]")
133153
// 1) Get temp storage size
134154
std::uint8_t* d_temp_storage{};
135155
std::size_t temp_storage_bytes{};
136-
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
156+
auto result = cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
157+
if (result != cudaSuccess)
158+
{
159+
std::cerr << "ForEach operation failed with error code: " << result << std::endl;
160+
}
137161

138162
// 2) Allocate temp storage
139163
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
140164
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
141165

142166
// 3) Perform for each operation
143-
cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
167+
result = cub::DeviceFor::ForEach(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
168+
if (result != cudaSuccess)
169+
{
170+
std::cerr << "ForEach operation failed with error code: " << result << std::endl;
171+
}
144172

145173
c2h::device_vector<int> expected = {1, 4, 9, 16};
146174
// example-end for-each-temp-storage
@@ -154,7 +182,11 @@ C2H_TEST("Device for each works without temporary storage", "[for_each][device]"
154182
c2h::device_vector<int> vec = {1, 2, 3, 4};
155183
square_ref_t op{};
156184

157-
cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);
185+
auto result = cub::DeviceFor::ForEach(vec.begin(), vec.end(), op);
186+
if (result != cudaSuccess)
187+
{
188+
std::cerr << "ForEach operation failed with error code: " << result << std::endl;
189+
}
158190

159191
c2h::device_vector<int> expected = {1, 4, 9, 16};
160192
// example-end for-each-wo-temp-storage
@@ -172,14 +204,22 @@ C2H_TEST("Device for each n copy works with temporary storage", "[for_each][devi
172204
// 1) Get temp storage size
173205
std::uint8_t* d_temp_storage{};
174206
std::size_t temp_storage_bytes{};
175-
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
207+
auto result = cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
208+
if (result != cudaSuccess)
209+
{
210+
std::cerr << "ForEachCopyN operation failed with error code: " << result << std::endl;
211+
}
176212

177213
// 2) Allocate temp storage
178214
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
179215
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
180216

181217
// 3) Perform for each n operation
182-
cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
218+
result = cub::DeviceFor::ForEachCopyN(d_temp_storage, temp_storage_bytes, vec.begin(), vec.size(), op);
219+
if (result != cudaSuccess)
220+
{
221+
std::cerr << "ForEachCopyN operation failed with error code: " << result << std::endl;
222+
}
183223

184224
c2h::device_vector<int> expected = {2};
185225
// example-end for-each-copy-n-temp-storage
@@ -194,7 +234,11 @@ C2H_TEST("Device for each n copy works without temporary storage", "[for_each][d
194234
c2h::device_vector<int> count(1);
195235
odd_count_t op{thrust::raw_pointer_cast(count.data())};
196236

197-
cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);
237+
auto result = cub::DeviceFor::ForEachCopyN(vec.begin(), vec.size(), op);
238+
if (result != cudaSuccess)
239+
{
240+
std::cerr << "ForEachCopyN operation failed with error code: " << result << std::endl;
241+
}
198242

199243
c2h::device_vector<int> expected = {2};
200244
// example-end for-each-copy-n-wo-temp-storage
@@ -212,14 +256,22 @@ C2H_TEST("Device for each copy works with temporary storage", "[for_each][device
212256
// 1) Get temp storage size
213257
std::uint8_t* d_temp_storage{};
214258
std::size_t temp_storage_bytes{};
215-
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
259+
auto result = cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
260+
if (result != cudaSuccess)
261+
{
262+
std::cerr << "ForEachCopy operation failed with error code: " << result << std::endl;
263+
}
216264

217265
// 2) Allocate temp storage
218266
c2h::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
219267
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
220268

221269
// 3) Perform for each n operation
222-
cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
270+
result = cub::DeviceFor::ForEachCopy(d_temp_storage, temp_storage_bytes, vec.begin(), vec.end(), op);
271+
if (result != cudaSuccess)
272+
{
273+
std::cerr << "ForEachCopy operation failed with error code: " << result << std::endl;
274+
}
223275

224276
c2h::device_vector<int> expected = {2};
225277
// example-end for-each-copy-temp-storage
@@ -234,7 +286,11 @@ C2H_TEST("Device for each copy works without temporary storage", "[for_each][dev
234286
c2h::device_vector<int> count(1);
235287
odd_count_t op{thrust::raw_pointer_cast(count.data())};
236288

237-
cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);
289+
auto result = cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op);
290+
if (result != cudaSuccess)
291+
{
292+
std::cerr << "ForEachCopy operation failed with error code: " << result << std::endl;
293+
}
238294

239295
c2h::device_vector<int> expected = {2};
240296
// example-end for-each-copy-wo-temp-storage
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
#include "insert_nested_NVTX_range_guard.h"
5+
6+
#include <cub/device/device_for.cuh>
7+
8+
#include <thrust/detail/raw_pointer_cast.h>
9+
#include <thrust/device_vector.h>
10+
11+
#include <cuda/devices>
12+
#include <cuda/stream>
13+
14+
#include <c2h/catch2_test_helper.h>
15+
16+
struct square_ref_op
17+
{
18+
__device__ void operator()(int& i)
19+
{
20+
i *= i;
21+
}
22+
};
23+
24+
struct square_idx_op
25+
{
26+
int* d_ptr;
27+
28+
__device__ void operator()(int i)
29+
{
30+
d_ptr[i] *= d_ptr[i];
31+
}
32+
};
33+
34+
struct odd_count_op
35+
{
36+
int* d_count;
37+
38+
__device__ void operator()(int i)
39+
{
40+
if (i % 2 == 1)
41+
{
42+
atomicAdd(d_count, 1);
43+
}
44+
}
45+
};
46+
47+
// -----------------------------------------------------------------------
48+
// Bulk
49+
// -----------------------------------------------------------------------
50+
51+
C2H_TEST("DeviceFor::Bulk env uses custom stream", "[for][env]")
52+
{
53+
auto vec = c2h::device_vector<int>{1, 2, 3, 4};
54+
square_idx_op op{thrust::raw_pointer_cast(vec.data())};
55+
56+
cuda::stream stream{cuda::devices[0]};
57+
auto env = cuda::std::execution::env{cuda::stream_ref{stream}};
58+
59+
auto error = cub::DeviceFor::Bulk(4, op, env);
60+
REQUIRE(error == cudaSuccess);
61+
REQUIRE(cudaStreamSynchronize(stream.get()) == cudaSuccess);
62+
63+
c2h::device_vector<int> expected{1, 4, 9, 16};
64+
REQUIRE(vec == expected);
65+
}
66+
67+
// -----------------------------------------------------------------------
68+
// ForEachN
69+
// -----------------------------------------------------------------------
70+
71+
C2H_TEST("DeviceFor::ForEachN env uses custom stream", "[for][env]")
72+
{
73+
auto vec = c2h::device_vector<int>{1, 2, 3, 4};
74+
square_ref_op op{};
75+
76+
cuda::stream stream{cuda::devices[0]};
77+
auto env = cuda::std::execution::env{cuda::stream_ref{stream}};
78+
79+
auto error = cub::DeviceFor::ForEachN(vec.begin(), static_cast<int>(vec.size()), op, env);
80+
REQUIRE(error == cudaSuccess);
81+
REQUIRE(cudaStreamSynchronize(stream.get()) == cudaSuccess);
82+
83+
c2h::device_vector<int> expected{1, 4, 9, 16};
84+
REQUIRE(vec == expected);
85+
}
86+
87+
// -----------------------------------------------------------------------
88+
// ForEach
89+
// -----------------------------------------------------------------------
90+
91+
C2H_TEST("DeviceFor::ForEach env uses custom stream", "[for][env]")
92+
{
93+
auto vec = c2h::device_vector<int>{1, 2, 3, 4};
94+
square_ref_op op{};
95+
96+
cuda::stream stream{cuda::devices[0]};
97+
auto env = cuda::std::execution::env{cuda::stream_ref{stream}};
98+
99+
auto error = cub::DeviceFor::ForEach(vec.begin(), vec.end(), op, env);
100+
REQUIRE(error == cudaSuccess);
101+
REQUIRE(cudaStreamSynchronize(stream.get()) == cudaSuccess);
102+
103+
c2h::device_vector<int> expected{1, 4, 9, 16};
104+
REQUIRE(vec == expected);
105+
}
106+
107+
// -----------------------------------------------------------------------
108+
// ForEachCopyN
109+
// -----------------------------------------------------------------------
110+
111+
C2H_TEST("DeviceFor::ForEachCopyN env uses custom stream", "[for][env]")
112+
{
113+
auto vec = c2h::device_vector<int>{1, 2, 3, 4};
114+
auto count = c2h::device_vector<int>(1);
115+
odd_count_op op{thrust::raw_pointer_cast(count.data())};
116+
117+
cuda::stream stream{cuda::devices[0]};
118+
auto env = cuda::std::execution::env{cuda::stream_ref{stream}};
119+
120+
auto error = cub::DeviceFor::ForEachCopyN(vec.begin(), static_cast<int>(vec.size()), op, env);
121+
REQUIRE(error == cudaSuccess);
122+
REQUIRE(cudaStreamSynchronize(stream.get()) == cudaSuccess);
123+
124+
c2h::device_vector<int> expected_count{2};
125+
REQUIRE(count == expected_count);
126+
}
127+
128+
// -----------------------------------------------------------------------
129+
// ForEachCopy
130+
// -----------------------------------------------------------------------
131+
132+
C2H_TEST("DeviceFor::ForEachCopy env uses custom stream", "[for][env]")
133+
{
134+
auto vec = c2h::device_vector<int>{1, 2, 3, 4};
135+
auto count = c2h::device_vector<int>(1);
136+
odd_count_op op{thrust::raw_pointer_cast(count.data())};
137+
138+
cuda::stream stream{cuda::devices[0]};
139+
auto env = cuda::std::execution::env{cuda::stream_ref{stream}};
140+
141+
auto error = cub::DeviceFor::ForEachCopy(vec.begin(), vec.end(), op, env);
142+
REQUIRE(error == cudaSuccess);
143+
REQUIRE(cudaStreamSynchronize(stream.get()) == cudaSuccess);
144+
145+
c2h::device_vector<int> expected_count{2};
146+
REQUIRE(count == expected_count);
147+
}

0 commit comments

Comments
 (0)