Skip to content

Commit 0883fc1

Browse files
committed
Implement cccl-rt kernel launch paterns example
1 parent 956d3ba commit 0883fc1

File tree

3 files changed

+276
-0
lines changed

3 files changed

+276
-0
lines changed

cudax/examples/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,5 @@ if (cudax_ENABLE_CUDASTF AND
8585
# STF examples are handled separately:
8686
add_subdirectory(stf)
8787
endif()
88+
89+
add_subdirectory(cccl-rt)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
find_package(cudax) # already found, bring in version info.
2+
3+
function(cudax_add_cccl_rt_example target_name_var example_src cudax_target)
4+
cudax_get_target_property(config_prefix ${cudax_target} PREFIX)
5+
cudax_get_target_property(config_dialect ${cudax_target} DIALECT)
6+
7+
get_filename_component(example_name ${example_src} NAME_WE)
8+
9+
# The actual name of the test's target:
10+
set(example_target ${config_prefix}.example.cccl-rt.${example_name})
11+
set(${target_name_var} ${example_target} PARENT_SCOPE)
12+
13+
# Related target names:
14+
set(config_meta_target ${config_prefix}.examples)
15+
set(example_meta_target cudax.all.example.cccl-rt.${example_name})
16+
17+
add_executable(${example_target} "${example_src}")
18+
cccl_configure_target(${example_target} DIALECT ${config_dialect})
19+
target_link_libraries(${example_target} PRIVATE
20+
${cudax_target}
21+
libcudacxx::libcudacxx
22+
)
23+
target_compile_options(${example_target} PRIVATE
24+
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
25+
)
26+
target_compile_definitions(${example_target} PRIVATE
27+
"LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
28+
)
29+
30+
cudax_clone_target_properties(${example_target} ${cudax_target})
31+
target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
32+
33+
# Add to the active configuration's meta target
34+
add_dependencies(${config_meta_target} ${example_target})
35+
36+
# Meta target that builds examples with this name for all configurations:
37+
if (NOT TARGET ${example_meta_target})
38+
add_custom_target(${example_meta_target})
39+
endif()
40+
add_dependencies(${example_meta_target} ${example_target})
41+
42+
add_test(NAME ${example_target}
43+
COMMAND "$<TARGET_FILE:${example_target}>"
44+
)
45+
endfunction()
46+
47+
file(GLOB cccl_rt_example_srcs
48+
RELATIVE "${cudax_SOURCE_DIR}/examples/cccl-rt"
49+
CONFIGURE_DEPENDS
50+
*.cu *.cpp
51+
)
52+
53+
foreach(cudax_target IN LISTS cudax_TARGETS)
54+
cudax_get_target_property(config_prefix ${cudax_target} PREFIX)
55+
56+
# Metatarget for the current configuration's tests:
57+
set(config_meta_target ${config_prefix}.examples.cccl-rt)
58+
add_custom_target(${config_meta_target})
59+
add_dependencies(${config_prefix}.all ${config_meta_target})
60+
61+
foreach (example_src IN LISTS cccl_rt_example_srcs)
62+
cudax_add_cccl_rt_example(example_target "${example_src}" ${cudax_target})
63+
endforeach()
64+
endforeach()
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <cuda/devices>
12+
#include <cuda/std/cassert>
13+
#include <cuda/std/cstddef>
14+
#include <cuda/std/cstring>
15+
#include <cuda/stream>
16+
17+
#include <cuda/experimental/hierarchy.cuh>
18+
#include <cuda/experimental/kernel.cuh>
19+
#include <cuda/experimental/launch.cuh>
20+
21+
#include <cstdio>
22+
#include <stdexcept>
23+
24+
#include <cuda.h>
25+
26+
// Create an alias for the experimental namespace to shorten the code.
27+
namespace cudax = cuda::experimental;
28+
29+
// A helper type for storing kernel launch patter name.
30+
struct name_buffer
31+
{
32+
// Size of the buffer.
33+
static constexpr cuda::std::size_t size = 128;
34+
35+
// Buffer data.
36+
char data[size];
37+
38+
// Constructor from string literal.
39+
template <cuda::std::size_t N>
40+
name_buffer(const char (&str)[N])
41+
{
42+
static_assert(N <= size, "string literal is too long");
43+
cuda::std::memcpy(data, str, N);
44+
}
45+
};
46+
47+
// A helper function for printing the Hello world! message.
48+
__device__ void say_hello(dim3 tid, const name_buffer& name)
49+
{
50+
printf("Hello world from thread [%u, %u] launched as %s!\n", tid.x, tid.y, name.data);
51+
52+
// Wait for all threads in block to print the output.
53+
__syncthreads();
54+
55+
// Print additional new line once.
56+
if (tid.x == 0 && tid.y == 0)
57+
{
58+
printf("\n");
59+
}
60+
}
61+
62+
// This is the traditional way to define a kernel, a void function decorated with __global__ attribute.
63+
__global__ void kernel(name_buffer name)
64+
{
65+
say_hello(threadIdx, name);
66+
}
67+
68+
// This is a kernel functor, a callable object with operator() decorated with __device__ attribute. When launched, the
69+
// object is copied to the device and operator() is invoked on the device.
70+
struct kernel_functor
71+
{
72+
// The functor object can be set on host before the launch. Keep in mind that the functor (thus all the members) must
73+
// be trivially copyable.
74+
int member;
75+
76+
// The operator() must be decorated with __device__ attribute. It can also be a template.
77+
__device__ void operator()(name_buffer name)
78+
{
79+
say_hello(threadIdx, name);
80+
81+
// Check that the member was copied correctly to the device.
82+
assert(member == 42);
83+
}
84+
};
85+
86+
// This is again a kernel functor, but this time the operator() takes the implicit kernel configuration parameter. This
87+
// parameter is a cudax::kernel_config object that contains the launch configuration.
88+
struct kernel_functor_with_config
89+
{
90+
// A type that represents the layout of the dynamic shared memory used by this kernel functor.
91+
struct dynamic_smem_layout
92+
{
93+
int value;
94+
};
95+
96+
// The operator() must be again decorated with __device__ attribute. It can also be a template or take additional
97+
// parameters after the kernel configuration parameter. Return type must be void.
98+
template <class Dims, class... Opts>
99+
__device__ void operator()(cudax::kernel_config<Dims, Opts...> config, name_buffer name)
100+
{
101+
// dims.index(entity, in_level) queries the index of an entity in a hierarchy level. Query of the thread entity
102+
// index in the block hierarchy level results in the same value as blockIdx.
103+
const auto thread_idx = config.dims.index(cudax::thread, cudax::block);
104+
assert(thread_idx.x == threadIdx.x);
105+
assert(thread_idx.y == threadIdx.y);
106+
107+
say_hello(thread_idx, name);
108+
109+
// Similarly dims.extents(entity, level) queries the extents of an entity in a hierarchy level. Query of the thread
110+
// entity extent in the block hierarchy results in the same value as blockDim.
111+
const auto block_dim = config.dims.extents(cudax::thread, cudax::block);
112+
assert(block_dim.x == blockDim.x);
113+
assert(block_dim.y == blockDim.y);
114+
115+
// todo: show that we can static_assert on grid dim
116+
117+
// If the config contains a cudax::dynamic_shared_memory_option option, the cudax::dynamic_smem_ref function can be
118+
// used to get a reference to the dynamic shared memory type. Keep in mind that the object is not constructed, so
119+
// one of the threads must construct it before it is used.
120+
dynamic_smem_layout& dyn_smem = cudax::dynamic_smem_ref(config);
121+
122+
// Construct the dynamic_smem_layout object in the shared memory by the first thread in the block.
123+
if (config.dims.rank(cudax::thread, cudax::block) == 0)
124+
{
125+
new (&dyn_smem) dynamic_smem_layout{42};
126+
}
127+
128+
// Wait until the write is finished.
129+
__syncthreads();
130+
131+
// All threads should see the same value in the shared memory.
132+
assert(dyn_smem.value == 42);
133+
}
134+
};
135+
136+
#if defined(__CUDACC_EXTENDED_LAMBDA__)
137+
// Kernel lambda is another form of the kernel functor. It can optionally take the kernel_config as the first argument.
138+
// Extended lambda are required to use this feature.
139+
// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambdas for more info.
140+
const auto kernel_lambda = [] __device__(auto config, name_buffer name) {
141+
say_hello(dim3{config.dims.index(cudax::thread, cudax::block)}, name);
142+
};
143+
#endif // defined(__CUDACC_EXTENDED_LAMBDA__)
144+
145+
int main()
146+
try
147+
{
148+
// Check we have at least one device.
149+
if (cuda::devices.size() == 0)
150+
{
151+
std::fprintf(stderr, "No CUDA devices found\n");
152+
return 1;
153+
}
154+
155+
// We will use the first device.
156+
cuda::device_ref device = cuda::devices[0];
157+
158+
// cudax::launch always requires a work submitter, so let's create a CUDA stream.
159+
cuda::stream stream{device};
160+
161+
// Create a custom hierarchy to be used in cudax::launch. We will be launching a 1D grid of 1 block. The block will be
162+
// a 2D grid of 2 threads in x and y axis.
163+
//
164+
// Note that the grid dimensions are passed as template parameters in this example. That means the value can be used
165+
// in constexpr context inside the kernel. Block dimensions will be constructed at runtime as usually.
166+
const auto hierarchy = cudax::make_hierarchy(cudax::grid_dims<1>(), cudax::block_dims(dim3{2, 2}));
167+
168+
// Launch an ordinary kernel. cudax::launch takes a stream as the first argument followed by the kernel configuration,
169+
// kernel and kernel parameters.
170+
cudax::launch(stream, cudax::make_config(hierarchy), kernel, "kernel");
171+
172+
// Launch a kernel functor. Here, we use cudax::distribute to create the kernel_config for us. This function creates
173+
// a simple 1D grid of 1D blocks of a given size.
174+
cudax::launch(stream, cudax::distribute<4>(4), kernel_functor{42}, name_buffer{"kernel functor"});
175+
176+
// Launch a kernel functor that takes a cudax::kernel_config. Note that the kernel config is passed automatically as
177+
// the first argument by the cudax::launch function.
178+
const auto config =
179+
cudax::make_config(hierarchy, cudax::dynamic_shared_memory<kernel_functor_with_config::dynamic_smem_layout>());
180+
cudax::launch(stream, config, kernel_functor_with_config{}, name_buffer{"kernel functor with config"});
181+
182+
#if defined(__CUDACC_EXTENDED_LAMBDA__)
183+
// Launch a kernel lambda.
184+
cudax::launch(stream, cudax::make_config(hierarchy), kernel_lambda, name_buffer{"kernel lambda"});
185+
#endif // defined(__CUDACC_EXTENDED_LAMBDA__)
186+
187+
#if CUDA_VERSION >= 12010
188+
// Launch a cudax::kernel_ref object which is a wrapper of cudaKernel_t. The type is available since CUDA 12.0, but
189+
// the cudaGetKernel function used to get the handle of a CUDA Runtime kernel is available since CUDA 12.1.
190+
cudax::launch(stream, cudax::make_config(hierarchy), cudax::kernel_ref{kernel}, "kernel reference");
191+
#endif // CUDA_VERSION >= 12010
192+
193+
// Wait for all of the tasks in the stream to complete.
194+
stream.sync();
195+
}
196+
catch (const cuda::cuda_error& e)
197+
{
198+
std::fprintf(stderr, "CUDA error: %s", e.what());
199+
return 1;
200+
}
201+
catch (const std::exception& e)
202+
{
203+
std::fprintf(stderr, "Error: %s", e.what());
204+
return 1;
205+
}
206+
catch (...)
207+
{
208+
std::fprintf(stderr, "An unknown error was encountered");
209+
return 1;
210+
}

0 commit comments

Comments
 (0)