|
| 1 | +//===----------------------------------------------------------------------===// |
| 2 | +// |
| 3 | +// Part of CUDA Experimental in CUDA C++ Core Libraries, |
| 4 | +// under the Apache License v2.0 with LLVM Exceptions. |
| 5 | +// See https://llvm.org/LICENSE.txt for license information. |
| 6 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 7 | +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. |
| 8 | +// |
| 9 | +//===----------------------------------------------------------------------===// |
| 10 | + |
| 11 | +#include <cuda/devices> |
| 12 | +#include <cuda/std/cassert> |
| 13 | +#include <cuda/std/cstddef> |
| 14 | +#include <cuda/std/cstring> |
| 15 | +#include <cuda/stream> |
| 16 | + |
| 17 | +#include <cuda/experimental/hierarchy.cuh> |
| 18 | +#include <cuda/experimental/kernel.cuh> |
| 19 | +#include <cuda/experimental/launch.cuh> |
| 20 | + |
| 21 | +#include <cstdio> |
| 22 | +#include <stdexcept> |
| 23 | + |
| 24 | +#include <cuda.h> |
| 25 | + |
| 26 | +// Create an alias for the experimental namespace to shorten the code. |
| 27 | +namespace cudax = cuda::experimental; |
| 28 | + |
| 29 | +// A helper type for storing kernel launch patter name. |
| 30 | +struct name_buffer |
| 31 | +{ |
| 32 | + // Size of the buffer. |
| 33 | + static constexpr cuda::std::size_t size = 128; |
| 34 | + |
| 35 | + // Buffer data. |
| 36 | + char data[size]; |
| 37 | + |
| 38 | + // Constructor from string literal. |
| 39 | + template <cuda::std::size_t N> |
| 40 | + name_buffer(const char (&str)[N]) |
| 41 | + { |
| 42 | + static_assert(N <= size, "string literal is too long"); |
| 43 | + cuda::std::memcpy(data, str, N); |
| 44 | + } |
| 45 | +}; |
| 46 | + |
| 47 | +// A helper function for printing the Hello world! message. |
| 48 | +__device__ void say_hello(dim3 tid, const name_buffer& name) |
| 49 | +{ |
| 50 | + printf("Hello world from thread [%u, %u] launched as %s!\n", tid.x, tid.y, name.data); |
| 51 | + |
| 52 | + // Wait for all threads in block to print the output. |
| 53 | + __syncthreads(); |
| 54 | + |
| 55 | + // Print additional new line once. |
| 56 | + if (tid.x == 0 && tid.y == 0) |
| 57 | + { |
| 58 | + printf("\n"); |
| 59 | + } |
| 60 | +} |
| 61 | + |
| 62 | +// This is the traditional way to define a kernel, a void function decorated with __global__ attribute. |
| 63 | +__global__ void kernel(name_buffer name) |
| 64 | +{ |
| 65 | + say_hello(threadIdx, name); |
| 66 | +} |
| 67 | + |
| 68 | +// This is a kernel functor, a callable object with operator() decorated with __device__ attribute. When launched, the |
| 69 | +// object is copied to the device and operator() is invoked on the device. |
| 70 | +struct kernel_functor |
| 71 | +{ |
| 72 | + // The functor object can be set on host before the launch. Keep in mind that the functor (thus all the members) must |
| 73 | + // be trivially copyable. |
| 74 | + int member; |
| 75 | + |
| 76 | + // The operator() must be decorated with __device__ attribute. It can also be a template. |
| 77 | + __device__ void operator()(name_buffer name) |
| 78 | + { |
| 79 | + say_hello(threadIdx, name); |
| 80 | + |
| 81 | + // Check that the member was copied correctly to the device. |
| 82 | + assert(member == 42); |
| 83 | + } |
| 84 | +}; |
| 85 | + |
| 86 | +// This is again a kernel functor, but this time the operator() takes the implicit kernel configuration parameter. This |
| 87 | +// parameter is a cudax::kernel_config object that contains the launch configuration. |
| 88 | +struct kernel_functor_with_config |
| 89 | +{ |
| 90 | + // A type that represents the layout of the dynamic shared memory used by this kernel functor. |
| 91 | + struct dynamic_smem_layout |
| 92 | + { |
| 93 | + int value; |
| 94 | + }; |
| 95 | + |
| 96 | + // The operator() must be again decorated with __device__ attribute. It can also be a template or take additional |
| 97 | + // parameters after the kernel configuration parameter. Return type must be void. |
| 98 | + template <class Dims, class... Opts> |
| 99 | + __device__ void operator()(cudax::kernel_config<Dims, Opts...> config, name_buffer name) |
| 100 | + { |
| 101 | + // dims.index(entity, in_level) queries the index of an entity in a hierarchy level. Query of the thread entity |
| 102 | + // index in the block hierarchy level results in the same value as blockIdx. |
| 103 | + const auto thread_idx = config.dims.index(cudax::thread, cudax::block); |
| 104 | + assert(thread_idx.x == threadIdx.x); |
| 105 | + assert(thread_idx.y == threadIdx.y); |
| 106 | + |
| 107 | + say_hello(thread_idx, name); |
| 108 | + |
| 109 | + // Similarly dims.extents(entity, level) queries the extents of an entity in a hierarchy level. Query of the thread |
| 110 | + // entity extent in the block hierarchy results in the same value as blockDim. |
| 111 | + const auto block_dim = config.dims.extents(cudax::thread, cudax::block); |
| 112 | + assert(block_dim.x == blockDim.x); |
| 113 | + assert(block_dim.y == blockDim.y); |
| 114 | + |
| 115 | + // todo: show that we can static_assert on grid dim |
| 116 | + |
| 117 | + // If the config contains a cudax::dynamic_shared_memory_option option, the cudax::dynamic_smem_ref function can be |
| 118 | + // used to get a reference to the dynamic shared memory type. Keep in mind that the object is not constructed, so |
| 119 | + // one of the threads must construct it before it is used. |
| 120 | + dynamic_smem_layout& dyn_smem = cudax::dynamic_smem_ref(config); |
| 121 | + |
| 122 | + // Construct the dynamic_smem_layout object in the shared memory by the first thread in the block. |
| 123 | + if (config.dims.rank(cudax::thread, cudax::block) == 0) |
| 124 | + { |
| 125 | + new (&dyn_smem) dynamic_smem_layout{42}; |
| 126 | + } |
| 127 | + |
| 128 | + // Wait until the write is finished. |
| 129 | + __syncthreads(); |
| 130 | + |
| 131 | + // All threads should see the same value in the shared memory. |
| 132 | + assert(dyn_smem.value == 42); |
| 133 | + } |
| 134 | +}; |
| 135 | + |
| 136 | +#if defined(__CUDACC_EXTENDED_LAMBDA__) |
| 137 | +// Kernel lambda is another form of the kernel functor. It can optionally take the kernel_config as the first argument. |
| 138 | +// Extended lambda are required to use this feature. |
| 139 | +// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambdas for more info. |
| 140 | +const auto kernel_lambda = [] __device__(auto config, name_buffer name) { |
| 141 | + say_hello(dim3{config.dims.index(cudax::thread, cudax::block)}, name); |
| 142 | +}; |
| 143 | +#endif // defined(__CUDACC_EXTENDED_LAMBDA__) |
| 144 | + |
| 145 | +int main() |
| 146 | +try |
| 147 | +{ |
| 148 | + // Check we have at least one device. |
| 149 | + if (cuda::devices.size() == 0) |
| 150 | + { |
| 151 | + std::fprintf(stderr, "No CUDA devices found\n"); |
| 152 | + return 1; |
| 153 | + } |
| 154 | + |
| 155 | + // We will use the first device. |
| 156 | + cuda::device_ref device = cuda::devices[0]; |
| 157 | + |
| 158 | + // cudax::launch always requires a work submitter, so let's create a CUDA stream. |
| 159 | + cuda::stream stream{device}; |
| 160 | + |
| 161 | + // Create a custom hierarchy to be used in cudax::launch. We will be launching a 1D grid of 1 block. The block will be |
| 162 | + // a 2D grid of 2 threads in x and y axis. |
| 163 | + // |
| 164 | + // Note that the grid dimensions are passed as template parameters in this example. That means the value can be used |
| 165 | + // in constexpr context inside the kernel. Block dimensions will be constructed at runtime as usually. |
| 166 | + const auto hierarchy = cudax::make_hierarchy(cudax::grid_dims<1>(), cudax::block_dims(dim3{2, 2})); |
| 167 | + |
| 168 | + // Launch an ordinary kernel. cudax::launch takes a stream as the first argument followed by the kernel configuration, |
| 169 | + // kernel and kernel parameters. |
| 170 | + cudax::launch(stream, cudax::make_config(hierarchy), kernel, "kernel"); |
| 171 | + |
| 172 | + // Launch a kernel functor. Here, we use cudax::distribute to create the kernel_config for us. This function creates |
| 173 | + // a simple 1D grid of 1D blocks of a given size. |
| 174 | + cudax::launch(stream, cudax::distribute<4>(4), kernel_functor{42}, name_buffer{"kernel functor"}); |
| 175 | + |
| 176 | + // Launch a kernel functor that takes a cudax::kernel_config. Note that the kernel config is passed automatically as |
| 177 | + // the first argument by the cudax::launch function. |
| 178 | + const auto config = |
| 179 | + cudax::make_config(hierarchy, cudax::dynamic_shared_memory<kernel_functor_with_config::dynamic_smem_layout>()); |
| 180 | + cudax::launch(stream, config, kernel_functor_with_config{}, name_buffer{"kernel functor with config"}); |
| 181 | + |
| 182 | +#if defined(__CUDACC_EXTENDED_LAMBDA__) |
| 183 | + // Launch a kernel lambda. |
| 184 | + cudax::launch(stream, cudax::make_config(hierarchy), kernel_lambda, name_buffer{"kernel lambda"}); |
| 185 | +#endif // defined(__CUDACC_EXTENDED_LAMBDA__) |
| 186 | + |
| 187 | +#if CUDA_VERSION >= 12010 |
| 188 | + // Launch a cudax::kernel_ref object which is a wrapper of cudaKernel_t. The type is available since CUDA 12.0, but |
| 189 | + // the cudaGetKernel function used to get the handle of a CUDA Runtime kernel is available since CUDA 12.1. |
| 190 | + cudax::launch(stream, cudax::make_config(hierarchy), cudax::kernel_ref{kernel}, "kernel reference"); |
| 191 | +#endif // CUDA_VERSION >= 12010 |
| 192 | + |
| 193 | + // Wait for all of the tasks in the stream to complete. |
| 194 | + stream.sync(); |
| 195 | +} |
| 196 | +catch (const cuda::cuda_error& e) |
| 197 | +{ |
| 198 | + std::fprintf(stderr, "CUDA error: %s", e.what()); |
| 199 | + return 1; |
| 200 | +} |
| 201 | +catch (const std::exception& e) |
| 202 | +{ |
| 203 | + std::fprintf(stderr, "Error: %s", e.what()); |
| 204 | + return 1; |
| 205 | +} |
| 206 | +catch (...) |
| 207 | +{ |
| 208 | + std::fprintf(stderr, "An unknown error was encountered"); |
| 209 | + return 1; |
| 210 | +} |
0 commit comments