Implement cccl-rt kernel launch paterns example

davebayer · davebayer · commit 0883fc135a6a · 2025-09-16T15:44:23.000+02:00
diff --git a/cudax/examples/CMakeLists.txt b/cudax/examples/CMakeLists.txt
@@ -85,3 +85,5 @@ if (cudax_ENABLE_CUDASTF AND
   # STF examples are handled separately:
   add_subdirectory(stf)
 endif()
+
+add_subdirectory(cccl-rt)
diff --git a/cudax/examples/cccl-rt/CMakeLists.txt b/cudax/examples/cccl-rt/CMakeLists.txt
@@ -0,0 +1,64 @@
+find_package(cudax) # already found, bring in version info.
+
+function(cudax_add_cccl_rt_example target_name_var example_src cudax_target)
+  cudax_get_target_property(config_prefix ${cudax_target} PREFIX)
+  cudax_get_target_property(config_dialect ${cudax_target} DIALECT)
+
+  get_filename_component(example_name ${example_src} NAME_WE)
+
+  # The actual name of the test's target:
+  set(example_target ${config_prefix}.example.cccl-rt.${example_name})
+  set(${target_name_var} ${example_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.examples)
+  set(example_meta_target cudax.all.example.cccl-rt.${example_name})
+
+  add_executable(${example_target} "${example_src}")
+  cccl_configure_target(${example_target} DIALECT ${config_dialect})
+  target_link_libraries(${example_target} PRIVATE
+    ${cudax_target}
+    libcudacxx::libcudacxx
+  )
+  target_compile_options(${example_target} PRIVATE
+    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
+  )
+  target_compile_definitions(${example_target} PRIVATE
+    "LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE"
+  )
+
+  cudax_clone_target_properties(${example_target} ${cudax_target})
+  target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${example_target})
+
+  # Meta target that builds examples with this name for all configurations:
+  if (NOT TARGET ${example_meta_target})
+    add_custom_target(${example_meta_target})
+  endif()
+  add_dependencies(${example_meta_target} ${example_target})
+
+  add_test(NAME ${example_target}
+    COMMAND "$<TARGET_FILE:${example_target}>"
+  )
+endfunction()
+
+file(GLOB cccl_rt_example_srcs
+  RELATIVE "${cudax_SOURCE_DIR}/examples/cccl-rt"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(cudax_target IN LISTS cudax_TARGETS)
+  cudax_get_target_property(config_prefix ${cudax_target} PREFIX)
+
+  # Metatarget for the current configuration's tests:
+  set(config_meta_target ${config_prefix}.examples.cccl-rt)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+
+  foreach (example_src IN LISTS cccl_rt_example_srcs)
+    cudax_add_cccl_rt_example(example_target "${example_src}" ${cudax_target})
+  endforeach()
+endforeach()
diff --git a/cudax/examples/cccl-rt/kernel_launch_patterns.cu b/cudax/examples/cccl-rt/kernel_launch_patterns.cu
@@ -0,0 +1,210 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/devices>
+#include <cuda/std/cassert>
+#include <cuda/std/cstddef>
+#include <cuda/std/cstring>
+#include <cuda/stream>
+
+#include <cuda/experimental/hierarchy.cuh>
+#include <cuda/experimental/kernel.cuh>
+#include <cuda/experimental/launch.cuh>
+
+#include <cstdio>
+#include <stdexcept>
+
+#include <cuda.h>
+
+// Create an alias for the experimental namespace to shorten the code.
+namespace cudax = cuda::experimental;
+
+// A helper type for storing kernel launch patter name.
+struct name_buffer
+{
+  // Size of the buffer.
+  static constexpr cuda::std::size_t size = 128;
+
+  // Buffer data.
+  char data[size];
+
+  // Constructor from string literal.
+  template <cuda::std::size_t N>
+  name_buffer(const char (&str)[N])
+  {
+    static_assert(N <= size, "string literal is too long");
+    cuda::std::memcpy(data, str, N);
+  }
+};
+
+// A helper function for printing the Hello world! message.
+__device__ void say_hello(dim3 tid, const name_buffer& name)
+{
+  printf("Hello world from thread [%u, %u] launched as %s!\n", tid.x, tid.y, name.data);
+
+  // Wait for all threads in block to print the output.
+  __syncthreads();
+
+  // Print additional new line once.
+  if (tid.x == 0 && tid.y == 0)
+  {
+    printf("\n");
+  }
+}
+
+// This is the traditional way to define a kernel, a void function decorated with __global__ attribute.
+__global__ void kernel(name_buffer name)
+{
+  say_hello(threadIdx, name);
+}
+
+// This is a kernel functor, a callable object with operator() decorated with __device__ attribute. When launched, the
+// object is copied to the device and operator() is invoked on the device.
+struct kernel_functor
+{
+  // The functor object can be set on host before the launch. Keep in mind that the functor (thus all the members) must
+  // be trivially copyable.
+  int member;
+
+  // The operator() must be decorated with __device__ attribute. It can also be a template.
+  __device__ void operator()(name_buffer name)
+  {
+    say_hello(threadIdx, name);
+
+    // Check that the member was copied correctly to the device.
+    assert(member == 42);
+  }
+};
+
+// This is again a kernel functor, but this time the operator() takes the implicit kernel configuration parameter. This
+// parameter is a cudax::kernel_config object that contains the launch configuration.
+struct kernel_functor_with_config
+{
+  // A type that represents the layout of the dynamic shared memory used by this kernel functor.
+  struct dynamic_smem_layout
+  {
+    int value;
+  };
+
+  // The operator() must be again decorated with __device__ attribute. It can also be a template or take additional
+  // parameters after the kernel configuration parameter. Return type must be void.
+  template <class Dims, class... Opts>
+  __device__ void operator()(cudax::kernel_config<Dims, Opts...> config, name_buffer name)
+  {
+    // dims.index(entity, in_level) queries the index of an entity in a hierarchy level. Query of the thread entity
+    // index in the block hierarchy level results in the same value as blockIdx.
+    const auto thread_idx = config.dims.index(cudax::thread, cudax::block);
+    assert(thread_idx.x == threadIdx.x);
+    assert(thread_idx.y == threadIdx.y);
+
+    say_hello(thread_idx, name);
+
+    // Similarly dims.extents(entity, level) queries the extents of an entity in a hierarchy level. Query of the thread
+    // entity extent in the block hierarchy results in the same value as blockDim.
+    const auto block_dim = config.dims.extents(cudax::thread, cudax::block);
+    assert(block_dim.x == blockDim.x);
+    assert(block_dim.y == blockDim.y);
+
+    // todo: show that we can static_assert on grid dim
+
+    // If the config contains a cudax::dynamic_shared_memory_option option, the cudax::dynamic_smem_ref function can be
+    // used to get a reference to the dynamic shared memory type. Keep in mind that the object is not constructed, so
+    // one of the threads must construct it before it is used.
+    dynamic_smem_layout& dyn_smem = cudax::dynamic_smem_ref(config);
+
+    // Construct the dynamic_smem_layout object in the shared memory by the first thread in the block.
+    if (config.dims.rank(cudax::thread, cudax::block) == 0)
+    {
+      new (&dyn_smem) dynamic_smem_layout{42};
+    }
+
+    // Wait until the write is finished.
+    __syncthreads();
+
+    // All threads should see the same value in the shared memory.
+    assert(dyn_smem.value == 42);
+  }
+};
+
+#if defined(__CUDACC_EXTENDED_LAMBDA__)
+// Kernel lambda is another form of the kernel functor. It can optionally take the kernel_config as the first argument.
+// Extended lambda are required to use this feature.
+// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambdas for more info.
+const auto kernel_lambda = [] __device__(auto config, name_buffer name) {
+  say_hello(dim3{config.dims.index(cudax::thread, cudax::block)}, name);
+};
+#endif // defined(__CUDACC_EXTENDED_LAMBDA__)
+
+int main()
+try
+{
+  // Check we have at least one device.
+  if (cuda::devices.size() == 0)
+  {
+    std::fprintf(stderr, "No CUDA devices found\n");
+    return 1;
+  }
+
+  // We will use the first device.
+  cuda::device_ref device = cuda::devices[0];
+
+  // cudax::launch always requires a work submitter, so let's create a CUDA stream.
+  cuda::stream stream{device};
+
+  // Create a custom hierarchy to be used in cudax::launch. We will be launching a 1D grid of 1 block. The block will be
+  // a 2D grid of 2 threads in x and y axis.
+  //
+  // Note that the grid dimensions are passed as template parameters in this example. That means the value can be used
+  // in constexpr context inside the kernel. Block dimensions will be constructed at runtime as usually.
+  const auto hierarchy = cudax::make_hierarchy(cudax::grid_dims<1>(), cudax::block_dims(dim3{2, 2}));
+
+  // Launch an ordinary kernel. cudax::launch takes a stream as the first argument followed by the kernel configuration,
+  // kernel and kernel parameters.
+  cudax::launch(stream, cudax::make_config(hierarchy), kernel, "kernel");
+
+  // Launch a kernel functor. Here, we use cudax::distribute to create the kernel_config for us. This function creates
+  // a simple 1D grid of 1D blocks of a given size.
+  cudax::launch(stream, cudax::distribute<4>(4), kernel_functor{42}, name_buffer{"kernel functor"});
+
+  // Launch a kernel functor that takes a cudax::kernel_config. Note that the kernel config is passed automatically as
+  // the first argument by the cudax::launch function.
+  const auto config =
+    cudax::make_config(hierarchy, cudax::dynamic_shared_memory<kernel_functor_with_config::dynamic_smem_layout>());
+  cudax::launch(stream, config, kernel_functor_with_config{}, name_buffer{"kernel functor with config"});
+
+#if defined(__CUDACC_EXTENDED_LAMBDA__)
+  // Launch a kernel lambda.
+  cudax::launch(stream, cudax::make_config(hierarchy), kernel_lambda, name_buffer{"kernel lambda"});
+#endif // defined(__CUDACC_EXTENDED_LAMBDA__)
+
+#if CUDA_VERSION >= 12010
+  // Launch a cudax::kernel_ref object which is a wrapper of cudaKernel_t. The type is available since CUDA 12.0, but
+  // the cudaGetKernel function used to get the handle of a CUDA Runtime kernel is available since CUDA 12.1.
+  cudax::launch(stream, cudax::make_config(hierarchy), cudax::kernel_ref{kernel}, "kernel reference");
+#endif // CUDA_VERSION >= 12010
+
+  // Wait for all of the tasks in the stream to complete.
+  stream.sync();
+}
+catch (const cuda::cuda_error& e)
+{
+  std::fprintf(stderr, "CUDA error: %s", e.what());
+  return 1;
+}
+catch (const std::exception& e)
+{
+  std::fprintf(stderr, "Error: %s", e.what());
+  return 1;
+}
+catch (...)
+{
+  std::fprintf(stderr, "An unknown error was encountered");
+  return 1;
+}