diff --git a/cudax/include/cuda/experimental/__hierarchy/fwd.cuh b/cudax/include/cuda/experimental/__hierarchy/fwd.cuh index 97580c487c9..c070ded06e0 100644 --- a/cudax/include/cuda/experimental/__hierarchy/fwd.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/fwd.cuh @@ -21,49 +21,59 @@ # pragma system_header #endif // no system header +// Q: Do we want to enable this by default, or do we want the user to define some macro to get the interoperability with +// cooperative groups? +#if __has_include() +# define _CCCL_HAS_COOPERATIVE_GROUPS() 1 +#else // ^^^ has cooperative groups ^^^ / vvv no cooperative groups vvv +# define _CCCL_HAS_COOPERATIVE_GROUPS() 0 +#endif // ^^^ no cooperative groups ^^^ + +#include +#include + #include namespace cuda::experimental { -// hierarchy group kinds - -class __this_hierarchy_group_kind -{}; +using __implicit_hierarchy_t = + hierarchy>, + hierarchy_level_desc>, + hierarchy_level_desc>>; -// hierarchy group base +// group bases -template -class __hierarchy_group_base; template -using __this_hierarchy_group_base = __hierarchy_group_base<_Level, _Hierarchy, __this_hierarchy_group_kind>; +class __this_group_base; -// hierarchy groups +// groups -template -class thread_group; -template -class warp_group; -template -class block_group; -template -class cluster_group; -template -class grid_group; +template +class this_thread; +template +class this_warp; +template +class this_block; +template +class this_cluster; +template +class this_grid; // traits template inline constexpr bool __is_this_hierarchy_group_v = false; template -inline constexpr bool __is_this_hierarchy_group_v> = true; +inline constexpr bool __is_this_hierarchy_group_v> = true; template -inline constexpr bool __is_this_hierarchy_group_v> = true; +inline constexpr bool __is_this_hierarchy_group_v> = true; template -inline constexpr bool __is_this_hierarchy_group_v> = true; +inline constexpr bool __is_this_hierarchy_group_v> = true; template -inline constexpr bool __is_this_hierarchy_group_v> = true; +inline constexpr bool __is_this_hierarchy_group_v> = true; template -inline constexpr bool __is_this_hierarchy_group_v> = true; +inline constexpr bool __is_this_hierarchy_group_v> = true; } // namespace cuda::experimental #include diff --git a/cudax/include/cuda/experimental/__hierarchy/implicit_hierarchy.cuh b/cudax/include/cuda/experimental/__hierarchy/implicit_hierarchy.cuh new file mode 100644 index 00000000000..6507f3589f6 --- /dev/null +++ b/cudax/include/cuda/experimental/__hierarchy/implicit_hierarchy.cuh @@ -0,0 +1,44 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___HIERARCHY_IMPLICIT_HIERARCHY_CUH +#define _CUDA_EXPERIMENTAL___HIERARCHY_IMPLICIT_HIERARCHY_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#include + +#include + +namespace cuda::experimental +{ +[[nodiscard]] _CCCL_DEVICE_API inline __implicit_hierarchy_t __implicit_hierarchy() noexcept +{ + return __implicit_hierarchy_t{ + cuda::gpu_thread, + hierarchy_level_desc>{cluster.extents(grid)}, + hierarchy_level_desc>{block.extents(cluster)}, + hierarchy_level_desc>{gpu_thread.extents(block)}}; +} +} // namespace cuda::experimental + +#include + +#endif // _CUDA_EXPERIMENTAL___HIERARCHY_IMPLICIT_HIERARCHY_CUH diff --git a/cudax/include/cuda/experimental/__hierarchy/group.cuh b/cudax/include/cuda/experimental/__hierarchy/this_group.cuh similarity index 62% rename from cudax/include/cuda/experimental/__hierarchy/group.cuh rename to cudax/include/cuda/experimental/__hierarchy/this_group.cuh index ca86895e9d5..caf332787ce 100644 --- a/cudax/include/cuda/experimental/__hierarchy/group.cuh +++ b/cudax/include/cuda/experimental/__hierarchy/this_group.cuh @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDA_EXPERIMENTAL___HIERARCHY_GROUP_CUH -#define _CUDA_EXPERIMENTAL___HIERARCHY_GROUP_CUH +#ifndef _CUDA_EXPERIMENTAL___THIS_GROUP_CUH +#define _CUDA_EXPERIMENTAL___THIS_GROUP_CUH #include @@ -28,6 +28,11 @@ #include #include +#include + +#if _CCCL_HAS_COOPERATIVE_GROUPS() +# include +#endif // _CCCL_HAS_COOPERATIVE_GROUPS() #include @@ -39,19 +44,23 @@ using __hierarchy_type_of = // todo: use __hier_ in queries template -class __hierarchy_group_base<_Level, _Hierarchy, __this_hierarchy_group_kind> +class __this_group_base { static_assert(__is_hierarchy_level_v<_Level>); static_assert(__is_hierarchy_v<_Hierarchy>); - const _Hierarchy& __hier_; + _Hierarchy __hier_; public: using hierarchy_type = _Hierarchy; + _CCCL_DEVICE_API explicit __this_group_base() noexcept + : __hier_{::cuda::experimental::__implicit_hierarchy()} + {} + _CCCL_TEMPLATE(class _HierarchyLike) _CCCL_REQUIRES(::cuda::std::is_same_v<_Hierarchy, __hierarchy_type_of<_HierarchyLike>>) - _CCCL_DEVICE_API __hierarchy_group_base(const _HierarchyLike& __hier_like) noexcept + _CCCL_DEVICE_API __this_group_base(const _HierarchyLike& __hier_like) noexcept : __hier_{::cuda::__unpack_hierarchy_if_needed(__hier_like)} {} @@ -94,9 +103,9 @@ public: }; template -class thread_group<_Hierarchy, __this_hierarchy_group_kind> : __this_hierarchy_group_base +class this_thread : __this_group_base { - using __base_type = __this_hierarchy_group_base; + using __base_type = __this_group_base; public: using level_type = thread_level; @@ -109,19 +118,29 @@ public: using __base_type::rank; using __base_type::rank_as; +# if _CCCL_HAS_COOPERATIVE_GROUPS() + template + _CCCL_DEVICE_API this_thread(const ::cooperative_groups::thread_block_tile<1, _Parent>&) noexcept + : __base_type{::cuda::experimental::__implicit_hierarchy()} + {} +# endif // _CCCL_HAS_COOPERATIVE_GROUPS() + _CCCL_DEVICE_API void sync() noexcept {} #endif // _CCCL_CUDA_COMPILATION() }; _CCCL_TEMPLATE(class _Hierarchy) _CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_Hierarchy>) -_CCCL_HOST_DEVICE thread_group(const _Hierarchy&) - -> thread_group<__hierarchy_type_of<_Hierarchy>, __this_hierarchy_group_kind>; +_CCCL_HOST_DEVICE this_thread(const _Hierarchy&) -> this_thread<__hierarchy_type_of<_Hierarchy>>; + +#if _CCCL_HAS_COOPERATIVE_GROUPS() +_CCCL_HOST_DEVICE this_thread(const ::cooperative_groups::thread_block_tile<1, void>&) -> this_thread<>; +#endif // _CCCL_HAS_COOPERATIVE_GROUPS() template -class warp_group<_Hierarchy, __this_hierarchy_group_kind> : __this_hierarchy_group_base +class this_warp : __this_group_base { - using __base_type = __this_hierarchy_group_base; + using __base_type = __this_group_base; public: using level_type = warp_level; @@ -143,13 +162,12 @@ public: _CCCL_TEMPLATE(class _Hierarchy) _CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_Hierarchy>) -_CCCL_HOST_DEVICE warp_group(const _Hierarchy&) - -> warp_group<__hierarchy_type_of<_Hierarchy>, __this_hierarchy_group_kind>; +_CCCL_HOST_DEVICE this_warp(const _Hierarchy&) -> this_warp<__hierarchy_type_of<_Hierarchy>>; template -class block_group<_Hierarchy, __this_hierarchy_group_kind> : __this_hierarchy_group_base +class this_block : __this_group_base { - using __base_type = __this_hierarchy_group_base; + using __base_type = __this_group_base; public: using level_type = block_level; @@ -163,22 +181,31 @@ public: using __base_type::rank; using __base_type::rank_as; +# if _CCCL_HAS_COOPERATIVE_GROUPS() + _CCCL_DEVICE_API this_block(const ::cooperative_groups::thread_block&) noexcept + : __base_type{::cuda::experimental::__implicit_hierarchy()} + {} +# endif // _CCCL_HAS_COOPERATIVE_GROUPS() + _CCCL_DEVICE_API void sync() noexcept { - ::__syncthreads(); + ::__barrier_sync(0); } #endif // _CCCL_CUDA_COMPILATION() }; _CCCL_TEMPLATE(class _Hierarchy) _CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_Hierarchy>) -_CCCL_HOST_DEVICE block_group(const _Hierarchy&) - -> block_group<__hierarchy_type_of<_Hierarchy>, __this_hierarchy_group_kind>; +_CCCL_HOST_DEVICE this_block(const _Hierarchy&) -> this_block<__hierarchy_type_of<_Hierarchy>>; + +#if _CCCL_HAS_COOPERATIVE_GROUPS() +_CCCL_HOST_DEVICE this_block(const ::cooperative_groups::thread_block&) -> this_block<>; +#endif // _CCCL_HAS_COOPERATIVE_GROUPS() template -class cluster_group<_Hierarchy, __this_hierarchy_group_kind> : __this_hierarchy_group_base +class this_cluster : __this_group_base { - using __base_type = __this_hierarchy_group_base; + using __base_type = __this_group_base; public: using level_type = cluster_level; @@ -192,6 +219,12 @@ public: using __base_type::rank; using __base_type::rank_as; +# if _CCCL_HAS_COOPERATIVE_GROUPS() && defined(_CG_HAS_CLUSTER_GROUP) + _CCCL_DEVICE_API this_cluster(const ::cooperative_groups::cluster_group&) noexcept + : __base_type{::cuda::experimental::__implicit_hierarchy()} + {} +# endif // _CCCL_HAS_COOPERATIVE_GROUPS() && defined(_CG_HAS_CLUSTER_GROUP) + _CCCL_DEVICE_API void sync() noexcept { NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90, @@ -199,20 +232,23 @@ public: ::__cluster_barrier_arrive(); ::__cluster_barrier_wait(); }), - ({ ::__syncthreads(); })) + ({ ::__barrier_sync(0); })) } #endif // _CCCL_CUDA_COMPILATION() }; _CCCL_TEMPLATE(class _Hierarchy) _CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_Hierarchy>) -_CCCL_HOST_DEVICE cluster_group(const _Hierarchy&) - -> cluster_group<__hierarchy_type_of<_Hierarchy>, __this_hierarchy_group_kind>; +_CCCL_HOST_DEVICE this_cluster(const _Hierarchy&) -> this_cluster<__hierarchy_type_of<_Hierarchy>>; + +#if _CCCL_HAS_COOPERATIVE_GROUPS() && defined(_CG_HAS_CLUSTER_GROUP) +_CCCL_HOST_DEVICE this_cluster(const ::cooperative_groups::cluster_group&) -> this_cluster<>; +#endif // _CCCL_HAS_COOPERATIVE_GROUPS() && defined(_CG_HAS_CLUSTER_GROUP) template -class grid_group<_Hierarchy, __this_hierarchy_group_kind> : __this_hierarchy_group_base +class this_grid : __this_group_base { - using __base_type = __this_hierarchy_group_base; + using __base_type = __this_group_base; public: using level_type = grid_level; @@ -221,6 +257,12 @@ public: using __base_type::hierarchy; #if _CCCL_CUDA_COMPILATION() +# if _CCCL_HAS_COOPERATIVE_GROUPS() + _CCCL_DEVICE_API this_grid(const ::cooperative_groups::grid_group&) noexcept + : __base_type{::cuda::experimental::__implicit_hierarchy()} + {} +# endif // _CCCL_HAS_COOPERATIVE_GROUPS() + _CCCL_DEVICE_API void sync() noexcept { ::cuda::experimental::__cg_imported::__grid_sync(); @@ -230,45 +272,13 @@ public: _CCCL_TEMPLATE(class _Hierarchy) _CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_Hierarchy>) -_CCCL_HOST_DEVICE grid_group(const _Hierarchy&) - -> grid_group<__hierarchy_type_of<_Hierarchy>, __this_hierarchy_group_kind>; - -_CCCL_TEMPLATE(class _HierarchyLike) -_CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_HierarchyLike>) -[[nodiscard]] _CCCL_DEVICE_API auto this_thread(const _HierarchyLike& __hier_like) noexcept -{ - return thread_group{__hier_like}; -} - -_CCCL_TEMPLATE(class _HierarchyLike) -_CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_HierarchyLike>) -[[nodiscard]] _CCCL_DEVICE_API auto this_warp(const _HierarchyLike& __hier_like) noexcept -{ - return warp_group{__hier_like}; -} +_CCCL_HOST_DEVICE this_grid(const _Hierarchy&) -> this_grid<__hierarchy_type_of<_Hierarchy>>; -_CCCL_TEMPLATE(class _HierarchyLike) -_CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_HierarchyLike>) -[[nodiscard]] _CCCL_DEVICE_API auto this_block(const _HierarchyLike& __hier_like) noexcept -{ - return block_group{__hier_like}; -} - -_CCCL_TEMPLATE(class _HierarchyLike) -_CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_HierarchyLike>) -[[nodiscard]] _CCCL_DEVICE_API auto this_cluster(const _HierarchyLike& __hier_like) noexcept -{ - return cluster_group{__hier_like}; -} - -_CCCL_TEMPLATE(class _HierarchyLike) -_CCCL_REQUIRES(__is_or_has_hierarchy_member_v<_HierarchyLike>) -[[nodiscard]] _CCCL_DEVICE_API auto this_grid(const _HierarchyLike& __hier_like) noexcept -{ - return grid_group{__hier_like}; -} +#if _CCCL_HAS_COOPERATIVE_GROUPS() +_CCCL_HOST_DEVICE this_grid(const ::cooperative_groups::grid_group&) -> this_grid<>; +#endif // _CCCL_HAS_COOPERATIVE_GROUPS() } // namespace cuda::experimental #include -#endif // _CUDA_EXPERIMENTAL___HIERARCHY_GROUP_CUH +#endif // _CUDA_EXPERIMENTAL___THIS_GROUP_CUH diff --git a/cudax/include/cuda/experimental/hierarchy.cuh b/cudax/include/cuda/experimental/hierarchy.cuh index 474d251b1c0..448a83bf17e 100644 --- a/cudax/include/cuda/experimental/hierarchy.cuh +++ b/cudax/include/cuda/experimental/hierarchy.cuh @@ -23,6 +23,7 @@ #include #include -#include +#include +#include #endif // _CUDA_EXPERIMENTAL_HIERARCHY diff --git a/cudax/test/hierarchy/group.cu b/cudax/test/hierarchy/group.cu index f346b9a44db..3af313d66c5 100644 --- a/cudax/test/hierarchy/group.cu +++ b/cudax/test/hierarchy/group.cu @@ -21,33 +21,73 @@ #include +#include + #include "testing.cuh" -template -[[nodiscard]] __device__ T sum(cudax::thread_group group, T (&array)[N]) +template +__device__ T sum(cudax::this_thread group, T (&array)[N]) { return cub::ThreadReduce(array, cuda::std::plus{}); } -template -[[nodiscard]] __device__ T sum(cudax::warp_group group, T (&array)[N]) +template +__device__ T sum(cudax::this_warp group, T (&array)[N]) { using WarpReduce = cub::WarpReduce; - __shared__ typename WarpReduce::TempStorage temp_storage; + __shared__ typename WarpReduce::TempStorage scratch; const auto partial = cub::ThreadReduce(array, cuda::std::plus{}); - return WarpReduce{temp_storage}.Sum(partial); + return WarpReduce{scratch}.Sum(partial); } -template -[[nodiscard]] __device__ T sum(cudax::block_group group, T (&array)[N]) +template +__device__ T sum(cudax::this_block group, T (&array)[N]) { - // todo: Replace 32 with value from group. + // todo: support other block sizes using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - return BlockReduce{temp_storage}.Sum(array); + __shared__ typename BlockReduce::TempStorage scratch; + return BlockReduce{scratch}.Sum(array); +} + +template +__device__ T sum(cudax::this_cluster group, T (&array)[N]) +{ + // todo: support other block sizes + using BlockReduce = cub::BlockReduce; + union SMem + { + typename BlockReduce::TempStorage block_scratch; + T cluster_scratch; + }; + + __shared__ SMem smem; + T result = BlockReduce{smem.block_scratch}.Sum(array); + + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + const auto dsmem = static_cast(__cluster_map_shared_rank(&smem.cluster_scratch, 0)); + + if (cuda::gpu_thread.rank(group) == 0) + { + smem.cluster_scratch = 0; + } + group.sync(); + + if (cuda::gpu_thread.rank(cuda::block, group.hierarchy()) == 0) + { + atomicAdd(dsmem, result); + } + group.sync(); + + if (cuda::gpu_thread.rank(group) == 0) + { + result = smem.cluster_scratch; + } + })) + + return result; } struct TestKernel @@ -58,8 +98,7 @@ struct TestKernel { unsigned array[]{1, 2, 3}; - auto this_thread = cudax::this_thread(config); - + cudax::this_thread this_thread{config}; this_thread.sync(); const auto result = sum(this_thread, array); @@ -79,7 +118,7 @@ struct TestKernel { unsigned array[]{1, 2, 3}; - auto this_warp = cudax::this_warp(config); + cudax::this_warp this_warp{config}; this_warp.sync(); const auto result = sum(this_warp, array); @@ -102,7 +141,7 @@ struct TestKernel { unsigned array[]{1, 2, 3}; - auto this_block = cudax::this_block(config); + cudax::this_block this_block{config}; this_block.sync(); const auto result = sum(this_block, array); @@ -123,11 +162,17 @@ struct TestKernel CUDAX_REQUIRE(this_block.rank(cuda::grid) == cuda::block.rank(cuda::grid)); } { - auto this_cluster = cudax::this_cluster(config); - CUDAX_REQUIRE(this_cluster.count(cuda::grid) == cuda::cluster.count(cuda::grid)); - CUDAX_REQUIRE(this_cluster.rank(cuda::grid) == cuda::cluster.rank(cuda::grid)); + unsigned array[]{1, 2, 3}; + + cudax::this_cluster this_cluster{config}; this_cluster.sync(); + const auto result = sum(this_cluster, array); + if (cuda::gpu_thread.rank(cuda::cluster) == 0) + { + CUDAX_REQUIRE(result == 6 * cuda::gpu_thread.count(cuda::cluster)); + } + CUDAX_REQUIRE(cuda::gpu_thread.count(this_cluster) == cuda::gpu_thread.count(cuda::cluster)); CUDAX_REQUIRE(cuda::gpu_thread.rank(this_cluster) == cuda::gpu_thread.rank(cuda::cluster)); CUDAX_REQUIRE(cuda::warp.count(this_cluster) == cuda::warp.count(cuda::cluster)); @@ -140,7 +185,7 @@ struct TestKernel CUDAX_REQUIRE(this_cluster.rank(cuda::grid) == cuda::cluster.rank(cuda::grid)); } { - auto this_grid = cudax::this_grid(config); + cudax::this_grid this_grid{config}; this_grid.sync(); CUDAX_REQUIRE(cuda::gpu_thread.count(this_grid) == cuda::gpu_thread.count(cuda::grid)); @@ -163,9 +208,66 @@ C2H_TEST("Hierarchy groups", "[hierarchy]") const cuda::stream stream{device}; - const auto config = cuda::make_config(cuda::grid_dims<2>(), cuda::block_dims<32>(), cuda::cooperative_launch{}); + if (cuda::device_attributes::compute_capability(device) >= cuda::compute_capability{90}) + { + const auto config = cuda::make_config( + cuda::grid_dims<2>(), cuda::cluster_dims<3>(), cuda::block_dims<32>(), cuda::cooperative_launch{}); + cuda::launch(stream, config, TestKernel{}); + } + else + { + const auto config = cuda::make_config(cuda::grid_dims<2>(), cuda::block_dims<32>(), cuda::cooperative_launch{}); + cuda::launch(stream, config, TestKernel{}); + } - cuda::launch(stream, config, TestKernel{}); + stream.sync(); +} + +struct CgInteropKernel +{ + template + __device__ void operator()(const Config& config) + { + { + cudax::this_thread g{cooperative_groups::this_thread()}; + g.sync(); + } + { + cudax::this_block g{cooperative_groups::this_thread_block()}; + g.sync(); + } +#if defined(_CG_HAS_CLUSTER_GROUP) + { + NV_IF_TARGET(NV_PROVIDES_SM_90, ({ + cudax::this_cluster g{cooperative_groups::this_cluster()}; + g.sync(); + })) + } +#endif // _CG_HAS_CLUSTER_GROUP + { + cudax::this_grid g{cooperative_groups::this_grid()}; + g.sync(); + } + } +}; + +C2H_TEST("Groups interoperability with coopertive groups", "[hierarchy][cg_interop]") +{ + const auto device = cuda::devices[0]; + + const cuda::stream stream{device}; + + if (cuda::device_attributes::compute_capability(device) >= cuda::compute_capability{90}) + { + const auto config = cuda::make_config( + cuda::grid_dims<2>(), cuda::cluster_dims<3>(), cuda::block_dims<32>(), cuda::cooperative_launch{}); + cuda::launch(stream, config, CgInteropKernel{}); + } + else + { + const auto config = cuda::make_config(cuda::grid_dims<2>(), cuda::block_dims<32>(), cuda::cooperative_launch{}); + cuda::launch(stream, config, CgInteropKernel{}); + } stream.sync(); }