Skip to content

Commit 61c2a17

Browse files
authored
Merge branch 'main' into stf_token_checks
2 parents a925958 + aa3463d commit 61c2a17

File tree

4 files changed

+221
-181
lines changed

4 files changed

+221
-181
lines changed
Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDASTF in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
/**
12+
* @file
13+
*
14+
* @brief Deferred definition of the interpreted_execution_policy constructor.
15+
*
16+
* The constructor maps a thread_hierarchy_spec onto an exec_place, which
17+
* requires both exec_place (from places.cuh) and compute_kernel_limits
18+
* (from occupancy.cuh) to be fully defined. This separate file breaks the
19+
* circular dependency between interpreted_execution_policy.cuh and places.cuh.
20+
*/
21+
22+
#pragma once
23+
24+
#include <cuda/__cccl_config>
25+
26+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
27+
# pragma GCC system_header
28+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
29+
# pragma clang system_header
30+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
31+
# pragma system_header
32+
#endif // no system header
33+
34+
#include <cuda/experimental/__stf/internal/interpreted_execution_policy.cuh>
35+
#include <cuda/experimental/__stf/places/places.cuh>
36+
#include <cuda/experimental/__stf/utility/occupancy.cuh>
37+
38+
namespace cuda::experimental::stf
39+
{
40+
template <auto... spec>
41+
template <typename Fun>
42+
interpreted_execution_policy<spec...>::interpreted_execution_policy(
43+
const thread_hierarchy_spec<spec...>& p, const exec_place& where, const Fun& f)
44+
{
45+
constexpr size_t pdepth = sizeof...(spec) / 2;
46+
47+
if (where == exec_place::host())
48+
{
49+
// XXX this may not match the type of the spec if we are not using the default spec ...
50+
for (size_t d = 0; d < pdepth; d++)
51+
{
52+
this->add_level({::std::make_pair(hw_scope::thread, 1)});
53+
}
54+
return;
55+
}
56+
57+
size_t ndevs = where.size();
58+
59+
if constexpr (pdepth == 1)
60+
{
61+
size_t l0_size = p.get_width(0);
62+
bool l0_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<0>;
63+
64+
size_t shared_mem_bytes = 0;
65+
66+
auto kernel_limits = reserved::compute_kernel_limits(f, shared_mem_bytes, l0_sync);
67+
68+
int grid_size = 0;
69+
int block_size;
70+
71+
if (l0_size == 0)
72+
{
73+
grid_size = kernel_limits.min_grid_size;
74+
// Maximum occupancy without exceeding limits
75+
block_size = ::std::min(kernel_limits.max_block_size, kernel_limits.block_size_limit);
76+
l0_size = ndevs * grid_size * block_size;
77+
}
78+
else
79+
{
80+
// Find grid_size and block_size such that grid_size*block_size = l0_size and block_size <= max_block_size
81+
for (block_size = kernel_limits.max_block_size; block_size >= 1; block_size--)
82+
{
83+
if (l0_size % block_size == 0)
84+
{
85+
grid_size = l0_size / block_size;
86+
break;
87+
}
88+
}
89+
}
90+
91+
// Make sure we have computed the width if that was implicit
92+
_CCCL_ASSERT(l0_size > 0, "invalid level 0 size");
93+
94+
_CCCL_ASSERT(grid_size > 0, "invalid grid size");
95+
_CCCL_ASSERT(block_size <= kernel_limits.max_block_size, "invalid block size");
96+
97+
_CCCL_ASSERT(l0_size % ndevs == 0, "invalid level 0 size");
98+
_CCCL_ASSERT(l0_size % (ndevs * block_size) == 0, "invalid level 0 size");
99+
100+
_CCCL_ASSERT(ndevs * grid_size * block_size == l0_size, "invalid level 0 size");
101+
102+
this->add_level({::std::make_pair(hw_scope::device, ndevs),
103+
::std::make_pair(hw_scope::block, grid_size),
104+
::std::make_pair(hw_scope::thread, block_size)});
105+
this->set_level_mem(0, size_t(p.get_mem(0)));
106+
this->set_level_sync(0, l0_sync);
107+
}
108+
else if constexpr (pdepth == 2)
109+
{
110+
size_t l0_size = p.get_width(0);
111+
size_t l1_size = p.get_width(1);
112+
bool l0_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<0>;
113+
bool l1_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<1>;
114+
115+
/* level 1 will be mapped on threads, level 0 on blocks and above */
116+
size_t shared_mem_bytes = size_t(p.get_mem(1));
117+
auto kernel_limits = reserved::compute_kernel_limits(f, shared_mem_bytes, l0_sync);
118+
119+
// For implicit widths, use sizes suggested by CUDA occupancy calculator
120+
if (l1_size == 0)
121+
{
122+
// Maximum occupancy without exceeding limits
123+
l1_size = ::std::min(kernel_limits.max_block_size, kernel_limits.block_size_limit);
124+
}
125+
else
126+
{
127+
if (int(l1_size) > kernel_limits.block_size_limit)
128+
{
129+
fprintf(stderr,
130+
"Unsatisfiable spec: Maximum block size %d threads, requested %zu (level 1)\n",
131+
kernel_limits.block_size_limit,
132+
l1_size);
133+
abort();
134+
}
135+
}
136+
137+
if (l0_size == 0)
138+
{
139+
l0_size = kernel_limits.min_grid_size * ndevs;
140+
}
141+
142+
// Enforce the resource limits in the number of threads per block
143+
_CCCL_ASSERT(int(l1_size) <= kernel_limits.block_size_limit, "invalid level 1 size");
144+
_CCCL_ASSERT(l0_size % ndevs == 0, "invalid level 0 size");
145+
146+
/* Merge blocks and devices */
147+
this->add_level({::std::make_pair(hw_scope::device, ndevs), ::std::make_pair(hw_scope::block, l0_size / ndevs)});
148+
this->set_level_mem(0, size_t(p.get_mem(0)));
149+
this->set_level_sync(0, l0_sync);
150+
151+
this->add_level({::std::make_pair(hw_scope::thread, l1_size)});
152+
this->set_level_mem(1, size_t(p.get_mem(1)));
153+
this->set_level_sync(1, l1_sync);
154+
}
155+
else if constexpr (pdepth == 3)
156+
{
157+
size_t l0_size = p.get_width(0);
158+
size_t l1_size = p.get_width(1);
159+
size_t l2_size = p.get_width(2);
160+
bool l0_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<0>;
161+
bool l1_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<1>;
162+
bool l2_sync = thread_hierarchy_spec<spec...>::template is_synchronizable<2>;
163+
164+
/* level 2 will be mapped on threads, level 1 on blocks, level 0 on devices */
165+
size_t shared_mem_bytes = size_t(p.get_mem(2));
166+
auto kernel_limits = reserved::compute_kernel_limits(f, shared_mem_bytes, l0_sync || l1_sync);
167+
168+
// For implicit widths, use sizes suggested by CUDA occupancy calculator
169+
if (l2_size == 0)
170+
{
171+
// Maximum occupancy without exceeding limits
172+
l2_size = ::std::min(kernel_limits.max_block_size, kernel_limits.block_size_limit);
173+
}
174+
else
175+
{
176+
if (int(l2_size) > kernel_limits.block_size_limit)
177+
{
178+
fprintf(stderr,
179+
"Unsatisfiable spec: Maximum block size %d threads, requested %zu (level 2)\n",
180+
kernel_limits.block_size_limit,
181+
l2_size);
182+
abort();
183+
}
184+
}
185+
186+
if (l1_size == 0)
187+
{
188+
l1_size = kernel_limits.min_grid_size;
189+
}
190+
191+
if (l0_size == 0)
192+
{
193+
l0_size = ndevs;
194+
}
195+
196+
// Enforce the resource limits in the number of threads per block
197+
_CCCL_ASSERT(int(l2_size) <= kernel_limits.block_size_limit, "invalid level 2 size");
198+
_CCCL_ASSERT(int(l0_size) <= ndevs, "invalid level 0 size");
199+
200+
/* Merge blocks and devices */
201+
this->add_level({::std::make_pair(hw_scope::device, l0_size)});
202+
this->set_level_mem(0, size_t(p.get_mem(0)));
203+
this->set_level_sync(0, l0_sync);
204+
205+
this->add_level({::std::make_pair(hw_scope::block, l1_size)});
206+
this->set_level_mem(1, size_t(p.get_mem(1)));
207+
this->set_level_sync(1, l1_sync);
208+
209+
this->add_level({::std::make_pair(hw_scope::thread, l2_size)});
210+
this->set_level_mem(2, size_t(p.get_mem(2)));
211+
this->set_level_sync(2, l2_sync);
212+
}
213+
else
214+
{
215+
static_assert(pdepth == 3);
216+
}
217+
}
218+
} // end namespace cuda::experimental::stf

cudax/include/cuda/experimental/__stf/internal/launch.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#endif // no system header
2222

2323
#include <cuda/experimental/__stf/internal/execution_policy.cuh> // launch_impl() uses execution_policy
24+
#include <cuda/experimental/__stf/internal/interpreted_execution_policy_impl.cuh>
2425
#include <cuda/experimental/__stf/internal/task_dep.cuh>
2526
#include <cuda/experimental/__stf/internal/task_statistics.cuh>
2627
#include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>

cudax/include/cuda/experimental/__stf/internal/task_dep.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include <cuda/experimental/__stf/internal/void_interface.cuh>
2525
#include <cuda/experimental/__stf/utility/core.cuh>
2626

27+
#include <variant>
28+
2729
namespace cuda::experimental::stf
2830
{
2931
::std::shared_ptr<void> pack_state(const logical_data_untyped&);

0 commit comments

Comments
 (0)