cccl/cudax/include/cuda/experimental/__places/place_partition.cuh at 7aa966706ad9720be72b81c10c1f2cde47f86e0d · caugonnet/cccl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
 * @file
 * @brief Facilities to manipulate subset of places
 */

#pragma once

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cuda/experimental/__places/exec/cuda_stream.cuh>
#include <cuda/experimental/__places/exec/green_context.cuh>
#include <cuda/experimental/__places/places.cuh>
#include <cuda/experimental/__stf/internal/async_resources_handle.cuh>

namespace cuda::experimental::stf
{
/**
 * @brief Defines a partitioning granularity
 *
 * This should be used in combination with `place_partition`
 */
enum class place_partition_scope
{
  cuda_device,
  green_context,
  cuda_stream,
};

/**
 * @brief Convert a place_partition_scope value to a string (for debugging purpose)
 * @param scope The partitioning granularity to convert
 * @return A string representation of `scope` (e.g. "cuda_device", "green_context", "cuda_stream")
 */
inline ::std::string place_partition_scope_to_string(place_partition_scope scope)
{
  switch (scope)
  {
    case place_partition_scope::cuda_device:
      return "cuda_device";
    case place_partition_scope::green_context:
      return "green_context";
    case place_partition_scope::cuda_stream:
      return "cuda_stream";
  }

  abort();
  return "unknown";
}

// TODO method to get the scope of an exec place

/**
 * @brief Get subsets of an execution place.
 *
 * Computes a vector of execution places that partition the input place at a
 * given granularity (see `place_partition_scope`). For example, a grid place
 * can be partitioned into devices, or into green contexts, or into CUDA streams.
 *
 * Use the constructors that take `async_resources_handle&` when partitioning at
 * `cuda_stream` or `green_context` scope (stream and green-context resources
 * are obtained from the handle). The constructors without a handle support only
 * `cuda_device` scope. Green context scope requires CUDA 12.4 or later.
 *
 * Iteration over subplaces is provided via `begin()` / `end()`; `to_exec_place()` builds
 * an `exec_place` grid from the subplaces.
 */
class place_partition
{
public:
  /** @brief Partition an execution place into a vector of subplaces (with async resource handle).
   * @param place The execution place to partition (e.g. grid or device)
   * @param handle Handle used to obtain stream or green-context resources when scope is cuda_stream or green_context
   * @param scope Partitioning granularity (cuda_device, green_context, or cuda_stream)
   */
  place_partition(exec_place place, async_resources_handle& handle, place_partition_scope scope)
  {
#if _CCCL_CTK_BELOW(12, 4)
    _CCCL_ASSERT(scope != place_partition_scope::green_context, "Green contexts unsupported.");
#endif // _CCCL_CTK_BELOW(12, 4)
    compute_subplaces(handle, mv(place), scope);
  }

  /** @brief Partition an execution place into a vector of subplaces (no async handle).
   * Only `cuda_device` scope is supported; green_context and cuda_stream require a handle.
   * @param place The execution place to partition
   * @param scope Partitioning granularity (must be cuda_device when no handle is provided)
   */
  place_partition(const exec_place& place, place_partition_scope scope)
  {
#if _CCCL_CTK_BELOW(12, 4)
    _CCCL_ASSERT(scope != place_partition_scope::green_context, "Green contexts need an async resource handle.");
#endif // _CCCL_CTK_BELOW(12, 4)
    compute_subplaces_no_handle(place, scope);
  }

  /** @brief Partition a vector of execution places into a single vector of subplaces (with async handle).
   * @param handle Handle for stream or green-context resources when scope is cuda_stream or green_context
   * @param places Input execution places to partition
   * @param scope Partitioning granularity
   */
  place_partition(async_resources_handle& handle,
                  const ::std::vector<::std::shared_ptr<exec_place>>& places,
                  place_partition_scope scope)
  {
    for (const auto& place : places)
    {
      compute_subplaces(handle, *place, scope);
    }
  }

  /** @brief Partition a grid of execution places into a single vector of subplaces (with async handle).
   * @param handle Handle for stream or green-context resources when scope is cuda_stream or green_context
   * @param grid Input execution place grid to partition
   * @param scope Partitioning granularity
   */
  place_partition(async_resources_handle& handle, const exec_place& grid, place_partition_scope scope)
  {
    ::std::vector<::std::shared_ptr<exec_place>> places;
    places.reserve(grid.size());
    for (size_t i = 0; i < grid.size(); ++i)
    {
      places.push_back(::std::make_shared<exec_place>(grid.get_place(i)));
    }
    for (const auto& place : places)
    {
      compute_subplaces(handle, *place, scope);
    }
  }

  /** @brief Partition a vector of execution places into a single vector of subplaces (no async handle).
   * Only cuda_device scope is supported.
   * @param places Input execution places to partition
   * @param scope Partitioning granularity (must be cuda_device)
   */
  place_partition(const ::std::vector<::std::shared_ptr<exec_place>>& places, place_partition_scope scope)
  {
    for (const auto& place : places)
    {
      compute_subplaces_no_handle(*place, scope);
    }
  }

  ~place_partition() = default;

  /** Iteration over subplaces. */
  using iterator       = ::std::vector<exec_place>::iterator;
  using const_iterator = ::std::vector<exec_place>::const_iterator;

  /** @brief Iterator to the first subplace. @return Begin iterator. */
  iterator begin()
  {
    return sub_places.begin();
  }
  /** @brief Past-the-end iterator for subplaces. @return End iterator. */
  iterator end()
  {
    return sub_places.end();
  }
  /** @brief Const iterator to the first subplace. @return Begin const iterator. */
  const_iterator begin() const
  {
    return sub_places.begin();
  }
  /** @brief Past-the-end const iterator. @return End const iterator. */
  const_iterator end() const
  {
    return sub_places.end();
  }

  /** @brief Number of subplaces in the partition. @return Size of the partition. */
  size_t size() const
  {
    return sub_places.size();
  }

  /** @brief Get the i-th subplace (mutable).
   * @param i Index in [0, size()).
   * @return Reference to the i-th exec_place.
   */
  exec_place& get(size_t i)
  {
    return sub_places[i];
  }

  /** @brief Get the i-th subplace (const).
   * @param i Index in [0, size()).
   * @return Const reference to the i-th exec_place.
   */
  const exec_place& get(size_t i) const
  {
    return sub_places[i];
  }

  /** @brief Build an exec_place from the subplaces.
   * @return A grid view of the partitioned execution places, or single place if size == 1.
   */
  exec_place to_exec_place() const
  {
    return make_grid(sub_places);
  }

private:
  /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */
  void compute_subplaces(async_resources_handle& handle, exec_place place, place_partition_scope scope)
  {
    // Handle multi-element grids by recursively partitioning
    if (place.size() > 1 && scope == place_partition_scope::cuda_stream)
    {
      for (auto& device_p : place_partition(mv(place), handle, place_partition_scope::cuda_device))
      {
        auto device_p_places = place_partition(device_p, handle, place_partition_scope::cuda_stream).sub_places;
        sub_places.insert(sub_places.end(), device_p_places.begin(), device_p_places.end());
      }
      return;
    }

    // Handle scalar places (including 1-element grids) for cuda_stream scope
    if (place.size() == 1 && scope == place_partition_scope::cuda_stream)
    {
      // Get the underlying scalar place (for 1-element grids, get the single element)
      exec_place scalar_place = place.is_device() ? place : place.get_place(0);
      if (!scalar_place.is_device())
      {
        // Host or other non-device place - no streams to partition into
        sub_places.push_back(mv(place));
        return;
      }
      auto& pool = scalar_place.get_stream_pool(true);
      for (size_t i = 0; i < pool.size(); i++)
      {
        sub_places.push_back(exec_place::cuda_stream(pool.next(scalar_place)));
      }
      return;
    }

// Green contexts are only supported since CUDA 12.4
#if _CCCL_CTK_AT_LEAST(12, 4)
    if (place.size() > 1 && scope == place_partition_scope::green_context)
    {
      // Recursively partition grid into devices, then into green contexts
      for (auto& device_p : place_partition(mv(place), handle, place_partition_scope::cuda_device))
      {
        auto device_p_places = place_partition(device_p, handle, place_partition_scope::green_context).sub_places;
        sub_places.insert(sub_places.end(), device_p_places.begin(), device_p_places.end());
      }
      return;
    }

    // Handle scalar places (including 1-element grids) for green_context scope
    if (place.size() == 1 && scope == place_partition_scope::green_context)
    {
      exec_place scalar_place = place.is_device() ? place : place.get_place(0);
      if (!scalar_place.is_device())
      {
        sub_places.push_back(mv(place));
        return;
      }
      int dev_id = device_ordinal(scalar_place.affine_data_place());

      const char* env = getenv("CUDASTF_GREEN_CONTEXT_SIZE");
      int sm_cnt      = env ? atoi(env) : 8;

      auto h = handle.get_gc_helper(dev_id, sm_cnt);

      size_t cnt = h->get_count();
      for (size_t i = 0; i < cnt; i++)
      {
        sub_places.push_back(exec_place::green_ctx(h->get_view(i)));
      }
      return;
    }
#endif

    // If the scope requires no handle
    compute_subplaces_no_handle(place, scope);
  }

  /** @brief Compute the subplaces of a place at the specified granularity (scope) into the sub_places vector */
  void compute_subplaces_no_handle(const exec_place& place, place_partition_scope scope)
  {
#if _CCCL_CTK_BELOW(12, 4)
    _CCCL_ASSERT(scope != place_partition_scope::green_context, "Green contexts scope need an async resource handle.");
#endif // _CCCL_CTK_BELOW(12, 4)
    _CCCL_ASSERT(scope != place_partition_scope::cuda_stream, "CUDA stream scope needs an async resource handle.");

    if (scope == place_partition_scope::cuda_device)
    {
      for (size_t i = 0; i < place.size(); ++i)
      {
        sub_places.push_back(place.get_place(i));
      }
      return;
    }

    assert(!"Internal error: unreachable code.");
  }

  /** A vector with all subplaces (computed once in compute_subplaces) */
  ::std::vector<exec_place> sub_places;
};

// Deferred implementation because we need place_partition
template <typename... Args>
auto exec_place::partition_by_scope(Args&&... args)
{
  return place_partition(*this, ::std::forward<Args>(args)...).to_exec_place();
}
} // end namespace cuda::experimental::stf