quda/include/comm_quda.h at d351388dc7b4666a28b53f1c3c9fb12c507caf66 · lattice/quda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#pragma once
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <quda_constants.h>
#include <quda_api.h>
#include <array.h>

#ifdef __cplusplus
extern "C" {
#endif

/* defined in quda.h; redefining here to avoid circular references */
typedef int (*QudaCommsMap)(const int *coords, void *fdata);

#ifdef __cplusplus
}
#endif

/** Maximum length in bytes of the host string */
#define QUDA_MAX_HOSTNAME_STRING 128

namespace quda
{

  typedef struct MsgHandle_s MsgHandle;
  typedef struct Topology_s Topology;

  char *comm_hostname(void);
  double comm_drand(void);
  Topology *comm_create_topology(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data);
  void comm_destroy_topology(Topology *topo);
  int comm_ndim(const Topology *topo);
  const int *comm_dims(const Topology *topo);
  const int *comm_coords(const Topology *topo);
  const int *comm_coords_from_rank(const Topology *topo, int rank);
  int comm_rank_from_coords(const Topology *topo, const int *coords);
  int comm_rank_displaced(const Topology *topo, const int displacement[]);
  void comm_set_default_topology(Topology *topo);
  Topology *comm_default_topology(void);

  // routines related to direct peer-2-peer access
  void comm_set_neighbor_ranks(Topology *topo = NULL);
  int comm_neighbor_rank(int dir, int dim);

  /**
     Return the number of processes in the dimension dim
     @param dim Dimension which we are querying
     @return Length of process dimensions
  */
  int comm_dim(int dim);

  /**
     Return the coording of this process in the dimension dim
     @param dim Dimension which we are querying
     @return Coordinate of this process
  */
  int comm_coord(int dim);

  /**
   * Declare a message handle for sending `nbytes` to the `rank` with `tag`.
   */
  MsgHandle *comm_declare_send_rank(void *buffer, int rank, int tag, size_t nbytes);

  /**
   * Declare a message handle for receiving `nbytes` from the `rank` with `tag`.
   */
  MsgHandle *comm_declare_recv_rank(void *buffer, int rank, int tag, size_t nbytes);

  /**
     Create a persistent message handler for a relative send.  This
     should not be called directly, and instead the helper macro
     (without the trailing underscore) should be called instead.
     @param buffer Buffer from which message will be sent
     @param dim Dimension in which message will be sent
     @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
     @param nbytes Size of message in bytes
  */
  MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line, void *buffer, int dim, int dir,
                                         size_t nbytes);

#define comm_declare_send_relative(buffer, dim, dir, nbytes)                                                           \
  comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)

  /**
     Create a persistent message handler for a relative send.  This
     should not be called directly, and instead the helper macro
     (without the trailing underscore) should be called instead.
     @param buffer Buffer into which message will be received
     @param dim Dimension from message will be received
     @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
     @param nbytes Size of message in bytes
  */
  MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim,
                                            int dir, size_t nbytes);

#define comm_declare_receive_relative(buffer, dim, dir, nbytes)                                                        \
  comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)

  /**
     Create a persistent strided message handler for a relative send.
     This should not be called directly, and instead the helper macro
     (without the trailing underscore) should be called instead.
     @param buffer Buffer from which message will be sent
     @param dim Dimension in which message will be sent
     @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
     @param blksize Size of block in bytes
     @param nblocks Number of blocks
     @param stride Stride between blocks in bytes
  */
  MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line, void *buffer, int dim,
                                                 int dir, size_t blksize, int nblocks, size_t stride);

#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride)                                 \
  comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)

  /**
     Create a persistent strided message handler for a relative receive
     This should not be called directly, and instead the helper macro
     (without the trailing underscore) should be called instead.
     @param buffer Buffer into which message will be received
     @param dim Dimension from message will be received
     @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
     @param blksize Size of block in bytes
     @param nblocks Number of blocks
     @param stride Stride between blocks in bytes
  */
  MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line, void *buffer, int dim,
                                                    int dir, size_t blksize, int nblocks, size_t stride);

#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride)                              \
  comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)

  void comm_finalize(void);
  void comm_dim_partitioned_set(int dim);
  int comm_dim_partitioned(int dim);

  /**
     @brief Loop over comm_dim_partitioned(dim) for all comms dimensions
     @return Whether any communications dimensions are partitioned
  */
  int comm_partitioned();

  /**
     @brief Create the topology and partition strings that are used in tuneKeys
  */
  void comm_set_tunekey_string();

  /**
     @brief Return a string that defines the comm partitioning (used as a tuneKey)
     @param comm_dim_override Optional override for partitioning
     @return String specifying comm partitioning
  */
  const char *comm_dim_partitioned_string(const int *comm_dim_override = 0);

  /**
     @brief Return a string that defines the comm topology (for use as a tuneKey)
     @return String specifying comm topology
  */
  const char *comm_dim_topology_string();

  /**
     @brief Return a string that defines the P2P/GDR environment
     variable configuration (for use as a tuneKey to enable unique
     policies).
     @return String specifying comm config
  */
  const char *comm_config_string();

  /**
     @brief Initialize the communications, implemented in comm_single.cpp, comm_qmp.cpp, and comm_mpi.cpp
  */
  void comm_init(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data,
                 bool user_set_comm_handle = false, void *user_comm = nullptr);

  /**
     @brief Initialize the communications common to all communications abstractions
  */
  void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data);

  /**
     @return Rank id of this process
  */
  int comm_rank(void);

  /**
     @return the default rank id of this process.
     This doesn't go through the communicator route, so it can be called without initializing the communicator stack.
  */
  int comm_rank_global(void);

  /**
     @return Number of processes
  */
  size_t comm_size(void);

  /**
     @return GPU id associated with this process
  */
  int comm_gpuid(void);

  /**
     @return Whether are doing determinisitic multi-process reductions or not
  */
  bool comm_deterministic_reduce();

  /**
     @brief Gather all hostnames
     @param[out] hostname_recv_buf char array of length
     QUDA_MAX_HOSTNAME_STRING*comm_size() that will be filled in GPU ids for all processes.
     Each hostname is in rank order, with QUDA_MAX_HOSTNAME_STRING bytes for each.
  */
  void comm_gather_hostname(char *hostname_recv_buf);

  /**
     @brief Gather all GPU ids
     @param[out] gpuid_recv_buf int array of length comm_size() that
     will be filled in GPU ids for all processes (in rank order).
  */
  void comm_gather_gpuid(int *gpuid_recv_buf);

  /**
     Enabled peer-to-peer communication.
     @param hostname_buf Array that holds all process hostnames
  */
  void comm_peer2peer_init(const char *hostname_recv_buf);

  /**
     @brief Query if peer-to-peer communication is possible between two GPUs
     @param[in] local_gpuid GPU associated with this process
     @param[in] neighbor_gpuid GPU associated with neighboring process
     (assumed on same node)
     @return True/false if peer-to-peer is possible
  */
  bool comm_peer2peer_possible(int local_gpuid, int neighbor_gpuid);

  /**
     @brief Query the performance of peer-to-peer communication between two GPUs
     @param[in] local_gpuid GPU associated with this process
     @param[in] neighbor_gpuid GPU associated with neighboring process
     (assumed on same node)
     @return Relative performance ranking between this pair of GPUs
  */
  int comm_peer2peer_performance(int local_gpuid, int neighbor_gpuid);

  /**
     @brief Symmetric exchange of local memory addresses between
     logically neighboring processes on the lattice.  The remote
     addresses that are returned are directly addressable by the local
     process and can be read or written to by a kernel, or can be
     copied to and from.  This exchange is only defined between
     devices that are peer-to-peer enabled.
     @param[out] remote Array of remote memory pointers to neighboring
     pointers
     @param[in] local The process-local memory pointer to be exchanged
     from this process
  */
  void comm_create_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &remote, void *local);

  /**
     @brief Deallocate the remote addresses to logically neighboring
     processes on the on the lattice.
     @param[in] remote Array of remote memory pointers to neighboring
     pointers
  */
  void comm_destroy_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &remote);

  /**
     @brief Create unique events shared between each logical pair of
     neighboring processes, e.g., the event in the forwards direction
     in a given dimension on a given process aliases the event in the
     backward direction in the same dimension, and is unique
     between that process pair. This exchange is only defined between
     devices that are peer-to-peer enabled.
     @param[out] remote Array of remote events to neighboring processes
     @param[in] local Array of local event to neighboring processes
   */
  void comm_create_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &remote,
                                  array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local);

  /**
     @brief Destroy the coupled events
     @param[out] remote Array of remote events to neighboring processes
     @param[in] local Array of local event to neighboring processes
   */
  void comm_destroy_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &remote,
                                   array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &local);

  /**
     @brief Returns true if any peer-to-peer capability is present on
     this system (regardless of whether it has been disabled or not.  We
     use this, for example, to determine if we need to allocate pinned
     device memory or not.
  */
  bool comm_peer2peer_present();

  /**
     Query what peer-to-peer communication is enabled globally
     @return 2-bit number reporting 1 for copy engine, 2 for remote writes
  */
  int comm_peer2peer_enabled_global();

  /**
     Query if peer-to-peer communication is enabled
     @param dir Direction (0 - backwards, 1 forwards)
     @param dim Dimension (0-3)
     @return Whether peer-to-peer is enabled
  */
  bool comm_peer2peer_enabled(int dir, int dim);

  /**
     @brief Enable / disable peer-to-peer communication: used for dslash
     policies that do not presently support peer-to-peer communication
     @param[in] enable Boolean flag to enable / disable peer-to-peer communication
  */
  void comm_enable_peer2peer(bool enable);

  /**
     Query if intra-node (non-peer-to-peer) communication is enabled
     in a given dimension and direction
     @param dir Direction (0 - backwards, 1 forwards)
     @param dim Dimension (0-3)
     @return Whether intra-node communication is enabled
  */
  bool comm_intranode_enabled(int dir, int dim);

  /**
     @brief Enable / disable intra-node (non-peer-to-peer)
     communication
     @param[in] enable Boolean flag to enable / disable intra-node
     (non peer-to-peer) communication
  */
  void comm_enable_intranode(bool enable);

  /**
     @brief Query if GPU Direct RDMA communication is enabled (global setting)
  */
  bool comm_gdr_enabled();

  /**
     @brief Return if zero-copy policy kernels have been enabled.  By
     default kernels that read their communication halos directly from
     host memory are disabled to reduce tuning time, since on
     PCIe-based architectures, these kernels underperform and can take
     excessive tuning time.  They can be enabled with the environment
     variable QUDA_ENABLE_ZERO_COPY=1
     @return Return if zero-copy policy halos are enabled
   */
  bool comm_zero_copy_enabled();

  /**
     @brief Query if NVSHMEM communication is enabled (global setting)
  */
  bool comm_nvshmem_enabled();

  /**
      @brief Query if GPU Direct RDMA communication is blacklisted for this GPU
  */
  bool comm_gdr_blacklist();

  /**
     Create a persistent message handler for a relative send
     @param buffer Buffer from which message will be sent
     @param dim Dimension in which message will be sent
     @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
     @param nbytes Size of message in bytes
  */
  MsgHandle *comm_declare_send_displaced(void *buffer, const int displacement[], size_t nbytes);

  /**
     Create a persistent message handler for a relative receive
     @param buffer Buffer into which message will be received
     @param dim Dimension from message will be received
     @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
     @param nbytes Size of message in bytes
  */
  MsgHandle *comm_declare_receive_displaced(void *buffer, const int displacement[], size_t nbytes);

  /**
     Create a persistent strided message handler for a displaced send
     @param buffer Buffer from which message will be sent
     @param displacement Array of offsets specifying the relative node to which we are sending
     @param blksize Size of block in bytes
     @param nblocks Number of blocks
     @param stride Stride between blocks in bytes
  */
  MsgHandle *comm_declare_strided_send_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks,
                                                 size_t stride);

  /**
     Create a persistent strided message handler for a displaced receive
     @param buffer Buffer into which message will be received
     @param displacement Array of offsets specifying the relative node from which we are receiving
     @param blksize Size of block in bytes
     @param nblocks Number of blocks
     @param stride Stride between blocks in bytes
  */
  MsgHandle *comm_declare_strided_receive_displaced(void *buffer, const int displacement[], size_t blksize, int nblocks,
                                                    size_t stride);

  void comm_free(MsgHandle *&mh);
  void comm_start(MsgHandle *mh);
  void comm_wait(MsgHandle *mh);
  int comm_query(MsgHandle *mh);

  template <typename T> void comm_allreduce_sum(T &v);
  template <typename T> void comm_allreduce_max(T &v);
  template <typename T> void comm_allreduce_min(T &v);

  void comm_allreduce_int(int &data);
  void comm_allreduce_xor(uint64_t &data);

  /**
     @brief Broadcast from the root rank
     @param[in,out] data The data to be read from on the root rank, and
     written to on all other ranks
     @param[in] nbytes The size in bytes of data to be broadcast
     @param[in] root The process that will be broadcasting
  */
  void comm_broadcast(void *data, size_t nbytes, int root = 0);

  void comm_barrier(void);
  void comm_abort(int status);
  void comm_abort_(int status);

  int commDim(int);
  int commCoords(int);
  int commDimPartitioned(int dir);
  void commDimPartitionedSet(int dir);

  /**
   * @brief Reset the comm dim partioned array to zero,
   * @details This should only be needed for automated testing
   * when different partitioning is applied within a single run.
   */
  void commDimPartitionedReset();
  bool commGlobalReduction();
  void commGlobalReductionPush(bool global_reduce);
  void commGlobalReductionPop();

  bool commAsyncReduction();
  void commAsyncReductionSet(bool global_reduce);

} // namespace quda