-
Notifications
You must be signed in to change notification settings - Fork 113
Expand file tree
/
Copy pathquda_api.h
More file actions
294 lines (240 loc) · 12.4 KB
/
quda_api.h
File metadata and controls
294 lines (240 loc) · 12.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#pragma once
#include <quda_define.h>
#include <string>
#include <enum_quda.h>
#include <quda_ptr.h>
/* We have to overwrite some cuda-ism here even for public interface,
other wise we can't compile tests.
*/
#ifdef QUDA_TARGET_OMPTARGET
#include "targets/omptarget/quda_api.h"
#endif
/**
@file quda_api.h
Wrappers around CUDA API function calls allowing us to easily
profile and switch between using the CUDA runtime and driver APIs.
*/
enum qudaMemcpyKind {
qudaMemcpyHostToHost,
qudaMemcpyHostToDevice,
qudaMemcpyDeviceToHost,
qudaMemcpyDeviceToDevice,
qudaMemcpyDefault
};
namespace quda
{
struct TuneParam;
struct qudaStream_t {
int idx;
};
struct qudaEvent_t {
void *event;
};
/**
@brief Wrapper around cudaMemcpy or driver API equivalent
@param[out] dst Destination pointer
@param[in] src Source pointer
@param[in] count Size of transfer
@param[in] kind Type of memory copy
*/
void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
const char *line);
/**
@brief Wrapper around cudaMemcpy or driver API equivalent
@param[out] dst Destination pointer
@param[in] src Source pointer
@param[in] count Size of transfer
@param[in] kind Type of memory copy
*/
void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
const char *file, const char *line);
/**
@brief Wrapper around cudaMemcpyAsync or driver API equivalent
@param[out] dst Destination pointer
@param[in] src Source pointer
@param[in] count Size of transfer
@param[in] kind Type of memory copy
@param[in] stream Stream to issue copy
*/
void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaMemcpyAsync or driver API equivalent for peer-to-peer copies
@param[out] dst Destination pointer
@param[in] src Source pointer
@param[in] count Size of transfer
@param[in] stream Stream to issue copy
*/
void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream, const char *func,
const char *file, const char *line);
/**
@brief Heterogenous memset function
@param[out] ptr Heterogeneous pointer
@param[in] value Value to set for each byte of specified memory
@param[in] count Size in bytes to set
*/
void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaMemset or driver API equivalent
@param[out] ptr Starting address pointer
@param[in] value Value to set for each byte of specified memory
@param[in] count Size in bytes to set
*/
void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaMemsetAsync or driver API equivalent
@param[out] ptr Starting address pointer
@param[in] value Value to set for each byte of specified memory
@param[in] count Size in bytes to set
@param[in] stream Stream to issue memset
*/
void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
const char *file, const char *line);
/**
@brief Wrapper around cudaMemsetAsync or driver API equivalent
@param[out] ptr Starting address pointer
@param[in] value Value to set for each byte of specified memory
@param[in] count Size in bytes to set
@param[in] stream Stream to issue memset
*/
void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
const char *file, const char *line);
/**
@brief Asynchronous heterogenous memset2d function
@param[out] ptr Starting address pointer
@param[in] Initial offset from pointer
@param[in] Pitch in bytes
@param[in] value Value to set for each byte of specified memory
@param[in] width Width in bytes
@param[in] height Height in bytes
@param[in] stream Stream to issue memset
*/
void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
const qudaStream_t &stream, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaMemPrefetchAsync or driver API equivalent
@param[out] ptr Starting address pointer to be prefetched
@param[in] count Size in bytes to prefetch
@param[in] mem_space Memory space to prefetch to
@param[in] stream Stream to issue prefetch
*/
void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream,
const char *func, const char *file, const char *line);
/**
@brief Return instance of an event.
*/
qudaEvent_t qudaEventCreate_(const char *func, const char *file, const char *line);
/**
@brief Return instance of an event that can be used for timing.
*/
qudaEvent_t qudaChronoEventCreate_(const char *func, const char *file, const char *line);
/**
@brief Return elapsed time in seconds between two events
*/
float qudaEventElapsedTime_(const qudaEvent_t &start, const qudaEvent_t &stop, const char *func, const char *file,
const char *line);
/**
@brief Destroy the event
*/
void qudaEventDestroy_(qudaEvent_t &event, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaEventQuery or cuEventQuery with built-in error checking
@param[in] event Event we are querying
@return true if event has been reached
*/
bool qudaEventQuery_(qudaEvent_t &event, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaEventRecord or cuEventRecord with
built-in error checking
@param[in,out] event Event we are recording
@param[in,out] stream Stream where to record the event
*/
void qudaEventRecord_(qudaEvent_t &event, qudaStream_t stream, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaStreamWaitEvent or cuStreamWaitEvent
with built-in error checking
@param[in,out] stream Stream which we are instructing to wait
@param[in] event Event we are waiting on
@param[in] flags Flags to pass to function
*/
void qudaStreamWaitEvent_(qudaStream_t stream, qudaEvent_t event, unsigned int flags, const char *func,
const char *file, const char *line);
/**
@brief Wrapper around cudaEventSynchronize or cuEventSynchronize
with built-in error checking
@param[in] event Event which we are synchronizing with respect to
*/
void qudaEventSynchronize_(const qudaEvent_t &event, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaStreamSynchronize or
cuStreamSynchronize with built-in error checking
@param[in] stream Stream which we are synchronizing
*/
void qudaStreamSynchronize_(const qudaStream_t &stream, const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaDeviceSynchronize or
cuDeviceSynchronize with built-in error checking
*/
void qudaDeviceSynchronize_(const char *func, const char *file, const char *line);
/**
@brief Wrapper around cudaGetSymbolAddress with built in error
checking. Returns the address of symbol on the device; symbol
is a variable that resides in global memory space.
@param[in] symbol Global variable or string symbol to search for
@return Return device pointer associated with symbol
*/
void *qudaGetSymbolAddress_(const char *symbol, const char *func, const char *file, const char *line);
/**
@brief Get the last error recorded by the target runtime. By
calling this, we reset the last error.
*/
qudaError_t qudaGetLastError();
/**
@brief Get the error string associated with the last error that
was thrown by the target runtime
*/
std::string qudaGetLastErrorString();
/**
@brief Print out the timer profile for CUDA API calls
*/
void printAPIProfile();
} // namespace quda
#define STRINGIFY__(x) #x
#define __STRINGIFY__(x) STRINGIFY__(x)
#define qudaMemcpy(dst, src, count, kind) \
::quda::qudaMemcpy_(dst, src, count, kind, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaMemcpyAsync(dst, src, count, kind, stream) \
::quda::qudaMemcpyAsync_(dst, src, count, kind, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaMemcpyP2PAsync(dst, src, count, stream) \
::quda::qudaMemcpyP2PAsync_(dst, src, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaMemset(ptr, value, count) \
::quda::qudaMemset_(ptr, value, count, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaMemsetAsync(ptr, value, count, stream) \
::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaMemset2DAsync(ptr, offset, pitch, value, width, height, stream) \
::quda::qudaMemset2DAsync_(ptr, offset, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__), \
__STRINGIFY__(__LINE__))
#define qudaMemPrefetchAsync(ptr, count, mem_space, stream) \
::quda::qudaMemPrefetchAsync_(ptr, count, mem_space, stream, __func__, quda::file_name(__FILE__), \
__STRINGIFY__(__LINE__))
#define qudaEventCreate() ::quda::qudaEventCreate_(__func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaChronoEventCreate() \
::quda::qudaChronoEventCreate_(__func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaEventElapsedTime(start, stop) \
::quda::qudaEventElapsedTime_(start, stop, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaEventDestroy(event) \
::quda::qudaEventDestroy_(event, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaEventQuery(event) \
::quda::qudaEventQuery_(event, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaEventRecord(event, stream) \
::quda::qudaEventRecord_(event, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaStreamWaitEvent(stream, event, flags) \
::quda::qudaStreamWaitEvent_(stream, event, flags, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaEventSynchronize(event) \
::quda::qudaEventSynchronize_(event, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaStreamSynchronize(stream) \
::quda::qudaStreamSynchronize_(stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaDeviceSynchronize() \
::quda::qudaDeviceSynchronize_(__func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))
#define qudaGetSymbolAddress(symbol) \
::quda::qudaGetSymbolAddress_(symbol, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__))