Skip to content

Commit 5f6a6b3

Browse files
committed
gpu - allow running shared kernels on stream
1 parent ff9bb46 commit 5f6a6b3

File tree

6 files changed

+39
-38
lines changed

6 files changed

+39
-38
lines changed

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
9898
//------------------------------------------------------------------------------
9999
// Apply and add to output
100100
//------------------------------------------------------------------------------
101-
static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
101+
static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
102102
CeedRequest *request) {
103103
bool is_at_points, is_tensor;
104104
Ceed ceed;
@@ -219,7 +219,7 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, const CeedScalar *
219219
}
220220
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
221221

222-
CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs));
222+
CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, stream, grid, block[0], block[1], block[2], shared_mem, is_run_good, opargs));
223223

224224
// Restore input arrays
225225
for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -278,7 +278,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
278278
// Try to run kernel
279279
if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
280280
if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
281-
CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, input_arr, output_arr, &is_run_good, request));
281+
CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
282282
if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
283283
if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
284284

backends/cuda/ceed-cuda-compile.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -222,13 +222,13 @@ int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, con
222222
//------------------------------------------------------------------------------
223223
// Run CUDA kernel for spatial dimension with shared memory
224224
//------------------------------------------------------------------------------
225-
static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
226-
const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run,
227-
void **args) {
225+
static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
226+
const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
227+
bool *is_good_run, void **args) {
228228
#if CUDA_VERSION >= 9000
229229
cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
230230
#endif
231-
CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
231+
CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
232232

233233
if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
234234
*is_good_run = false;
@@ -246,19 +246,19 @@ static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const i
246246
return CEED_ERROR_SUCCESS;
247247
}
248248

249-
int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
249+
int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
250250
const int block_size_z, const int shared_mem_size, void **args) {
251251
bool is_good_run = true;
252252

253-
CeedCallBackend(
254-
CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
253+
CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
254+
&is_good_run, args));
255255
return CEED_ERROR_SUCCESS;
256256
}
257257

258-
int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
258+
int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
259259
const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
260-
CeedCallBackend(
261-
CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
260+
CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
261+
is_good_run, args));
262262
return CEED_ERROR_SUCCESS;
263263
}
264264

backends/cuda/ceed-cuda-compile.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ CEED_INTERN int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t
2323

2424
CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, void **args);
2525

26-
CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
27-
int shared_mem_size, void **args);
28-
CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
29-
int shared_mem_size, bool *is_good_run, void **args);
26+
CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y,
27+
int block_size_z, int shared_mem_size, void **args);
28+
CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, int grid_size, int block_size_x, int block_size_y,
29+
int block_size_z, int shared_mem_size, bool *is_good_run, void **args);

backends/hip-gen/ceed-hip-gen-operator.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
3434
//------------------------------------------------------------------------------
3535
// Apply and add to output
3636
//------------------------------------------------------------------------------
37-
static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, const CeedScalar *input_arr, CeedScalar *output_arr, bool *is_run_good,
38-
CeedRequest *request) {
37+
static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream, const CeedScalar *input_arr, CeedScalar *output_arr,
38+
bool *is_run_good, CeedRequest *request) {
3939
bool is_at_points, is_tensor;
4040
Ceed ceed;
4141
CeedInt num_elem, num_input_fields, num_output_fields;
@@ -153,19 +153,19 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, const CeedScalar *i
153153
CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
154154

155155
CeedCallBackend(
156-
CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
156+
CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
157157
} else if (dim == 2) {
158158
CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
159159
CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
160160

161161
CeedCallBackend(
162-
CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
162+
CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
163163
} else if (dim == 3) {
164164
CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
165165
CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
166166

167167
CeedCallBackend(
168-
CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
168+
CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
169169
}
170170

171171
// Restore input arrays
@@ -225,7 +225,7 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
225225
// Try to run kernel
226226
if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(input_vec, CEED_MEM_DEVICE, &input_arr));
227227
if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArray(output_vec, CEED_MEM_DEVICE, &output_arr));
228-
CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, input_arr, output_arr, &is_run_good, request));
228+
CeedCallBackend(CeedOperatorApplyAddCore_Cuda_gen(op, NULL, input_arr, output_arr, &is_run_good, request));
229229
if (input_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArrayRead(input_vec, &input_arr));
230230
if (output_vec != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorRestoreArray(output_vec, &output_arr));
231231

backends/hip/ceed-hip-compile.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -205,28 +205,29 @@ int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, c
205205
//------------------------------------------------------------------------------
206206
// Run HIP kernel for spatial dimension with shared memory
207207
//------------------------------------------------------------------------------
208-
static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
209-
const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run, void **args) {
210-
hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
208+
static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
209+
const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
210+
bool *is_good_run, void **args) {
211+
hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
211212

212213
*is_good_run = result == hipSuccess;
213214
if (throw_error) CeedCallHip(ceed, result);
214215
return CEED_ERROR_SUCCESS;
215216
}
216217

217-
int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
218-
const int block_size_z, const int shared_mem_size, void **args) {
218+
int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
219+
const int block_size_y, const int block_size_z, const int shared_mem_size, void **args) {
219220
bool is_good_run = true;
220221

221-
CeedCallBackend(
222-
CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
222+
CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
223+
&is_good_run, args));
223224
return CEED_ERROR_SUCCESS;
224225
}
225226

226-
int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
227-
const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
228-
CeedCallBackend(
229-
CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
227+
int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, const int grid_size, const int block_size_x,
228+
const int block_size_y, const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
229+
CeedCallBackend(CeedRunKernelDimSharedCore_Hip(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
230+
is_good_run, args));
230231
return CEED_ERROR_SUCCESS;
231232
}
232233

backends/hip/ceed-hip-compile.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ CEED_INTERN int CeedRunKernel_Hip(Ceed ceed, hipFunction_t kernel, int grid_size
2222
CEED_INTERN int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
2323
void **args);
2424

25-
CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
26-
int shared_mem_size, void **args);
27-
CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
28-
int shared_mem_size, bool *is_good_run, void **args);
25+
CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y,
26+
int block_size_z, int shared_mem_size, void **args);
27+
CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, hipStream_t stream, int grid_size, int block_size_x, int block_size_y,
28+
int block_size_z, int shared_mem_size, bool *is_good_run, void **args);

0 commit comments

Comments
 (0)