Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 75 additions & 6 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
//------------------------------------------------------------------------------
// Build single operator kernel
//------------------------------------------------------------------------------
extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
bool is_tensor = true, is_at_points = false, use_3d_slices = false;
Ceed ceed;
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
Expand All @@ -927,18 +927,77 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
CeedOperator_Cuda_gen *data;
std::ostringstream code;

CeedCallBackend(CeedOperatorGetData(op, &data));
{
bool is_setup_done;

CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
if (is_setup_done) return CEED_ERROR_SUCCESS;
if (is_setup_done) {
*is_good_build = !data->use_fallback;
return CEED_ERROR_SUCCESS;
}
}

// Check field compatibility
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
{
bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;

for (CeedInt i = 0; i < num_input_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;
CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}

for (CeedInt i = 0; i < num_output_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;

CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}
// -- Fallback to ref if not all bases are shared
if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
*is_good_build = false;
return CEED_ERROR_SUCCESS;
}
}
CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
CeedCallBackend(CeedOperatorGetData(op, &data));
CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));

// Get operator data
Expand Down Expand Up @@ -1207,8 +1266,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
code << "// -----------------------------------------------------------------------------\n\n";

// Compile
CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
{
bool is_compile_good = false;

CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
if (is_compile_good) {
*is_good_build = true;
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
} else {
*is_good_build = false;
data->use_fallback = true;
}
}
CeedCallBackend(CeedOperatorSetSetupDone(op));
CeedCallBackend(CeedDestroy(&ceed));
CeedCallBackend(CeedQFunctionDestroy(&qf));
Expand Down
2 changes: 1 addition & 1 deletion backends/cuda-gen/ceed-cuda-gen-operator-build.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
// This file is part of CEED: http://github.com/ceed
#pragma once

CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op);
CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build);
78 changes: 21 additions & 57 deletions backends/cuda-gen/ceed-cuda-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
// Apply and add to output
//------------------------------------------------------------------------------
static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
bool is_at_points, is_tensor;
bool is_at_points, is_tensor, is_run_good = true;
Ceed ceed;
Ceed_Cuda *cuda_data;
CeedInt num_elem, num_input_fields, num_output_fields;
Expand All @@ -111,62 +111,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
CeedOperatorField *op_input_fields, *op_output_fields;
CeedOperator_Cuda_gen *data;

// Check for shared bases
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
// Creation of the operator
{
bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;

for (CeedInt i = 0; i < num_input_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;
CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}
bool is_good_build = false;

for (CeedInt i = 0; i < num_output_fields; i++) {
CeedBasis basis;

CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
if (basis != CEED_BASIS_NONE) {
bool is_tensor = true;
const char *resource;
char *resource_root;
Ceed basis_ceed;

CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
is_all_tensor &= is_tensor;
is_all_nontensor &= !is_tensor;

CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
CeedCallBackend(CeedGetResource(basis_ceed, &resource));
CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
CeedCallBackend(CeedFree(&resource_root));
CeedCallBackend(CeedDestroy(&basis_ceed));
}
CeedCallBackend(CeedBasisDestroy(&basis));
}
// -- Fallback to ref if not all bases are shared
if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_good_build));
if (!is_good_build) {
CeedOperator op_fallback;

CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due unsupported bases");
CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to code generation issue");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
return CEED_ERROR_SUCCESS;
Expand All @@ -179,11 +132,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));

// Creation of the operator
CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op));

// Input vectors
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
Expand Down Expand Up @@ -293,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
}
CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);

CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));

// Restore input arrays
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -349,8 +300,21 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,

// Restore context data
CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));

// Cleanup
CeedCallBackend(CeedDestroy(&ceed));
CeedCallBackend(CeedQFunctionDestroy(&qf));

// Fallback if run was bad (out of resources)
if (!is_run_good) {
CeedOperator op_fallback;

data->use_fallback = true;
CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to kernel execution issue");
CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
return CEED_ERROR_SUCCESS;
}
return CEED_ERROR_SUCCESS;
}

Expand Down
1 change: 1 addition & 0 deletions backends/cuda-gen/ceed-cuda-gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cuda.h>

typedef struct {
bool use_fallback;
CeedInt dim;
CeedInt Q_1d;
CeedInt max_P_1d;
Expand Down
68 changes: 53 additions & 15 deletions backends/cuda/ceed-cuda-compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
//------------------------------------------------------------------------------
// Compile CUDA kernel
//------------------------------------------------------------------------------
int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
const CeedInt num_defines, va_list args) {
size_t ptx_size;
char *ptx;
const int num_opts = 4;
Expand All @@ -50,8 +51,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed

// Get kernel specific options, such as kernel constants
if (num_defines > 0) {
va_list args;
va_start(args, num_defines);
char *name;
int val;

Expand All @@ -60,7 +59,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
val = va_arg(args, int);
code << "#define " << name << " " << val << "\n";
}
va_end(args);
}

// Standard libCEED definitions for CUDA backends
Expand Down Expand Up @@ -133,7 +131,8 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
}
CeedCallBackend(CeedFree(&opts));
if (result != NVRTC_SUCCESS) {
*is_compile_good = result == NVRTC_SUCCESS;
if (!*is_compile_good && throw_error) {
char *log;
size_t log_size;

Expand All @@ -159,6 +158,25 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
return CEED_ERROR_SUCCESS;
}

int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
bool is_compile_good = true;
va_list args;

va_start(args, num_defines);
CeedCallBackend(CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args));
va_end(args);
return CEED_ERROR_SUCCESS;
}

int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
va_list args;

va_start(args, num_defines);
CeedCallBackend(CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args));
va_end(args);
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Get CUDA kernel
//------------------------------------------------------------------------------
Expand Down Expand Up @@ -200,24 +218,44 @@ int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, con
//------------------------------------------------------------------------------
// Run CUDA kernel for spatial dimension with shared memory
//------------------------------------------------------------------------------
int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, void **args) {
static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run,
void **args) {
#if CUDA_VERSION >= 9000
cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
#endif
CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);

if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
int max_threads_per_block, shared_size_bytes, num_regs;

cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
return CeedError(ceed, CEED_ERROR_BACKEND,
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
*is_good_run = false;
if (throw_error) {
int max_threads_per_block, shared_size_bytes, num_regs;

cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
return CeedError(ceed, CEED_ERROR_BACKEND,
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
}
} else CeedChk_Cu(ceed, result);
return CEED_ERROR_SUCCESS;
}

int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, void **args) {
bool is_good_run = true;

CeedCallBackend(
CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
return CEED_ERROR_SUCCESS;
}

int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
CeedCallBackend(
CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
3 changes: 3 additions & 0 deletions backends/cuda/ceed-cuda-compile.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }

CEED_INTERN int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...);
CEED_INTERN int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...);

CEED_INTERN int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel);

Expand All @@ -24,3 +25,5 @@ CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_siz

CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
int shared_mem_size, void **args);
CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
int shared_mem_size, bool *is_good_run, void **args);
Loading
Loading