-
Notifications
You must be signed in to change notification settings - Fork 135
Open
Description
Describe the bug
Convolution with sigmoid activation is 20 times slower compared to the legacy API with NCHW layout.
Expected behavior
The performance of the legacy API can be achieved without workarounds.
System Environment (please complete the following information):
- cudnn_frontend version: 1.12
- cudnn_backend version: 9.11 or 9.12
- GPU arch: RTX A2000 or RTX A4000
- cuda runtime version: 12.8
- cuda driver version: 570.172.08
- host compiler: clang19
- OS: ubuntu22.04
API logs
Logs are attached:
To Reproduce
struct Shape
{
int64_t batch_size;
int64_t depth;
int64_t height;
int64_t width;
};
static void SetTensorShape(
std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& tensor,
Shape const& shape)
{
tensor->set_dim({shape.batch_size, shape.depth, shape.height, shape.width})
.set_stride({shape.height * shape.width * shape.depth,
shape.height * shape.width, shape.width, 1});
}
static std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
TensorFromShape(cudnn_frontend::graph::Graph* graph, Shape const& shape,
std::string const& name)
{
namespace fe = cudnn_frontend;
std::shared_ptr<fe::graph::Tensor_attributes> tensor =
graph->tensor(fe::graph::Tensor_attributes().set_name(name));
SetTensorShape(tensor, shape);
return tensor;
}
TEST(ConvSigmoid, Slow)
{
...
code to obtain cudnn handle
...
namespace fe = cudnn_frontend;
auto fwd_graph = std::make_unique<fe::graph::Graph>();
fe::graph::Graph* graph = fwd_graph.get();
graph->set_io_data_type(fe::DataType_t::FLOAT)
.set_compute_data_type(fe::DataType_t::FLOAT)
.set_intermediate_data_type(fe::DataType_t::FLOAT);
Shape input_shape{1, 256, 80, 128};
auto tensor_x = TensorFromShape(graph, input_shape, "input");
int64_t const num_kernel = 90;
Shape weight_shape{num_kernel, input_shape.depth, 3, 3};
auto tensor_w = TensorFromShape(graph, weight_shape, "weights");
auto conv_options = fe::graph::Conv_fprop_attributes()
.set_padding({1, 1})
.set_stride({1, 1})
.set_dilation({1, 1});
auto out_tensor = graph->conv_fprop(tensor_x, tensor_w, conv_options);
Shape bias_shape{1, num_kernel, 1, 1};
auto tensor_b = TensorFromShape(graph, bias_shape, "bias");
out_tensor =
graph->pointwise(out_tensor, tensor_b,
fe::graph::Pointwise_attributes()
.set_mode(fe::PointwiseMode_t::ADD)
.set_compute_data_type(fe::DataType_t::FLOAT));
out_tensor =
graph->pointwise(out_tensor, fe::graph::Pointwise_attributes().set_mode(
fe::PointwiseMode_t::SIGMOID_FWD));
Shape output_shape{1, 90, 80, 128};
SetTensorShape(out_tensor, output_shape);
out_tensor->set_output(true);
if (auto err = graph->validate(); err.is_bad())
{
GTEST_FAIL();
}
if (auto err = graph->build_operation_graph(cudnn_handle); err.is_bad())
{
GTEST_FAIL();
}
if (auto err = graph->create_execution_plans(
{fe::HeurMode_t::B, fe::HeurMode_t::FALLBACK});
err.is_bad())
{
GTEST_FAIL();
}
if (auto err = graph->check_support(cudnn_handle); err.is_bad())
{
GTEST_FAIL();
}
if (auto err = graph->build_plans(cudnn_handle, fe::BuildPlanPolicy_t::ALL);
err.is_bad())
{
GTEST_FAIL();
}
int64_t workspace_size;
if (auto err = graph->get_workspace_size(workspace_size); err.is_bad())
{
GTEST_FAIL();
}
Surface<uint8_t> cu_workspace{workspace_size, false};
Surface<float> cu_x{tensor_x->get_volume(), false};
Surface<float> cu_w{tensor_w->get_volume(), false};
Surface<float> cu_b{tensor_b->get_volume(), false};
Surface<float> cu_y{out_tensor->get_volume(), false};
std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
variant_pack{{tensor_x, cu_x.devPtr},
{tensor_w, cu_w.devPtr},
{tensor_b, cu_b.devPtr},
{out_tensor, cu_y.devPtr}};
if (auto err =
graph->execute(cudnn_handle, variant_pack, cu_workspace.devPtr);
err.is_bad())
{
GTEST_FAIL();
}
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaDeviceSynchronize();
cudaEventRecord(start);
if (auto err =
graph->execute(cudnn_handle, variant_pack, cu_workspace.devPtr);
err.is_bad())
{
GTEST_FAIL();
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float time_in_ms;
cudaEventElapsedTime(&time_in_ms, start, stop);
printf("%f\n", time_in_ms);
}This reports 16ms on an RTX A2000.
Additional context
Separating convolution and activation into separate graphs mitigates the regression. Using NHWC layout also mitigates the regressions at the cost of performance (for the conversion) and memory (temporary block for converted input and output).
Metadata
Metadata
Assignees
Labels
No labels