Skip to content

Convolution with sigmoid performance with NCHW layout #169

@swimvtec

Description

@swimvtec

Describe the bug
Convolution with sigmoid activation is 20 times slower compared to the legacy API with NCHW layout.

Expected behavior
The performance of the legacy API can be achieved without workarounds.

System Environment (please complete the following information):

  • cudnn_frontend version: 1.12
  • cudnn_backend version: 9.11 or 9.12
  • GPU arch: RTX A2000 or RTX A4000
  • cuda runtime version: 12.8
  • cuda driver version: 570.172.08
  • host compiler: clang19
  • OS: ubuntu22.04

API logs
Logs are attached:

be.log
fe.log

To Reproduce

struct Shape
{
  int64_t batch_size;
  int64_t depth;
  int64_t height;
  int64_t width;
};

static void SetTensorShape(
    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& tensor,
    Shape const&                                               shape)
{
  tensor->set_dim({shape.batch_size, shape.depth, shape.height, shape.width})
      .set_stride({shape.height * shape.width * shape.depth,
                   shape.height * shape.width, shape.width, 1});
}

static std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
TensorFromShape(cudnn_frontend::graph::Graph* graph, Shape const& shape,
                std::string const& name)
{
  namespace fe = cudnn_frontend;

  std::shared_ptr<fe::graph::Tensor_attributes> tensor =
      graph->tensor(fe::graph::Tensor_attributes().set_name(name));

  SetTensorShape(tensor, shape);

  return tensor;
}

TEST(ConvSigmoid, Slow)
{
  ...
   code to obtain cudnn handle
  ...

  namespace fe                = cudnn_frontend;
  auto              fwd_graph = std::make_unique<fe::graph::Graph>();
  fe::graph::Graph* graph     = fwd_graph.get();

  graph->set_io_data_type(fe::DataType_t::FLOAT)
      .set_compute_data_type(fe::DataType_t::FLOAT)
      .set_intermediate_data_type(fe::DataType_t::FLOAT);

  Shape input_shape{1, 256, 80, 128};

  auto tensor_x = TensorFromShape(graph, input_shape, "input");

  int64_t const num_kernel = 90;
  Shape         weight_shape{num_kernel, input_shape.depth, 3, 3};
  auto          tensor_w = TensorFromShape(graph, weight_shape, "weights");

  auto conv_options = fe::graph::Conv_fprop_attributes()
                          .set_padding({1, 1})
                          .set_stride({1, 1})
                          .set_dilation({1, 1});

  auto out_tensor = graph->conv_fprop(tensor_x, tensor_w, conv_options);

  Shape bias_shape{1, num_kernel, 1, 1};
  auto  tensor_b = TensorFromShape(graph, bias_shape, "bias");

  out_tensor =
      graph->pointwise(out_tensor, tensor_b,
                       fe::graph::Pointwise_attributes()
                           .set_mode(fe::PointwiseMode_t::ADD)
                           .set_compute_data_type(fe::DataType_t::FLOAT));

  out_tensor =
      graph->pointwise(out_tensor, fe::graph::Pointwise_attributes().set_mode(
                                       fe::PointwiseMode_t::SIGMOID_FWD));

  Shape output_shape{1, 90, 80, 128};

  SetTensorShape(out_tensor, output_shape);
  out_tensor->set_output(true);

  if (auto err = graph->validate(); err.is_bad())
  {
    GTEST_FAIL();
  }
  if (auto err = graph->build_operation_graph(cudnn_handle); err.is_bad())
  {
    GTEST_FAIL();
  }

  if (auto err = graph->create_execution_plans(
          {fe::HeurMode_t::B, fe::HeurMode_t::FALLBACK});
      err.is_bad())
  {
    GTEST_FAIL();
  }

  if (auto err = graph->check_support(cudnn_handle); err.is_bad())
  {
    GTEST_FAIL();
  }

  if (auto err = graph->build_plans(cudnn_handle, fe::BuildPlanPolicy_t::ALL);
      err.is_bad())
  {
    GTEST_FAIL();
  }

  int64_t workspace_size;
  if (auto err = graph->get_workspace_size(workspace_size); err.is_bad())
  {
    GTEST_FAIL();
  }

  Surface<uint8_t> cu_workspace{workspace_size, false};

  Surface<float> cu_x{tensor_x->get_volume(), false};
  Surface<float> cu_w{tensor_w->get_volume(), false};
  Surface<float> cu_b{tensor_b->get_volume(), false};
  Surface<float> cu_y{out_tensor->get_volume(), false};

  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
      variant_pack{{tensor_x, cu_x.devPtr},
                   {tensor_w, cu_w.devPtr},
                   {tensor_b, cu_b.devPtr},
                   {out_tensor, cu_y.devPtr}};

  if (auto err =
          graph->execute(cudnn_handle, variant_pack, cu_workspace.devPtr);
      err.is_bad())
  {
    GTEST_FAIL();
  }

  cudaEvent_t start, stop;

  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaDeviceSynchronize();
  cudaEventRecord(start);

  if (auto err =
          graph->execute(cudnn_handle, variant_pack, cu_workspace.devPtr);
      err.is_bad())
  {
    GTEST_FAIL();
  }

  cudaEventRecord(stop);
  cudaEventSynchronize(stop);

  float time_in_ms;
  cudaEventElapsedTime(&time_in_ms, start, stop);

  printf("%f\n", time_in_ms);
}

This reports 16ms on an RTX A2000.

Additional context
Separating convolution and activation into separate graphs mitigates the regression. Using NHWC layout also mitigates the regressions at the cost of performance (for the conversion) and memory (temporary block for converted input and output).

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions