Convolution with sigmoid performance with NCHW layout

**Describe the bug**
Convolution with sigmoid activation is 20 times slower compared to the legacy API with NCHW layout.

**Expected behavior**
The performance of the legacy API can be achieved without workarounds.

**System Environment (please complete the following information):**
 - cudnn_frontend version: 1.12
 - cudnn_backend version: 9.11 or 9.12
 - GPU arch: RTX A2000 or RTX A4000
 - cuda runtime version: 12.8
 - cuda driver version: 570.172.08
 - host compiler: clang19
 - OS:  ubuntu22.04

**API logs**
Logs are attached:

[be.log](https://github.com/user-attachments/files/22466787/be.log)
[fe.log](https://github.com/user-attachments/files/22466786/fe.log)


**To Reproduce**
```cpp
struct Shape
{
  int64_t batch_size;
  int64_t depth;
  int64_t height;
  int64_t width;
};

static void SetTensorShape(
    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& tensor,
    Shape const&                                               shape)
{
  tensor->set_dim({shape.batch_size, shape.depth, shape.height, shape.width})
      .set_stride({shape.height * shape.width * shape.depth,
                   shape.height * shape.width, shape.width, 1});
}

static std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
TensorFromShape(cudnn_frontend::graph::Graph* graph, Shape const& shape,
                std::string const& name)
{
  namespace fe = cudnn_frontend;

  std::shared_ptr<fe::graph::Tensor_attributes> tensor =
      graph->tensor(fe::graph::Tensor_attributes().set_name(name));

  SetTensorShape(tensor, shape);

  return tensor;
}

TEST(ConvSigmoid, Slow)
{
  ...
   code to obtain cudnn handle
  ...

  namespace fe                = cudnn_frontend;
  auto              fwd_graph = std::make_unique<fe::graph::Graph>();
  fe::graph::Graph* graph     = fwd_graph.get();

  graph->set_io_data_type(fe::DataType_t::FLOAT)
      .set_compute_data_type(fe::DataType_t::FLOAT)
      .set_intermediate_data_type(fe::DataType_t::FLOAT);

  Shape input_shape{1, 256, 80, 128};

  auto tensor_x = TensorFromShape(graph, input_shape, "input");

  int64_t const num_kernel = 90;
  Shape         weight_shape{num_kernel, input_shape.depth, 3, 3};
  auto          tensor_w = TensorFromShape(graph, weight_shape, "weights");

  auto conv_options = fe::graph::Conv_fprop_attributes()
                          .set_padding({1, 1})
                          .set_stride({1, 1})
                          .set_dilation({1, 1});

  auto out_tensor = graph->conv_fprop(tensor_x, tensor_w, conv_options);

  Shape bias_shape{1, num_kernel, 1, 1};
  auto  tensor_b = TensorFromShape(graph, bias_shape, "bias");

  out_tensor =
      graph->pointwise(out_tensor, tensor_b,
                       fe::graph::Pointwise_attributes()
                           .set_mode(fe::PointwiseMode_t::ADD)
                           .set_compute_data_type(fe::DataType_t::FLOAT));

  out_tensor =
      graph->pointwise(out_tensor, fe::graph::Pointwise_attributes().set_mode(
                                       fe::PointwiseMode_t::SIGMOID_FWD));

  Shape output_shape{1, 90, 80, 128};

  SetTensorShape(out_tensor, output_shape);
  out_tensor->set_output(true);

  if (auto err = graph->validate(); err.is_bad())
  {
    GTEST_FAIL();
  }
  if (auto err = graph->build_operation_graph(cudnn_handle); err.is_bad())
  {
    GTEST_FAIL();
  }

  if (auto err = graph->create_execution_plans(
          {fe::HeurMode_t::B, fe::HeurMode_t::FALLBACK});
      err.is_bad())
  {
    GTEST_FAIL();
  }

  if (auto err = graph->check_support(cudnn_handle); err.is_bad())
  {
    GTEST_FAIL();
  }

  if (auto err = graph->build_plans(cudnn_handle, fe::BuildPlanPolicy_t::ALL);
      err.is_bad())
  {
    GTEST_FAIL();
  }

  int64_t workspace_size;
  if (auto err = graph->get_workspace_size(workspace_size); err.is_bad())
  {
    GTEST_FAIL();
  }

  Surface<uint8_t> cu_workspace{workspace_size, false};

  Surface<float> cu_x{tensor_x->get_volume(), false};
  Surface<float> cu_w{tensor_w->get_volume(), false};
  Surface<float> cu_b{tensor_b->get_volume(), false};
  Surface<float> cu_y{out_tensor->get_volume(), false};

  std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*>
      variant_pack{{tensor_x, cu_x.devPtr},
                   {tensor_w, cu_w.devPtr},
                   {tensor_b, cu_b.devPtr},
                   {out_tensor, cu_y.devPtr}};

  if (auto err =
          graph->execute(cudnn_handle, variant_pack, cu_workspace.devPtr);
      err.is_bad())
  {
    GTEST_FAIL();
  }

  cudaEvent_t start, stop;

  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaDeviceSynchronize();
  cudaEventRecord(start);

  if (auto err =
          graph->execute(cudnn_handle, variant_pack, cu_workspace.devPtr);
      err.is_bad())
  {
    GTEST_FAIL();
  }

  cudaEventRecord(stop);
  cudaEventSynchronize(stop);

  float time_in_ms;
  cudaEventElapsedTime(&time_in_ms, start, stop);

  printf("%f\n", time_in_ms);
}
```

This reports 16ms on an RTX A2000.

**Additional context**
Separating convolution and activation into separate graphs mitigates the regression. Using NHWC layout also mitigates the regressions at the cost of performance (for the conversion) and memory (temporary block for converted input and output).


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Convolution with sigmoid performance with NCHW layout #169

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Convolution with sigmoid performance with NCHW layout #169

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions