diff --git a/paddle/phi/backends/gpu/rocm/miopen_desc.h b/paddle/phi/backends/gpu/rocm/miopen_desc.h index 15276c61ef8ddb..b2119d16c4a88f 100644 --- a/paddle/phi/backends/gpu/rocm/miopen_desc.h +++ b/paddle/phi/backends/gpu/rocm/miopen_desc.h @@ -62,6 +62,9 @@ inline miopenDataType_t ToCudnnDataType(const DataType& t) { case DataType::FLOAT32: type = miopenFloat; break; + case DataType::BFLOAT16: + type = miopenBFloat16; + break; default: break; } diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index ac390524b19a45..fd0c69da3b1be6 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -1443,34 +1443,39 @@ PD_REGISTER_KERNEL(conv2d_grad, ALL_LAYOUT, phi::ConvCudnnGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_grad, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv2d_double_grad, GPUDNN, ALL_LAYOUT, phi::ConvCudnnGradGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(conv3d_double_grad, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnDoubleGradKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(depthwise_conv2d_double_grad, GPU, ALL_LAYOUT, phi::DepthwiseConvDoubleGradGPUDNNKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(conv2d_grad, diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index fcc1a2fff7029a..89def35da14830 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -561,18 +561,29 @@ void Conv3DCudnnKernel(const Context& dev_ctx, } // namespace phi #ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL( - conv2d, GPUDNN, ALL_LAYOUT, phi::ConvCudnnKernel, float, phi::float16) {} +PD_REGISTER_KERNEL(conv2d, + GPUDNN, + ALL_LAYOUT, + phi::ConvCudnnKernel, + float, + phi::float16, + phi::bfloat16) {} -PD_REGISTER_KERNEL( - conv3d, GPUDNN, ALL_LAYOUT, phi::Conv3DCudnnKernel, float, phi::float16) {} +PD_REGISTER_KERNEL(conv3d, + GPUDNN, + ALL_LAYOUT, + phi::Conv3DCudnnKernel, + float, + phi::float16, + phi::bfloat16) {} PD_REGISTER_KERNEL(depthwise_conv2d, GPUDNN, ALL_LAYOUT, phi::DepthwiseConvCudnnKernel, float, - phi::float16) {} + phi::float16, + phi::bfloat16) {} #else #if CUDNN_VERSION_MIN(8, 1, 0) @@ -623,5 +634,3 @@ PD_REGISTER_KERNEL(conv3d, #endif #endif - -// todo register bfloat16 diff --git a/test/legacy_test/test_hip_bf16_conv_kernel.py b/test/legacy_test/test_hip_bf16_conv_kernel.py new file mode 100644 index 00000000000000..1bd6a11aeb3370 --- /dev/null +++ b/test/legacy_test/test_hip_bf16_conv_kernel.py @@ -0,0 +1,72 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test HIP BF16 convolution kernel registration. + +This test keeps coverage focused on minimal convolution forward passes so the +suite validates kernel registration without pulling in unrelated BF16 operator +chains. +""" + +import unittest + +import numpy as np + +import paddle +from paddle.base import core + + +@unittest.skipIf(not core.is_compiled_with_rocm(), "HIP/ROCm is not available") +class TestHIPBF16Conv2dKernel(unittest.TestCase): + """Test that conv2d kernel is registered for BF16 on HIP.""" + + def test_conv2d_bf16_forward(self): + """Test conv2d BF16 forward pass on HIP.""" + paddle.set_device("gpu") + + # Create BF16 input tensor + input_np = np.random.randn(1, 3, 64, 64).astype(np.float32) + filter_np = np.random.randn(8, 3, 3, 3).astype(np.float32) + + input_tensor = paddle.to_tensor(input_np).astype("bfloat16") + filter_tensor = paddle.to_tensor(filter_np).astype("bfloat16") + + # This should not raise "kernel not registered" error + output = paddle.nn.functional.conv2d(input_tensor, filter_tensor) + + self.assertEqual(output.dtype, paddle.bfloat16) + self.assertEqual(output.shape, [1, 8, 62, 62]) + # Verify output is not NaN + self.assertFalse(paddle.isnan(output).any()) + + def test_conv2d_bf16_with_groups(self): + """Test conv2d BF16 with groups (depthwise-like) on HIP.""" + paddle.set_device("gpu") + + input_np = np.random.randn(1, 8, 16, 16).astype(np.float32) + filter_np = np.random.randn(8, 1, 3, 3).astype(np.float32) + + input_tensor = paddle.to_tensor(input_np).astype("bfloat16") + filter_tensor = paddle.to_tensor(filter_np).astype("bfloat16") + + output = paddle.nn.functional.conv2d( + input_tensor, filter_tensor, groups=8 + ) + + self.assertEqual(output.dtype, paddle.bfloat16) + self.assertEqual(output.shape, [1, 8, 14, 14]) + + +if __name__ == "__main__": + unittest.main()