[AMD][BACKEND] Enable bf16 dot2 in AMD backend pass (#6600)

jungpark-mlir · web-flow · commit a0e3e78ac46a · 2025-04-24T22:44:39.000Z
This enables the basic usage of bf16 dot2 instruction in CDNA4 arch. 
Calculates 32-bit sum of 16-bit multiplications.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3604,9 +3604,17 @@ def get_test_dot_double_rate_cases():
             (16, 16, 32, 4, False, False, 'None', 'ieee', 'bfloat16', 'float32', 1, None)]
 
 
+def get_test_dot_vdot2_cases():
+    if not is_hip_cdna():
+        return []
+    return [(4, 32, 32, 4, False, False, 'None', 'ieee', 'float16', 'float32', 1, None),
+            (4, 32, 32, 4, False, False, 'None', 'ieee', 'bfloat16', 'float32', 1, None)]
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize(
     "M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dtype, out_dtype, kpack, mma_nonk_size",
+    get_test_dot_vdot2_cases() + \
     get_test_dot_double_rate_cases() + \
     get_test_dot_base_cases() + \
     get_test_dot_mixed_sizes_cases() + \
@@ -3799,8 +3807,20 @@ def kernel(X, stride_xm, stride_xk, Y, stride_yk, stride_yn, W, stride_wn, strid
     else:
         # added atol, to loose precision for float16xfloat16->float32 case
         np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3)
-    if not is_cuda():
+
+    if not (is_cuda() or is_hip_cdna()):
         return
+
+    if is_hip_cdna():
+        if M != 4:
+            return
+        amdgcn = pgm.asm['amdgcn']
+        if in_dtype == 'float16':
+            assert 'v_dot2c_f32_f16' in amdgcn
+        elif (in_dtype == 'bfloat16') and is_hip_cdna4():
+            assert 'v_dot2c_f32_bf16' in amdgcn
+        return
+
     # make sure ld/st are vectorized
     ptx = pgm.asm['ptx']
     if (K > 16 or N > 16 or M > 16) and (M * N // (num_warps * 32) >= 4):
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/FMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/DotOpToLLVM/FMA.cpp
@@ -36,10 +36,11 @@ class AMDFMAVectorMultiplier : public FMAVectorMultiplier {
     bool dotAvailable = AMD::supportsVDot(arch);
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     if (dotAvailable) {
-      if (aElemTy.isF16() && dElemTy.isF32()) {
+      if ((aElemTy.isF16() || aElemTy.isBF16()) && dElemTy.isF32()) {
         chosenOp.vectorSize = 2;
         chosenOp.outElemTy = f32_ty;
-        chosenOp.intrinsicName = "llvm.amdgcn.fdot2";
+        chosenOp.intrinsicName = aElemTy.isF16() ? "llvm.amdgcn.fdot2"
+                                                 : "llvm.amdgcn.fdot2.f32.bf16";
         chosenOp.additionalArgs = {b.false_val()};
         return chosenOp;
       }
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -1103,6 +1103,13 @@ class AccelerateBlocked : public OpRewritePattern<DotOp> {
         return true;
       }
 
+      // CDNA4 has Bf16 v_dot2
+      if (AMD::deduceISAFamily(arch) == ISAFamily::CDNA4 &&
+          dotTypes.a.isBF16() && dotTypes.b.isBF16() && dotTypes.c.isF32() &&
+          dotTypes.d.isF32() && k % 2 == 0) {
+        return true;
+      }
+
       // TODO: enable this condition, when fp32 -> fp16 cast works correctly
       // Consider this case as non legal, despite this case is covered by fp16
       // FMA. Because v_dot expected to give both better performance and