Support Bias in _kernel_matmul_fp8_row_non_persistent (#4167)

njriasan · facebook-github-bot · commit 98cf949f4d1f · 2025-05-29T18:16:46.000-07:00
Summary: Pull Request resolved: #4167 X-link: facebookresearch/FBGEMM#1247 Matches Bias support with the persistent kernel. Reviewed By: karthik-man Differential Revision: D74914575 fbshipit-source-id: bff73c9d13ac193085b7aee927b9701e94cdd4e7
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -296,7 +296,7 @@ def _kernel_matmul_fp8_row(
         SPLIT_K (int): Number of SM's to launch per row.
         USE_BIAS (bool): Whether to use bias.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     # Matrix multiplication.
     start_pid = tl.program_id(axis=0)
@@ -459,7 +459,7 @@ def _kernel_matmul_fp8_row_no_fast_acc(
         SPLIT_K (int): Number of SM's to launch per row.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
         USE_BIAS(bool): Whether to use bias.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     # Matrix multiplication.
 
@@ -615,7 +615,7 @@ def _kernel_matmul_fp8_row_imprecise_acc(
         SPLIT_K (int): Number of SM's to launch per row.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
         USE_BIAS (bool): Whether to use bias.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     # Matrix multiplication.
     pid = tl.program_id(0)
@@ -810,7 +810,7 @@ def _kernel_matmul_fp8_row_tma_persistent(
         GROUP_M (int): Number of groups for M dimension swizzle.
         SPLIT_K (int): Number of SM's to launch per row.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     # Matrix multiplication.
     start_pid = tl.program_id(axis=0)
@@ -1050,7 +1050,7 @@ def _kernel_matmul_fp8_row_tma_persistent_ws_cooperative(
         GROUP_M (int): Number of groups for M dimension swizzle.
         SPLIT_K (int): Number of SM's to launch per row.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     num_tiles = tl.cdiv(M, BLOCK_M) * tl.cdiv(N, BLOCK_N)
     num_pid_m = tl.cdiv(M, BLOCK_M)
@@ -1206,8 +1206,6 @@ def persistent_grid(META):
 
     if no_use_persistent:
         logger.info("Using non-persistent kernel")
-        if bias is not None:
-            raise AssertionError("bias is not supported in non-persistent kernel")
         # pyre-ignore
         torch._library.capture_triton(_kernel_matmul_fp8_row_non_persistent)[grid](
             a,
@@ -1221,7 +1219,7 @@ def persistent_grid(META):
             k_key,
             a_scale,
             b_scale,
-            # bias,
+            bias,
             a.stride(0),
             a.stride(1),
             b.stride(0),
@@ -1232,7 +1230,7 @@ def persistent_grid(META):
             allow_tf32=allow_tf32,
             fp8_fast_accum=fp8_fast_accum,
             # GROUP_M=8,
-            # USE_BIAS=bias is not None,
+            USE_BIAS=bias is not None,
             AB_DTYPE=False,
         )
     elif use_warp_specialization:
@@ -1679,7 +1677,7 @@ def _kernel_matmul_fp8_block_fastacc(
         GROUP_M (int): Number of groups for M dimension swizzle.
         SPLIT_K (int): Number of SM's to launch per row.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     assert BLOCK_M < scale_block_m
     assert BLOCK_N < scale_block_n
@@ -1875,7 +1873,7 @@ def _kernel_matmul_fp8_block_slowacc(
         GROUP_M (int): Number of groups for M dimension swizzle.
         SPLIT_K (int): Number of SM's to launch per row.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     assert BLOCK_M < scale_block_m
     assert BLOCK_N < scale_block_n
@@ -3172,6 +3170,7 @@ def prune_configs(configs, named_args, **kwargs):
     K = named_args["K"]
     elemBytes_a = named_args["A"].element_size()
     elemBytes_b = named_args["B"].element_size()
+    use_bias = kwargs["USE_BIAS"]
 
     if M < 32 or N < 32:
         mfma = 16
@@ -3211,6 +3210,9 @@ def prune_configs(configs, named_args, **kwargs):
             continue
         if BLOCK_SIZE_N > N * 2 and BLOCK_SIZE_N != 16:
             continue
+        # split_k cannot be used if there is a bias
+        if use_bias and SPLIT_K != 1:
+            continue
         # skip large split_k when not necessary
         if SPLIT_K != 1 and not need_split_k(M, N, K):
             continue
@@ -3369,6 +3371,7 @@ def _kernel_matmul_fp8_row_non_persistent(
     k_key,
     A_scale,
     B_scale,
+    Bias,
     stride_am,
     stride_ak,
     stride_bn,
@@ -3384,6 +3387,7 @@ def _kernel_matmul_fp8_row_non_persistent(
     GROUP_M: tl.constexpr,
     SPLIT_K: tl.constexpr,
     EVEN_K: tl.constexpr,
+    USE_BIAS: tl.constexpr,
     AB_DTYPE: tl.constexpr,
 ) -> None:
     """Matmul kernel of [M, K] @ [N, K] with row-wise scales
@@ -3402,6 +3406,7 @@ def _kernel_matmul_fp8_row_non_persistent(
         k_key (int): Autotuning key for K dimension of input tensor.
         A_scale (TensorWrapper): [M] reciprocal scale tensor per row. A * A_scale = original A
         B_scale (TensorWrapper): [N] reciprocal scale tensor per row. B * B_scale = original B
+        Bias (tensorWrapper): [N] Optional bias tensor.
         stride_am (int): Stride of M dimension of A.
         stride_ak (int): Stride of K dimension of A.
         stride_bn (int): Stride of N dimension of B.
@@ -3417,7 +3422,8 @@ def _kernel_matmul_fp8_row_non_persistent(
         GROUP_M (int): Number of groups for M dimension swizzle.
         SPLIT_K (int): Number of SM's to launch per row.
         EVEN_K (bool): Whether K is evenly divisible by BLOCK_K * SPLIT_K.
-        AB_DTYPE (bool): Wether to cast A and B to C.dtype before tensor core.
+        USE_BIAS (bool): Whether to use bias.
+        AB_DTYPE (bool): Whether to cast A and B to C.dtype before tensor core.
     """
     tl.assume(M >= 0)
     tl.assume(N >= 0)
@@ -3484,6 +3490,11 @@ def _kernel_matmul_fp8_row_non_persistent(
     scale = a_scale[:, None] * b_scale[None, :]
     acc *= scale
 
+    # Load and add bias if specified.
+    if USE_BIAS:
+        bias = tl.load(Bias + rn, mask=rn < N)
+        acc += bias[None, :]
+
     acc = acc.to(C.dtype.element_ty)
     C = C + (rm[:, None] * stride_cm + rn[None, :] * stride_cn)
     mask = (rm < M)[:, None] & (rn < N)[None, :]