[Blackwell] Support mixed precision in mxfp tutorial (#6204)

masahi · web-flow · commit d3922038eaa0 · 2025-03-16T07:34:57.000+09:00
Now that TMA load for padded fp4 is supported, update the tutorial for mixed precision (`--format mixed`). ~Either cpasync or device TMA can be used for fp4. To use TMA, set `--mixed-fp4-tma`.~ (UPDATE: Now supports only TMA) Using TMA makes it significantly faster (up to 50%) If #6194 goes in first, I'll drop `_experimental_` here. --------- Co-authored-by: Masahiro Masuda <mmasuda@nvidia.com>
diff --git a/python/tutorials/10-block-scaled-matmul.py b/python/tutorials/10-block-scaled-matmul.py
@@ -88,10 +88,12 @@ def _matmul_launch_metadata(grid, kernel, args):
     ret = {}
     M, N, K = args["M"], args["N"], args["K"]
     kernel_name = kernel.name
-    if "ELEM_PER_BYTE" and "VEC_SIZE" in args:
-        if args["ELEM_PER_BYTE"] == 1:
+    if "ELEM_PER_BYTE_A" and "ELEM_PER_BYTE_B" and "VEC_SIZE" in args:
+        if args["ELEM_PER_BYTE_A"] == 1 and args["ELEM_PER_BYTE_B"] == 1:
             kernel_name += "_mxfp8"
-        elif args["ELEM_PER_BYTE"] == 2:
+        elif args["ELEM_PER_BYTE_A"] == 1 and args["ELEM_PER_BYTE_B"] == 2:
+            kernel_name += "_mixed"
+        elif args["ELEM_PER_BYTE_A"] == 2 and args["ELEM_PER_BYTE_B"] == 2:
             if args["VEC_SIZE"] == 16:
                 kernel_name += "_nvfp4"
             elif args["VEC_SIZE"] == 32:
@@ -104,23 +106,29 @@ def _matmul_launch_metadata(grid, kernel, args):
 @triton.jit(launch_metadata=_matmul_launch_metadata)
 def block_scaled_matmul_kernel(  #
         a_desc, a_scale,  #
-        b_desc, b_scale,  #
+        b_desc_or_tensor, b_scale,  #
         c_desc,  #
         M: tl.constexpr, N: tl.constexpr, K: tl.constexpr,  #
         stride_sk: tl.constexpr, stride_sb: tl.constexpr, stride_sc: tl.constexpr, stride_sd: tl.constexpr,
         output_type: tl.constexpr,  #
-        ELEM_PER_BYTE: tl.constexpr,  #
+        ELEM_PER_BYTE_A: tl.constexpr,  #
+        ELEM_PER_BYTE_B: tl.constexpr,  #
         VEC_SIZE: tl.constexpr,  #
         BLOCK_M: tl.constexpr,  #
         BLOCK_N: tl.constexpr,  #
         BLOCK_K: tl.constexpr,  #
         NUM_STAGES: tl.constexpr,  #
         USE_2D_SCALE_LOAD: tl.constexpr):  #
 
-    if ELEM_PER_BYTE == 1:
-        dtype = tl.float8e4nv
-    elif ELEM_PER_BYTE == 2:
-        dtype = tl.dtype("uint8")
+    if ELEM_PER_BYTE_A == 1:
+        dtype_a = tl.float8e4nv
+    elif ELEM_PER_BYTE_A == 2:
+        dtype_a = tl.dtype("uint8")
+
+    if ELEM_PER_BYTE_B == 1:
+        dtype_b = tl.float8e4nv
+    elif ELEM_PER_BYTE_B == 2:
+        dtype_b = tl.dtype("uint8")
 
     if output_type == 0:
         output_dtype = tl.float32
@@ -129,25 +137,38 @@ def block_scaled_matmul_kernel(  #
     elif output_type == 2:
         output_dtype = tl.float8e4nv
 
-    tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [a_desc], dtype=tl.int32, is_pure=False,
-                              pack=1)
-    tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [b_desc], dtype=tl.int32, is_pure=False,
-                              pack=1)
-    tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [c_desc], dtype=tl.int32, is_pure=False,
-                              pack=1)
-
     pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_M)
     pid_m = pid % num_pid_m
     pid_n = pid // num_pid_m
     offs_am = pid_m * BLOCK_M
     offs_bn = pid_n * BLOCK_N
-    offs_k = 0
+    offs_k_a = 0
+    offs_k_b = 0
 
     ## block scale offsets
     offs_sm = (pid_m * (BLOCK_M // 128) + tl.arange(0, BLOCK_M // 128)) % M
     offs_sn = (pid_n * (BLOCK_N // 128) + tl.arange(0, BLOCK_N // 128)) % N
 
+    MIXED_PREC: tl.constexpr = ELEM_PER_BYTE_A == 1 and ELEM_PER_BYTE_B == 2
+
+    if MIXED_PREC:
+        b_desc = tl.make_tensor_descriptor(
+            b_desc_or_tensor,
+            shape=[N, K // ELEM_PER_BYTE_B],
+            strides=[K // ELEM_PER_BYTE_B, 1],
+            block_shape=[BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B],
+        )
+    else:
+        b_desc = b_desc_or_tensor
+        tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [b_desc], dtype=tl.int32,
+                                  is_pure=False, pack=1)
+
+    tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [a_desc], dtype=tl.int32, is_pure=False,
+                              pack=1)
+    tl.inline_asm_elementwise("prefetch.tensormap [$1]; // dummy $0", "=r,l", [c_desc], dtype=tl.int32, is_pure=False,
+                              pack=1)
+
     # For now it is recommended to use 2D scale loads for better performance.
     # In the future we will bring additional optimizations to either allow 5D loads,
     # the use of TMAs for scale factors, or both.
@@ -171,26 +192,39 @@ def block_scaled_matmul_kernel(  #
 
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
     for k in tl.range(0, tl.cdiv(K, BLOCK_K), num_stages=NUM_STAGES):
-        a = tl._experimental_descriptor_load(a_desc, [offs_am, offs_k], [BLOCK_M, BLOCK_K // ELEM_PER_BYTE], dtype)
-        b = tl._experimental_descriptor_load(b_desc, [offs_bn, offs_k], [BLOCK_N, BLOCK_K // ELEM_PER_BYTE], dtype)
+        a = tl._experimental_descriptor_load(a_desc, [offs_am, offs_k_a], [BLOCK_M, BLOCK_K // ELEM_PER_BYTE_A],
+                                             dtype_a)
+
+        if MIXED_PREC:
+            b = b_desc.load([offs_bn, offs_k_b])
+        else:
+            b = tl._experimental_descriptor_load(b_desc, [offs_bn, offs_k_b], [BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B],
+                                                 dtype_b)
+
         scale_a = tl.load(a_scale_ptr)
         scale_b = tl.load(b_scale_ptr)
         if USE_2D_SCALE_LOAD:
             scale_a = scale_a.reshape(BLOCK_M // 128, BLOCK_K // VEC_SIZE // 4, 32, 4, 4)
             scale_b = scale_b.reshape(BLOCK_N // 128, BLOCK_K // VEC_SIZE // 4, 32, 4, 4)
         scale_a = scale_a.trans(0, 3, 2, 1, 4).reshape(BLOCK_M, BLOCK_K // VEC_SIZE)
         scale_b = scale_b.trans(0, 3, 2, 1, 4).reshape(BLOCK_N, BLOCK_K // VEC_SIZE)
-        if ELEM_PER_BYTE == 2:
+
+        if MIXED_PREC:
+            accumulator = tl.dot_scaled(a, scale_a, "e4m3", b.T, scale_b, "e2m1", accumulator)
+        elif ELEM_PER_BYTE_A == 2 and ELEM_PER_BYTE_B == 2:
             accumulator = tl.dot_scaled(a, scale_a, "e2m1", b.T, scale_b, "e2m1", accumulator)
         else:
             accumulator = tl.dot_scaled(a, scale_a, "e4m3", b.T, scale_b, "e4m3", accumulator)
-        offs_k += BLOCK_K // ELEM_PER_BYTE
+
+        offs_k_a += BLOCK_K // ELEM_PER_BYTE_A
+        offs_k_b += BLOCK_K // ELEM_PER_BYTE_B
         a_scale_ptr += (BLOCK_K // VEC_SIZE // 4) * stride_sb
         b_scale_ptr += (BLOCK_K // VEC_SIZE // 4) * stride_sb
+
     tl._experimental_descriptor_store(c_desc, accumulator.to(output_dtype), [offs_am, offs_bn])
 
 
-def block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, dtype_dst, M, N, K, configs):
+def block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, dtype_dst, M, N, K, configs):
     output = torch.empty((M, N), dtype=dtype_dst, device="cuda")
     if dtype_dst == torch.float32:
         dtype_dst = 0
@@ -205,11 +239,11 @@ def block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, dtype_dst, M, N, K, co
                                 output.element_size())
 
     grid = (triton.cdiv(M, configs["BLOCK_SIZE_M"]) * triton.cdiv(N, configs["BLOCK_SIZE_N"]), 1)
-    block_scaled_matmul_kernel[grid](a_desc, a_scale, b_desc, b_scale, c_desc, M, N, K, a_scale.stride(0),
+    block_scaled_matmul_kernel[grid](a_desc, a_scale, b_desc_or_tensor, b_scale, c_desc, M, N, K, a_scale.stride(0),
                                      a_scale.stride(1), a_scale.stride(2), a_scale.stride(3), dtype_dst,
-                                     configs["ELEM_PER_BYTE"], configs["VEC_SIZE"], configs["BLOCK_SIZE_M"],
-                                     configs["BLOCK_SIZE_N"], configs["BLOCK_SIZE_K"], configs["num_stages"],
-                                     USE_2D_SCALE_LOAD=True)
+                                     configs["ELEM_PER_BYTE_A"], configs["ELEM_PER_BYTE_B"], configs["VEC_SIZE"],
+                                     configs["BLOCK_SIZE_M"], configs["BLOCK_SIZE_N"], configs["BLOCK_SIZE_K"],
+                                     configs["num_stages"], USE_2D_SCALE_LOAD=True)
     return output
 
 
@@ -218,8 +252,9 @@ def initialize_block_scaled(M, N, K, block_scale_type="nvfp4", compute_reference
     BLOCK_N = 256
     BLOCK_K = 256 if "fp4" in block_scale_type else 128
     VEC_SIZE = 16 if block_scale_type == "nvfp4" else 32
-    assert block_scale_type in ["nvfp4", "mxfp4", "mxfp8"], f"Invalid block scale type: {block_scale_type}"
-    ELEM_PER_BYTE = 2 if "fp4" in block_scale_type else 1
+    assert block_scale_type in ["nvfp4", "mxfp4", "mxfp8", "mixed"], f"Invalid block scale type: {block_scale_type}"
+    ELEM_PER_BYTE_A = 2 if "fp4" in block_scale_type else 1
+    ELEM_PER_BYTE_B = 1 if block_scale_type == "mxfp8" else 2
 
     device = "cuda"
     a_ref = MXFP4Tensor(size=(M, K), device=device).random()
@@ -229,20 +264,32 @@ def initialize_block_scaled(M, N, K, block_scale_type="nvfp4", compute_reference
     # the data is generated in col-major layout, packed along K for fp4, and then
     # logically transposed. Note that if one operand is of fp8 precision, unlike Hopper,
     # Blackwell supports both row-major and col-major layouts for the RHS matrix.
+    # For the mixed-precision case, the fp4 RHS can be either in row or col-major layout.
+    # But for performance reason, it is recommended to use col-major layout. If TMA is used
+    # for the fp4 RHS operand load in mixed-precision dot, as in this tutorial, it must be
+    # in col-major layout.
     b_ref = MXFP4Tensor(size=(N, K), device=device).random()
-    if block_scale_type == "mxfp8":
+    if block_scale_type in ["mxfp8", "mixed"]:
         a_ref = a_ref.to(torch.float32)
-        b_ref = b_ref.to(torch.float32)
         a = a_ref.to(torch.float8_e4m3fn)
-        b = b_ref.to(torch.float8_e4m3fn)
     else:
         # Pack two fp4 elements per byte along K
         a = a_ref.to_packed_tensor(dim=1)
+
+    if block_scale_type == "mxfp8":
+        b_ref = b_ref.to(torch.float32)
+        b = b_ref.to(torch.float8_e4m3fn)
+    else:
         b = b_ref.to_packed_tensor(dim=1)
+
     b_ref = b_ref.to(torch.float32).T
 
-    a_desc = TmaDescKernelParam(a.data_ptr(), a.shape, [BLOCK_M, BLOCK_K // ELEM_PER_BYTE], 1)
-    b_desc = TmaDescKernelParam(b.data_ptr(), b.shape, [BLOCK_N, BLOCK_K // ELEM_PER_BYTE], 1)
+    a_desc = TmaDescKernelParam(a.data_ptr(), a.shape, [BLOCK_M, BLOCK_K // ELEM_PER_BYTE_A], 1)
+
+    if block_scale_type == "mixed":
+        b_desc_or_tensor = b
+    else:
+        b_desc_or_tensor = TmaDescKernelParam(b.data_ptr(), b.shape, [BLOCK_N, BLOCK_K // ELEM_PER_BYTE_B], 1)
 
     epsilon = 1e-8
     a_scale = torch.rand((M // 128, K // VEC_SIZE // 4, 32, 4, 4), device=device) + epsilon
@@ -252,7 +299,7 @@ def initialize_block_scaled(M, N, K, block_scale_type="nvfp4", compute_reference
         b_scale = b_scale.to(torch.float8_e4m3fn)
         a_scale_ref = a_scale
         b_scale_ref = b_scale
-    elif block_scale_type in ["mxfp4", "mxfp8"]:
+    elif block_scale_type in ["mxfp4", "mxfp8", "mixed"]:
         a_scale_ref = MXScaleTensor(a_scale)
         b_scale_ref = MXScaleTensor(b_scale)
         a_scale = a_scale_ref.data
@@ -276,16 +323,26 @@ def unpack_scale(packed):
         "BLOCK_SIZE_N": BLOCK_N,
         "BLOCK_SIZE_K": BLOCK_K,
         "num_stages": 4,
-        "ELEM_PER_BYTE": ELEM_PER_BYTE,
+        "ELEM_PER_BYTE_A": ELEM_PER_BYTE_A,
+        "ELEM_PER_BYTE_B": ELEM_PER_BYTE_B,
         "VEC_SIZE": VEC_SIZE,
     }
-    return a_desc, a_scale, b_desc, b_scale, configs, reference
+    return a_desc, a_scale, b_desc_or_tensor, b_scale, configs, reference
 
 
 def validate_block_scaled(M, N, K, block_scale_type="nvfp4"):
-    a_desc, a_scale, b_desc, b_scale, configs, reference = initialize_block_scaled(M, N, K, block_scale_type,
-                                                                                   compute_reference=True)
-    output = block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, torch.float16, M, N, K, configs)
+
+    def alloc_fn(size: int, align: int, _):
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    if block_scale_type == "mixed":
+        # This is needed for TMA with the descriptor created on the device.
+        # TMA load for mixed-precision fp4 is supported only by device TMA.
+        triton.set_allocator(alloc_fn)
+
+    a_desc, a_scale, b_desc_or_tensor, b_scale, configs, reference = initialize_block_scaled(
+        M, N, K, block_scale_type, compute_reference=True)
+    output = block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, torch.float16, M, N, K, configs)
     torch.testing.assert_close(reference, output.to(torch.float32), atol=1e-3, rtol=1e-3)
     print(f"✅ (pass {block_scale_type})")
 
@@ -296,13 +353,19 @@ def bench_block_scaled(K, block_scale_type="nvfp4", reps=10):
     N = 8192
     print(f"Problem Shape = {M}x{N}x{K}")
 
-    a_desc, a_scale, b_desc, b_scale, configs, _ = initialize_block_scaled(M, N, K, block_scale_type,
-                                                                           compute_reference=False)
-    _ = block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, torch.float16, M, N, K, configs)
+    def alloc_fn(size: int, align: int, _):
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    if block_scale_type == "mixed":
+        triton.set_allocator(alloc_fn)
+
+    a_desc, a_scale, b_desc_or_tensor, b_scale, configs, _ = initialize_block_scaled(
+        M, N, K, block_scale_type, compute_reference=False)
+    _ = block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, torch.float16, M, N, K, configs)
 
     proton.activate(0)
     for _ in range(reps):
-        _ = block_scaled_matmul(a_desc, a_scale, b_desc, b_scale, torch.float16, M, N, K, configs)
+        _ = block_scaled_matmul(a_desc, a_scale, b_desc_or_tensor, b_scale, torch.float16, M, N, K, configs)
     proton.deactivate(0)
     print("Done benchmarking")
 
@@ -321,7 +384,7 @@ def show_profile(profile_name):
     parser.add_argument("--K_range", type=int, nargs=2)
     parser.add_argument("--K_step", type=int, default=512)
     parser.add_argument("--bench", action="store_true")
-    parser.add_argument("--format", type=str, choices=["mxfp4", "nvfp4", "mxfp8"], default="nvfp4")
+    parser.add_argument("--format", type=str, choices=["mxfp4", "nvfp4", "mxfp8", "mixed"], default="nvfp4")
     args = parser.parse_args()
 
     if not supports_block_scaling():