Add a triton kernel for swizziling

drisspg · drisspg · commit 4621f1012fd1 · 2025-05-04T19:09:13.000-07:00
stack-info: PR: #2168, branch: drisspg/stack/53
diff --git a/test/prototype/mx_formats/test_custom_cast.py b/test/prototype/mx_formats/test_custom_cast.py
@@ -44,6 +44,7 @@
     sem_vals_to_f32,
 )
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
@@ -465,3 +466,16 @@ def test_triton_mxfp8_dim1_randn(M, K):
     x_mx_t, x_s_t = triton_to_mxfp8_dim1(x, inner_block_size=32)
     torch.testing.assert_close(x_mx_t, x_mx_ref, rtol=0, atol=0)
     torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize(
+    "shape",
+    # [(63, 1023), (128, 4), (128, 8), (256, 8), (300, 9), (133, 512), (528, 512), (128, 1)],
+    [(128, 1)],
+)
+def test_rearrange(shape):
+    scales = torch.randint(256, size=shape, device="cuda", dtype=torch.uint8)
+    eager = to_blocked(scales, False)
+    triton = to_blocked(scales, True)
+    torch.testing.assert_close(eager, triton, atol=0, rtol=0)
diff --git a/torchao/prototype/mx_formats/custom_cast.py b/torchao/prototype/mx_formats/custom_cast.py
@@ -1383,6 +1383,133 @@ def triton_to_mxfp8_dim1_reference(
             scale_e8m0_dim1,
         )
 
+    @triton.jit
+    def scale_swizzle(
+        scale_ptr,
+        scale_rows,
+        scale_cols,
+        output_ptr,
+        input_row_stride,
+        output_block_stride,
+        BLOCK_ROWS: tl.constexpr,
+        BLOCK_COLS: tl.constexpr,
+    ):
+        """
+        Rearranges tensor data from row-major to block-scaled swizzle format.
+
+        Args:
+            scale_ptr: Pointer to the input scale tensor
+            scale_rows: Number of rows in the scale tensor
+            scale_cols: Number of columns in the scale tensor
+            output_ptr: Pointer to the output tensor
+            input_row_stride: Stride between rows in the input tensor
+            output_block_stride: Stride between blocks in the output tensor
+            BLOCK_ROWS: Number of rows in a tile (compile-time constant)
+            BLOCK_COLS: Number of columns in a tile (compile-time constant)
+        """
+        pid_row = tl.program_id(0)
+        pid_col = tl.program_id(1)
+
+        rows = tl.arange(0, BLOCK_ROWS)[:, None]
+        cols = tl.arange(0, BLOCK_COLS)[None, :]
+
+        # Calculate starting row and column for this tile
+        start_row = pid_row * BLOCK_ROWS
+        start_col = pid_col * BLOCK_COLS
+        global_rows = start_row + rows
+        global_cols = start_col + cols
+
+        mask = (global_rows < scale_rows) & (global_cols < scale_cols)
+
+        input_scales = tl.load(
+            scale_ptr + global_rows * input_row_stride + global_cols,
+            mask=mask,
+            other=0.0,
+        )
+
+        # Block rearrangement logic for the _to_blocked_single transformation:
+        # 1) Divide into 4×32 blocks
+        r_div_32 = rows // 32
+        r_mod_32 = rows % 32
+
+        # 2) Rearrange to (32, 4, 4) then to final (32, 16) coordinates
+        # row = r_mod_32, col = (r_div_32 * 4 + inner_col)
+        dest_indices = r_mod_32 * 16 + r_div_32 * 4 + cols
+
+        # Flatten indices for storage
+        dest_indices_flat = tl.reshape(
+            dest_indices, (BLOCK_ROWS * BLOCK_COLS), can_reorder=True
+        )
+
+        # Calculate block offset using provided output block stride
+        LOCAL_NUMEL = BLOCK_ROWS * BLOCK_COLS
+        block_offset = pid_col * LOCAL_NUMEL + (pid_row * output_block_stride)
+
+        # Store the rearranged values
+        tl.store(
+            output_ptr + block_offset + dest_indices_flat,
+            tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS), can_reorder=True),
+        )
+
+    def mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
+        """
+        Rearranges an E8M0 tensor scale from row-major format to block-scaled swizzle format.
+
+        This format is suitable for Tmem as described in NVIDIA documentation:
+        https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+        Args:
+            scale_tensor: Input tensor in row-major format with 8-bit elements
+
+        Returns:
+            Rearranged tensor in block-scaled swizzle format
+        """
+        assert scale_tensor.element_size() == 1, (
+            "Expected element size to be 1 byte (8 bits)"
+        )
+        assert scale_tensor.is_contiguous(), "Input tensor must be contiguous"
+
+        rows, cols = scale_tensor.shape
+
+        # Calculate blocks needed
+        n_row_blocks = triton.cdiv(rows, 128)
+        n_col_blocks = triton.cdiv(cols, 4)
+        padded_rows = n_row_blocks * 128
+        padded_cols = n_col_blocks * 4
+
+        out = scale_tensor.new_empty((padded_rows, padded_cols))
+
+        # Input stride (for row-major format)
+        input_row_stride = cols
+
+        # We probably want handle multiple blocks per tile but for now keep it simple
+        BLOCK_ROWS, BLOCK_COLS = 128, 4
+
+        # Output block stride for the rearranged format
+        output_block_stride = BLOCK_ROWS * BLOCK_COLS * (padded_cols // BLOCK_COLS)
+
+        # Calculate grid dimensions
+        grid = lambda META: (
+            triton.cdiv(padded_rows, BLOCK_ROWS),
+            triton.cdiv(padded_cols, BLOCK_COLS),
+        )
+
+        # Launch kernel with added stride parameters
+        # TODO fix before land
+        # wrap_triton(scale_swizzle)[grid](
+        scale_swizzle[grid](
+            scale_tensor.view(torch.uint8),
+            rows,
+            cols,
+            out.view(torch.uint8),
+            input_row_stride,
+            output_block_stride,
+            BLOCK_ROWS=BLOCK_ROWS,
+            BLOCK_COLS=BLOCK_COLS,
+        )
+
+        return out
+
 else:
 
     def triton_to_mxfp8_dim1(
@@ -1394,3 +1521,6 @@ def triton_to_mxfp8_dim1_reference(
         x_hp: torch.Tensor, block_size
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         raise AssertionError("needs torch version 2.8+ and triton")
+
+    def mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
+        raise AssertionError("needs torch version 2.8+ and triton")
diff --git a/torchao/prototype/mx_formats/utils.py b/torchao/prototype/mx_formats/utils.py
@@ -6,14 +6,16 @@
 
 import torch
 
+from torchao.prototype.mx_formats.custom_cast import mx_block_rearrange
+
 Tensor = torch.Tensor
 
 
 def ceil_div(a, b):
     return (a + b - 1) // b
 
 
-def to_blocked(input_matrix) -> Tensor:
+def to_blocked(input_matrix, swizzle_kernel: bool = False) -> Tensor:
     """
     Rearrange a large matrix by breaking it into blocks and applying the rearrangement pattern.
 
@@ -26,6 +28,9 @@ def to_blocked(input_matrix) -> Tensor:
     Returns:
         Rearranged tensor of shape (32*ceil_div(H,128), 16*ceil_div(W,4))
     """
+    if swizzle_kernel:
+        return mx_block_rearrange(input_matrix).flatten()
+
     rows, cols = input_matrix.shape
     n_row_blocks = ceil_div(rows, 128)
     n_col_blocks = ceil_div(cols, 4)
@@ -36,7 +41,8 @@ def to_blocked(input_matrix) -> Tensor:
 
     padded = input_matrix
     # TODO This is to work around VLLM's usage of compile w/ dynamic shapes
-    if torch.compiler.is_compiling() or (rows, cols) != (padded_rows, padded_cols):
+    # if torch.compiler.is_compiling() or (rows, cols) != (padded_rows, padded_cols):
+    if (rows, cols) != (padded_rows, padded_cols):
         padded = torch.zeros(
             (padded_rows, padded_cols),
             device=input_matrix.device,