fix

yaoyaoding · yaoyaoding · commit aa72a53f74bb · 2025-09-27T05:35:07.000Z
Signed-off-by: Yaoyao Ding &lt;dingyaoyao.cs@gmail.com&gt;
diff --git a/python/tilus/backends/emitters/cuda/tcgen05/copy.py b/python/tilus/backends/emitters/cuda/tcgen05/copy.py
@@ -15,30 +15,32 @@
 
 
 from __future__ import annotations
-from typing import Optional
+
 from dataclasses import dataclass
 
 from hidet.ir.dtypes import uint64
 from hidet.ir.expr import Expr
-from hidet.ir.primitives.debug import printf
 
 from tilus.backends.codegen import BaseInstEmitter, register_emitter
-from tilus.backends.emitters.cuda.tcgen05.allocation import COLUMN_STRIDE, ROW_STRIDE
+from tilus.backends.emitters.cuda.tcgen05.allocation import COLUMN_STRIDE, LANE_STRIDE
 from tilus.extensions.hidet.ir.primitives.cuda.tcgen05 import (
     Tcgen05CopyMulticastKind,
     Tcgen05CopyShapeKind,
     Tcgen05CtaGroupKind,
     tcgen05_copy,
     tcgen05_encode_smem_descriptor,
 )
+from tilus.extensions.hidet.ir.utils.index_transform import index_deserialize
 from tilus.ir.instructions.cuda.tmem import Tcgen05CopyInst
-from tilus.ir.layout.cuda.tcgen05_smem import CanonicalSharedLayout, canonicalize_shared_layout, Tcgen05SwizzleMode
+from tilus.ir.layout.cuda.tcgen05_smem import CanonicalSharedLayout, Tcgen05SwizzleMode, canonicalize_shared_layout
 from tilus.ir.tensor import SharedTensor, TMemoryTensor
 from tilus.target import nvgpu_sm100
 
+
 class GenerationFailedError(Exception):
     pass
 
+
 @dataclass
 class SharedMatrixDescriptor:
     start_addr: Expr | int
@@ -78,19 +80,24 @@ class Tcgen05CopyInstMeta:
     tmem_offset: int
     shared_descriptor: SharedMatrixDescriptor
 
+    def __str__(self) -> str:
+        items = []
+        for key, value in self.__dict__.items():
+            items.append(f"{key}: {value}")
+        return "Tcgen05CopyInstMeta(" + ",\n  ".join(items) + "\n)"
+
 
 @register_emitter(Tcgen05CopyInst, target=nvgpu_sm100)
 class Tcgen05CopyEmitter(BaseInstEmitter):
     def split_canonical_layout(
-        self, 
-        canonical: CanonicalSharedLayout, 
-        shape_kind: Tcgen05CopyShapeKind
-    ) -> Optional[list[tuple[int, SharedMatrixDescriptor]]]:
+        self, smem_addr: Expr, canonical: CanonicalSharedLayout, shape_kind: Tcgen05CopyShapeKind
+    ) -> list[Tcgen05CopyInstMeta]:
         """
         A shared memory tensor might be very large that we need to split it into multiple sub-tensors and
         each sub-tensor is copied by a tcgen05.copy instruction. The smem_addr in returned SharedMatrixDescriptor
         is the offset of the sub-tensor relative to the shared memory tensor in bytes.
 
+        Each tcgen05.copy instruction copies a sub-tensor with the following layout:
         +----------------+--------------------------+--------------------------------------+-------------------------------------+
         | Major-ness     | Swizzling mode           | Canonical Layout without swizzling   | Swizzling on the previous column    |
         +================+==========================+======================================+=====================================+
@@ -118,58 +125,101 @@ def split_canonical_layout(
         - k represents the number of repeating patterns across columns.
         (The table is is from: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-canonical-layouts.)
 
+        The definition of the canonical layout in Tilus is similar to above table, but it's different since we want to represent the layouts
+        in a more natural and extensible way for larger tensors. See the docstring of CanonicalSharedLayout for more details.
+
         Returns
         -------
         ret: Optional[list[tuple[int, SharedMatrixDescriptor]]]
             The list of instructions, each instruction contains the tmem_offset and shared matrix descriptor for each sub-tensor.
         """
         cute_layout = canonical.swizzled_cute_layout.layout
-        m, n = cute_layout.shape
+        m, n = cute_layout.flattened_shape
 
         if shape_kind.n % canonical.dtype_nbits != 0:
-            raise GenerationFailedError("The number of columns in the shape kind must be divisible by the number of bits in the data type")
+            raise GenerationFailedError(
+                "The number of columns in the shape kind must be divisible by the number of bits in the data type"
+            )
 
         inst_m, inst_n = shape_kind.m, shape_kind.n // canonical.dtype_nbits
 
         if m % inst_m != 0 or n % inst_n != 0:
-            raise GenerationFailedError("The number of rows or columns in the shape kind must be divisible by the number of rows or columns in the canonical layout")
+            raise GenerationFailedError(
+                "The number of rows or columns in the shape kind must be divisible by the number of rows or columns in the canonical layout"
+            )
+        if canonical.major_kind == "MN" and (inst_m % (canonical.T * canonical.S) != 0 or inst_n % 8 != 0):
+            raise GenerationFailedError(
+                "The number of rows or columns in the shape kind must be divisible by the number of rows or columns in the canonical layout"
+            )
+        if canonical.major_kind == "K" and (inst_m % 8 != 0 or inst_n % (canonical.T * 2) != 0):
+            raise GenerationFailedError(
+                "The number of rows or columns in the shape kind must be divisible by the number of rows or columns in the canonical layout"
+            )
 
         num_m, num_n = m // inst_m, n // inst_n
         nbytes = canonical.dtype_nbits // 8
 
+        instructions: list[Tcgen05CopyInstMeta] = []
         for i in range(num_m):
             for j in range(num_n):
-                tmem_offset = i * ROW_STRIDE + j * COLUMN_STRIDE
+                tmem_offset = i * inst_m * LANE_STRIDE + j * inst_n * COLUMN_STRIDE
                 if canonical.major_kind == "MN":
-                    assert inst_m % (canonical.T * canonical.S) == 0 and inst_n % 8 == 0
+                    if canonical.swizzle_mode == Tcgen05SwizzleMode.NO_SWIZZLE:
+                        smem_offset = (
+                            i * inst_m // (canonical.T * canonical.S) * canonical.SBO + j * inst_n // 8 * canonical.LBO
+                        ) * nbytes
+                    else:
+                        smem_offset = (
+                            i * inst_m // (canonical.T * canonical.S) * canonical.LBO + j * inst_n // 8 * canonical.SBO
+                        ) * nbytes
                     s_desc = SharedMatrixDescriptor(
-                        start_addr=(i * inst_m * canonical.SBO + j * inst_n * canonical.LBO) * nbytes,
+                        start_addr=smem_addr + smem_offset,
                         lbo=canonical.LBO * nbytes,
                         sbo=canonical.SBO * nbytes,
                         base_offset=0,
-                        stride_mode=0,  
+                        stride_mode=0,
                         swizzle_mode=canonical.swizzle_mode.encode(),
                     )
                 elif canonical.major_kind == "K":
-                    assert inst_m % 8 == 0 and inst_n % (canonical.T * canonical.S) == 0
                     if canonical.swizzle_mode == Tcgen05SwizzleMode.NO_SWIZZLE:
-                        s_desc = SharedMatrixDescriptor(
-                            start_addr=(i * inst_m * canonical.SBO + j * inst_n * canonical.LBO) * nbytes,
-                            lbo=canonical.LBO * nbytes,
-                            sbo=canonical.SBO * nbytes,
-                            base_offset=0,
-                            stride_mode=0,  
-                            swizzle_mode=canonical.swizzle_mode.encode(),
-                        )
+                        smem_offset = (
+                            i * inst_m // 8 * canonical.SBO + j * inst_n // (canonical.T * canonical.S) * canonical.LBO
+                        ) * nbytes
+                        lbo = canonical.LBO * nbytes
                     else:
-                        pass
+                        # j0, j1, j2 for shape (T, S, k)
+                        _, j1, j2 = index_deserialize(
+                            j * inst_n,
+                            (canonical.T, canonical.S, canonical.k // (canonical.T * canonical.S)),
+                            ranks=[2, 1, 0],
+                        )
+                        smem_offset = (i * inst_m // 8 * canonical.SBO + j1 * canonical.T + j2 * canonical.LBO) * nbytes
+                        lbo = 1 << 4  # assume lbo be 16 so that lbo >> 4 == 1, as required by the documentation
+                    s_desc = SharedMatrixDescriptor(
+                        start_addr=smem_addr + smem_offset,
+                        lbo=lbo,
+                        sbo=canonical.SBO * nbytes,
+                        base_offset=0,
+                        stride_mode=0,
+                        swizzle_mode=canonical.swizzle_mode.encode(),
+                    )
+
+                instructions.append(
+                    Tcgen05CopyInstMeta(
+                        shape_kind=shape_kind,
+                        multicast=Tcgen05CopyMulticastKind.NONE,
+                        cta_group=Tcgen05CtaGroupKind.CTA_1,
+                        tmem_offset=tmem_offset,
+                        shared_descriptor=s_desc,
+                    )
+                )
 
+        return instructions
 
     def generate_instructions(
         self, tmem_tensor: TMemoryTensor, shared_tensor: SharedTensor
     ) -> list[Tcgen05CopyInstMeta]:
         dtype = shared_tensor.dtype
-        shape = shared_tensor.shape
         canonical_layout: CanonicalSharedLayout | None = canonicalize_shared_layout(
             shared_tensor.layout, tmem_tensor.dtype
         )
@@ -180,52 +230,18 @@ def generate_instructions(
                 f"  shared_layout: {shared_tensor.layout}",
             ]
             raise ValueError("\n".join(msg))
-        print(f"canonical_layout: {canonical_layout}")
-        print(f"canonical_layout.swizzled_cute_layout: {canonical_layout.swizzled_cute_layout}")
-        print(f"canonical_layout.atom_shape: {canonical_layout.atom_shape}")
-        print(f"canonical_layout.atom_strides: {canonical_layout.atom_strides}")
         smem_addr = self.shared_tensor_shared_space_addr[shared_tensor]
-        ret = []
+
         for shape_kind in [
             Tcgen05CopyShapeKind.R128x256B,
             Tcgen05CopyShapeKind.R128x128B,
         ]:
-            column_bits = shape_kind.as_int_tuple()[1]
-            assert column_bits % dtype.nbits == 0
-            column_elements = column_bits // dtype.nbits
-            if shape[1] % column_elements != 0:
-                continue
-            if shape[0] != 128:
+            try:
+                return self.split_canonical_layout(smem_addr, canonical_layout, shape_kind)
+            except GenerationFailedError:
                 continue
-            num_inst_columns = shape[1] // column_elements
-            for inst_column in range(num_inst_columns):
-                tmem_offset = inst_column * (column_bits // 32 * COLUMN_STRIDE)
-                smem_offset = inst_column * (
-                    column_elements // canonical_layout.atom_shape[1] * canonical_layout.atom_strides[1] * dtype.nbytes
-                )
 
-                shared_descriptor = SharedMatrixDescriptor(
-                    start_addr=(smem_addr + smem_offset),
-                    lbo=(canonical_layout.LBO * dtype.nbytes),
-                    sbo=(canonical_layout.SBO * dtype.nbytes),
-                    base_offset=0,
-                    stride_mode=0,  # 0 for relative mode and 1 for absolute mode
-                    swizzle_mode=canonical_layout.swizzle_mode.encode(),
-                )
-                print(f"shared_descriptor: {shared_descriptor}")
-
-                inst_meta = Tcgen05CopyInstMeta(
-                    shape_kind=shape_kind,
-                    multicast=Tcgen05CopyMulticastKind.NONE,
-                    cta_group=Tcgen05CtaGroupKind.CTA_1,
-                    tmem_offset=tmem_offset,
-                    shared_descriptor=shared_descriptor,
-                )
-                ret.append(inst_meta)
-            break
-        else:
-            raise ValueError("No valid instructions generated")
-        return ret
+        raise ValueError("No valid instructions generated")
 
     def check_warp_group(self) -> None:
         begin = self.current_thread_group_begin
@@ -254,8 +270,6 @@ def emit(self, inst: Tcgen05CopyInst) -> None:
                 s_desc = self.declare_var("s_desc", tp=uint64, init=inst_meta.shared_descriptor.encoded())
                 t_addr = tmem_base_addr + inst_meta.tmem_offset
 
-                self.append(printf("taddr: %#08x, sdesc: %#016lx\n", t_addr, s_desc))
-
                 self.append(
                     tcgen05_copy(
                         taddr=t_addr,
diff --git a/python/tilus/extensions/hidet/ir/primitives/cuda/tcgen05.py b/python/tilus/extensions/hidet/ir/primitives/cuda/tcgen05.py
@@ -138,7 +138,7 @@ def as_int_tuple(self) -> tuple[int, int]:
             Tcgen05CopyShapeKind.R4x128B: (4, 128),
         }
         return table[self]
-    
+
     @property
     def n(self) -> int:
         return self.as_int_tuple()[1]
@@ -148,7 +148,6 @@ def m(self) -> int:
         return self.as_int_tuple()[0]
 
 
-
 class Tcgen05CopyMulticastKind(Enum):
     NONE = ""
     WARP_X2_02_13 = ".warpx2_02_13"
diff --git a/python/tilus/ir/layout/cuda/tcgen05_smem.py b/python/tilus/ir/layout/cuda/tcgen05_smem.py
@@ -10,7 +10,7 @@
 from hidet.utils.py import prod
 
 from tilus.ir.layout.shared_layout import SharedLayout
-from tilus.ir.layout.utils.cute import CuteLayout, CuteSwizzle, cute_layout, SwizzledCuteLayout, tuple_product
+from tilus.ir.layout.utils.cute import CuteLayout, CuteSwizzle, IntTuple, SwizzledCuteLayout, cute_layout, tuple_product
 from tilus.ir.utils.veceval import meshgrid, vectorized_evaluate
 from tilus.utils import floor_log2
 
@@ -31,19 +31,19 @@ def encode(self) -> int:
             Tcgen05SwizzleMode.B64_SWIZZLE: 4,
             Tcgen05SwizzleMode.B128_SWIZZLE: 2,
         }[self]
-    
+
     @property
     def bbits(self) -> int:
         return self.value[0]
-    
+
     @property
     def mbase(self) -> int:
         return self.value[1]
-    
+
     @property
     def sshift(self) -> int:
         return self.value[2]
-    
+
     def as_cute_swizzle(self) -> CuteSwizzle:
         bbits, mbase, sshift = self.value
         return CuteSwizzle(bbits=bbits, mbase=mbase, sshift=sshift)
@@ -94,19 +94,21 @@ def __post_init__(self):
         atom_size = 2**self.swizzle_mode.bbits * 8 * self.T
         if (self.m > 1 and self.SBO % atom_size != 0) or (self.k > 1 and self.LBO % atom_size != 0):
             raise ValueError(f"SBO {self.SBO} and LBO {self.LBO} must be divisible by atom size: {atom_size}")
-    
+
     @property
     def S(self) -> int:
         return 2**self.swizzle_mode.bbits
-    
+
     @property
     def dtype_nbits(self) -> int:
         return 128 // self.T
 
     @property
     def swizzled_cute_layout(self) -> SwizzledCuteLayout:
+        shape: IntTuple
+        strides: IntTuple
         if self.major_kind == "MN":
-            shape = ((self.T, S, self.m), (8, self.k))
+            shape = ((self.T, self.S, self.m), (8, self.k))
             if self.swizzle_mode == Tcgen05SwizzleMode.NO_SWIZZLE:
                 strides = ((1, self.T, self.SBO), (self.T, self.LBO))
             else:
diff --git a/python/tilus/ir/layout/utils/cute.py b/python/tilus/ir/layout/utils/cute.py
@@ -67,6 +67,13 @@ def __call__(self, *coords: IntTuple) -> Int:
         ret = tuple_sum(tuple_multiply(coords, self.strides))
         return ret
 
+    @property
+    def flattened_shape(self) -> tuple[Int, ...]:
+        if not isinstance(self.shape, Sequence):
+            return (self.shape,)
+        else:
+            return tuple(tuple_product(item) for item in self.shape)
+
 
 class CuteSwizzle:
     def __init__(self, bbits: int, mbase: int, sshift: int):
@@ -86,14 +93,15 @@ def __call__(self, offset: Int) -> Int:
             y_mask = ((1 << self.bbits) - 1) << (self.mbase + self.sshift)
             return offset ^ ((offset & y_mask) >> self.sshift)
 
+
 class SwizzledCuteLayout:
     def __init__(self, layout: CuteLayout, swizzle: CuteSwizzle):
         self.layout: CuteLayout = layout
         self.swizzle: CuteSwizzle = swizzle
-    
+
     def __str__(self) -> str:
-        return str(self.swizzle) + ' ○ ' + str(self.layout)
-    
+        return str(self.swizzle) + " ○ " + str(self.layout)
+
     def __call__(self, *coords: IntTuple) -> Int:
         return self.swizzle(self.layout(*coords))
 
diff --git a/tests/instructions/test_tcgen05_copy.py b/tests/instructions/test_tcgen05_copy.py
@@ -80,8 +80,6 @@ def __call__(self, m_size: int, n_size: int, x_ptr: ~int32, y_ptr: ~int32):
 def test_tcgen05_copy(major_kind, swizzle_mode):
     if major_kind == "MN":
         pytest.xfail("MN is not supported")
-    if major_kind == "K" and swizzle_mode in [Tcgen05SwizzleMode.B64_SWIZZLE, Tcgen05SwizzleMode.B128_SWIZZLE]:
-        pytest.xfail("K with swizzle mode B64 and B128 is not supported")
     m_size = 128
     n_size = 32
     x = torch.randint(0, 128, [m_size, n_size], dtype=torch.int32, device="cuda")