NVIDIA
diff --git a/‎examples/blackwell_matmul/matmul_v0.py‎
Lines changed: 1 addition & 3 deletions b/‎examples/blackwell_matmul/matmul_v0.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/blackwell_matmul/matmul_v1.py‎
Lines changed: 1 addition & 3 deletions b/‎examples/blackwell_matmul/matmul_v1.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/blackwell_matmul/matmul_v2.py‎
Lines changed: 1 addition & 3 deletions b/‎examples/blackwell_matmul/matmul_v2.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/blackwell_matmul/matmul_v3.py‎
Lines changed: 1 addition & 3 deletions b/‎examples/blackwell_matmul/matmul_v3.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/blackwell_matmul/matmul_v4.py‎
Lines changed: 1 addition & 3 deletions b/‎examples/blackwell_matmul/matmul_v4.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/tilus/backends/emitters/cuda/tcgen05/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tilus/backends/emitters/cuda/tcgen05/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…ends/emitters/cuda/tcgen05/allocation.py‎ ‎…/backends/emitters/cuda/tcgen05/alloc.py‎python/tilus/backends/emitters/cuda/tcgen05/allocation.py renamed to python/tilus/backends/emitters/cuda/tcgen05/alloc.py
Lines changed: 20 additions & 27 deletions b/‎…ends/emitters/cuda/tcgen05/allocation.py‎ ‎…/backends/emitters/cuda/tcgen05/alloc.py‎python/tilus/backends/emitters/cuda/tcgen05/allocation.py renamed to python/tilus/backends/emitters/cuda/tcgen05/alloc.py
Lines changed: 20 additions & 27 deletions
diff --git a/‎python/tilus/backends/emitters/cuda/tcgen05/copy.py‎
Lines changed: 6 additions & 3 deletions b/‎python/tilus/backends/emitters/cuda/tcgen05/copy.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎python/tilus/backends/emitters/cuda/tcgen05/ldst.py‎
Lines changed: 7 additions & 32 deletions b/‎python/tilus/backends/emitters/cuda/tcgen05/ldst.py‎
Lines changed: 7 additions & 32 deletions
@@ -70,9 +70,7 @@ def __call__(
             phase ^= 1
 
         # load the result from tensor memory to register
-        r_acc = self.tcgen05.load(
-            t_acc, offsets=[0, 0], shape=[self.block_m, self.block_n]
-        )
+        r_acc = self.tcgen05.load(t_acc)
 
         g_c = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size])
         self.store_global(g_c, r_acc.to(float16), offsets=[offset_m, offset_n])
 
@@ -79,9 +79,7 @@ def __call__(
             phase ^= 1
 
         # load the result from tensor memory to register
-        r_acc = self.tcgen05.load(
-            t_acc, offsets=[0, 0], shape=[self.block_m, self.block_n]
-        )
+        r_acc = self.tcgen05.load(t_acc)
 
         g_c = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size])
         self.store_global(g_c, r_acc.to(float16), offsets=[offset_m, offset_n])
 
@@ -108,9 +108,7 @@ def __call__(
             self.sync()
 
         # load the result from tensor memory to register
-        r_acc = self.tcgen05.load(
-            t_acc, offsets=[0, 0], shape=[self.block_m, self.block_n]
-        )
+        r_acc = self.tcgen05.load(t_acc)
 
         g_c = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size])
         self.store_global(g_c, r_acc.to(float16), offsets=[offset_m, offset_n])
 
@@ -109,9 +109,7 @@ def __call__(
         self.sync()
 
         # load the result from tensor memory to register
-        r_acc = self.tcgen05.load(
-            t_acc, offsets=[0, 0], shape=[self.block_m, self.block_n]
-        )
+        r_acc = self.tcgen05.load(t_acc)
 
         g_c = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size])
         self.store_global(g_c, r_acc.to(float16), offsets=[offset_m, offset_n])
 
@@ -210,9 +210,7 @@ def __call__(
         self.sync()
 
         # load the result from tensor memory to register
-        r_acc = self.tcgen05.load(
-            mma_worker.t_acc, offsets=[0, 0], shape=[self.block_m, self.block_n]
-        )
+        r_acc = self.tcgen05.load(mma_worker.t_acc)
 
         g_c = self.global_view(c_ptr, dtype=float16, shape=[m_size, n_size])
         self.store_global(g_c, r_acc.to(float16), offsets=[offset_m, offset_n])
 
@@ -92,7 +92,9 @@ ignore = [
 convention = "numpy"
 
 [tool.ruff.lint.per-file-ignores]
-"__init__.py" = ["F401"]
+"__init__.py" = [
+    "F401"  # checks for unused imports.
+]
 "examples/**/*.py" = [
     "D400",
     "D205", # 1 blank line required between summary line and description
 
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from . import allocation, copy, ldst, mma, sync
+from . import alloc, copy, ldst, mma, slice, sync
@@ -30,25 +30,27 @@
     Tcgen05AllocInst,
     Tcgen05DeallocInst,
     Tcgen05RelinquishAllocPermitInst,
-    Tcgen05SliceInst,
     Tcgen05ViewInst,
 )
 from tilus.ir.tensor import TMemoryTensor
 from tilus.target import nvgpu_sm100
-
-#    tmem addr: 0xAAAABBBB where AAAA is the lane index and BBBB is the column index
-#   lane index: 0x0000 to 0x007F
-# column index: 0x0000 to 0x01FF
-LANE_STRIDE = 0x00010000
-COLUMN_STRIDE = 0x00000001
+from tilus.utils import prod, same_list
 
 
 class Tcgen05AllocDeallocEmitter(BaseInstEmitter):
     def get_num_columns(self, tmem_tensor: TMemoryTensor) -> int:
-        assert tmem_tensor.shape[0] == 128
-        assert tmem_tensor.shape[1] * tmem_tensor.dtype.nbits % 32 == 0
-        num_columns = tmem_tensor.shape[1] * tmem_tensor.dtype.nbits // 32
-        assert num_columns % 32 == 0 and 32 <= num_columns <= 512, num_columns
+        shape = tmem_tensor.shape
+        if shape[-2] != 128:
+            raise NotImplementedError(f"The emitter currently only supports shape[-2] == 128, but got {shape[-2]}")
+        if shape[-1] * tmem_tensor.dtype.nbits % 32 != 0:
+            raise ValueError(
+                f"shape[-1] * dtype.nbits must be divisible by 32, but got {shape[-1]} * {tmem_tensor.dtype.nbits} = {shape[-1] * tmem_tensor.dtype.nbits}"
+            )
+        num_columns = prod(shape[:-2]) * shape[-1] * tmem_tensor.dtype.nbits // 32
+        if not (num_columns % 32 == 0 and 32 <= num_columns <= 512):
+            raise ValueError(
+                f"The number of 32-bit columns must be a multiple of 32 and in range [32, 512], but got {num_columns}"
+            )
         return num_columns
 
 
@@ -122,32 +124,23 @@ def emit(self, inst: Tcgen05RelinquishAllocPermitInst) -> None:
         self.append(tcgen05_relinquish_alloc_permit(Tcgen05CtaGroupKind.from_int(inst.cta_group)))
 
 
-@register_emitter(Tcgen05SliceInst, target=nvgpu_sm100)
-class TMemorySliceEmitter(BaseInstEmitter):
-    def emit(self, inst: Tcgen05SliceInst) -> None:
-        tmem_tensor = inst.inputs[0].as_tmemory_tensor()
-        output_tmem_tensor = inst.tmemory_output
-        tmem_addr = self.get_or_allocate_var(tmem_tensor)
-
-        sliced_addr = self.get_or_allocate_var(output_tmem_tensor, name="tmem_slice")
-        self.assign(
-            sliced_addr,
-            tmem_addr + inst.offsets[0] * LANE_STRIDE + inst.offsets[1] * COLUMN_STRIDE * tmem_tensor.dtype.nbits // 32,
-        )
-
-
 @register_emitter(Tcgen05ViewInst, target=nvgpu_sm100)
 class TMemoryViewEmitter(BaseInstEmitter):
     def emit(self, inst: Tcgen05ViewInst) -> None:
         tmem_tensor = inst.inputs[0].as_tmemory_tensor()
         output_tmem_tensor = inst.tmemory_output
 
         if (
-            tmem_tensor.dtype.nbits * tmem_tensor.shape[1]
-            != output_tmem_tensor.dtype.nbits * output_tmem_tensor.shape[1]
+            tmem_tensor.dtype.nbits * tmem_tensor.shape[-1]
+            != output_tmem_tensor.dtype.nbits * output_tmem_tensor.shape[-1]
         ):
             raise ValueError("The total number of bits must be the same as the original tensor.")
 
+        if not same_list(tmem_tensor.layout.column_strides[:-2], output_tmem_tensor.layout.column_strides[:-2]):
+            raise ValueError(
+                "The column strides of the leading dimensions (all dimensions except the last two ones) must be the same as the original tensor."
+            )
+
         tmem_addr = self.get_or_allocate_var(tmem_tensor)
         view_addr = self.get_or_allocate_var(output_tmem_tensor, name="tmem_view")
         self.assign(view_addr, tmem_addr)
@@ -22,9 +22,10 @@
 from hidet.ir.expr import Expr
 
 from tilus.backends.emitter import BaseInstEmitter, register_emitter
-from tilus.backends.emitters.cuda.tcgen05.allocation import COLUMN_STRIDE, LANE_STRIDE
 from tilus.backends.emitters.cuda.tcgen05.smem_desc import SharedMatrixDescriptor
 from tilus.extensions.hidet.ir.primitives.cuda.tcgen05 import (
+    COLUMN_STRIDE,
+    LANE_STRIDE,
     Tcgen05CopyMulticastKind,
     Tcgen05CopyShapeKind,
     Tcgen05CtaGroupKind,
@@ -200,10 +201,12 @@ def emit(self, inst: Tcgen05CopyInst) -> None:
         self.check_warp_group()
 
         if len(shared_tensor.shape) != 2:
-            raise ValueError("The shared tensor must be a 2D tensor")
+            raise ValueError("The shared tensor must be a 2D tensor, got shape {}".format(shared_tensor.shape))
+        if len(tmem_tensor.shape) != 2:
+            raise ValueError("The tensor memory tensor must be a 2D tensor, got shape {}".format(tmem_tensor.shape))
         if shared_tensor.shape[0] != 128:
             raise NotImplementedError("The number of rows in the shared tensor must be 128")
-        if tmem_tensor.first_lane != 0:
+        if tmem_tensor.layout.lane_offset != 0:
             raise NotImplementedError("The first lane of the tmem tensor must be 0")
 
         tmem_base_addr = self.tensor2var[tmem_tensor]
 
@@ -14,13 +14,14 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Sequence
 
 from hidet.ir.dtypes import int32, uint32
 from hidet.ir.expr import Expr, cast
 
 from tilus.backends.emitter import BaseInstEmitter, register_emitter
 from tilus.extensions.hidet.ir.primitives.cuda.tcgen05 import (
+    COLUMN_STRIDE,
+    LANE_STRIDE,
     Tcgen05LoadStoreNumKind,
     Tcgen05LoadStorePackKind,
     Tcgen05LoadStoreShapeKind,
@@ -41,12 +42,6 @@
 from tilus.target import nvgpu_sm100
 from tilus.utils import gcd
 
-#    tmem addr: 0xAAAABBBB where AAAA is the lane index and BBBB is the column index
-#   lane index: 0x0000 to 0x007F
-# column index: 0x0000 to 0x01FF
-LANE_STRIDE = 0x00010000
-COLUMN_STRIDE = 0x00000001
-
 
 @dataclass
 class LoadStoreWarpInst:
@@ -58,24 +53,6 @@ class LoadStoreWarpInst:
 
 
 class TMemoryLoadStoreBaseEmitter(BaseInstEmitter):
-    def slice_tmem_tensor(
-        self, tmem_tensor: TMemoryTensor, offsets: Sequence[int], shape: Sequence[int]
-    ) -> tuple[TMemoryTensor, Expr]:
-        if any(not isinstance(ofs, int) for ofs in offsets):
-            raise ValueError("All offsets must be integer constants")
-        if len(offsets) != 2:
-            raise ValueError("The length of offsets must be 2")
-        if len(shape) != 2:
-            raise ValueError("The length of shape must be 2")
-        tmem_addr = self.get_or_allocate_var(tmem_tensor)
-        sliced_tmem_tensor = TMemoryTensor.create(
-            dtype=tmem_tensor.dtype, shape=shape, first_lane=tmem_tensor.first_lane + offsets[0]
-        )
-        sliced_tmem_addr = (
-            tmem_addr + offsets[0] * LANE_STRIDE + offsets[1] * COLUMN_STRIDE * tmem_tensor.dtype.nbits // 32
-        )
-        return sliced_tmem_tensor, sliced_tmem_addr
-
     def emit_tcgen05_inst(self, inst: LoadStoreWarpInst) -> None:
         raise NotImplementedError("Subclasses must implement this method")
 
@@ -87,11 +64,11 @@ def emit_tcgen05_instructions(
     ) -> None:
         if self.current_num_threads % 32 != 0:
             raise ValueError("The number of threads in the current thread group must be divisible by 32")
-        if self.current_thread_group_begin % 128 != tmem_tensor.first_lane:
+        if self.current_thread_group_begin % 128 != tmem_tensor.layout.lane_offset:
             raise ValueError(
                 "Lane mismatch: the first lane of the tmem tensor must be the same as the thread group begin"
             )
-        if self.current_num_threads != tmem_tensor.shape[0]:
+        if self.current_num_threads != tmem_tensor.shape[-2]:
             raise ValueError(
                 "The number of threads in the current thread group must be the same as the number of lanes in the tmem tensor"
             )
@@ -174,8 +151,7 @@ class TMemoryLoadEmitter(TMemoryLoadStoreBaseEmitter):
     def emit(self, inst: Tcgen05LoadInst) -> None:
         regs_tensor = inst.register_output
         tmem_tensor = inst.inputs[0].as_tmemory_tensor()
-        sliced_tmem_tensor, sliced_tmem_addr = self.slice_tmem_tensor(tmem_tensor, inst.offsets, regs_tensor.shape)
-        self.emit_tcgen05_instructions(regs_tensor, sliced_tmem_tensor, sliced_tmem_addr)
+        self.emit_tcgen05_instructions(regs_tensor, tmem_tensor, self.tensor2var[tmem_tensor])
 
     def emit_tcgen05_inst(self, inst: LoadStoreWarpInst) -> None:
         self.append(
@@ -195,11 +171,10 @@ def emit(self, inst: Tcgen05StoreInst) -> None:
         regs_tensor = inst.inputs[1].as_register_tensor()
         tmem_tensor = inst.inputs[0].as_tmemory_tensor()
 
-        sliced_tmem_tensor, sliced_tmem_addr = self.slice_tmem_tensor(tmem_tensor, inst.offsets, regs_tensor.shape)
         self.emit_tcgen05_instructions(
             regs_tensor,
-            sliced_tmem_tensor,
-            sliced_tmem_addr,
+            tmem_tensor,
+            self.tensor2var[tmem_tensor],
         )
 
     def emit_tcgen05_inst(self, inst: LoadStoreWarpInst) -> None: