fix tests

yaoyaoding · yaoyaoding · commit 1e12940fe349 · 2025-11-03T19:59:40.000Z
Signed-off-by: Yaoyao Ding &lt;dingyaoyao.cs@gmail.com&gt;
diff --git a/examples/blackwell_matmul/matmul_v3.py b/examples/blackwell_matmul/matmul_v3.py
diff --git a/python/tilus/backends/codegen.py b/python/tilus/backends/codegen.py
@@ -148,7 +148,7 @@ def launch_kernel(self, kernel_func: HidetFunction) -> None:
         if kernel_func.kind == "cuda_kernel":
             func_var = Var(hint=None, type=FuncType.from_func(kernel_func), name=kernel_func.name)
             dynamic_shared_bytes = kernel_func.get_attr("cuda.dynamic_smem_bytes", int32(0))
-            assert isinstance(dynamic_shared_bytes, Expr)
+            assert isinstance(dynamic_shared_bytes, Expr | int)
 
             # set max dynamic shared memory bytes if needed
             with self.host_builder.if_then(dynamic_shared_bytes > 48 * 1024):
@@ -163,7 +163,7 @@ def launch_kernel(self, kernel_func: HidetFunction) -> None:
                     grid_dim=normalize_dim3(kernel_func.get_attr("cuda.grid_dim")),  # type: ignore
                     cluster_dim=normalize_dim3(kernel_func.get_attr("cuda.cluster_dim", default=1)),  # type: ignore
                     block_dim=normalize_dim3(kernel_func.get_attr("cuda.block_dim")),  # type: ignore
-                    shared_mem=dynamic_shared_bytes,
+                    shared_mem=int32(dynamic_shared_bytes),
                     target="cuda",
                 )
             )
diff --git a/python/tilus/backends/contexts/tcgen05_ctx.py b/python/tilus/backends/contexts/tcgen05_ctx.py
@@ -16,6 +16,7 @@
 
 from typing import Optional
 
+from tilus.ir.tensor import TMemoryTensor
 from tilus.backends.context import BaseEmitContext
 
 
@@ -28,12 +29,23 @@ def __post_init__(self):
         # thus, we only ask the user to give cta_group in the tcgen05.alloc instruction and we track it here
         # for other tcgen05 instructions to use
         self.cta_group: Optional[int] = None
+        self.allocated_tmemory_tensors: set[TMemoryTensor] = set()
 
     @staticmethod
     def current() -> Tcgen05EmitContext:
         if Tcgen05EmitContext._current is None:
             raise RuntimeError("No active Tcgen05EmitContext found.")
         return Tcgen05EmitContext._current
+    
+    def mark_tmemory_tensor_allocate(self, tensor: TMemoryTensor) -> None:
+        if tensor in self.allocated_tmemory_tensors:
+            raise ValueError(f"TMemory tensor {tensor} has already been allocated.")
+        self.allocated_tmemory_tensors.add(tensor)
+    
+    def mark_tmemory_tensor_deallocate(self, tensor: TMemoryTensor) -> None:
+        if tensor not in self.allocated_tmemory_tensors:
+            raise ValueError(f"TMemory tensor {tensor} has not been allocated yet.")
+        self.allocated_tmemory_tensors.remove(tensor)
 
     def set_cta_group(self, cta_group: int) -> None:
         assert cta_group in (1, 2)
@@ -52,3 +64,14 @@ def get_cta_group(self) -> int:
                 "before any other tcgen05 instructions."
             )
         return self.cta_group
+    
+    def finalize(self):
+        # check all TMemory tensors are deallocated
+        if len(self.allocated_tmemory_tensors) > 0:
+            rows = []
+            for tensor in self.allocated_tmemory_tensors:
+                rows.append(f"  - {tensor}")
+            raise ValueError(
+                "The following TMemory tensors are not deallocated before the end of the kernel:\n"
+                + "\n".join(rows)
+            )
diff --git a/python/tilus/backends/emitters/cuda/tcgen05/allocation.py b/python/tilus/backends/emitters/cuda/tcgen05/allocation.py
@@ -89,6 +89,9 @@ def emit(self, inst: Tcgen05AllocInst) -> None:
         self.assign(tmem_var, cast(smem_ptr, ~int32)[0])
         self.sync()
 
+        # mark the tensor as allocated in the tcgen05 context to track allocations
+        tcgen05_ctx.mark_tmemory_tensor_allocate(tmem_tensor)
+
 
 @register_emitter(Tcgen05DeallocInst, target=nvgpu_sm100)
 class Tcgen05DeallocEmitter(Tcgen05AllocDeallocEmitter):
@@ -108,6 +111,9 @@ def emit(self, inst: Tcgen05DeallocInst) -> None:
                     cta_group=Tcgen05CtaGroupKind.from_int(tcgen05_ctx.get_cta_group()),
                 )
             )
+        
+        # mark the tensor as deallocated in the tcgen05 context to track allocations
+        tcgen05_ctx.mark_tmemory_tensor_deallocate(tmem_tensor)
 
 
 @register_emitter(Tcgen05RelinquishAllocPermitInst, target=nvgpu_sm100)
diff --git a/tests/instructions/test_print_tensor.py b/tests/instructions/test_print_tensor.py
@@ -24,6 +24,7 @@ def __call__(self):
 
         t_a = self.tcgen05.alloc(dtype=float32, shape=[128, 32])
         self.print_tensor("t_a: ", t_a)
+        self.tcgen05.dealloc(t_a)
 
 
 @requires.nvgpu_sm100a