Skip to content

Commit 1e12940

Browse files
committed
fix tests
Signed-off-by: Yaoyao Ding <dingyaoyao.cs@gmail.com>
1 parent a7e8e60 commit 1e12940

File tree

5 files changed

+32
-178
lines changed

5 files changed

+32
-178
lines changed

examples/blackwell_matmul/matmul_v3.py

Lines changed: 0 additions & 176 deletions
This file was deleted.

python/tilus/backends/codegen.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def launch_kernel(self, kernel_func: HidetFunction) -> None:
148148
if kernel_func.kind == "cuda_kernel":
149149
func_var = Var(hint=None, type=FuncType.from_func(kernel_func), name=kernel_func.name)
150150
dynamic_shared_bytes = kernel_func.get_attr("cuda.dynamic_smem_bytes", int32(0))
151-
assert isinstance(dynamic_shared_bytes, Expr)
151+
assert isinstance(dynamic_shared_bytes, Expr | int)
152152

153153
# set max dynamic shared memory bytes if needed
154154
with self.host_builder.if_then(dynamic_shared_bytes > 48 * 1024):
@@ -163,7 +163,7 @@ def launch_kernel(self, kernel_func: HidetFunction) -> None:
163163
grid_dim=normalize_dim3(kernel_func.get_attr("cuda.grid_dim")), # type: ignore
164164
cluster_dim=normalize_dim3(kernel_func.get_attr("cuda.cluster_dim", default=1)), # type: ignore
165165
block_dim=normalize_dim3(kernel_func.get_attr("cuda.block_dim")), # type: ignore
166-
shared_mem=dynamic_shared_bytes,
166+
shared_mem=int32(dynamic_shared_bytes),
167167
target="cuda",
168168
)
169169
)

python/tilus/backends/contexts/tcgen05_ctx.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from typing import Optional
1818

19+
from tilus.ir.tensor import TMemoryTensor
1920
from tilus.backends.context import BaseEmitContext
2021

2122

@@ -28,12 +29,23 @@ def __post_init__(self):
2829
# thus, we only ask the user to give cta_group in the tcgen05.alloc instruction and we track it here
2930
# for other tcgen05 instructions to use
3031
self.cta_group: Optional[int] = None
32+
self.allocated_tmemory_tensors: set[TMemoryTensor] = set()
3133

3234
@staticmethod
3335
def current() -> Tcgen05EmitContext:
3436
if Tcgen05EmitContext._current is None:
3537
raise RuntimeError("No active Tcgen05EmitContext found.")
3638
return Tcgen05EmitContext._current
39+
40+
def mark_tmemory_tensor_allocate(self, tensor: TMemoryTensor) -> None:
41+
if tensor in self.allocated_tmemory_tensors:
42+
raise ValueError(f"TMemory tensor {tensor} has already been allocated.")
43+
self.allocated_tmemory_tensors.add(tensor)
44+
45+
def mark_tmemory_tensor_deallocate(self, tensor: TMemoryTensor) -> None:
46+
if tensor not in self.allocated_tmemory_tensors:
47+
raise ValueError(f"TMemory tensor {tensor} has not been allocated yet.")
48+
self.allocated_tmemory_tensors.remove(tensor)
3749

3850
def set_cta_group(self, cta_group: int) -> None:
3951
assert cta_group in (1, 2)
@@ -52,3 +64,14 @@ def get_cta_group(self) -> int:
5264
"before any other tcgen05 instructions."
5365
)
5466
return self.cta_group
67+
68+
def finalize(self):
69+
# check all TMemory tensors are deallocated
70+
if len(self.allocated_tmemory_tensors) > 0:
71+
rows = []
72+
for tensor in self.allocated_tmemory_tensors:
73+
rows.append(f" - {tensor}")
74+
raise ValueError(
75+
"The following TMemory tensors are not deallocated before the end of the kernel:\n"
76+
+ "\n".join(rows)
77+
)

python/tilus/backends/emitters/cuda/tcgen05/allocation.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ def emit(self, inst: Tcgen05AllocInst) -> None:
8989
self.assign(tmem_var, cast(smem_ptr, ~int32)[0])
9090
self.sync()
9191

92+
# mark the tensor as allocated in the tcgen05 context to track allocations
93+
tcgen05_ctx.mark_tmemory_tensor_allocate(tmem_tensor)
94+
9295

9396
@register_emitter(Tcgen05DeallocInst, target=nvgpu_sm100)
9497
class Tcgen05DeallocEmitter(Tcgen05AllocDeallocEmitter):
@@ -108,6 +111,9 @@ def emit(self, inst: Tcgen05DeallocInst) -> None:
108111
cta_group=Tcgen05CtaGroupKind.from_int(tcgen05_ctx.get_cta_group()),
109112
)
110113
)
114+
115+
# mark the tensor as deallocated in the tcgen05 context to track allocations
116+
tcgen05_ctx.mark_tmemory_tensor_deallocate(tmem_tensor)
111117

112118

113119
@register_emitter(Tcgen05RelinquishAllocPermitInst, target=nvgpu_sm100)

tests/instructions/test_print_tensor.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def __call__(self):
2424

2525
t_a = self.tcgen05.alloc(dtype=float32, shape=[128, 32])
2626
self.print_tensor("t_a: ", t_a)
27+
self.tcgen05.dealloc(t_a)
2728

2829

2930
@requires.nvgpu_sm100a

0 commit comments

Comments
 (0)