@@ -25,6 +25,57 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
2525
2626// -----
2727
28+ #blocked = #ttg.blocked <{sizePerThread = [1 , 32 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [4 , 1 ], order = [0 , 1 ], CGALayout = [[0 , 1 ]]}>
29+ #slice1 = #ttg.slice <{dim = 1 , parent = #blocked }>
30+
31+ module attributes {" ttg.num-ctas" = 2 : i32 , " ttg.num-warps" = 4 : i32 , ttg.target = " cuda:90" , " ttg.threads-per-warp" = 32 : i32 } {
32+ // If there is a cross-CTA read dependency at kernel exit, we must end with a cluster barrier.
33+ // CHECK-LABEL: @end_cluster_barrier_after_cross_reduce
34+ // CHECK: "tt.reduce"{{.*}}axis = 1
35+ // CHECK: ttng.cluster_arrive {relaxed = false}
36+ // CHECK-NEXT: ttng.cluster_wait
37+ // CHECK-NEXT: tt.return
38+ tt.func @end_cluster_barrier_after_cross_reduce (%arg0: tensor <256 x128 xf16 , #blocked >) -> tensor <256 xf16 , #slice1 > {
39+ %red = " tt.reduce" (%arg0 ) ({
40+ ^bb0 (%lhs: f16 , %rhs: f16 ):
41+ %add = arith.addf %lhs , %rhs : f16
42+ tt.reduce.return %add : f16
43+ }) {axis = 1 : i32 } : (tensor <256 x128 xf16 , #blocked >) -> tensor <256 xf16 , #slice1 >
44+ tt.return %red : tensor <256 xf16 , #slice1 >
45+ }
46+ }
47+
48+ // -----
49+
50+ #sharedA = #ttg.nvmma_shared <{swizzlingByteWidth = 64 , transposed = false , elementBitWidth = 16 , CGALayout = [[1 , 0 ]]}>
51+ #sharedB = #ttg.nvmma_shared <{swizzlingByteWidth = 64 , transposed = false , elementBitWidth = 16 , CGALayout = [[0 , 1 ]]}>
52+ #tmem = #ttng.tensor_memory_encoding <blockM = 128 , blockN = 128 , colStride = 1 , CTASplitM = 2 , twoCTAs = true >
53+ #smem = #ttg.shared_memory
54+
55+ module attributes {" ttg.num-ctas" = 2 : i32 , " ttg.num-warps" = 8 : i32 , " ttng.two-ctas" = true , ttg.target = " cuda:90" , " ttg.threads-per-warp" = 32 : i32 } {
56+ // Negative test: in 2CTA kernels with non-zero tensor memory size, TMEM
57+ // teardown sync at kernel exit means we should not add an extra cluster barrier.
58+ // CHECK-LABEL: @no_end_cluster_barrier_for_mma_with_tmem_teardown
59+ // CHECK: ttng.tmem_alloc
60+ // CHECK: ttng.tc_gen5_mma
61+ // CHECK-NOT: ttng.cluster_arrive {relaxed = false}
62+ // CHECK-NOT: ttng.cluster_wait
63+ // CHECK: tt.return
64+ tt.func @no_end_cluster_barrier_for_mma_with_tmem_teardown () {
65+ %true = arith.constant true
66+ %a = ttg.local_alloc : () -> !ttg.memdesc <256 x32 xf16 , #sharedA , #smem , mutable >
67+ %b = ttg.local_alloc : () -> !ttg.memdesc <32 x128 xf16 , #sharedB , #smem , mutable >
68+ %acc = ttng.tmem_alloc : () -> !ttg.memdesc <256 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >
69+ ttng.tc_gen5_mma %a , %b , %acc , %true , %true {two_ctas } :
70+ !ttg.memdesc <256 x32 xf16 , #sharedA , #smem , mutable >,
71+ !ttg.memdesc <32 x128 xf16 , #sharedB , #smem , mutable >,
72+ !ttg.memdesc <256 x128 xf32 , #tmem , #ttng.tensor_memory , mutable >
73+ tt.return
74+ }
75+ }
76+
77+ // -----
78+
2879#blocked = #ttg.blocked <{sizePerThread = [1 , 32 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [4 , 1 ], order = [0 , 1 ], CGALayout = [[0 , 1 ]]}>
2980#slice0 = #ttg.slice <{dim = 0 , parent = #blocked }>
3081#slice1 = #ttg.slice <{dim = 1 , parent = #blocked }>
@@ -388,16 +439,20 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
388439#smem = #ttg.shared_memory
389440
390441module attributes {" ttg.num-ctas" = 2 : i32 , " ttg.num-warps" = 4 : i32 , ttg.target = " cuda:90" , " ttg.threads-per-warp" = 32 : i32 } {
391- // Wait included to model the end of the async lifetime. No extra cluster
392- // barriers should appear after the wait when reusing the same alloc.
442+ // NB. Testing only. Note that in this program async_tma_copy_global
443+ // and local_store are racing!
444+ // Even though we have a wait_barrier, we should still emit a cluster
445+ // barrier at the end of the kernel, as a in that wait just one CTA is waiting
446+ // for both the CTAs. It could be that CTA1 exits the kernel before CTA0,
447+ // otherwise!
393448 // CHECK-LABEL: @no_cluster_when_same_allocation
394449 // CHECK: ttng.init_barrier
395450 // CHECK-NEXT: ttng.fence_mbarrier_init_release_cluster
396451 // CHECK-NEXT: ttng.cluster_arrive {relaxed = true}
397452 // CHECK-NEXT: ttng.cluster_wait
398453 // CHECK: ttng.wait_barrier
399- // CHECK-NOT : ttng.cluster_arrive
400- // CHECK-NOT : ttng.cluster_wait
454+ // CHECK: ttng.cluster_arrive {relaxed = false}
455+ // CHECK-NEXT : ttng.cluster_wait
401456 // CHECK: tt.return
402457 tt.func @no_cluster_when_same_allocation (%desc: !tt.tensordesc <tensor <64 x128 xf16 , #nvmma >>) -> tensor <64 x128 xf16 , #blocked > {
403458 %c0 = arith.constant 0 : i32
0 commit comments