@@ -116,7 +116,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
116116 %lhs_vec_0 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c0 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
117117 %rhs_vec_0 = vector.transfer_read %rhs_shared_expand [%n_outer_id , %ids#3 , %c0 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <4 x1 x1 x4 xbf16 >
118118
119- gpu.barrier
119+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
120120 rocdl.sched.barrier 0
121121 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
122122
@@ -128,7 +128,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
128128 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
129129
130130 rocdl.s.setprio 0
131- gpu.barrier
131+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
132132 rocdl.sched.barrier 0
133133
134134 // Global loads of rhs.
@@ -145,7 +145,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
145145 %lhs_vec_1 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c1 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
146146 %rhs_vec_1 = vector.transfer_read %rhs_shared_expand [%n_outer_id , %ids#3 , %c1 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <4 x1 x1 x4 xbf16 >
147147
148- gpu.barrier
148+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
149149 rocdl.sched.barrier 0
150150 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
151151
@@ -157,7 +157,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
157157 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
158158
159159 rocdl.s.setprio 0
160- gpu.barrier
160+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
161161 rocdl.sched.barrier 0
162162
163163 %lhs_vec_2 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c2 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
@@ -166,7 +166,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
166166 %lhs_vec_3 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c3 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
167167 %rhs_vec_3 = vector.transfer_read %rhs_shared_expand [%n_outer_id , %ids#3 , %c3 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <4 x1 x1 x4 xbf16 >
168168
169- gpu.barrier
169+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
170170 rocdl.sched.barrier 0
171171 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
172172
@@ -178,7 +178,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
178178 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
179179
180180 rocdl.s.setprio 0
181- gpu.barrier
181+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
182182 rocdl.sched.barrier 0
183183
184184 vector.transfer_write %lhs_vec_local_0 , %lhs_shared [%glb0 , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !bf16_shared
@@ -191,7 +191,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
191191 vector.transfer_write %rhs_vec_local_2 , %rhs_shared [%glb2 , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !bf16_shared
192192 vector.transfer_write %rhs_vec_local_3 , %rhs_shared [%glb3 , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !bf16_shared
193193
194- gpu.barrier
194+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
195195 rocdl.sched.barrier 0
196196 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
197197
@@ -203,7 +203,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
203203 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
204204
205205 rocdl.s.setprio 0
206- gpu.barrier
206+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
207207 rocdl.sched.barrier 0
208208
209209 scf.yield %dot3 : vector <8 x4 x1 x4 xf32 >
@@ -372,7 +372,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
372372 %lhs_thread_1 = tensor.extract_slice %lhs_block [0 , %glb1_lhs , %gko ] [1 , 1 , 8 ] [1 , 1 , 1 ] : !mexp_block_in_bf16 to tensor <1 x1 x8 xbf16 >
373373 %lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0 , %c0 , %c0 ], %cst {in_bounds = [true , true ]} : tensor <1 x1 x8 xbf16 >, vector <1 x8 xbf16 >
374374
375- gpu.barrier
375+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
376376 rocdl.sched.barrier 0
377377 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
378378
@@ -384,7 +384,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
384384 } : vector <4 x2 x1 x4 xbf16 >, vector <4 x2 x1 x4 xbf16 > into vector <4 x4 x1 x4 xf32 >
385385
386386 rocdl.s.setprio 0
387- gpu.barrier
387+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
388388 rocdl.sched.barrier 0
389389
390390 vector.transfer_write %rhs_vec_local_0 , %rhs_shared [%glb0 , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !shared_bf16
@@ -395,7 +395,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
395395 vector.transfer_write %lhs_vec_local_0 , %lhs_shared [%glb0_lhs , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !mshared_bf16
396396 vector.transfer_write %lhs_vec_local_1 , %lhs_shared [%glb1_lhs , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !mshared_bf16
397397
398- gpu.barrier
398+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
399399 rocdl.sched.barrier 0
400400 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
401401
@@ -407,7 +407,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
407407 } : vector <4 x2 x1 x4 xbf16 >, vector <4 x2 x1 x4 xbf16 > into vector <4 x4 x1 x4 xf32 >
408408
409409 rocdl.s.setprio 0
410- gpu.barrier
410+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
411411 rocdl.sched.barrier 0
412412
413413 scf.yield %dot2 : vector <4 x4 x1 x4 xf32 >
@@ -540,7 +540,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
540540 %lhs_vec_0 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c0 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
541541 %rhs_vec_0 = vector.transfer_read %rhs_shared_expand [%n_outer_id , %ids#3 , %c0 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <4 x1 x1 x4 xbf16 >
542542
543- gpu.barrier
543+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
544544 rocdl.sched.barrier 0
545545 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
546546
@@ -552,7 +552,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
552552 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
553553
554554 rocdl.s.setprio 0
555- gpu.barrier
555+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
556556 rocdl.sched.barrier 0
557557
558558 // Global loads of rhs.
@@ -569,7 +569,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
569569 %lhs_vec_1 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c1 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
570570 %rhs_vec_1 = vector.transfer_read %rhs_shared_expand [%n_outer_id , %ids#3 , %c1 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <4 x1 x1 x4 xbf16 >
571571
572- gpu.barrier
572+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
573573 rocdl.sched.barrier 0
574574 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
575575
@@ -581,7 +581,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
581581 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
582582
583583 rocdl.s.setprio 0
584- gpu.barrier
584+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
585585 rocdl.sched.barrier 0
586586
587587 %lhs_vec_2 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c2 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
@@ -590,7 +590,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
590590 %lhs_vec_3 = vector.transfer_read %lhs_shared_expand [%m_outer_id , %ids#3 , %c3 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <8 x1 x1 x4 xbf16 >
591591 %rhs_vec_3 = vector.transfer_read %rhs_shared_expand [%n_outer_id , %ids#3 , %c3 , %inner_id ], %cst {in_bounds = [true , true , true , true ]} : !bf16_shared_exp , vector <4 x1 x1 x4 xbf16 >
592592
593- gpu.barrier
593+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
594594 rocdl.sched.barrier 0
595595 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
596596
@@ -602,7 +602,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
602602 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
603603
604604 rocdl.s.setprio 0
605- gpu.barrier
605+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
606606 rocdl.sched.barrier 0
607607
608608 vector.transfer_write %lhs_vec_local_0 , %lhs_shared [%glb0 , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !bf16_shared
@@ -615,7 +615,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
615615 vector.transfer_write %rhs_vec_local_2 , %rhs_shared [%glb2 , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !bf16_shared
616616 vector.transfer_write %rhs_vec_local_3 , %rhs_shared [%glb3 , %gko ] {in_bounds = [true , true ]} : vector <1 x8 xbf16 >, !bf16_shared
617617
618- gpu.barrier
618+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
619619 rocdl.sched.barrier 0
620620 rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
621621
@@ -627,7 +627,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
627627 } : vector <8 x1 x1 x4 xbf16 >, vector <4 x1 x1 x4 xbf16 > into vector <8 x4 x1 x4 xf32 >
628628
629629 rocdl.s.setprio 0
630- gpu.barrier
630+ gpu.barrier memfence [ #gpu.address_space < workgroup >]
631631 rocdl.sched.barrier 0
632632
633633 scf.yield %dot3 : vector <8 x4 x1 x4 xf32 >
0 commit comments