Skip to content

Commit db9d11e

Browse files
authored
[GPU] Become more honest about barrier semantics (and drop stale code) (#23335)
Recently, I added an upstream PR ( llvm/llvm-project#177425 ) that adds the ability for gpu barriers to declare what address spaces they fence on in addition to allowing the __syncthreads() semantics. This PR uses this support pervasively in IREE's GPU code generation, either by explicitly specifying the address spaces fenced by barrier ops we create or by inhereting it from address spaces on memrefs (which we know will b in shared memory). We were **already** using such barriers on AMD GPU targets, where we had a rewrite pattern to unconditionally rewrite `gpu.barrie' to `amdgpu.lds_barrier`, which always only fences workgroup memory and doesn't synthronize global memory. This commit removes that rewrite, which allows us to be more explicit abuht the semantics we're using **and** will allow us to synchronize global memory accesses within a workgrop if we ever need to do so. This PR also adds explanatory notes explaining the memory semantics we're expecting where needed. While I'm here, this PR removes an unmanitained and stale copy of the gpu barrier elimination logic from the IREE transform ops and delegates to the upstream version instead.
1 parent 16a2395 commit db9d11e

63 files changed

Lines changed: 430 additions & 1200 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f16.mlir

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
8888
vector.transfer_write %rhs_vec_local_t, %rhs_shared[%delin#0, %delin#1, %inner, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf16>, !shared_ty
8989
} {mapping = [#gpu.thread<linear_dim_0>]}
9090

91-
gpu.barrier
91+
gpu.barrier memfence [#gpu.address_space<workgroup>]
9292

9393
%0 = tensor.empty() : !acc_base_ty
9494
%1 = scf.forall (%id) in (512) shared_outs(%out = %0) -> !acc_base_ty {
@@ -129,7 +129,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
129129
%lhs_vec_0_t = vector.shape_cast %lhs_vec_0 : vector<1x8x1x4xf16> to vector<8x1x1x4xf16>
130130
%rhs_vec_0_t = vector.shape_cast %rhs_vec_0 : vector<1x4x1x4xf16> to vector<4x1x1x4xf16>
131131

132-
gpu.barrier
132+
gpu.barrier memfence [#gpu.address_space<workgroup>]
133133
rocdl.sched.barrier 0
134134
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
135135

@@ -141,7 +141,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
141141
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
142142

143143
rocdl.s.setprio 0
144-
gpu.barrier
144+
gpu.barrier memfence [#gpu.address_space<workgroup>]
145145
rocdl.sched.barrier 0
146146

147147
// Global loads of rhs.
@@ -164,7 +164,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
164164
%lhs_vec_1_t = vector.shape_cast %lhs_vec_1 : vector<1x8x1x4xf16> to vector<8x1x1x4xf16>
165165
%rhs_vec_1_t = vector.shape_cast %rhs_vec_1 : vector<1x4x1x4xf16> to vector<4x1x1x4xf16>
166166

167-
gpu.barrier
167+
gpu.barrier memfence [#gpu.address_space<workgroup>]
168168
rocdl.sched.barrier 0
169169
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
170170

@@ -176,7 +176,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
176176
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
177177

178178
rocdl.s.setprio 0
179-
gpu.barrier
179+
gpu.barrier memfence [#gpu.address_space<workgroup>]
180180
rocdl.sched.barrier 0
181181

182182
// Local loads of lhs and rhs.
@@ -190,7 +190,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
190190
%lhs_vec_3_t = vector.shape_cast %lhs_vec_3 : vector<1x8x1x4xf16> to vector<8x1x1x4xf16>
191191
%rhs_vec_3_t = vector.shape_cast %rhs_vec_3 : vector<1x4x1x4xf16> to vector<4x1x1x4xf16>
192192

193-
gpu.barrier
193+
gpu.barrier memfence [#gpu.address_space<workgroup>]
194194
rocdl.sched.barrier 0
195195
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
196196

@@ -202,7 +202,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
202202
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
203203

204204
rocdl.s.setprio 0
205-
gpu.barrier
205+
gpu.barrier memfence [#gpu.address_space<workgroup>]
206206
rocdl.sched.barrier 0
207207

208208
// Local stores of lhs and rhs.
@@ -216,7 +216,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
216216
vector.transfer_write %lhs_vec_local_2_t, %lhs_shared [%c2, %glb#0, %glb_inner, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf16>, !shared_ty
217217
vector.transfer_write %lhs_vec_local_3_t, %lhs_shared [%c3, %glb#0, %glb_inner, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x4xf16>, !shared_ty
218218

219-
gpu.barrier
219+
gpu.barrier memfence [#gpu.address_space<workgroup>]
220220
rocdl.sched.barrier 0
221221
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
222222

@@ -228,7 +228,7 @@ util.func @pingpong_dt_large_f16(%lhs_base: !lhs_base_ty, %rhs_base: !rhs_base_t
228228
} : vector<8x1x1x4xf16>, vector<4x1x1x4xf16> into vector<8x4x1x4xf32>
229229

230230
rocdl.s.setprio 0
231-
gpu.barrier
231+
gpu.barrier memfence [#gpu.address_space<workgroup>]
232232
rocdl.sched.barrier 0
233233

234234
scf.yield %dot3 : vector<8x4x1x4xf32>

compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_dt_matmul_f8E4M3FNUZ.mlir

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
132132
%lhs_vec_0_t = vector.shape_cast %lhs_vec_0 : vector<1x8x1x8xf8E4M3FNUZ> to vector<8x1x1x8xf8E4M3FNUZ>
133133
%rhs_vec_0_t = vector.shape_cast %rhs_vec_0 : vector<1x4x1x8xf8E4M3FNUZ> to vector<4x1x1x8xf8E4M3FNUZ>
134134

135-
gpu.barrier
135+
gpu.barrier memfence [#gpu.address_space<workgroup>]
136136
rocdl.sched.barrier 0
137137
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
138138

@@ -144,7 +144,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
144144
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
145145

146146
rocdl.s.setprio 0
147-
gpu.barrier
147+
gpu.barrier memfence [#gpu.address_space<workgroup>]
148148
rocdl.sched.barrier 0
149149

150150
// Global loads of rhs.
@@ -167,7 +167,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
167167
%lhs_vec_1_t = vector.shape_cast %lhs_vec_1 : vector<1x8x1x8xf8E4M3FNUZ> to vector<8x1x1x8xf8E4M3FNUZ>
168168
%rhs_vec_1_t = vector.shape_cast %rhs_vec_1 : vector<1x4x1x8xf8E4M3FNUZ> to vector<4x1x1x8xf8E4M3FNUZ>
169169

170-
gpu.barrier
170+
gpu.barrier memfence [#gpu.address_space<workgroup>]
171171
rocdl.sched.barrier 0
172172
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
173173

@@ -179,7 +179,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
179179
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
180180

181181
rocdl.s.setprio 0
182-
gpu.barrier
182+
gpu.barrier memfence [#gpu.address_space<workgroup>]
183183
rocdl.sched.barrier 0
184184

185185
// Local loads.
@@ -194,7 +194,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
194194
%lhs_vec_3_t = vector.shape_cast %lhs_vec_3 : vector<1x8x1x8xf8E4M3FNUZ> to vector<8x1x1x8xf8E4M3FNUZ>
195195
%rhs_vec_3_t = vector.shape_cast %rhs_vec_3 : vector<1x4x1x8xf8E4M3FNUZ> to vector<4x1x1x8xf8E4M3FNUZ>
196196

197-
gpu.barrier
197+
gpu.barrier memfence [#gpu.address_space<workgroup>]
198198
rocdl.sched.barrier 0
199199
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
200200

@@ -206,7 +206,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
206206
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
207207

208208
rocdl.s.setprio 0
209-
gpu.barrier
209+
gpu.barrier memfence [#gpu.address_space<workgroup>]
210210
rocdl.sched.barrier 0
211211

212212
// Local stores of lhs and rhs.
@@ -220,7 +220,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
220220
vector.transfer_write %lhs_vec_local_2_t, %lhs_shared [%c2, %glb#0, %glb_inner, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x8xf8E4M3FNUZ>, !shared_ty
221221
vector.transfer_write %lhs_vec_local_3_t, %lhs_shared [%c3, %glb#0, %glb_inner, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x2x8xf8E4M3FNUZ>, !shared_ty
222222

223-
gpu.barrier
223+
gpu.barrier memfence [#gpu.address_space<workgroup>]
224224
rocdl.sched.barrier 0
225225
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
226226

@@ -232,7 +232,7 @@ util.func @pingpong_dt_large_f8E4M3FNUZ(%lhs_base: !lhs_base_ty, %rhs_base: !rhs
232232
} : vector<8x1x1x8xf8E4M3FNUZ>, vector<4x1x1x8xf8E4M3FNUZ> into vector<8x4x1x4xf32>
233233

234234
rocdl.s.setprio 0
235-
gpu.barrier
235+
gpu.barrier memfence [#gpu.address_space<workgroup>]
236236
rocdl.sched.barrier 0
237237

238238
scf.yield %dot3 : vector<8x4x1x4xf32>
@@ -414,7 +414,7 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
414414
%rhs_thread_3 = tensor.extract_slice %rhs [%i, %c1, %glb1_rhs, %ids#2, %c0] [1, 1, 1, 1, 16] [1, 1, 1, 1, 1] : !m_rhs_ty to tensor<1x1x1x16xf8E4M3FNUZ>
415415
%rhs_vec_local_3 = vector.transfer_read %rhs_thread_3 [%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x1x16xf8E4M3FNUZ>, vector<1x1x1x16xf8E4M3FNUZ>
416416

417-
gpu.barrier
417+
gpu.barrier memfence [#gpu.address_space<workgroup>]
418418
rocdl.sched.barrier 0
419419
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
420420

@@ -426,7 +426,7 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
426426
} : vector<8x2x1x8xf8E4M3FNUZ>, vector<2x2x1x8xf8E4M3FNUZ> into vector<8x2x1x4xf32>
427427

428428
rocdl.s.setprio 0
429-
gpu.barrier
429+
gpu.barrier memfence [#gpu.address_space<workgroup>]
430430
rocdl.sched.barrier 0
431431

432432
// Local stores of lhs and rhs.
@@ -438,7 +438,7 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
438438
vector.transfer_write %lhs_vec_local_0, %lhs_shared[%c0, %ids#1, %ids#2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x16xf8E4M3FNUZ>, !m_lhs_shared_ty
439439
vector.transfer_write %lhs_vec_local_1, %lhs_shared[%c1, %ids#1, %ids#2, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x1x16xf8E4M3FNUZ>, !m_lhs_shared_ty
440440

441-
gpu.barrier
441+
gpu.barrier memfence [#gpu.address_space<workgroup>]
442442
rocdl.sched.barrier 0
443443
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
444444

@@ -450,7 +450,7 @@ util.func private @pingpong_dt_medium_f8E4M3FNUZ(%lhs_base: !m_lhs_base_ty, %rhs
450450
} : vector<8x2x1x8xf8E4M3FNUZ>, vector<2x2x1x8xf8E4M3FNUZ> into vector<8x2x1x4xf32>
451451

452452
rocdl.s.setprio 0
453-
gpu.barrier
453+
gpu.barrier memfence [#gpu.address_space<workgroup>]
454454
rocdl.sched.barrier 0
455455

456456
scf.yield %dot2 : vector<8x2x1x4xf32>

compiler/plugins/target/ROCM/builtins/mlir_ukernel/iree_uk_amdgpu_matmul_bf16.mlir

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
116116
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
117117
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<4x1x1x4xbf16>
118118

119-
gpu.barrier
119+
gpu.barrier memfence [#gpu.address_space<workgroup>]
120120
rocdl.sched.barrier 0
121121
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
122122

@@ -128,7 +128,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
128128
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
129129

130130
rocdl.s.setprio 0
131-
gpu.barrier
131+
gpu.barrier memfence [#gpu.address_space<workgroup>]
132132
rocdl.sched.barrier 0
133133

134134
// Global loads of rhs.
@@ -145,7 +145,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
145145
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
146146
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<4x1x1x4xbf16>
147147

148-
gpu.barrier
148+
gpu.barrier memfence [#gpu.address_space<workgroup>]
149149
rocdl.sched.barrier 0
150150
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
151151

@@ -157,7 +157,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
157157
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
158158

159159
rocdl.s.setprio 0
160-
gpu.barrier
160+
gpu.barrier memfence [#gpu.address_space<workgroup>]
161161
rocdl.sched.barrier 0
162162

163163
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
@@ -166,7 +166,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
166166
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
167167
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<4x1x1x4xbf16>
168168

169-
gpu.barrier
169+
gpu.barrier memfence [#gpu.address_space<workgroup>]
170170
rocdl.sched.barrier 0
171171
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
172172

@@ -178,7 +178,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
178178
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
179179

180180
rocdl.s.setprio 0
181-
gpu.barrier
181+
gpu.barrier memfence [#gpu.address_space<workgroup>]
182182
rocdl.sched.barrier 0
183183

184184
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !bf16_shared
@@ -191,7 +191,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
191191
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !bf16_shared
192192
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !bf16_shared
193193

194-
gpu.barrier
194+
gpu.barrier memfence [#gpu.address_space<workgroup>]
195195
rocdl.sched.barrier 0
196196
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
197197

@@ -203,7 +203,7 @@ util.func private @pingpong_large_bf16(%lhs_base: !bf16_in_ty, %rhs_base: !bf16_
203203
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
204204

205205
rocdl.s.setprio 0
206-
gpu.barrier
206+
gpu.barrier memfence [#gpu.address_space<workgroup>]
207207
rocdl.sched.barrier 0
208208

209209
scf.yield %dot3 : vector<8x4x1x4xf32>
@@ -372,7 +372,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
372372
%lhs_thread_1 = tensor.extract_slice %lhs_block [0, %glb1_lhs, %gko] [1, 1, 8] [1, 1, 1] : !mexp_block_in_bf16 to tensor<1x1x8xbf16>
373373
%lhs_vec_local_1 = vector.transfer_read %lhs_thread_1 [%c0, %c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x1x8xbf16>, vector<1x8xbf16>
374374

375-
gpu.barrier
375+
gpu.barrier memfence [#gpu.address_space<workgroup>]
376376
rocdl.sched.barrier 0
377377
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
378378

@@ -384,7 +384,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
384384
} : vector<4x2x1x4xbf16>, vector<4x2x1x4xbf16> into vector<4x4x1x4xf32>
385385

386386
rocdl.s.setprio 0
387-
gpu.barrier
387+
gpu.barrier memfence [#gpu.address_space<workgroup>]
388388
rocdl.sched.barrier 0
389389

390390
vector.transfer_write %rhs_vec_local_0, %rhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !shared_bf16
@@ -395,7 +395,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
395395
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0_lhs, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !mshared_bf16
396396
vector.transfer_write %lhs_vec_local_1, %lhs_shared [%glb1_lhs, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !mshared_bf16
397397

398-
gpu.barrier
398+
gpu.barrier memfence [#gpu.address_space<workgroup>]
399399
rocdl.sched.barrier 0
400400
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
401401

@@ -407,7 +407,7 @@ util.func private @pingpong_medium_bf16_expanded(%lhs_base: !mexp_in_ty_bf16, %r
407407
} : vector<4x2x1x4xbf16>, vector<4x2x1x4xbf16> into vector<4x4x1x4xf32>
408408

409409
rocdl.s.setprio 0
410-
gpu.barrier
410+
gpu.barrier memfence [#gpu.address_space<workgroup>]
411411
rocdl.sched.barrier 0
412412

413413
scf.yield %dot2 : vector<4x4x1x4xf32>
@@ -540,7 +540,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
540540
%lhs_vec_0 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
541541
%rhs_vec_0 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c0, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<4x1x1x4xbf16>
542542

543-
gpu.barrier
543+
gpu.barrier memfence [#gpu.address_space<workgroup>]
544544
rocdl.sched.barrier 0
545545
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
546546

@@ -552,7 +552,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
552552
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
553553

554554
rocdl.s.setprio 0
555-
gpu.barrier
555+
gpu.barrier memfence [#gpu.address_space<workgroup>]
556556
rocdl.sched.barrier 0
557557

558558
// Global loads of rhs.
@@ -569,7 +569,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
569569
%lhs_vec_1 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
570570
%rhs_vec_1 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c1, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<4x1x1x4xbf16>
571571

572-
gpu.barrier
572+
gpu.barrier memfence [#gpu.address_space<workgroup>]
573573
rocdl.sched.barrier 0
574574
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
575575

@@ -581,7 +581,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
581581
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
582582

583583
rocdl.s.setprio 0
584-
gpu.barrier
584+
gpu.barrier memfence [#gpu.address_space<workgroup>]
585585
rocdl.sched.barrier 0
586586

587587
%lhs_vec_2 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c2, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
@@ -590,7 +590,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
590590
%lhs_vec_3 = vector.transfer_read %lhs_shared_expand[%m_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<8x1x1x4xbf16>
591591
%rhs_vec_3 = vector.transfer_read %rhs_shared_expand[%n_outer_id, %ids#3, %c3, %inner_id], %cst {in_bounds = [true, true, true, true]} : !bf16_shared_exp, vector<4x1x1x4xbf16>
592592

593-
gpu.barrier
593+
gpu.barrier memfence [#gpu.address_space<workgroup>]
594594
rocdl.sched.barrier 0
595595
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
596596

@@ -602,7 +602,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
602602
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
603603

604604
rocdl.s.setprio 0
605-
gpu.barrier
605+
gpu.barrier memfence [#gpu.address_space<workgroup>]
606606
rocdl.sched.barrier 0
607607

608608
vector.transfer_write %lhs_vec_local_0, %lhs_shared [%glb0, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !bf16_shared
@@ -615,7 +615,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
615615
vector.transfer_write %rhs_vec_local_2, %rhs_shared [%glb2, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !bf16_shared
616616
vector.transfer_write %rhs_vec_local_3, %rhs_shared [%glb3, %gko] {in_bounds = [true, true]} : vector<1x8xbf16>, !bf16_shared
617617

618-
gpu.barrier
618+
gpu.barrier memfence [#gpu.address_space<workgroup>]
619619
rocdl.sched.barrier 0
620620
rocdl.s.setprio 1 { iree_gpu.swap_mfma = 1 }
621621

@@ -627,7 +627,7 @@ util.func private @pingpong_large_bf16_expanded(%lhs_base: !bf16_exp_in_ty, %rhs
627627
} : vector<8x1x1x4xbf16>, vector<4x1x1x4xbf16> into vector<8x4x1x4xf32>
628628

629629
rocdl.s.setprio 0
630-
gpu.barrier
630+
gpu.barrier memfence [#gpu.address_space<workgroup>]
631631
rocdl.sched.barrier 0
632632

633633
scf.yield %dot3 : vector<8x4x1x4xf32>

0 commit comments

Comments
 (0)