Skip to content

Commit 4fba2bc

Browse files
erwei-xilinxclaude
andcommitted
[multi-gpu] Phase 2: spell System syncscope explicitly on atomics
Change the producer's release-atomicrmw and consumer's acquire-atomic-load in air_sym_handwritten.mlir from default (no syncscope qualifier) to `syncscope("")`. The empty string is LLVM IR's canonical spelling of the System scope; this makes the cross-device intent self-documenting at the MLIR level rather than relying on a default-omitted contract. Behavior is unchanged: `syncscope("")` lowers to LLVM IR identical to the unqualified form (LLVM textual IR omits the `syncscope(...)` token when scope == System), survives `convert-gpu-to-rocdl`, and runs e2e on 2x MI325X (verified on rad-mi325x-1). Update sym_atomic_syncscope.mlir FileCheck contract test accordingly: assert `syncscope("")` is preserved through the pipeline instead of asserting absence of any syncscope keyword. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 977767d commit 4fba2bc

2 files changed

Lines changed: 15 additions & 17 deletions

File tree

mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,20 @@
77
//
88
// The symmetric-heap producer/consumer test relies on a contract that
99
// `llvm.atomicrmw release` and `llvm.load atomic acquire` ops emitted with
10-
// NO syncscope qualifier survive the GPU compilation pipeline as LLVM
11-
// "system" syncscope (= cross-device on AMDGPU). Without that, the
10+
// `syncscope("")` (= LLVM IR's System scope = cross-device on AMDGPU)
11+
// survive the GPU compilation pipeline unchanged. Without that, the
1212
// producer's release-store on rank 0's GPU is not seen by the consumer's
1313
// acquire-load on rank 1's GPU, and the consumer hangs forever (test
1414
// times out — appears as "no crash, no signal, just dead").
1515
//
16-
// AMDGPU's LLVM backend rejects an explicit `syncscope("system")` keyword
17-
// (it recognizes "agent", "workgroup", "wavefront", "one-as", etc., but
18-
// not "system" by name). Default = LLVM IR's System scope, which AMDGPU
19-
// LangRef defines as cross-device:
16+
// The empty-string syncscope is LLVM IR's canonical spelling of System
17+
// scope (LLVM's textual IR omits the `syncscope(...)` token entirely when
18+
// scope == System; MLIR's LLVM dialect round-trips it as `syncscope("")`).
19+
// AMDGPU's LangRef defines System as cross-device:
2020
// https://llvm.org/docs/AMDGPUUsage.html#memory-model
2121
//
2222
// This test asserts that after `convert-gpu-to-rocdl` the atomic ops
23-
// retain their ordering and continue to have NO syncscope qualifier.
23+
// retain their ordering and the explicit `syncscope("")` qualifier.
2424
//
2525
//===-----------------------------------------------------------------------===//
2626

@@ -29,14 +29,12 @@
2929

3030
// CHECK-LABEL: gpu.module @kernels
3131
// CHECK-LABEL: llvm.func @atomic_kernel
32-
// CHECK: llvm.atomicrmw xchg %{{.*}}, %{{.*}} release : !llvm.ptr, i32
33-
// CHECK-NOT: syncscope
34-
// CHECK: llvm.load %{{.*}} atomic acquire {{.*}} : !llvm.ptr -> i32
35-
// CHECK-NOT: syncscope
32+
// CHECK: llvm.atomicrmw xchg %{{.*}}, %{{.*}} syncscope("") release : !llvm.ptr, i32
33+
// CHECK: llvm.load %{{.*}} atomic syncscope("") acquire {{.*}} : !llvm.ptr -> i32
3634
gpu.module @kernels {
3735
gpu.func @atomic_kernel(%ptr : !llvm.ptr, %v : i32) kernel {
38-
%old = llvm.atomicrmw xchg %ptr, %v release : !llvm.ptr, i32
39-
%loaded = llvm.load %ptr atomic acquire {alignment = 4 : i64} : !llvm.ptr -> i32
36+
%old = llvm.atomicrmw xchg %ptr, %v syncscope("") release : !llvm.ptr, i32
37+
%loaded = llvm.load %ptr atomic syncscope("") acquire {alignment = 4 : i64} : !llvm.ptr -> i32
4038
gpu.return
4139
}
4240
}

test/gpu/symmetric_heap_dma/air_sym_handwritten.mlir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@ module attributes {gpu.container_module} {
9393

9494
%is_lane0 = arith.cmpi eq, %lane, %c0 : index
9595
scf.if %is_lane0 {
96-
// Default syncscope = LLVM System = cross-device on AMDGPU.
96+
// syncscope("") = LLVM System scope = cross-device on AMDGPU.
9797
// See sym_atomic_syncscope.mlir for the contract test.
9898
%slot_ptr = func.call @flag_slot_ptr(%peer_flags, %wid)
9999
: (memref<4xi32>, index) -> !llvm.ptr
100-
%old = llvm.atomicrmw xchg %slot_ptr, %c1_i32 release
100+
%old = llvm.atomicrmw xchg %slot_ptr, %c1_i32 syncscope("") release
101101
: !llvm.ptr, i32
102102
}
103103
gpu.return
@@ -124,8 +124,8 @@ module attributes {gpu.container_module} {
124124
: (memref<4xi32>, index) -> !llvm.ptr
125125
// Spin: flag == 0.
126126
scf.while : () -> () {
127-
%v = llvm.load %slot_ptr atomic acquire {alignment = 4 : i64}
128-
: !llvm.ptr -> i32
127+
%v = llvm.load %slot_ptr atomic syncscope("") acquire
128+
{alignment = 4 : i64} : !llvm.ptr -> i32
129129
%not_ready = arith.cmpi eq, %v, %c0_i32 : i32
130130
scf.condition(%not_ready)
131131
} do {

0 commit comments

Comments
 (0)