[multi-gpu] Phase 2: spell System syncscope explicitly on atomics

erwei-xilinx · claude · erwei-xilinx · commit 4fba2bc186e1 · 2026-05-12T15:26:00.000Z
Change the producer's release-atomicrmw and consumer's acquire-atomic-load
in air_sym_handwritten.mlir from default (no syncscope qualifier) to
`syncscope("")`. The empty string is LLVM IR's canonical spelling of the
System scope; this makes the cross-device intent self-documenting at the
MLIR level rather than relying on a default-omitted contract.

Behavior is unchanged: `syncscope("")` lowers to LLVM IR identical to the
unqualified form (LLVM textual IR omits the `syncscope(...)` token when
scope == System), survives `convert-gpu-to-rocdl`, and runs e2e on 2x
MI325X (verified on rad-mi325x-1).

Update sym_atomic_syncscope.mlir FileCheck contract test accordingly:
assert `syncscope("")` is preserved through the pipeline instead of
asserting absence of any syncscope keyword.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir b/mlir/test/Conversion/AIRToROCDL/sym_atomic_syncscope.mlir
@@ -7,20 +7,20 @@
 //
 // The symmetric-heap producer/consumer test relies on a contract that
 // `llvm.atomicrmw release` and `llvm.load atomic acquire` ops emitted with
-// NO syncscope qualifier survive the GPU compilation pipeline as LLVM
-// "system" syncscope (= cross-device on AMDGPU). Without that, the
+// `syncscope("")` (= LLVM IR's System scope = cross-device on AMDGPU)
+// survive the GPU compilation pipeline unchanged. Without that, the
 // producer's release-store on rank 0's GPU is not seen by the consumer's
 // acquire-load on rank 1's GPU, and the consumer hangs forever (test
 // times out — appears as "no crash, no signal, just dead").
 //
-// AMDGPU's LLVM backend rejects an explicit `syncscope("system")` keyword
-// (it recognizes "agent", "workgroup", "wavefront", "one-as", etc., but
-// not "system" by name). Default = LLVM IR's System scope, which AMDGPU
-// LangRef defines as cross-device:
+// The empty-string syncscope is LLVM IR's canonical spelling of System
+// scope (LLVM's textual IR omits the `syncscope(...)` token entirely when
+// scope == System; MLIR's LLVM dialect round-trips it as `syncscope("")`).
+// AMDGPU's LangRef defines System as cross-device:
 //   https://llvm.org/docs/AMDGPUUsage.html#memory-model
 //
 // This test asserts that after `convert-gpu-to-rocdl` the atomic ops
-// retain their ordering and continue to have NO syncscope qualifier.
+// retain their ordering and the explicit `syncscope("")` qualifier.
 //
 //===-----------------------------------------------------------------------===//
 
@@ -29,14 +29,12 @@
 
 // CHECK-LABEL: gpu.module @kernels
 // CHECK-LABEL: llvm.func @atomic_kernel
-// CHECK:       llvm.atomicrmw xchg %{{.*}}, %{{.*}} release : !llvm.ptr, i32
-// CHECK-NOT:   syncscope
-// CHECK:       llvm.load %{{.*}} atomic acquire {{.*}} : !llvm.ptr -> i32
-// CHECK-NOT:   syncscope
+// CHECK:       llvm.atomicrmw xchg %{{.*}}, %{{.*}} syncscope("") release : !llvm.ptr, i32
+// CHECK:       llvm.load %{{.*}} atomic syncscope("") acquire {{.*}} : !llvm.ptr -> i32
 gpu.module @kernels {
   gpu.func @atomic_kernel(%ptr : !llvm.ptr, %v : i32) kernel {
-    %old = llvm.atomicrmw xchg %ptr, %v release : !llvm.ptr, i32
-    %loaded = llvm.load %ptr atomic acquire {alignment = 4 : i64} : !llvm.ptr -> i32
+    %old = llvm.atomicrmw xchg %ptr, %v syncscope("") release : !llvm.ptr, i32
+    %loaded = llvm.load %ptr atomic syncscope("") acquire {alignment = 4 : i64} : !llvm.ptr -> i32
     gpu.return
   }
 }
diff --git a/test/gpu/symmetric_heap_dma/air_sym_handwritten.mlir b/test/gpu/symmetric_heap_dma/air_sym_handwritten.mlir
@@ -93,11 +93,11 @@ module attributes {gpu.container_module} {
 
       %is_lane0 = arith.cmpi eq, %lane, %c0 : index
       scf.if %is_lane0 {
-        // Default syncscope = LLVM System = cross-device on AMDGPU.
+        // syncscope("") = LLVM System scope = cross-device on AMDGPU.
         // See sym_atomic_syncscope.mlir for the contract test.
         %slot_ptr = func.call @flag_slot_ptr(%peer_flags, %wid)
             : (memref<4xi32>, index) -> !llvm.ptr
-        %old = llvm.atomicrmw xchg %slot_ptr, %c1_i32 release
+        %old = llvm.atomicrmw xchg %slot_ptr, %c1_i32 syncscope("") release
             : !llvm.ptr, i32
       }
       gpu.return
@@ -124,8 +124,8 @@ module attributes {gpu.container_module} {
             : (memref<4xi32>, index) -> !llvm.ptr
         // Spin: flag == 0.
         scf.while : () -> () {
-          %v = llvm.load %slot_ptr atomic acquire {alignment = 4 : i64}
-              : !llvm.ptr -> i32
+          %v = llvm.load %slot_ptr atomic syncscope("") acquire
+              {alignment = 4 : i64} : !llvm.ptr -> i32
           %not_ready = arith.cmpi eq, %v, %c0_i32 : i32
           scf.condition(%not_ready)
         } do {