google-bazel-bot
diff --git a/‎llvm/docs/AMDGPUUsage.rst‎
Lines changed: 12 additions & 0 deletions b/‎llvm/docs/AMDGPUUsage.rst‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp‎
Lines changed: 2 additions & 4 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp‎
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h‎
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp‎
Lines changed: 28 additions & 7 deletions b/‎llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp‎
Lines changed: 28 additions & 7 deletions
@@ -7160,6 +7160,18 @@ treated as non-atomic.
 A memory synchronization scope wider than work-group is not meaningful for the
 group (LDS) address space and is treated as work-group.
 
+When a work-group's maximum flat work-group size does not exceed the wavefront
+size, the work-group fits within a single wavefront. In this case, LLVM
+``workgroup`` synchronization scope is equivalent to ``wavefront`` scope.
+
+If the compiler can determine this bound (e.g., via ``amdgpu-flat-work-group-size``),
+the AMDGPU backend optimizes ``workgroup`` scope operations by lowering them to
+``wavefront``-scoped machine instructions.
+
+It applies to atomic ``load``, ``store``, ``atomicrmw``, and ``cmpxchg``
+instructions, and to ``fence`` instructions, when they use synchronizing memory
+orderings (``acquire``, ``release``, ``acq_rel``, or ``seq_cst``).
+
 The memory model does not support the region address space which is treated as
 non-atomic.
 
 
@@ -94,10 +94,8 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
   bool IsSingleWaveWG = false;
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
-    IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
-  }
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
+    IsSingleWaveWG = ST.isSingleWavefrontWorkgroup(*I.getFunction());
 
   IRBuilder<> B(&I);
 
 
@@ -173,6 +173,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
   return Requested;
 }
 
+bool AMDGPUSubtarget::isSingleWavefrontWorkgroup(const Function &F) const {
+  return getFlatWorkGroupSizes(F).second <= getWavefrontSize();
+}
+
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
     std::pair<unsigned, unsigned> RequestedWavesPerEU,
     std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
 
@@ -80,6 +80,10 @@ class AMDGPUSubtarget {
   /// be converted to integer, or violate subtarget's specifications.
   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
 
+  /// \returns true if the maximum flat work-group size for \p F is at most the
+  /// wavefront size, so a work-group may fit in a single wavefront.
+  bool isSingleWavefrontWorkgroup(const Function &F) const;
+
   /// \returns The required size of workgroups that will be used to execute \p F
   /// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
   /// metadata. Otherwise, returns std::nullopt.
 
@@ -159,7 +159,8 @@ class SIMemOpInfo final {
       bool IsCrossAddressSpaceOrdering = true,
       AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
       bool IsVolatile = false, bool IsNonTemporal = false,
-      bool IsLastUse = false, bool IsCooperative = false)
+      bool IsLastUse = false, bool IsCooperative = false,
+      bool CanDemoteWorkgroupToWavefront = false)
       : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
         OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
         IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
@@ -207,6 +208,17 @@ class SIMemOpInfo final {
     // AGENT scope as a conservatively correct alternative.
     if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
       this->Scope = SIAtomicScope::AGENT;
+
+    // When max flat work-group size is at most the wavefront size, the
+    // work-group fits in a single wave, so LLVM workgroup scope matches
+    // wavefront scope. Demote workgroup → wavefront here for fences and for
+    // atomics with ordering stronger than monotonic.
+    if (CanDemoteWorkgroupToWavefront &&
+        this->Scope == SIAtomicScope::WORKGROUP &&
+        (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
+         llvm::isStrongerThan(this->FailureOrdering,
+                              AtomicOrdering::Monotonic)))
+      this->Scope = SIAtomicScope::WAVEFRONT;
   }
 
 public:
@@ -277,6 +289,7 @@ class SIMemOpAccess final {
 private:
   const AMDGPUMachineModuleInfo *MMI = nullptr;
   const GCNSubtarget &ST;
+  const bool CanDemoteWorkgroupToWavefront;
 
   /// Reports unsupported message \p Msg for \p MI to LLVM context.
   void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -300,7 +313,8 @@ class SIMemOpAccess final {
 public:
   /// Construct class to support accessing the machine memory operands
   /// of instructions in the machine function \p MF.
-  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
+  SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST,
+                const Function &F);
 
   /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
   std::optional<SIMemOpInfo>
@@ -779,9 +793,13 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
   return SIAtomicAddrSpace::OTHER;
 }
 
+// TODO: Consider moving single-wave workgroup->wavefront scope relaxation to an
+// IR pass (and extending it to other scoped operations), so middle-end
+// optimizations see wavefront scope earlier.
 SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
-                             const GCNSubtarget &ST)
-    : MMI(&MMI_), ST(ST) {}
+                             const GCNSubtarget &ST, const Function &F)
+    : MMI(&MMI_), ST(ST),
+      CanDemoteWorkgroupToWavefront(ST.isSingleWavefrontWorkgroup(F)) {}
 
 std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
     const MachineBasicBlock::iterator &MI) const {
@@ -851,7 +869,8 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
   }
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
-                     IsNonTemporal, IsLastUse, IsCooperative);
+                     IsNonTemporal, IsLastUse, IsCooperative,
+                     CanDemoteWorkgroupToWavefront);
 }
 
 std::optional<SIMemOpInfo>
@@ -920,7 +939,8 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
 
   return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
                      SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
-                     AtomicOrdering::NotAtomic);
+                     AtomicOrdering::NotAtomic, false, false, false, false,
+                     CanDemoteWorkgroupToWavefront);
 }
 
 std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -2533,7 +2553,8 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
   bool Changed = false;
 
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
+  SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST,
+                    MF.getFunction());
   CC = SICacheControl::create(ST);
 
   for (auto &MBB : MF) {