Skip to content

Commit ebc5607

Browse files
barbara-amdBarbara Mitic
andauthored
[AMDGPU] Use wavefront scope for single-wave workgroup synchronization (llvm#187673)
Workgroup-scoped fences and non-relaxed workgroup atomics were previously legalized with synchronization strong enough for multi-wave workgroups. When the kernel's maximum flat work-group size does not exceed the wavefront size, the workgroup contains only a single wavefront, so workgroup-scoped synchronization is equivalent to wavefront scope and the stronger legalization is unnecessary. SIMemoryLegalizer now demotes workgroup scope to wavefront scope in this case for workgroup-scoped fences and for non-relaxed atomic load, store, atomicrmw, and cmpxchg operations. This allows subsequent legalization to operate at wavefront scope. The decision is based on AMDGPUSubtarget::isSingleWavefrontWorkgroup. --------- Co-authored-by: Barbara Mitic <Barbara.Mitic@amd.com>
1 parent 9cf8152 commit ebc5607

File tree

8 files changed

+3065
-611
lines changed

8 files changed

+3065
-611
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7160,6 +7160,18 @@ treated as non-atomic.
71607160
A memory synchronization scope wider than work-group is not meaningful for the
71617161
group (LDS) address space and is treated as work-group.
71627162

7163+
When a work-group's maximum flat work-group size does not exceed the wavefront
7164+
size, the work-group fits within a single wavefront. In this case, LLVM
7165+
``workgroup`` synchronization scope is equivalent to ``wavefront`` scope.
7166+
7167+
If the compiler can determine this bound (e.g., via ``amdgpu-flat-work-group-size``),
7168+
the AMDGPU backend optimizes ``workgroup`` scope operations by lowering them to
7169+
``wavefront``-scoped machine instructions.
7170+
7171+
It applies to atomic ``load``, ``store``, ``atomicrmw``, and ``cmpxchg``
7172+
instructions, and to ``fence`` instructions, when they use synchronizing memory
7173+
orderings (``acquire``, ``release``, ``acq_rel``, or ``seq_cst``).
7174+
71637175
The memory model does not support the region address space which is treated as
71647176
non-atomic.
71657177

llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,8 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
9494
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
9595
bool IsSingleWaveWG = false;
9696

97-
if (TM.getOptLevel() > CodeGenOptLevel::None) {
98-
unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
99-
IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
100-
}
97+
if (TM.getOptLevel() > CodeGenOptLevel::None)
98+
IsSingleWaveWG = ST.isSingleWavefrontWorkgroup(*I.getFunction());
10199

102100
IRBuilder<> B(&I);
103101

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
173173
return Requested;
174174
}
175175

176+
bool AMDGPUSubtarget::isSingleWavefrontWorkgroup(const Function &F) const {
177+
return getFlatWorkGroupSizes(F).second <= getWavefrontSize();
178+
}
179+
176180
std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
177181
std::pair<unsigned, unsigned> RequestedWavesPerEU,
178182
std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ class AMDGPUSubtarget {
8080
/// be converted to integer, or violate subtarget's specifications.
8181
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
8282

83+
/// \returns true if the maximum flat work-group size for \p F is at most the
84+
/// wavefront size, so a work-group may fit in a single wavefront.
85+
bool isSingleWavefrontWorkgroup(const Function &F) const;
86+
8387
/// \returns The required size of workgroups that will be used to execute \p F
8488
/// in the \p Dim dimension, if it is known (from `!reqd_work_group_size`
8589
/// metadata. Otherwise, returns std::nullopt.

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,8 @@ class SIMemOpInfo final {
159159
bool IsCrossAddressSpaceOrdering = true,
160160
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
161161
bool IsVolatile = false, bool IsNonTemporal = false,
162-
bool IsLastUse = false, bool IsCooperative = false)
162+
bool IsLastUse = false, bool IsCooperative = false,
163+
bool CanDemoteWorkgroupToWavefront = false)
163164
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
164165
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
165166
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
@@ -207,6 +208,17 @@ class SIMemOpInfo final {
207208
// AGENT scope as a conservatively correct alternative.
208209
if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
209210
this->Scope = SIAtomicScope::AGENT;
211+
212+
// When max flat work-group size is at most the wavefront size, the
213+
// work-group fits in a single wave, so LLVM workgroup scope matches
214+
// wavefront scope. Demote workgroup → wavefront here for fences and for
215+
// atomics with ordering stronger than monotonic.
216+
if (CanDemoteWorkgroupToWavefront &&
217+
this->Scope == SIAtomicScope::WORKGROUP &&
218+
(llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
219+
llvm::isStrongerThan(this->FailureOrdering,
220+
AtomicOrdering::Monotonic)))
221+
this->Scope = SIAtomicScope::WAVEFRONT;
210222
}
211223

212224
public:
@@ -277,6 +289,7 @@ class SIMemOpAccess final {
277289
private:
278290
const AMDGPUMachineModuleInfo *MMI = nullptr;
279291
const GCNSubtarget &ST;
292+
const bool CanDemoteWorkgroupToWavefront;
280293

281294
/// Reports unsupported message \p Msg for \p MI to LLVM context.
282295
void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -300,7 +313,8 @@ class SIMemOpAccess final {
300313
public:
301314
/// Construct class to support accessing the machine memory operands
302315
/// of instructions in the machine function \p MF.
303-
SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
316+
SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST,
317+
const Function &F);
304318

305319
/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
306320
std::optional<SIMemOpInfo>
@@ -779,9 +793,13 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
779793
return SIAtomicAddrSpace::OTHER;
780794
}
781795

796+
// TODO: Consider moving single-wave workgroup->wavefront scope relaxation to an
797+
// IR pass (and extending it to other scoped operations), so middle-end
798+
// optimizations see wavefront scope earlier.
782799
SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
783-
const GCNSubtarget &ST)
784-
: MMI(&MMI_), ST(ST) {}
800+
const GCNSubtarget &ST, const Function &F)
801+
: MMI(&MMI_), ST(ST),
802+
CanDemoteWorkgroupToWavefront(ST.isSingleWavefrontWorkgroup(F)) {}
785803

786804
std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
787805
const MachineBasicBlock::iterator &MI) const {
@@ -851,7 +869,8 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
851869
}
852870
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
853871
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
854-
IsNonTemporal, IsLastUse, IsCooperative);
872+
IsNonTemporal, IsLastUse, IsCooperative,
873+
CanDemoteWorkgroupToWavefront);
855874
}
856875

857876
std::optional<SIMemOpInfo>
@@ -920,7 +939,8 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
920939

921940
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
922941
SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
923-
AtomicOrdering::NotAtomic);
942+
AtomicOrdering::NotAtomic, false, false, false, false,
943+
CanDemoteWorkgroupToWavefront);
924944
}
925945

926946
std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -2533,7 +2553,8 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
25332553
bool Changed = false;
25342554

25352555
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2536-
SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2556+
SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST,
2557+
MF.getFunction());
25372558
CC = SICacheControl::create(ST);
25382559

25392560
for (auto &MBB : MF) {

0 commit comments

Comments
 (0)