Skip to content

Commit 2e3fa4b

Browse files
authored
[AMDGPU] Insert before and after instructions that always use GDS (llvm#131338)
It is an architectural requirement that there must be no outstanding GDS instructions when an "always GDS" instruction is issued, and also that an always GDS instruction must be allowed to complete. Insert waits on DScnt/LGKMcnt prior to (if necessary) and subsequent to (unconditionally) any always GDS instruction, and an additional S_NOP if the subsequent wait was followed by S_ENDPGM. Always GDS instructions are GWS instructions, DS_ORDERED_COUNT, DS_ADD_GS_REG_RTN, and DS_SUB_GS_REG_RTN (the latter two as considered always GDS as of this patch).
1 parent 4e69258 commit 2e3fa4b

7 files changed

+125
-10
lines changed

llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,9 @@ bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
328328

329329
// taken from SIInstrInfo::isAlwaysGDS()
330330
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
331-
return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
331+
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
332+
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
333+
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
332334
}
333335

334336
} // namespace llvm::mca

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+84-7
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,16 @@ class WaitcntBrackets {
349349
LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
350350
}
351351

352+
bool hasPendingGDS() const {
353+
return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
354+
}
355+
356+
unsigned getPendingGDSWait() const {
357+
return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
358+
}
359+
360+
void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
361+
352362
// Return true if there might be pending writes to the vgpr-interval by VMEM
353363
// instructions with types different from V.
354364
bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
@@ -427,6 +437,8 @@ class WaitcntBrackets {
427437
unsigned PendingEvents = 0;
428438
// Remember the last flat memory operation.
429439
unsigned LastFlat[NUM_INST_CNTS] = {0};
440+
// Remember the last GDS operation.
441+
unsigned LastGDS = 0;
430442
// wait_cnt scores for every vgpr.
431443
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
432444
int VgprUB = -1;
@@ -729,6 +741,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
729741
MachineInstr *OldWaitcntInstr);
730742
void updateEventWaitcntAfter(MachineInstr &Inst,
731743
WaitcntBrackets *ScoreBrackets);
744+
bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
745+
MachineBasicBlock *Block) const;
746+
bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
747+
WaitcntBrackets &ScoreBrackets);
732748
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
733749
WaitcntBrackets &ScoreBrackets);
734750
};
@@ -1682,6 +1698,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16821698
}
16831699
}
16841700

1701+
// Wait for any pending GDS instruction to complete before any
1702+
// "Always GDS" instruction.
1703+
if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
1704+
addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1705+
16851706
if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
16861707
// The function is going to insert a wait on everything in its prolog.
16871708
// This still needs to be careful if the call target is a load (e.g. a GOT
@@ -1986,6 +2007,64 @@ static bool isCacheInvOrWBInst(MachineInstr &Inst) {
19862007
Opc == AMDGPU::GLOBAL_WBINV;
19872008
}
19882009

2010+
// Return true if the next instruction is S_ENDPGM, following fallthrough
2011+
// blocks if necessary.
2012+
bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2013+
MachineBasicBlock *Block) const {
2014+
auto BlockEnd = Block->getParent()->end();
2015+
auto BlockIter = Block->getIterator();
2016+
2017+
while (true) {
2018+
if (It.isEnd()) {
2019+
if (++BlockIter != BlockEnd) {
2020+
It = BlockIter->instr_begin();
2021+
continue;
2022+
}
2023+
2024+
return false;
2025+
}
2026+
2027+
if (!It->isMetaInstruction())
2028+
break;
2029+
2030+
It++;
2031+
}
2032+
2033+
assert(!It.isEnd());
2034+
2035+
return It->getOpcode() == AMDGPU::S_ENDPGM;
2036+
}
2037+
2038+
// Add a wait after an instruction if architecture requirements mandate one.
2039+
bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2040+
MachineBasicBlock &Block,
2041+
WaitcntBrackets &ScoreBrackets) {
2042+
AMDGPU::Waitcnt Wait;
2043+
bool NeedsEndPGMCheck = false;
2044+
2045+
if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2046+
Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2047+
!SIInstrInfo::isAtomicRet(Inst));
2048+
2049+
if (TII->isAlwaysGDS(Inst.getOpcode())) {
2050+
Wait.DsCnt = 0;
2051+
NeedsEndPGMCheck = true;
2052+
}
2053+
2054+
ScoreBrackets.simplifyWaitcnt(Wait);
2055+
2056+
auto SuccessorIt = std::next(Inst.getIterator());
2057+
bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2058+
/*OldWaitcntInstr=*/nullptr);
2059+
2060+
if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2061+
BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2062+
.addImm(0);
2063+
}
2064+
2065+
return Result;
2066+
}
2067+
19892068
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
19902069
WaitcntBrackets *ScoreBrackets) {
19912070
// Now look at the instruction opcode. If it is a memory access
@@ -1998,6 +2077,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
19982077
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
19992078
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
20002079
ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2080+
ScoreBrackets->setPendingGDS();
20012081
} else {
20022082
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
20032083
}
@@ -2128,6 +2208,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
21282208

21292209
StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
21302210

2211+
if (T == DS_CNT)
2212+
StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2213+
21312214
for (int J = 0; J <= VgprUB; J++)
21322215
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
21332216

@@ -2253,13 +2336,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
22532336

22542337
updateEventWaitcntAfter(Inst, &ScoreBrackets);
22552338

2256-
if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2257-
AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2258-
Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2259-
ScoreBrackets.simplifyWaitcnt(Wait);
2260-
Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2261-
ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2262-
}
2339+
Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
22632340

22642341
LLVM_DEBUG({
22652342
Inst.print(dbgs());

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -4239,7 +4239,9 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
42394239
}
42404240

42414241
bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
4242-
return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4242+
return Opcode == AMDGPU::DS_ORDERED_COUNT ||
4243+
Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
4244+
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
42434245
}
42444246

42454247
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s
2+
3+
---
4+
# GCN-LABEL: name: test_ordered_count
5+
# GCN: bb.0
6+
# GCN: DS_ADD_U32
7+
# GCN: DS_SUB_U32
8+
# GCN-NEXT: S_WAITCNT 64535
9+
# GCN-NEXT: $vgpr3 = DS_ORDERED_COUNT
10+
# GCN-NEXT: S_WAITCNT 64519
11+
# GCN-NEXT: $vgpr4_vgpr5 = DS_ADD_GS_REG_RTN
12+
# GCN-NEXT: S_WAITCNT 64519
13+
# GCN-NEXT: S_NOP 0
14+
15+
name: test_ordered_count
16+
body: |
17+
bb.0:
18+
liveins: $vgpr0, $vgpr1, $vgpr2
19+
20+
DS_ADD_U32 $vgpr1, $vgpr2, 12, -1, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
21+
DS_SUB_U32 $vgpr1, $vgpr2, 12, 0, implicit $m0, implicit $exec :: (load store (s32), addrspace 2)
22+
$vgpr3 = DS_ORDERED_COUNT $vgpr0, 772, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
23+
$vgpr4_vgpr5 = DS_ADD_GS_REG_RTN $vgpr0, 32, implicit $m0, implicit $exec :: (load store (s32), addrspace 3)
24+
S_ENDPGM 0
25+
26+
...

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll

+4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ define amdgpu_gs void @test_add_32(i32 %arg) {
99
; CHECK-LABEL: test_add_32:
1010
; CHECK: ; %bb.0:
1111
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:16 gds
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: s_nop 0
1214
; CHECK-NEXT: s_endpgm
1315
%unused = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16)
1416
ret void
@@ -30,6 +32,8 @@ define amdgpu_gs void @test_add_64(i32 %arg) {
3032
; CHECK-LABEL: test_add_64:
3133
; CHECK: ; %bb.0:
3234
; CHECK-NEXT: ds_add_gs_reg_rtn v[0:1], v0 offset:32 gds
35+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
36+
; CHECK-NEXT: s_nop 0
3337
; CHECK-NEXT: s_endpgm
3438
%unused = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32)
3539
ret void

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ define amdgpu_cs float @ds_ordered_swap(ptr addrspace(2) inreg %gds, i32 %value)
2626
; GCN: s_mov_b32 m0, s0
2727
; VIGFX9-NEXT: s_nop 0
2828
; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[VALUE]] offset:4868 gds
29+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
2930
; GCN-NEXT: [[BB]]:
3031
; // Wait for expcnt(0) before modifying EXEC
3132
; GCN-NEXT: s_waitcnt expcnt(0)
3233
; GCN-NEXT: s_or_b64 exec, exec, s[[SAVED]]
33-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
3434
define amdgpu_cs float @ds_ordered_swap_conditional(ptr addrspace(2) inreg %gds, i32 %value) {
3535
entry:
3636
%c = icmp ne i32 %value, 0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll

+4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ define amdgpu_gs void @test_sub_32(i32 %arg) {
99
; CHECK-LABEL: test_sub_32:
1010
; CHECK: ; %bb.0:
1111
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:16 gds
12+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
13+
; CHECK-NEXT: s_nop 0
1214
; CHECK-NEXT: s_endpgm
1315
%unused = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16)
1416
ret void
@@ -30,6 +32,8 @@ define amdgpu_gs void @test_sub_64(i32 %arg) {
3032
; CHECK-LABEL: test_sub_64:
3133
; CHECK: ; %bb.0:
3234
; CHECK-NEXT: ds_sub_gs_reg_rtn v[0:1], v0 offset:32 gds
35+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
36+
; CHECK-NEXT: s_nop 0
3337
; CHECK-NEXT: s_endpgm
3438
%unused = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32)
3539
ret void

0 commit comments

Comments
 (0)