@@ -349,6 +349,16 @@ class WaitcntBrackets {
349
349
LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
350
350
}
351
351
352
+ bool hasPendingGDS () const {
353
+ return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
354
+ }
355
+
356
+ unsigned getPendingGDSWait () const {
357
+ return std::min (getScoreUB (DS_CNT) - LastGDS, getWaitCountMax (DS_CNT) - 1 );
358
+ }
359
+
360
+ void setPendingGDS () { LastGDS = ScoreUBs[DS_CNT]; }
361
+
352
362
// Return true if there might be pending writes to the vgpr-interval by VMEM
353
363
// instructions with types different from V.
354
364
bool hasOtherPendingVmemTypes (RegInterval Interval, VmemType V) const {
@@ -427,6 +437,8 @@ class WaitcntBrackets {
427
437
unsigned PendingEvents = 0 ;
428
438
// Remember the last flat memory operation.
429
439
unsigned LastFlat[NUM_INST_CNTS] = {0 };
440
+ // Remember the last GDS operation.
441
+ unsigned LastGDS = 0 ;
430
442
// wait_cnt scores for every vgpr.
431
443
// Keep track of the VgprUB and SgprUB to make merge at join efficient.
432
444
int VgprUB = -1 ;
@@ -729,6 +741,10 @@ class SIInsertWaitcnts : public MachineFunctionPass {
729
741
MachineInstr *OldWaitcntInstr);
730
742
void updateEventWaitcntAfter (MachineInstr &Inst,
731
743
WaitcntBrackets *ScoreBrackets);
744
+ bool isNextENDPGM (MachineBasicBlock::instr_iterator It,
745
+ MachineBasicBlock *Block) const ;
746
+ bool insertForcedWaitAfter (MachineInstr &Inst, MachineBasicBlock &Block,
747
+ WaitcntBrackets &ScoreBrackets);
732
748
bool insertWaitcntInBlock (MachineFunction &MF, MachineBasicBlock &Block,
733
749
WaitcntBrackets &ScoreBrackets);
734
750
};
@@ -1682,6 +1698,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1682
1698
}
1683
1699
}
1684
1700
1701
+ // Wait for any pending GDS instruction to complete before any
1702
+ // "Always GDS" instruction.
1703
+ if (TII->isAlwaysGDS (MI.getOpcode ()) && ScoreBrackets.hasPendingGDS ())
1704
+ addWait (Wait, DS_CNT, ScoreBrackets.getPendingGDSWait ());
1705
+
1685
1706
if (MI.isCall () && callWaitsOnFunctionEntry (MI)) {
1686
1707
// The function is going to insert a wait on everything in its prolog.
1687
1708
// This still needs to be careful if the call target is a load (e.g. a GOT
@@ -1986,6 +2007,64 @@ static bool isCacheInvOrWBInst(MachineInstr &Inst) {
1986
2007
Opc == AMDGPU::GLOBAL_WBINV;
1987
2008
}
1988
2009
2010
+ // Return true if the next instruction is S_ENDPGM, following fallthrough
2011
+ // blocks if necessary.
2012
+ bool SIInsertWaitcnts::isNextENDPGM (MachineBasicBlock::instr_iterator It,
2013
+ MachineBasicBlock *Block) const {
2014
+ auto BlockEnd = Block->getParent ()->end ();
2015
+ auto BlockIter = Block->getIterator ();
2016
+
2017
+ while (true ) {
2018
+ if (It.isEnd ()) {
2019
+ if (++BlockIter != BlockEnd) {
2020
+ It = BlockIter->instr_begin ();
2021
+ continue ;
2022
+ }
2023
+
2024
+ return false ;
2025
+ }
2026
+
2027
+ if (!It->isMetaInstruction ())
2028
+ break ;
2029
+
2030
+ It++;
2031
+ }
2032
+
2033
+ assert (!It.isEnd ());
2034
+
2035
+ return It->getOpcode () == AMDGPU::S_ENDPGM;
2036
+ }
2037
+
2038
+ // Add a wait after an instruction if architecture requirements mandate one.
2039
+ bool SIInsertWaitcnts::insertForcedWaitAfter (MachineInstr &Inst,
2040
+ MachineBasicBlock &Block,
2041
+ WaitcntBrackets &ScoreBrackets) {
2042
+ AMDGPU::Waitcnt Wait;
2043
+ bool NeedsEndPGMCheck = false ;
2044
+
2045
+ if (ST->isPreciseMemoryEnabled () && Inst.mayLoadOrStore ())
2046
+ Wait = WCG->getAllZeroWaitcnt (Inst.mayStore () &&
2047
+ !SIInstrInfo::isAtomicRet (Inst));
2048
+
2049
+ if (TII->isAlwaysGDS (Inst.getOpcode ())) {
2050
+ Wait.DsCnt = 0 ;
2051
+ NeedsEndPGMCheck = true ;
2052
+ }
2053
+
2054
+ ScoreBrackets.simplifyWaitcnt (Wait);
2055
+
2056
+ auto SuccessorIt = std::next (Inst.getIterator ());
2057
+ bool Result = generateWaitcnt (Wait, SuccessorIt, Block, ScoreBrackets,
2058
+ /* OldWaitcntInstr=*/ nullptr );
2059
+
2060
+ if (Result && NeedsEndPGMCheck && isNextENDPGM (SuccessorIt, &Block)) {
2061
+ BuildMI (Block, SuccessorIt, Inst.getDebugLoc (), TII->get (AMDGPU::S_NOP))
2062
+ .addImm (0 );
2063
+ }
2064
+
2065
+ return Result;
2066
+ }
2067
+
1989
2068
void SIInsertWaitcnts::updateEventWaitcntAfter (MachineInstr &Inst,
1990
2069
WaitcntBrackets *ScoreBrackets) {
1991
2070
// Now look at the instruction opcode. If it is a memory access
@@ -1998,6 +2077,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1998
2077
TII->hasModifiersSet (Inst, AMDGPU::OpName::gds)) {
1999
2078
ScoreBrackets->updateByEvent (TII, TRI, MRI, GDS_ACCESS, Inst);
2000
2079
ScoreBrackets->updateByEvent (TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2080
+ ScoreBrackets->setPendingGDS ();
2001
2081
} else {
2002
2082
ScoreBrackets->updateByEvent (TII, TRI, MRI, LDS_ACCESS, Inst);
2003
2083
}
@@ -2128,6 +2208,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2128
2208
2129
2209
StrictDom |= mergeScore (M, LastFlat[T], Other.LastFlat [T]);
2130
2210
2211
+ if (T == DS_CNT)
2212
+ StrictDom |= mergeScore (M, LastGDS, Other.LastGDS );
2213
+
2131
2214
for (int J = 0 ; J <= VgprUB; J++)
2132
2215
StrictDom |= mergeScore (M, VgprScores[T][J], Other.VgprScores [T][J]);
2133
2216
@@ -2253,13 +2336,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2253
2336
2254
2337
updateEventWaitcntAfter (Inst, &ScoreBrackets);
2255
2338
2256
- if (ST->isPreciseMemoryEnabled () && Inst.mayLoadOrStore ()) {
2257
- AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt (
2258
- Inst.mayStore () && !SIInstrInfo::isAtomicRet (Inst));
2259
- ScoreBrackets.simplifyWaitcnt (Wait);
2260
- Modified |= generateWaitcnt (Wait, std::next (Inst.getIterator ()), Block,
2261
- ScoreBrackets, /* OldWaitcntInstr=*/ nullptr );
2262
- }
2339
+ Modified |= insertForcedWaitAfter (Inst, Block, ScoreBrackets);
2263
2340
2264
2341
LLVM_DEBUG ({
2265
2342
Inst.print (dbgs ());
0 commit comments