49#define DEBUG_TYPE "si-insert-waitcnts"
53 cl::desc(
"Force all waitcnt instrs to be emitted as "
54 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
58 "amdgpu-waitcnt-load-forcezero",
59 cl::desc(
"Force all waitcnt load counters to wait until 0"),
63 "amdgpu-expert-scheduling-mode",
64 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
69template <
typename EmitWaitcntFn>
70static void EmitExpandedWaitcnt(
unsigned Outstanding,
unsigned Target,
71 EmitWaitcntFn &&EmitWaitcnt) {
73 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0u; --
I)
93 TRACKINGID_RANGE_LEN = (1 << 16),
98 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
103 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
104 LDSDMA_BEGIN = REGUNITS_END,
105 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
109static constexpr VMEMID toVMEMID(MCRegUnit RU) {
110 return static_cast<unsigned>(RU);
122 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
123 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
124 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
125 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
126 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
131 switch (
MI.getOpcode()) {
132 case AMDGPU::ASYNCMARK:
133 case AMDGPU::WAIT_ASYNCMARK:
136 return MI.isMetaInstruction();
151class WaitcntBrackets;
159class WaitcntGenerator {
161 const GCNSubtarget &ST;
162 const SIInstrInfo &TII;
163 AMDGPU::IsaVersion IV;
166 bool ExpandWaitcntProfiling =
false;
167 const AMDGPU::HardwareLimits &Limits;
170 WaitcntGenerator() =
delete;
171 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
172 WaitcntGenerator(
const MachineFunction &MF,
174 const AMDGPU::HardwareLimits &Limits)
175 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
179 ExpandWaitcntProfiling(
180 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
185 bool isOptNone()
const {
return OptNone; }
201 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
202 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
206 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
211 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
213 AMDGPU::Waitcnt
Wait,
214 const WaitcntBrackets &ScoreBrackets) = 0;
221 assert(
E.size() == 1 &&
"Cannot handle a mask of events!");
223 if (getWaitEvents(
T) &
E)
234 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
236 virtual ~WaitcntGenerator() =
default;
239class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
242 HWEvents::VMEM_READ_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
243 HWEvents::VMEM_BVH_READ_ACCESS,
244 HWEvents::SMEM_ACCESS | HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS |
245 HWEvents::SQ_MESSAGE,
246 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
247 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
248 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
249 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
260 using WaitcntGenerator::WaitcntGenerator;
262 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
263 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
266 bool createNewWaitcnt(MachineBasicBlock &
Block,
268 AMDGPU::Waitcnt
Wait,
269 const WaitcntBrackets &ScoreBrackets)
override;
272 HWEvents EVs = WaitEventMaskForInstPreGFX12[
T];
278 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
281class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
286 HWEvents::VMEM_READ_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
287 HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS,
288 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
289 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
290 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
292 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
293 HWEvents::VMEM_SAMPLER_READ_ACCESS,
294 HWEvents::VMEM_BVH_READ_ACCESS,
296 HWEvents::SMEM_ACCESS | HWEvents::SQ_MESSAGE | HWEvents::SCC_WRITE,
297 HWEvents::VMEM_GROUP | HWEvents::SMEM_GROUP,
298 HWEvents::ASYNC_ACCESS,
299 HWEvents::TENSOR_ACCESS,
300 HWEvents::VGPR_CSMACC_WRITE | HWEvents::VGPR_DPMACC_WRITE |
301 HWEvents::VGPR_TRANS_WRITE | HWEvents::VGPR_XDL_WRITE,
302 HWEvents::VGPR_LDS_READ | HWEvents::VGPR_FLAT_READ |
303 HWEvents::VGPR_VMEM_READ};
306 WaitcntGeneratorGFX12Plus() =
delete;
307 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
309 const AMDGPU::HardwareLimits &Limits,
311 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
314 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
315 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
318 bool createNewWaitcnt(MachineBasicBlock &
Block,
320 AMDGPU::Waitcnt
Wait,
321 const WaitcntBrackets &ScoreBrackets)
override;
324 return WaitEventMaskForInstGFX12Plus[
T];
327 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
331struct PreheaderFlushFlags {
332 bool FlushVmCnt =
false;
333 bool FlushDsCnt =
false;
336class SIInsertWaitcnts {
337 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
338 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
339 MachineLoopInfo &MLI;
340 MachinePostDominatorTree &PDT;
345 std::unique_ptr<WaitcntBrackets> Incoming;
347 BlockInfo() =
default;
348 BlockInfo(BlockInfo &&) =
default;
349 BlockInfo &operator=(BlockInfo &&) =
default;
353 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
357 std::unique_ptr<WaitcntGenerator> WCG;
360 DenseSet<MachineInstr *> CallInsts;
361 DenseSet<MachineInstr *> ReturnInsts;
366 DenseMap<MachineInstr *, bool> EndPgmInsts;
368 AMDGPU::HardwareLimits Limits;
371 const GCNSubtarget &ST;
372 const SIInstrInfo &TII;
373 const SIRegisterInfo &TRI;
374 const MachineRegisterInfo &MRI;
377 bool IsExpertMode =
false;
379 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
381 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
382 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
383 MRI(MF.getRegInfo()) {}
385 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
387 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
388 const WaitcntBrackets &Brackets);
389 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
390 const WaitcntBrackets &ScoreBrackets);
391 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
392 bool isDSRead(
const MachineInstr &
MI)
const;
393 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
396 bool isAsync(
const MachineInstr &
MI)
const {
401 const MachineOperand *
Async =
402 TII.getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
406 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
410 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
414 bool shouldUpdateAsyncMark(
const MachineInstr &
MI,
418 if (!isAsyncLdsDmaWrite(
MI))
425 bool isVmemAccess(
const MachineInstr &
MI)
const;
426 bool generateWaitcntInstBefore(MachineInstr &
MI,
427 WaitcntBrackets &ScoreBrackets,
428 MachineInstr *OldWaitcntInstr,
429 PreheaderFlushFlags FlushFlags);
430 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
432 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
433 MachineInstr *OldWaitcntInstr);
434 void updateEventWaitcntAfter(MachineInstr &Inst,
435 WaitcntBrackets *ScoreBrackets);
437 MachineBasicBlock *
Block)
const;
438 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
439 WaitcntBrackets &ScoreBrackets);
440 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
441 WaitcntBrackets &ScoreBrackets);
444 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
446 bool ExpertMode)
const;
448 return WCG->getWaitEvents(
T);
451 return WCG->getCounterFromEvent(
E);
463class WaitcntBrackets {
465 WaitcntBrackets(
const SIInsertWaitcnts *Context) : Context(Context) {
466 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
471 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
472 for (
auto &[
ID, Val] : VMem) {
476 for (
auto &[
ID, Val] : SGPRs) {
481 if (NumUnusedVmem || NumUnusedSGPRs) {
482 errs() <<
"WaitcntBracket had unused entries at destruction time: "
483 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
484 <<
" SGPR unused entries\n";
495 return ScoreUBs[
T] - ScoreLBs[
T];
499 return getVMemScore(
ID,
T) > getScoreLB(
T);
517 return getScoreUB(
T) - getScoreLB(
T);
521 auto It = SGPRs.find(RU);
522 return It != SGPRs.end() ? It->second.get(
T) : 0;
526 auto It = VMem.find(TID);
527 return It != VMem.end() ? It->second.Scores[
T] : 0;
534 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
537 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
538 AMDGPU::Waitcnt &UpdateWait)
const;
541 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
542 AMDGPU::Waitcnt &UpdateWait)
const;
543 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
544 AMDGPU::Waitcnt &UpdateWait)
const;
547 AMDGPU::Waitcnt &
Wait,
548 const MachineInstr &
MI)
const;
549 MCPhysReg determineVGPR16Dependency(
const MachineInstr &
MI,
553 AMDGPU::Waitcnt &
Wait)
const;
554 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
555 void tryClearSCCWriteEvent(MachineInstr *Inst);
557 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
561 void recordAsyncMark(MachineInstr &
MI);
563 HWEvents getPendingEvents()
const {
return PendingEvents; }
564 bool hasPendingEvent()
const {
return PendingEvents.
any(); }
565 bool hasPendingEvent(
HWEvents E)
const {
return PendingEvents.contains(
E); }
567 bool HasPending = (PendingEvents & Context->getWaitEvents(
T)).any();
569 "Expected pending events iff scoreboard is not empty");
574 HWEvents Events = PendingEvents & Context->getWaitEvents(
T);
576 return Events.
size() > 1;
579 bool hasPendingFlat()
const {
586 void setPendingFlat() {
591 bool hasPendingGDS()
const {
596 unsigned getPendingGDSWait()
const {
606 for (MCRegUnit RU : regunits(
Reg)) {
607 auto It = VMem.find(toVMEMID(RU));
608 if (It != VMem.end() && (It->second.VGPRPendingEvents & ~
E).any())
615 for (MCRegUnit RU : regunits(
Reg)) {
616 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
618 if (It->second.empty())
624 void setStateOnFunctionEntryOrReturn() {
630 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
634 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
635 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
638 void print(raw_ostream &)
const;
643 void purgeEmptyTrackingData();
647 return Context->getLimits().get(
T);
657 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
660 AMDGPU::Waitcnt &
Wait)
const;
662 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
663 unsigned OtherScore);
668 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
669 if (!Context->TRI.isInAllocatableClass(
Reg))
671 return Context->TRI.regunits(
Reg);
692 const SIRegisterInfo &
TRI = Context->TRI;
693 if (
Reg == AMDGPU::SCC) {
695 }
else if (
TRI.isVectorRegister(Context->MRI,
Reg)) {
696 for (MCRegUnit RU : regunits(
Reg))
697 VMem[toVMEMID(RU)].Scores[
T] = Val;
698 }
else if (
TRI.isSGPRReg(Context->MRI,
Reg)) {
699 for (MCRegUnit RU : regunits(
Reg))
700 SGPRs[RU].get(
T) = Val;
707 VMem[TID].Scores[
T] = Val;
710 void setScoreByOperand(
const MachineOperand &
Op,
713 const SIInsertWaitcnts *Context;
719 unsigned LastFlatDsCnt = 0;
720 unsigned LastFlatLoadCnt = 0;
722 unsigned LastGDS = 0;
739 CounterValueArray Scores{};
753 unsigned ScoreDsKmCnt = 0;
754 unsigned ScoreXCnt = 0;
770 bool empty()
const {
return !ScoreDsKmCnt && !ScoreXCnt; }
773 DenseMap<VMEMID, VMEMInfo> VMem;
774 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
777 unsigned SCCScore = 0;
779 const MachineInstr *PendingSCCWrite =
nullptr;
783 SmallVector<const MachineInstr *> LDSDMAStores;
792 static constexpr unsigned MaxAsyncMarks = 16;
796 CounterValueArray AsyncScore{};
799SIInsertWaitcnts::BlockInfo::~BlockInfo() =
default;
804 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
806 bool runOnMachineFunction(MachineFunction &MF)
override;
808 StringRef getPassName()
const override {
809 return "SI insert wait instructions";
812 void getAnalysisUsage(AnalysisUsage &AU)
const override {
815 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
827 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
835bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
840 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
850bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
852 if (!hasPointSampleAccel(
MI))
855 return hasDifferentVGPRPendingEvents(
Reg, HWEvents::VMEM_READ_ACCESS);
858void WaitcntBrackets::updateByEvent(
HWEvents E, MachineInstr &Inst) {
859 assert(
E.size() == 1 &&
"Expected singular event!");
863 unsigned UB = getScoreUB(
T);
866 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
878 setScoreUB(
T, CurrScore);
881 const MachineRegisterInfo &MRI =
Context->MRI;
890 if (
const auto *AddrOp =
TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
894 if (
const auto *Data0 =
895 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
897 if (
const auto *Data1 =
898 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
902 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
903 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
904 for (
const MachineOperand &
Op : Inst.
all_uses()) {
905 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
909 }
else if (
TII.isFLAT(Inst)) {
911 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
914 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
917 }
else if (
TII.isMIMG(Inst)) {
921 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
924 }
else if (
TII.isMTBUF(Inst)) {
927 }
else if (
TII.isMUBUF(Inst)) {
931 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
934 }
else if (
TII.isLDSDIR(Inst)) {
936 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
939 if (
TII.isEXP(Inst)) {
944 for (MachineOperand &DefMO : Inst.
all_defs()) {
945 if (
TRI.isVGPR(MRI, DefMO.getReg())) {
950 for (
const MachineOperand &
Op : Inst.
all_uses()) {
951 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
957 E == HWEvents::SMEM_GROUP ? HWEvents::VMEM_GROUP : HWEvents::SMEM_GROUP;
958 if (PendingEvents.
contains(OtherEvent)) {
963 setScoreLB(
T, getScoreUB(
T) - 1);
964 PendingEvents -= OtherEvent;
966 for (
const MachineOperand &
Op : Inst.
all_uses())
967 setScoreByOperand(
Op,
T, CurrScore);
971 for (
const MachineOperand &
Op : Inst.
operands()) {
976 setScoreByOperand(
Op,
T, CurrScore);
988 for (
const MachineOperand &
Op : Inst.
defs()) {
991 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
993 if (updateVMCntOnly(Inst)) {
1002 if (hasPointSampleAccel(Inst))
1003 VGPRContext |= HWEvents::VMEM_READ_ACCESS;
1004 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1005 VMem[toVMEMID(RU)].VGPRPendingEvents |= VGPRContext;
1008 setScoreByOperand(
Op,
T, CurrScore);
1011 (
TII.isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1020 if (!MemOp->isStore() ||
1025 auto AAI = MemOp->getAAInfo();
1031 if (!AAI || !AAI.Scope)
1033 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1034 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1035 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1050 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1051 if (Slot && Slot < NUM_LDSDMA)
1052 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1055 if (
Context->shouldUpdateAsyncMark(Inst,
T)) {
1056 AsyncScore[
T] = CurrScore;
1060 setRegScore(AMDGPU::SCC,
T, CurrScore);
1061 PendingSCCWrite = &Inst;
1066void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1072 AsyncMarks.push_back(AsyncScore);
1075 dbgs() <<
"recordAsyncMark:\n" << Inst;
1076 for (
const auto &Mark : AsyncMarks) {
1083void WaitcntBrackets::print(raw_ostream &OS)
const {
1087 unsigned SR = getScoreRange(
T);
1090 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1094 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1098 OS <<
" EXP_CNT(" << SR <<
"):";
1101 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1105 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1108 OS <<
" BVH_CNT(" << SR <<
"):";
1111 OS <<
" KM_CNT(" << SR <<
"):";
1114 OS <<
" X_CNT(" << SR <<
"):";
1117 OS <<
" ASYNC_CNT(" << SR <<
"):";
1120 OS <<
" VA_VDST(" << SR <<
"): ";
1123 OS <<
" VM_VSRC(" << SR <<
"): ";
1126 OS <<
" UNKNOWN(" << SR <<
"):";
1132 unsigned LB = getScoreLB(
T);
1135 sort(SortedVMEMIDs);
1137 for (
auto ID : SortedVMEMIDs) {
1138 unsigned RegScore = VMem.at(
ID).Scores[
T];
1141 unsigned RelScore = RegScore - LB - 1;
1142 if (
ID < REGUNITS_END) {
1143 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1145 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1146 "Unhandled/unexpected ID value!");
1147 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1152 if (isSmemCounter(
T)) {
1154 sort(SortedSMEMIDs);
1155 for (
auto ID : SortedSMEMIDs) {
1156 unsigned RegScore = SGPRs.at(
ID).get(
T);
1159 unsigned RelScore = RegScore - LB - 1;
1160 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1165 OS <<
' ' << SCCScore <<
":scc";
1170 OS <<
"Pending Events: ";
1171 if (hasPendingEvent()) {
1172 OS << getPendingEvents();
1178 OS <<
"Async score: ";
1179 if (AsyncScore.empty())
1185 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1187 for (
const auto &Mark : AsyncMarks) {
1189 unsigned MarkedScore = Mark[
T];
1192 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1193 <<
"_CNT: " << MarkedScore;
1196 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1197 <<
"_CNT: " << MarkedScore;
1200 OS <<
" EXP_CNT: " << MarkedScore;
1203 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS")
1204 <<
"_CNT: " << MarkedScore;
1207 OS <<
" SAMPLE_CNT: " << MarkedScore;
1210 OS <<
" BVH_CNT: " << MarkedScore;
1213 OS <<
" KM_CNT: " << MarkedScore;
1216 OS <<
" X_CNT: " << MarkedScore;
1219 OS <<
" ASYNC_CNT: " << MarkedScore;
1222 OS <<
" UNKNOWN: " << MarkedScore;
1233void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1234 AMDGPU::Waitcnt &UpdateWait)
const {
1242 simplifyXcnt(CheckWait, UpdateWait);
1244 simplifyVmVsrc(CheckWait, UpdateWait);
1249 unsigned &
Count)
const {
1253 if (
Count >= getScoreRange(
T))
1257void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait,
1259 unsigned Cnt =
Wait.get(
T);
1260 simplifyWaitcnt(
T, Cnt);
1264void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1265 AMDGPU::Waitcnt &UpdateWait)
const {
1275 hasPendingEvent(HWEvents::SMEM_GROUP))
1281 hasPendingEvent(HWEvents::VMEM_GROUP) &&
1288void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1289 AMDGPU::Waitcnt &UpdateWait)
const {
1294 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1295 CheckWait.get(AMDGPU::STORE_CNT),
1296 CheckWait.get(AMDGPU::SAMPLE_CNT),
1297 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1302void WaitcntBrackets::purgeEmptyTrackingData() {
1303 VMem.remove_if([](
const auto &
P) {
return P.second.empty(); });
1304 SGPRs.remove_if([](
const auto &
P) {
return P.second.empty(); });
1308 unsigned ScoreToWait,
1309 AMDGPU::Waitcnt &
Wait)
const {
1310 const unsigned LB = getScoreLB(
T);
1311 const unsigned UB = getScoreUB(
T);
1314 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1316 !
Context->ST.hasFlatLgkmVMemCountInOrder()) {
1321 }
else if (counterOutOfOrder(
T)) {
1329 unsigned NeededWait = std::min(UB - ScoreToWait, getLimit(
T) - 1);
1330 Wait.add(
T, NeededWait);
1335AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1337 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1339 for (
const auto &Mark : AsyncMarks) {
1345 if (AsyncMarks.size() == MaxAsyncMarks) {
1350 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1351 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1354 AMDGPU::Waitcnt
Wait;
1355 if (AsyncMarks.size() <=
N) {
1360 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1361 const auto &RequiredMark = AsyncMarks[MarkIndex];
1363 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1369 dbgs() <<
"Removing " << (MarkIndex + 1)
1370 <<
" async marks after determining wait\n";
1372 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1385MCPhysReg WaitcntBrackets::determineVGPR16Dependency(
const MachineInstr &
MI,
1388 const TargetRegisterClass *RC =
Context->TRI.getPhysRegBaseClass(
Reg);
1389 unsigned Size =
Context->TRI.getRegSizeInBits(*RC);
1391 if (
Size != 16 || !
Context->ST.hasD16Writes32BitVgpr())
1401 AMDGPU::Waitcnt
Wait;
1402 for (MCRegUnit RU : regunits(OtherHalf))
1403 determineWaitForScore(
T, getVMemScore(toVMEMID(RU),
T),
Wait);
1406 if (!
Wait.hasWait())
1416 HWEvents Events = MIEvents & OtherHalfEvents;
1417 if (Events.
size() > 1)
1424 AMDGPU::Waitcnt &
Wait,
1425 const MachineInstr &
MI)
const {
1426 if (
Reg == AMDGPU::SCC) {
1427 determineWaitForScore(
T, SCCScore,
Wait);
1431 Reg = determineVGPR16Dependency(
MI,
T,
Reg);
1432 for (MCRegUnit RU : regunits(
Reg))
1433 determineWaitForScore(
1434 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1441 AMDGPU::Waitcnt &
Wait)
const {
1442 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1443 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1446void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1449 if (PendingSCCWrite &&
1450 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1452 HWEvents SCC_WRITE_PendingEvent = HWEvents::SCC_WRITE;
1455 SCC_WRITE_PendingEvent) {
1459 PendingEvents -= SCC_WRITE_PendingEvent;
1460 PendingSCCWrite =
nullptr;
1464void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1466 applyWaitcnt(
Wait,
T);
1470 const unsigned UB = getScoreUB(
T);
1474 if (counterOutOfOrder(
T))
1476 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1479 PendingEvents -=
Context->getWaitEvents(
T);
1483 hasPendingEvent(HWEvents::SMEM_GROUP)) {
1487 PendingEvents -= HWEvents::SMEM_GROUP;
1493 else if (
Count == 0)
1494 PendingEvents -= HWEvents::VMEM_GROUP;
1498void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait,
1500 unsigned Cnt =
Wait.get(
T);
1501 applyWaitcnt(
T, Cnt);
1508 if ((
T ==
Context->SmemAccessCounter &&
1509 hasPendingEvent(HWEvents::SMEM_ACCESS)) ||
1526 static constexpr HWEvents ExtendedImageEvents =
1527 HWEvents::VMEM_SAMPLER_READ_ACCESS | HWEvents::VMEM_BVH_READ_ACCESS;
1528 if (!
Context->ST.hasExtendedWaitCounts() &&
1529 (Events & ExtendedImageEvents).any()) {
1530 Events -= ExtendedImageEvents;
1533 Events |= HWEvents::VMEM_READ_ACCESS;
1539 Events -= HWEvents::GLOBAL_INV_ACCESS;
1543 return Events.
size() > 1;
1546 return hasMixedPendingEvents(
T);
1556char SIInsertWaitcntsLegacy::
ID = 0;
1561 return new SIInsertWaitcntsLegacy();
1566 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1571 if (NewEnc == MO.
getImm())
1578bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1592bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1593 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1595 assert(isNormalMode(MaxCounter));
1598 MachineInstr *WaitcntInstr =
nullptr;
1599 MachineInstr *WaitcntVsCntInstr =
nullptr;
1602 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1604 dbgs() <<
"end of block\n";
1612 if (isNonWaitcntMetaInst(
II)) {
1618 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1622 if (Opcode == AMDGPU::S_WAITCNT) {
1623 unsigned IEnc =
II.getOperand(0).getImm();
1626 ScoreBrackets.simplifyWaitcnt(OldWait);
1630 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1631 II.eraseFromParent();
1635 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1638 <<
"Before: " <<
Wait <<
'\n';);
1649 II.eraseFromParent();
1650 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1651 unsigned N =
II.getOperand(0).getImm();
1653 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1656 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1657 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1660 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1666 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1667 II.eraseFromParent();
1670 WaitcntVsCntInstr = &
II;
1677 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1686 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1687 <<
"New Instr at block end: "
1688 << *WaitcntInstr <<
'\n'
1689 :
dbgs() <<
"applied pre-existing waitcnt\n"
1690 <<
"Old Instr: " << *It
1691 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1694 if (WaitcntVsCntInstr) {
1698 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1704 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1705 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1707 :
dbgs() <<
"applied pre-existing waitcnt\n"
1708 <<
"Old Instr: " << *It
1709 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1717bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1719 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1720 assert(isNormalMode(MaxCounter));
1727 if (
Wait.hasWaitExceptStoreCnt()) {
1729 if (ExpandWaitcntProfiling) {
1733 bool AnyOutOfOrder =
false;
1735 unsigned WaitCnt =
Wait.get(CT);
1736 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1737 AnyOutOfOrder =
true;
1742 if (AnyOutOfOrder) {
1750 unsigned WaitCnt =
Wait.get(CT);
1754 unsigned Outstanding =
1755 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
1756 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1768 [[maybe_unused]]
auto SWaitInst =
1773 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1774 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1778 if (
Wait.hasWaitStoreCnt()) {
1784 unsigned Outstanding =
1787 EmitExpandedWaitcnt(
1789 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1790 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1795 [[maybe_unused]]
auto SWaitInst =
1797 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1802 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1803 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1811WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1812 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
1816WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1817 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1818 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1820 ~0u , ExpertVal, ExpertVal);
1827bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1828 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1830 assert(!isNormalMode(MaxCounter));
1833 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1834 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1835 MachineInstr *WaitcntDepctrInstr =
nullptr;
1839 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1841 dbgs() <<
"end of block\n";
1847 AMDGPU::Waitcnt RequiredWait;
1852 if (isNonWaitcntMetaInst(
II)) {
1861 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1865 if (Opcode == AMDGPU::S_WAITCNT)
1868 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1870 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1875 RequiredWait = RequiredWait.combined(OldWait);
1877 if (CombinedLoadDsCntInstr ==
nullptr) {
1878 CombinedLoadDsCntInstr = &
II;
1880 II.eraseFromParent();
1883 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1885 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1890 RequiredWait = RequiredWait.combined(OldWait);
1892 if (CombinedStoreDsCntInstr ==
nullptr) {
1893 CombinedStoreDsCntInstr = &
II;
1895 II.eraseFromParent();
1898 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1900 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1901 AMDGPU::Waitcnt OldWait;
1905 ScoreBrackets.simplifyWaitcnt(OldWait);
1907 if (WaitcntDepctrInstr ==
nullptr) {
1908 WaitcntDepctrInstr = &
II;
1917 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1925 II.eraseFromParent();
1929 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1932 II.eraseFromParent();
1934 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1937 unsigned N =
II.getOperand(0).getImm();
1938 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1941 std::optional<AMDGPU::InstCounterType> CT =
1945 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1947 Wait.add(CT.value(), OldCnt);
1949 RequiredWait.add(CT.value(), OldCnt);
1951 if (WaitInstrs[CT.value()] ==
nullptr) {
1952 WaitInstrs[CT.value()] = &
II;
1954 II.eraseFromParent();
1960 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
1961 Wait =
Wait.combined(RequiredWait);
1963 if (CombinedLoadDsCntInstr) {
1979 AMDGPU::OpName::simm16, NewEnc);
1980 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1986 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1987 <<
"New Instr at block end: "
1988 << *CombinedLoadDsCntInstr <<
'\n'
1989 :
dbgs() <<
"applied pre-existing waitcnt\n"
1990 <<
"Old Instr: " << *It <<
"New Instr: "
1991 << *CombinedLoadDsCntInstr <<
'\n');
1998 if (CombinedStoreDsCntInstr) {
2003 AMDGPU::OpName::simm16, NewEnc);
2004 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2010 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2011 <<
"New Instr at block end: "
2012 << *CombinedStoreDsCntInstr <<
'\n'
2013 :
dbgs() <<
"applied pre-existing waitcnt\n"
2014 <<
"Old Instr: " << *It <<
"New Instr: "
2015 << *CombinedStoreDsCntInstr <<
'\n');
2045 for (MachineInstr **WI : WaitsToErase) {
2049 (*WI)->eraseFromParent();
2056 if (!WaitInstrs[CT])
2059 unsigned NewCnt =
Wait.get(CT);
2060 if (NewCnt != ~0u) {
2062 AMDGPU::OpName::simm16, NewCnt);
2063 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2065 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2069 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2070 <<
"New Instr at block end: " << *WaitInstrs[CT]
2072 :
dbgs() <<
"applied pre-existing waitcnt\n"
2073 <<
"Old Instr: " << *It
2074 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2081 if (WaitcntDepctrInstr) {
2085 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2100 AMDGPU::OpName::simm16, Enc);
2102 <<
"New Instr at block end: "
2103 << *WaitcntDepctrInstr <<
'\n'
2104 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2105 <<
"Old Instr: " << *It <<
"New Instr: "
2106 << *WaitcntDepctrInstr <<
'\n');
2117bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2119 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2120 assert(!isNormalMode(MaxCounter));
2127 if (ExpandWaitcntProfiling) {
2134 if (ScoreBrackets.counterOutOfOrder(CT)) {
2141 unsigned Outstanding =
2142 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
2143 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2155 MachineInstr *SWaitInst =
nullptr;
2179 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2180 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2192 [[maybe_unused]]
auto SWaitInst =
2199 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2200 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2203 if (
Wait.hasWaitDepctr()) {
2209 [[maybe_unused]]
auto SWaitInst =
2215 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2216 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2235bool SIInsertWaitcnts::generateWaitcntInstBefore(
2236 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2237 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2242 AMDGPU::Waitcnt
Wait;
2243 const unsigned Opc =
MI.getOpcode();
2246 case AMDGPU::BUFFER_WBINVL1:
2247 case AMDGPU::BUFFER_WBINVL1_SC:
2248 case AMDGPU::BUFFER_WBINVL1_VOL:
2249 case AMDGPU::BUFFER_GL0_INV:
2250 case AMDGPU::BUFFER_GL1_INV: {
2258 case AMDGPU::SI_RETURN_TO_EPILOG:
2259 case AMDGPU::SI_RETURN:
2260 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2261 case AMDGPU::S_SETPC_B64_return: {
2266 AMDGPU::Waitcnt AllZeroWait =
2267 WCG->getAllZeroWaitcnt(
false);
2272 if (
ST.hasExtendedWaitCounts() &&
2273 !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_READ_ACCESS))
2278 case AMDGPU::S_ENDPGM:
2279 case AMDGPU::S_ENDPGM_SAVED: {
2290 !ScoreBrackets.hasPendingEvent(HWEvents::SCRATCH_WRITE_ACCESS);
2293 case AMDGPU::S_SENDMSG:
2294 case AMDGPU::S_SENDMSGHALT: {
2295 if (
ST.hasLegacyGeometry() &&
2310 if (
MI.modifiesRegister(AMDGPU::EXEC, &
TRI)) {
2313 if (ScoreBrackets.hasPendingEvent(HWEvents::EXP_GPR_LOCK) ||
2314 ScoreBrackets.hasPendingEvent(HWEvents::EXP_PARAM_ACCESS) ||
2315 ScoreBrackets.hasPendingEvent(HWEvents::EXP_POS_ACCESS) ||
2316 ScoreBrackets.hasPendingEvent(HWEvents::GDS_GPR_LOCK)) {
2323 if (
TII.isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2331 Wait = AMDGPU::Waitcnt();
2333 const MachineOperand &CallAddrOp =
TII.getCalleeOperand(
MI);
2334 if (CallAddrOp.
isReg()) {
2335 ScoreBrackets.determineWaitForPhysReg(
2338 if (
const auto *RtnAddrOp =
2339 TII.getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2340 ScoreBrackets.determineWaitForPhysReg(
2341 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait,
MI);
2344 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2345 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2361 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2362 const Value *Ptr = Memop->getValue();
2363 if (Memop->isStore()) {
2364 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2365 Wait.add(SmemAccessCounter, 0);
2367 SLoadAddresses.
erase(It);
2370 unsigned AS = Memop->getAddrSpace();
2374 if (
TII.mayWriteLDSThroughDMA(
MI))
2378 unsigned TID = LDSDMA_BEGIN;
2379 if (Ptr && Memop->getAAInfo()) {
2380 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2381 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2382 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2383 if ((
I + 1) >= NUM_LDSDMA) {
2398 if (Memop->isStore()) {
2404 for (
const MachineOperand &
Op :
MI.operands()) {
2409 if (
Op.isTied() &&
Op.isUse() &&
TII.doesNotReadTiedSource(
MI))
2414 const bool IsVGPR =
TRI.isVectorRegister(MRI,
Op.getReg());
2421 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2434 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2435 ScoreBrackets.hasDifferentVGPRPendingEvents(
2437 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2438 !
ST.hasVmemWriteVgprInOrder()) {
2445 ScoreBrackets.clearVGPRPendingEvents(
Reg);
2449 ScoreBrackets.hasPendingEvent(HWEvents::EXP_LDS_ACCESS)) {
2454 }
else if (
Op.getReg() == AMDGPU::SCC) {
2457 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait,
2461 if (
ST.hasWaitXcnt() &&
Op.isDef())
2480 if (
Opc == AMDGPU::S_BARRIER && !
ST.hasAutoWaitcntBeforeBarrier() &&
2481 !
ST.hasBackOffBarrier()) {
2482 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2489 ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS)) {
2494 ScoreBrackets.simplifyWaitcnt(
Wait);
2514 Wait = WCG->getAllZeroWaitcnt(
false);
2518 if (!ForceEmitWaitcnt[
T])
2523 if (FlushFlags.FlushVmCnt) {
2529 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
2535 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2539bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2541 MachineBasicBlock &
Block,
2542 WaitcntBrackets &ScoreBrackets,
2543 MachineInstr *OldWaitcntInstr) {
2546 if (OldWaitcntInstr)
2550 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2555 MachineOperand *WaitExp =
TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2565 <<
"Update Instr: " << *It);
2568 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2573 ScoreBrackets.applyWaitcnt(
Wait);
2578bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2579 return (
TII.isFLAT(
MI) &&
TII.mayAccessVMEMThroughFlat(
MI)) ||
2586 MachineBasicBlock *
Block)
const {
2587 auto BlockEnd =
Block->getParent()->end();
2588 auto BlockIter =
Block->getIterator();
2592 if (++BlockIter != BlockEnd) {
2593 It = BlockIter->instr_begin();
2600 if (!It->isMetaInstruction())
2608 return It->getOpcode() == AMDGPU::S_ENDPGM;
2612bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2613 MachineBasicBlock &
Block,
2614 WaitcntBrackets &ScoreBrackets) {
2615 AMDGPU::Waitcnt
Wait;
2616 bool NeedsEndPGMCheck =
false;
2624 NeedsEndPGMCheck =
true;
2627 ScoreBrackets.simplifyWaitcnt(
Wait);
2630 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2633 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2641void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2642 WaitcntBrackets *ScoreBrackets) {
2646 ScoreBrackets->updateByEvent(
E, Inst);
2648 if (
TII.isDS(Inst) &&
TII.usesLGKM_CNT(Inst)) {
2650 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2651 ScoreBrackets->setPendingGDS();
2653 }
else if (
TII.isFLAT(Inst)) {
2661 ScoreBrackets->setPendingFlat();
2663 }
else if (Inst.
isCall()) {
2666 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2667 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2668 }
else if (
TII.isVINTERP(Inst)) {
2669 int64_t
Imm =
TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2679bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2680 unsigned OtherScore) {
2681 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2682 unsigned OtherShifted =
2683 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2684 Score = std::max(MyShifted, OtherShifted);
2685 return OtherShifted > MyShifted;
2690 bool StrictDom =
false;
2694 if (AsyncMarks.empty() && OtherMarks.
empty()) {
2701 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
2702 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2705 if (AsyncMarks.size() > MaxSize)
2706 AsyncMarks.erase(AsyncMarks.begin(),
2707 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2713 constexpr CounterValueArray ZeroMark{};
2714 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2717 dbgs() <<
"Before merge:\n";
2718 for (
const auto &Mark : AsyncMarks) {
2722 dbgs() <<
"Other marks:\n";
2723 for (
const auto &Mark : OtherMarks) {
2732 unsigned OtherSize = OtherMarks.size();
2733 unsigned OurSize = AsyncMarks.size();
2734 unsigned MergeCount = std::min(OtherSize, OurSize);
2738 if (MergeCount == 0)
2742 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
2743 OtherMarks[OtherSize - Idx][
T]);
2748 dbgs() <<
"After merge:\n";
2749 for (
const auto &Mark : AsyncMarks) {
2763bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2764 bool StrictDom =
false;
2768 for (
auto K :
Other.VMem.keys())
2769 VMem.try_emplace(K);
2770 for (
auto K :
Other.SGPRs.keys())
2771 SGPRs.try_emplace(K);
2779 const HWEvents OldEvents = PendingEvents & EventsForT;
2780 const HWEvents OtherEvents =
Other.PendingEvents & EventsForT;
2781 if (!OldEvents.
contains(OtherEvents))
2783 PendingEvents |= OtherEvents;
2786 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2787 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2788 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2789 if (NewUB < ScoreLBs[
T])
2792 MergeInfo &
M = MergeInfos[
T];
2793 M.OldLB = ScoreLBs[
T];
2794 M.OtherLB =
Other.ScoreLBs[
T];
2795 M.MyShift = NewUB - ScoreUBs[
T];
2796 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2798 ScoreUBs[
T] = NewUB;
2801 StrictDom |= mergeScore(M, LastFlatLoadCnt,
Other.LastFlatLoadCnt);
2804 StrictDom |= mergeScore(M, LastFlatDsCnt,
Other.LastFlatDsCnt);
2805 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2809 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
2810 if (
Other.hasPendingEvent(HWEvents::SCC_WRITE)) {
2811 if (!(OldEvents & HWEvents::SCC_WRITE)) {
2812 PendingSCCWrite =
Other.PendingSCCWrite;
2813 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
2814 PendingSCCWrite =
nullptr;
2819 for (
auto &[RegID, Info] : VMem)
2820 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
2822 if (isSmemCounter(
T)) {
2823 for (
auto &[RegID, Info] : SGPRs) {
2824 auto It =
Other.SGPRs.find(RegID);
2825 unsigned OtherScore = (It !=
Other.SGPRs.end()) ? It->second.get(
T) : 0;
2826 StrictDom |= mergeScore(M,
Info.get(
T), OtherScore);
2831 for (
auto &[TID, Info] : VMem) {
2832 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
2834 Info.VGPRPendingEvents | It->second.VGPRPendingEvents;
2835 StrictDom |= NewVGPRContext !=
Info.VGPRPendingEvents;
2836 Info.VGPRPendingEvents = NewVGPRContext;
2840 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
2842 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
2844 purgeEmptyTrackingData();
2850 return Opcode == AMDGPU::S_WAITCNT ||
2853 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2854 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2855 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2856 Opcode == AMDGPU::WAIT_ASYNCMARK ||
2860void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
2862 bool ExpertMode)
const {
2866 .
addImm(ExpertMode ? 2 : 0)
2884class VCCZWorkaround {
2885 const WaitcntBrackets &ScoreBrackets;
2886 const GCNSubtarget &
ST;
2887 const SIInstrInfo &
TII;
2888 const SIRegisterInfo &
TRI;
2889 bool VCCZCorruptionBug =
false;
2890 bool VCCZNotUpdatedByPartialWrites =
false;
2893 bool MustRecomputeVCCZ =
true;
2896 VCCZWorkaround(
const WaitcntBrackets &ScoreBrackets,
const GCNSubtarget &ST,
2897 const SIInstrInfo &
TII,
const SIRegisterInfo &
TRI)
2899 VCCZCorruptionBug =
ST.hasReadVCCZBug();
2900 VCCZNotUpdatedByPartialWrites = !
ST.partialVCCWritesUpdateVCCZ();
2907 bool tryRecomputeVCCZ(MachineInstr &
MI) {
2909 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
2919 MustRecomputeVCCZ |= VCCZCorruptionBug &&
TII.isSMRD(
MI);
2925 std::optional<bool> PartiallyWritesToVCCOpt;
2926 auto PartiallyWritesToVCC = [](MachineInstr &
MI) {
2927 return MI.definesRegister(AMDGPU::VCC_LO,
nullptr) ||
2928 MI.definesRegister(AMDGPU::VCC_HI,
nullptr);
2930 if (VCCZNotUpdatedByPartialWrites) {
2931 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
2934 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
2940 if (!ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS) ||
2941 !VCCZCorruptionBug) {
2943 if (!PartiallyWritesToVCCOpt)
2944 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
2945 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
2946 MI.definesRegister(AMDGPU::VCC,
nullptr);
2949 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
2950 *PartiallyWritesToVCCOpt);
2952 MustRecomputeVCCZ =
false;
2962 TII.get(
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2965 MustRecomputeVCCZ =
false;
2975bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2976 MachineBasicBlock &
Block,
2977 WaitcntBrackets &ScoreBrackets) {
2981 dbgs() <<
"*** Begin Block: ";
2983 ScoreBrackets.dump();
2985 VCCZWorkaround VCCZW(ScoreBrackets, ST,
TII,
TRI);
2988 MachineInstr *OldWaitcntInstr =
nullptr;
2993 Iter !=
E; ++Iter) {
2994 MachineInstr &Inst = *Iter;
2995 if (isNonWaitcntMetaInst(Inst))
3000 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3001 if (!OldWaitcntInstr)
3002 OldWaitcntInstr = &Inst;
3006 PreheaderFlushFlags FlushFlags;
3007 if (
Block.getFirstTerminator() == Inst)
3008 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3011 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3013 OldWaitcntInstr =
nullptr;
3015 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3019 ScoreBrackets.recordAsyncMark(Inst);
3023 if (
TII.isSMRD(Inst)) {
3024 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3027 if (!Memop->isInvariant()) {
3028 const Value *Ptr = Memop->getValue();
3034 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3038 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3042 ScoreBrackets.dump();
3047 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3052 AMDGPU::Waitcnt
Wait;
3053 if (
Block.getFirstTerminator() ==
Block.end()) {
3054 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3055 if (FlushFlags.FlushVmCnt) {
3063 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
3072 dbgs() <<
"*** End Block: ";
3074 ScoreBrackets.dump();
3080bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3081 if (
Block.size() <= 1)
3089 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3095 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3096 LastAtomicWithSoftXcnt =
nullptr;
3100 MachineInstr &PrevMI = *
MI.getPrevNode();
3102 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3105 if (LastAtomicWithSoftXcnt) {
3109 LastAtomicWithSoftXcnt = &
MI;
3117SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3118 const WaitcntBrackets &ScoreBrackets) {
3119 auto [Iterator, IsInserted] =
3122 return Iterator->second;
3126 return PreheaderFlushFlags();
3130 return PreheaderFlushFlags();
3133 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3134 return Iterator->second;
3137 return PreheaderFlushFlags();
3140bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3142 return TII.mayAccessVMEMThroughFlat(
MI);
3146bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3152bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3181SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3182 const WaitcntBrackets &Brackets) {
3183 PreheaderFlushFlags
Flags;
3184 bool HasVMemLoad =
false;
3185 bool HasVMemStore =
false;
3186 bool UsesVgprVMEMLoadedOutside =
false;
3187 bool UsesVgprDSReadOutside =
false;
3188 bool VMemInvalidated =
false;
3192 bool TrackSimpleDSOpt =
ST.hasExtendedWaitCounts();
3193 DenseSet<MCRegUnit> VgprUse;
3194 DenseSet<MCRegUnit> VgprDefVMEM;
3195 DenseSet<MCRegUnit> VgprDefDS;
3201 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3202 unsigned DSReadPosition = 0;
3203 bool IsSingleBlock =
ML->getNumBlocks() == 1;
3204 bool TrackDSFlushPoint =
ST.hasExtendedWaitCounts() && IsSingleBlock;
3205 unsigned LastDSFlushPosition = 0;
3207 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3208 for (MachineInstr &
MI : *
MBB) {
3209 if (isVMEMOrFlatVMEM(
MI)) {
3210 HasVMemLoad |=
MI.mayLoad();
3211 HasVMemStore |=
MI.mayStore();
3215 if (mayStoreIncrementingDSCNT(
MI)) {
3218 if (VMemInvalidated)
3220 TrackSimpleDSOpt =
false;
3221 TrackDSFlushPoint =
false;
3223 bool IsDSRead = isDSRead(
MI);
3228 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3229 if (!TrackDSFlushPoint)
3231 if (
auto It = LastDSReadPositionMap.
find(RU);
3232 It != LastDSReadPositionMap.
end()) {
3236 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3240 for (
const MachineOperand &
Op :
MI.all_uses()) {
3241 if (
Op.isDebug() || !
TRI.isVectorRegister(MRI,
Op.getReg()))
3244 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3248 VMemInvalidated =
true;
3252 TrackSimpleDSOpt =
false;
3255 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3259 updateDSReadFlushTracking(RU);
3264 VMEMID
ID = toVMEMID(RU);
3268 UsesVgprVMEMLoadedOutside =
true;
3273 UsesVgprDSReadOutside =
true;
3278 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3279 for (
const MachineOperand &
Op :
MI.all_defs()) {
3280 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3284 VMemInvalidated =
true;
3289 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3300 if (IsDSRead || TrackDSFlushPoint) {
3301 for (
const MachineOperand &
Op :
MI.all_defs()) {
3302 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
3304 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3307 updateDSReadFlushTracking(RU);
3310 if (TrackDSFlushPoint)
3311 LastDSReadPositionMap[RU] = DSReadPosition;
3320 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3321 ((!
ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3322 (HasVMemLoad &&
ST.hasVmemWriteVgprInOrder())))
3323 Flags.FlushVmCnt =
true;
3329 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3332 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3333 bool DSFlushPointPrefetch =
3334 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3336 if (SimpleDSOpt || DSFlushPointPrefetch)
3337 Flags.FlushDsCnt =
true;
3342bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3343 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3345 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3347 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3348 AA = &AAR->getAAResults();
3350 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3362 if (!SIInsertWaitcnts(MLI, PDT,
AA, MF).
run())
3367 .preserve<AAManager>();
3370bool SIInsertWaitcnts::run() {
3378 if (ST.hasExtendedWaitCounts()) {
3379 IsExpertMode = ST.hasExpertSchedulingMode() &&
3388 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3393 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3397 SmemAccessCounter = getCounterFromEvent(HWEvents::SMEM_ACCESS);
3401 MachineBasicBlock &EntryBB = MF.
front();
3412 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3415 if (
ST.hasExtendedWaitCounts()) {
3424 if (!
ST.hasImageInsts() &&
3430 TII.get(instrsForExtendedCounterTypes[CT]))
3443 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3444 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3445 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3452 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3455 std::unique_ptr<WaitcntBrackets> Brackets;
3460 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3462 MachineBasicBlock *
MBB = BII->first;
3463 BlockInfo &BI = BII->second;
3469 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3471 *Brackets = *BI.Incoming;
3474 Brackets = std::make_unique<WaitcntBrackets>(
this);
3479 Brackets->~WaitcntBrackets();
3480 new (Brackets.get()) WaitcntBrackets(
this);
3484 if (
ST.hasWaitXcnt())
3486 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3489 if (Brackets->hasPendingEvent()) {
3490 BlockInfo *MoveBracketsToSucc =
nullptr;
3492 auto *SuccBII = BlockInfos.
find(Succ);
3493 BlockInfo &SuccBI = SuccBII->second;
3494 if (!SuccBI.Incoming) {
3495 SuccBI.Dirty =
true;
3496 if (SuccBII <= BII) {
3500 if (!MoveBracketsToSucc) {
3501 MoveBracketsToSucc = &SuccBI;
3503 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3507 dbgs() <<
"Try to merge ";
3513 if (SuccBI.Incoming->merge(*Brackets)) {
3514 SuccBI.Dirty =
true;
3515 if (SuccBII <= BII) {
3522 if (MoveBracketsToSucc)
3523 MoveBracketsToSucc->Incoming = std::move(Brackets);
3528 if (
ST.hasScalarStores()) {
3529 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3530 bool HaveScalarStores =
false;
3532 for (MachineBasicBlock &
MBB : MF) {
3533 for (MachineInstr &
MI :
MBB) {
3534 if (!HaveScalarStores &&
TII.isScalarStore(
MI))
3535 HaveScalarStores =
true;
3537 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3538 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3543 if (HaveScalarStores) {
3552 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3553 bool SeenDCacheWB =
false;
3557 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3558 SeenDCacheWB =
true;
3559 else if (
TII.isScalarStore(*
I))
3560 SeenDCacheWB =
false;
3563 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3564 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3580 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3582 setSchedulingMode(EntryBB,
I,
true);
3584 for (MachineInstr *
MI : CallInsts) {
3585 MachineBasicBlock &
MBB = *
MI->getParent();
3586 setSchedulingMode(
MBB,
MI,
false);
3587 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3590 for (MachineInstr *
MI : ReturnInsts)
3591 setSchedulingMode(*
MI->getParent(),
MI,
false);
3602 for (
auto [
MI,
_] : EndPgmInsts) {
3604 TII.get(AMDGPU::S_ALLOC_VGPR))
3608 }
else if (!WCG->isOptNone() &&
3609 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3610 (MF.getFrameInfo().hasCalls() ||
3611 ST.getOccupancyWithNumVGPRs(
3612 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3615 for (
auto [
MI, Flag] : EndPgmInsts) {
3617 if (
ST.requiresNopBeforeDeallocVGPRs()) {
3619 TII.get(AMDGPU::S_NOP))
3623 TII.get(AMDGPU::S_SENDMSG))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
AMDGPU::HWEvents HWEvents
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
Bit mask of hardware events.
constexpr unsigned size() const
constexpr bool contains(HWEvents Other) const
constexpr bool any() const
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
Get the array size.
bool empty() const
Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
HWEvents getSimplifiedVMEMEventsFor(const MachineInstr &Inst, const SIInstrInfo &TII)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
HWEvents getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST, bool IsExpertMode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
constexpr bool isMaybeAtomic(const T &...O)
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
@ Increment
Incrementally increasing token ID.
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.