49#define DEBUG_TYPE "si-insert-waitcnts"
53 cl::desc(
"Force all waitcnt instrs to be emitted as "
54 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
58 "amdgpu-waitcnt-load-forcezero",
59 cl::desc(
"Force all waitcnt load counters to wait until 0"),
63 "amdgpu-expert-scheduling-mode",
64 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
69template <
typename EmitWaitcntFn>
70static void EmitExpandedWaitcnt(
unsigned Outstanding,
unsigned Target,
71 EmitWaitcntFn &&EmitWaitcnt) {
73 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0u; --
I)
93 TRACKINGID_RANGE_LEN = (1 << 16),
98 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
103 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
104 LDSDMA_BEGIN = REGUNITS_END,
105 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
109static constexpr VMEMID toVMEMID(MCRegUnit RU) {
110 return static_cast<unsigned>(RU);
122 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
123 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
124 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
125 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
126 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
131 switch (
MI.getOpcode()) {
132 case AMDGPU::ASYNCMARK:
133 case AMDGPU::WAIT_ASYNCMARK:
136 return MI.isMetaInstruction();
151class WaitcntBrackets;
159class WaitcntGenerator {
161 const GCNSubtarget &ST;
162 const SIInstrInfo &TII;
163 AMDGPU::IsaVersion IV;
166 bool ExpandWaitcntProfiling =
false;
167 const AMDGPU::HardwareLimits &Limits;
170 WaitcntGenerator() =
delete;
171 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
172 WaitcntGenerator(
const MachineFunction &MF,
174 const AMDGPU::HardwareLimits &Limits)
175 : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
179 ExpandWaitcntProfiling(
180 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
185 bool isOptNone()
const {
return OptNone; }
201 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
202 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
206 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
211 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
213 AMDGPU::Waitcnt
Wait,
214 const WaitcntBrackets &ScoreBrackets) = 0;
221 assert(
E.size() == 1 &&
"Cannot handle a mask of events!");
223 if (getWaitEvents(
T) &
E)
234 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
236 virtual ~WaitcntGenerator() =
default;
239class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
242 HWEvents::VMEM_READ_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
243 HWEvents::VMEM_BVH_READ_ACCESS,
244 HWEvents::SMEM_ACCESS | HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS |
245 HWEvents::SQ_MESSAGE,
246 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
247 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
248 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
249 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
260 using WaitcntGenerator::WaitcntGenerator;
262 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
263 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
266 bool createNewWaitcnt(MachineBasicBlock &
Block,
268 AMDGPU::Waitcnt
Wait,
269 const WaitcntBrackets &ScoreBrackets)
override;
272 HWEvents EVs = WaitEventMaskForInstPreGFX12[
T];
278 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
281class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
286 HWEvents::VMEM_READ_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
287 HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS,
288 HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
289 HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
290 HWEvents::EXP_POS_ACCESS | HWEvents::EXP_LDS_ACCESS,
292 HWEvents::VMEM_WRITE_ACCESS | HWEvents::SCRATCH_WRITE_ACCESS,
293 HWEvents::VMEM_SAMPLER_READ_ACCESS,
294 HWEvents::VMEM_BVH_READ_ACCESS,
296 HWEvents::SMEM_ACCESS | HWEvents::SQ_MESSAGE | HWEvents::SCC_WRITE,
297 HWEvents::VMEM_GROUP | HWEvents::SMEM_GROUP,
298 HWEvents::ASYNC_ACCESS,
299 HWEvents::TENSOR_ACCESS,
300 HWEvents::VGPR_CSMACC_WRITE | HWEvents::VGPR_DPMACC_WRITE |
301 HWEvents::VGPR_TRANS_WRITE | HWEvents::VGPR_XDL_WRITE,
302 HWEvents::VGPR_LDS_READ | HWEvents::VGPR_FLAT_READ |
303 HWEvents::VGPR_VMEM_READ};
306 WaitcntGeneratorGFX12Plus() =
delete;
307 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
309 const AMDGPU::HardwareLimits &Limits,
311 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
314 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
315 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
318 bool createNewWaitcnt(MachineBasicBlock &
Block,
320 AMDGPU::Waitcnt
Wait,
321 const WaitcntBrackets &ScoreBrackets)
override;
324 return WaitEventMaskForInstGFX12Plus[
T];
327 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
331struct PreheaderFlushFlags {
332 bool FlushVmCnt =
false;
333 bool FlushDsCnt =
false;
336class SIInsertWaitcnts {
337 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
338 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
339 MachineLoopInfo &MLI;
340 MachinePostDominatorTree &PDT;
345 std::unique_ptr<WaitcntBrackets> Incoming;
347 BlockInfo() =
default;
348 BlockInfo(BlockInfo &&) =
default;
349 BlockInfo &operator=(BlockInfo &&) =
default;
353 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
357 std::unique_ptr<WaitcntGenerator> WCG;
360 DenseSet<MachineInstr *> CallInsts;
361 DenseSet<MachineInstr *> ReturnInsts;
366 DenseMap<MachineInstr *, bool> EndPgmInsts;
368 AMDGPU::HardwareLimits Limits;
371 const GCNSubtarget &ST;
372 const SIInstrInfo &TII;
373 const SIRegisterInfo &TRI;
374 const MachineRegisterInfo &MRI;
377 bool IsExpertMode =
false;
379 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
381 : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
382 TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
383 MRI(MF.getRegInfo()) {}
385 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
387 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
388 const WaitcntBrackets &Brackets);
389 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
390 const WaitcntBrackets &ScoreBrackets);
391 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
392 bool isDSRead(
const MachineInstr &
MI)
const;
393 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
396 bool isAsync(
const MachineInstr &
MI)
const {
401 const MachineOperand *
Async =
402 TII.getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
406 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
410 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
414 bool shouldUpdateAsyncMark(
const MachineInstr &
MI,
418 if (!isAsyncLdsDmaWrite(
MI))
425 bool isVmemAccess(
const MachineInstr &
MI)
const;
426 bool generateWaitcntInstBefore(MachineInstr &
MI,
427 WaitcntBrackets &ScoreBrackets,
428 MachineInstr *OldWaitcntInstr,
429 PreheaderFlushFlags FlushFlags);
430 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
432 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
433 MachineInstr *OldWaitcntInstr);
434 void updateEventWaitcntAfter(MachineInstr &Inst,
435 WaitcntBrackets *ScoreBrackets);
437 MachineBasicBlock *
Block)
const;
438 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
439 WaitcntBrackets &ScoreBrackets);
440 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
441 WaitcntBrackets &ScoreBrackets);
444 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
446 bool ExpertMode)
const;
448 return WCG->getWaitEvents(
T);
451 return WCG->getCounterFromEvent(
E);
463class WaitcntBrackets {
465 WaitcntBrackets(
const SIInsertWaitcnts *Context) : Context(Context) {
466 assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
471 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
472 for (
auto &[
ID, Val] : VMem) {
476 for (
auto &[
ID, Val] : SGPRs) {
481 if (NumUnusedVmem || NumUnusedSGPRs) {
482 errs() <<
"WaitcntBracket had unused entries at destruction time: "
483 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
484 <<
" SGPR unused entries\n";
495 return ScoreUBs[
T] - ScoreLBs[
T];
499 return getVMemScore(
ID,
T) > getScoreLB(
T);
517 return getScoreUB(
T) - getScoreLB(
T);
521 auto It = SGPRs.find(RU);
522 return It != SGPRs.end() ? It->second.get(
T) : 0;
526 auto It = VMem.find(TID);
527 return It != VMem.end() ? It->second.Scores[
T] : 0;
534 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
537 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
538 AMDGPU::Waitcnt &UpdateWait)
const;
541 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
542 AMDGPU::Waitcnt &UpdateWait)
const;
543 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
544 AMDGPU::Waitcnt &UpdateWait)
const;
547 AMDGPU::Waitcnt &
Wait,
548 const MachineInstr &
MI)
const;
549 MCPhysReg determineVGPR16Dependency(
const MachineInstr &
MI,
553 AMDGPU::Waitcnt &
Wait)
const;
554 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
555 void tryClearSCCWriteEvent(MachineInstr *Inst);
557 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
561 void recordAsyncMark(MachineInstr &
MI);
563 HWEvents getPendingEvents()
const {
return PendingEvents; }
564 bool hasPendingEvent()
const {
return PendingEvents.
any(); }
565 bool hasPendingEvent(
HWEvents E)
const {
return PendingEvents.contains(
E); }
567 bool HasPending = (PendingEvents & Context->getWaitEvents(
T)).any();
569 "Expected pending events iff scoreboard is not empty");
574 HWEvents Events = PendingEvents & Context->getWaitEvents(
T);
576 return Events.
size() > 1;
579 bool hasPendingFlat()
const {
586 void setPendingFlat() {
591 bool hasPendingGDS()
const {
596 unsigned getPendingGDSWait()
const {
606 for (MCRegUnit RU : regunits(
Reg)) {
607 auto It = VMem.find(toVMEMID(RU));
608 if (It != VMem.end() && (It->second.VGPRPendingEvents & ~
E).any())
615 for (MCRegUnit RU : regunits(
Reg)) {
616 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
618 if (It->second.empty())
624 void setStateOnFunctionEntryOrReturn() {
630 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
634 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
635 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
638 void print(raw_ostream &)
const;
643 void purgeEmptyTrackingData();
647 return Context->getLimits().get(
T);
657 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
660 AMDGPU::Waitcnt &
Wait)
const;
662 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
663 unsigned OtherScore);
668 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
669 if (!Context->TRI.isInAllocatableClass(
Reg))
671 return Context->TRI.regunits(
Reg);
692 const SIRegisterInfo &
TRI = Context->TRI;
693 if (
Reg == AMDGPU::SCC) {
695 }
else if (
TRI.isVectorRegister(Context->MRI,
Reg)) {
696 for (MCRegUnit RU : regunits(
Reg))
697 VMem[toVMEMID(RU)].Scores[
T] = Val;
698 }
else if (
TRI.isSGPRReg(Context->MRI,
Reg)) {
699 for (MCRegUnit RU : regunits(
Reg))
700 SGPRs[RU].get(
T) = Val;
707 VMem[TID].Scores[
T] = Val;
710 void setScoreByOperand(
const MachineOperand &
Op,
713 const SIInsertWaitcnts *Context;
719 unsigned LastFlatDsCnt = 0;
720 unsigned LastFlatLoadCnt = 0;
722 unsigned LastGDS = 0;
739 CounterValueArray Scores{};
753 unsigned ScoreDsKmCnt = 0;
754 unsigned ScoreXCnt = 0;
770 bool empty()
const {
return !ScoreDsKmCnt && !ScoreXCnt; }
773 DenseMap<VMEMID, VMEMInfo> VMem;
774 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
777 unsigned SCCScore = 0;
779 const MachineInstr *PendingSCCWrite =
nullptr;
783 SmallVector<const MachineInstr *> LDSDMAStores;
792 static constexpr unsigned MaxAsyncMarks = 16;
796 CounterValueArray AsyncScore{};
799SIInsertWaitcnts::BlockInfo::~BlockInfo() =
default;
804 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
806 bool runOnMachineFunction(MachineFunction &MF)
override;
808 StringRef getPassName()
const override {
809 return "SI insert wait instructions";
812 void getAnalysisUsage(AnalysisUsage &AU)
const override {
815 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
827 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
835bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
840 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
850bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
852 if (!hasPointSampleAccel(
MI))
855 return hasDifferentVGPRPendingEvents(
Reg, HWEvents::VMEM_READ_ACCESS);
858void WaitcntBrackets::updateByEvent(
HWEvents E, MachineInstr &Inst) {
859 assert(
E.size() == 1 &&
"Expected singular event!");
863 unsigned UB = getScoreUB(
T);
866 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
878 setScoreUB(
T, CurrScore);
881 const MachineRegisterInfo &MRI =
Context->MRI;
890 if (
const auto *AddrOp =
TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
894 if (
const auto *Data0 =
895 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
897 if (
const auto *Data1 =
898 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
902 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
903 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
904 for (
const MachineOperand &
Op : Inst.
all_uses()) {
905 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
909 }
else if (
TII.isFLAT(Inst)) {
911 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
914 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
917 }
else if (
TII.isMIMG(Inst)) {
921 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
924 }
else if (
TII.isMTBUF(Inst)) {
927 }
else if (
TII.isMUBUF(Inst)) {
931 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
934 }
else if (
TII.isLDSDIR(Inst)) {
936 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
939 if (
TII.isEXP(Inst)) {
944 for (MachineOperand &DefMO : Inst.
all_defs()) {
945 if (
TRI.isVGPR(MRI, DefMO.getReg())) {
950 for (
const MachineOperand &
Op : Inst.
all_uses()) {
951 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
957 E == HWEvents::SMEM_GROUP ? HWEvents::VMEM_GROUP : HWEvents::SMEM_GROUP;
958 if (PendingEvents.
contains(OtherEvent)) {
963 setScoreLB(
T, getScoreUB(
T) - 1);
964 PendingEvents -= OtherEvent;
966 for (
const MachineOperand &
Op : Inst.
all_uses())
967 setScoreByOperand(
Op,
T, CurrScore);
971 for (
const MachineOperand &
Op : Inst.
operands()) {
976 setScoreByOperand(
Op,
T, CurrScore);
988 for (
const MachineOperand &
Op : Inst.
defs()) {
991 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
993 if (updateVMCntOnly(Inst)) {
1002 if (hasPointSampleAccel(Inst))
1003 VGPRContext |= HWEvents::VMEM_READ_ACCESS;
1004 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1005 VMem[toVMEMID(RU)].VGPRPendingEvents |= VGPRContext;
1008 setScoreByOperand(
Op,
T, CurrScore);
1011 (
TII.isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1020 if (!MemOp->isStore() ||
1025 auto AAI = MemOp->getAAInfo();
1031 if (!AAI || !AAI.Scope)
1033 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1034 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1035 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1050 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1051 if (Slot && Slot < NUM_LDSDMA)
1052 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1055 if (
Context->shouldUpdateAsyncMark(Inst,
T)) {
1056 AsyncScore[
T] = CurrScore;
1060 setRegScore(AMDGPU::SCC,
T, CurrScore);
1061 PendingSCCWrite = &Inst;
1066void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1072 AsyncMarks.push_back(AsyncScore);
1075 dbgs() <<
"recordAsyncMark:\n" << Inst;
1076 for (
const auto &Mark : AsyncMarks) {
1083void WaitcntBrackets::print(raw_ostream &OS)
const {
1087 unsigned SR = getScoreRange(
T);
1090 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1094 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1098 OS <<
" EXP_CNT(" << SR <<
"):";
1101 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1105 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1108 OS <<
" BVH_CNT(" << SR <<
"):";
1111 OS <<
" KM_CNT(" << SR <<
"):";
1114 OS <<
" X_CNT(" << SR <<
"):";
1117 OS <<
" ASYNC_CNT(" << SR <<
"):";
1120 OS <<
" VA_VDST(" << SR <<
"): ";
1123 OS <<
" VM_VSRC(" << SR <<
"): ";
1126 OS <<
" UNKNOWN(" << SR <<
"):";
1132 unsigned LB = getScoreLB(
T);
1135 sort(SortedVMEMIDs);
1137 for (
auto ID : SortedVMEMIDs) {
1138 unsigned RegScore = VMem.at(
ID).Scores[
T];
1141 unsigned RelScore = RegScore - LB - 1;
1142 if (
ID < REGUNITS_END) {
1143 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1145 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1146 "Unhandled/unexpected ID value!");
1147 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1152 if (isSmemCounter(
T)) {
1154 sort(SortedSMEMIDs);
1155 for (
auto ID : SortedSMEMIDs) {
1156 unsigned RegScore = SGPRs.at(
ID).get(
T);
1159 unsigned RelScore = RegScore - LB - 1;
1160 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1165 OS <<
' ' << SCCScore <<
":scc";
1170 OS <<
"Pending Events: ";
1171 if (hasPendingEvent()) {
1172 OS << getPendingEvents();
1178 OS <<
"Async score: ";
1179 if (AsyncScore.empty())
1185 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1187 for (
const auto &Mark : AsyncMarks) {
1189 unsigned MarkedScore = Mark[
T];
1192 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1193 <<
"_CNT: " << MarkedScore;
1196 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1197 <<
"_CNT: " << MarkedScore;
1200 OS <<
" EXP_CNT: " << MarkedScore;
1203 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS")
1204 <<
"_CNT: " << MarkedScore;
1207 OS <<
" SAMPLE_CNT: " << MarkedScore;
1210 OS <<
" BVH_CNT: " << MarkedScore;
1213 OS <<
" KM_CNT: " << MarkedScore;
1216 OS <<
" X_CNT: " << MarkedScore;
1219 OS <<
" ASYNC_CNT: " << MarkedScore;
1222 OS <<
" UNKNOWN: " << MarkedScore;
1233void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1234 AMDGPU::Waitcnt &UpdateWait)
const {
1242 simplifyXcnt(CheckWait, UpdateWait);
1244 simplifyVmVsrc(CheckWait, UpdateWait);
1249 unsigned &
Count)
const {
1253 if (
Count >= getScoreRange(
T))
1257void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait,
1259 unsigned Cnt =
Wait.get(
T);
1260 simplifyWaitcnt(
T, Cnt);
1264void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1265 AMDGPU::Waitcnt &UpdateWait)
const {
1275 hasPendingEvent(HWEvents::SMEM_GROUP))
1281 hasPendingEvent(HWEvents::VMEM_GROUP) &&
1288void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1289 AMDGPU::Waitcnt &UpdateWait)
const {
1294 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1295 CheckWait.get(AMDGPU::STORE_CNT),
1296 CheckWait.get(AMDGPU::SAMPLE_CNT),
1297 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1302void WaitcntBrackets::purgeEmptyTrackingData() {
1303 VMem.remove_if([](
const auto &
P) {
return P.second.empty(); });
1304 SGPRs.remove_if([](
const auto &
P) {
return P.second.empty(); });
1308 unsigned ScoreToWait,
1309 AMDGPU::Waitcnt &
Wait)
const {
1310 const unsigned LB = getScoreLB(
T);
1311 const unsigned UB = getScoreUB(
T);
1314 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1316 !
Context->ST.hasFlatLgkmVMemCountInOrder()) {
1321 }
else if (counterOutOfOrder(
T)) {
1329 unsigned NeededWait = std::min(UB - ScoreToWait, getLimit(
T) - 1);
1330 Wait.add(
T, NeededWait);
1335AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1337 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1339 for (
const auto &Mark : AsyncMarks) {
1345 if (AsyncMarks.size() == MaxAsyncMarks) {
1350 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1351 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1354 AMDGPU::Waitcnt
Wait;
1355 if (AsyncMarks.size() <=
N) {
1360 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1361 const auto &RequiredMark = AsyncMarks[MarkIndex];
1363 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1369 dbgs() <<
"Removing " << (MarkIndex + 1)
1370 <<
" async marks after determining wait\n";
1372 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1385MCPhysReg WaitcntBrackets::determineVGPR16Dependency(
const MachineInstr &
MI,
1389 unsigned Size =
Context->TRI.getRegSizeInBits(*RC);
1391 if (
Size != 16 || !
Context->ST.hasD16Writes32BitVgpr())
1401 AMDGPU::Waitcnt
Wait;
1402 for (MCRegUnit RU : regunits(OtherHalf))
1403 determineWaitForScore(
T, getVMemScore(toVMEMID(RU),
T),
Wait);
1406 if (!
Wait.hasWait())
1416 HWEvents Events = MIEvents & OtherHalfEvents;
1417 if (Events.
size() > 1)
1424 AMDGPU::Waitcnt &
Wait,
1425 const MachineInstr &
MI)
const {
1426 if (
Reg == AMDGPU::SCC) {
1427 determineWaitForScore(
T, SCCScore,
Wait);
1431 Reg = determineVGPR16Dependency(
MI,
T,
Reg);
1432 for (MCRegUnit RU : regunits(
Reg))
1433 determineWaitForScore(
1434 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1441 AMDGPU::Waitcnt &
Wait)
const {
1442 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1443 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1446void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1449 if (PendingSCCWrite &&
1450 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1452 HWEvents SCC_WRITE_PendingEvent = HWEvents::SCC_WRITE;
1455 SCC_WRITE_PendingEvent) {
1459 PendingEvents -= SCC_WRITE_PendingEvent;
1460 PendingSCCWrite =
nullptr;
1464void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1466 applyWaitcnt(
Wait,
T);
1470 const unsigned UB = getScoreUB(
T);
1474 if (counterOutOfOrder(
T))
1476 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1479 PendingEvents -=
Context->getWaitEvents(
T);
1483 hasPendingEvent(HWEvents::SMEM_GROUP)) {
1487 PendingEvents -= HWEvents::SMEM_GROUP;
1493 else if (
Count == 0)
1494 PendingEvents -= HWEvents::VMEM_GROUP;
1498void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait,
1500 unsigned Cnt =
Wait.get(
T);
1501 applyWaitcnt(
T, Cnt);
1508 if ((
T ==
Context->SmemAccessCounter &&
1509 hasPendingEvent(HWEvents::SMEM_ACCESS)) ||
1526 static constexpr HWEvents ExtendedImageEvents =
1527 HWEvents::VMEM_SAMPLER_READ_ACCESS | HWEvents::VMEM_BVH_READ_ACCESS;
1528 if (!
Context->ST.hasExtendedWaitCounts() &&
1529 (Events & ExtendedImageEvents).any()) {
1530 Events -= ExtendedImageEvents;
1531 Events |= HWEvents::VMEM_READ_ACCESS;
1537 Events -= HWEvents::GLOBAL_INV_ACCESS;
1541 return Events.
size() > 1;
1544 return hasMixedPendingEvents(
T);
1554char SIInsertWaitcntsLegacy::
ID = 0;
1559 return new SIInsertWaitcntsLegacy();
1564 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1569 if (NewEnc == MO.
getImm())
1576bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1590bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1591 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1593 assert(isNormalMode(MaxCounter));
1596 MachineInstr *WaitcntInstr =
nullptr;
1597 MachineInstr *WaitcntVsCntInstr =
nullptr;
1600 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1602 dbgs() <<
"end of block\n";
1610 if (isNonWaitcntMetaInst(
II)) {
1616 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1620 if (Opcode == AMDGPU::S_WAITCNT) {
1621 unsigned IEnc =
II.getOperand(0).getImm();
1624 ScoreBrackets.simplifyWaitcnt(OldWait);
1628 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1629 II.eraseFromParent();
1633 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1636 <<
"Before: " <<
Wait <<
'\n';);
1647 II.eraseFromParent();
1648 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1649 unsigned N =
II.getOperand(0).getImm();
1651 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1654 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1655 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1658 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1664 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1665 II.eraseFromParent();
1668 WaitcntVsCntInstr = &
II;
1675 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1684 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1685 <<
"New Instr at block end: "
1686 << *WaitcntInstr <<
'\n'
1687 :
dbgs() <<
"applied pre-existing waitcnt\n"
1688 <<
"Old Instr: " << *It
1689 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1692 if (WaitcntVsCntInstr) {
1696 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1702 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1703 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1705 :
dbgs() <<
"applied pre-existing waitcnt\n"
1706 <<
"Old Instr: " << *It
1707 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1715bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1717 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1718 assert(isNormalMode(MaxCounter));
1725 if (
Wait.hasWaitExceptStoreCnt()) {
1727 if (ExpandWaitcntProfiling) {
1731 bool AnyOutOfOrder =
false;
1733 unsigned WaitCnt =
Wait.get(CT);
1734 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1735 AnyOutOfOrder =
true;
1740 if (AnyOutOfOrder) {
1748 unsigned WaitCnt =
Wait.get(CT);
1752 unsigned Outstanding =
1753 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
1754 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1766 [[maybe_unused]]
auto SWaitInst =
1771 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1772 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1776 if (
Wait.hasWaitStoreCnt()) {
1782 unsigned Outstanding =
1785 EmitExpandedWaitcnt(
1787 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1788 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1793 [[maybe_unused]]
auto SWaitInst =
1795 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1800 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1801 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1809WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1810 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
1814WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1815 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1816 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1818 ~0u , ExpertVal, ExpertVal);
1825bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1826 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1828 assert(!isNormalMode(MaxCounter));
1831 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1832 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1833 MachineInstr *WaitcntDepctrInstr =
nullptr;
1837 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1839 dbgs() <<
"end of block\n";
1845 AMDGPU::Waitcnt RequiredWait;
1850 if (isNonWaitcntMetaInst(
II)) {
1859 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1863 if (Opcode == AMDGPU::S_WAITCNT)
1866 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1868 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1873 RequiredWait = RequiredWait.combined(OldWait);
1875 if (CombinedLoadDsCntInstr ==
nullptr) {
1876 CombinedLoadDsCntInstr = &
II;
1878 II.eraseFromParent();
1881 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1883 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1888 RequiredWait = RequiredWait.combined(OldWait);
1890 if (CombinedStoreDsCntInstr ==
nullptr) {
1891 CombinedStoreDsCntInstr = &
II;
1893 II.eraseFromParent();
1896 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1898 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1899 AMDGPU::Waitcnt OldWait;
1903 ScoreBrackets.simplifyWaitcnt(OldWait);
1905 if (WaitcntDepctrInstr ==
nullptr) {
1906 WaitcntDepctrInstr = &
II;
1915 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1923 II.eraseFromParent();
1927 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1930 II.eraseFromParent();
1932 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1935 unsigned N =
II.getOperand(0).getImm();
1936 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1939 std::optional<AMDGPU::InstCounterType> CT =
1943 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1945 Wait.add(CT.value(), OldCnt);
1947 RequiredWait.add(CT.value(), OldCnt);
1949 if (WaitInstrs[CT.value()] ==
nullptr) {
1950 WaitInstrs[CT.value()] = &
II;
1952 II.eraseFromParent();
1958 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
1959 Wait =
Wait.combined(RequiredWait);
1961 if (CombinedLoadDsCntInstr) {
1977 AMDGPU::OpName::simm16, NewEnc);
1978 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1984 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1985 <<
"New Instr at block end: "
1986 << *CombinedLoadDsCntInstr <<
'\n'
1987 :
dbgs() <<
"applied pre-existing waitcnt\n"
1988 <<
"Old Instr: " << *It <<
"New Instr: "
1989 << *CombinedLoadDsCntInstr <<
'\n');
1996 if (CombinedStoreDsCntInstr) {
2001 AMDGPU::OpName::simm16, NewEnc);
2002 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2008 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2009 <<
"New Instr at block end: "
2010 << *CombinedStoreDsCntInstr <<
'\n'
2011 :
dbgs() <<
"applied pre-existing waitcnt\n"
2012 <<
"Old Instr: " << *It <<
"New Instr: "
2013 << *CombinedStoreDsCntInstr <<
'\n');
2043 for (MachineInstr **WI : WaitsToErase) {
2047 (*WI)->eraseFromParent();
2054 if (!WaitInstrs[CT])
2057 unsigned NewCnt =
Wait.get(CT);
2058 if (NewCnt != ~0u) {
2060 AMDGPU::OpName::simm16, NewCnt);
2061 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2063 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2067 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2068 <<
"New Instr at block end: " << *WaitInstrs[CT]
2070 :
dbgs() <<
"applied pre-existing waitcnt\n"
2071 <<
"Old Instr: " << *It
2072 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2079 if (WaitcntDepctrInstr) {
2083 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2098 AMDGPU::OpName::simm16, Enc);
2100 <<
"New Instr at block end: "
2101 << *WaitcntDepctrInstr <<
'\n'
2102 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2103 <<
"Old Instr: " << *It <<
"New Instr: "
2104 << *WaitcntDepctrInstr <<
'\n');
2115bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2117 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2118 assert(!isNormalMode(MaxCounter));
2125 if (ExpandWaitcntProfiling) {
2132 if (ScoreBrackets.counterOutOfOrder(CT)) {
2139 unsigned Outstanding =
2140 std::min(ScoreBrackets.getOutstanding(CT), getLimit(CT) - 1);
2141 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2153 MachineInstr *SWaitInst =
nullptr;
2177 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2178 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2190 [[maybe_unused]]
auto SWaitInst =
2197 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2198 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2201 if (
Wait.hasWaitDepctr()) {
2207 [[maybe_unused]]
auto SWaitInst =
2213 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2214 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2233bool SIInsertWaitcnts::generateWaitcntInstBefore(
2234 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2235 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2240 AMDGPU::Waitcnt
Wait;
2241 const unsigned Opc =
MI.getOpcode();
2244 case AMDGPU::BUFFER_WBINVL1:
2245 case AMDGPU::BUFFER_WBINVL1_SC:
2246 case AMDGPU::BUFFER_WBINVL1_VOL:
2247 case AMDGPU::BUFFER_GL0_INV:
2248 case AMDGPU::BUFFER_GL1_INV: {
2256 case AMDGPU::SI_RETURN_TO_EPILOG:
2257 case AMDGPU::SI_RETURN:
2258 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2259 case AMDGPU::S_SETPC_B64_return: {
2264 AMDGPU::Waitcnt AllZeroWait =
2265 WCG->getAllZeroWaitcnt(
false);
2270 if (
ST.hasExtendedWaitCounts() &&
2271 !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_READ_ACCESS))
2276 case AMDGPU::S_ENDPGM:
2277 case AMDGPU::S_ENDPGM_SAVED: {
2288 !ScoreBrackets.hasPendingEvent(HWEvents::SCRATCH_WRITE_ACCESS);
2291 case AMDGPU::S_SENDMSG:
2292 case AMDGPU::S_SENDMSGHALT: {
2293 if (
ST.hasLegacyGeometry() &&
2308 if (
MI.modifiesRegister(AMDGPU::EXEC, &
TRI)) {
2311 if (ScoreBrackets.hasPendingEvent(HWEvents::EXP_GPR_LOCK) ||
2312 ScoreBrackets.hasPendingEvent(HWEvents::EXP_PARAM_ACCESS) ||
2313 ScoreBrackets.hasPendingEvent(HWEvents::EXP_POS_ACCESS) ||
2314 ScoreBrackets.hasPendingEvent(HWEvents::GDS_GPR_LOCK)) {
2321 if (
TII.isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2329 Wait = AMDGPU::Waitcnt();
2331 const MachineOperand &CallAddrOp =
TII.getCalleeOperand(
MI);
2332 if (CallAddrOp.
isReg()) {
2333 ScoreBrackets.determineWaitForPhysReg(
2336 if (
const auto *RtnAddrOp =
2337 TII.getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2338 ScoreBrackets.determineWaitForPhysReg(
2339 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait,
MI);
2342 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2343 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2359 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2360 const Value *Ptr = Memop->getValue();
2361 if (Memop->isStore()) {
2362 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2363 Wait.add(SmemAccessCounter, 0);
2365 SLoadAddresses.
erase(It);
2368 unsigned AS = Memop->getAddrSpace();
2372 if (
TII.mayWriteLDSThroughDMA(
MI))
2376 unsigned TID = LDSDMA_BEGIN;
2377 if (Ptr && Memop->getAAInfo()) {
2378 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2379 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2380 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2381 if ((
I + 1) >= NUM_LDSDMA) {
2396 if (Memop->isStore()) {
2402 for (
const MachineOperand &
Op :
MI.operands()) {
2407 if (
Op.isTied() &&
Op.isUse() &&
TII.doesNotReadTiedSource(
MI))
2412 const bool IsVGPR =
TRI.isVectorRegister(MRI,
Op.getReg());
2419 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2432 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2433 ScoreBrackets.hasDifferentVGPRPendingEvents(
2435 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2436 !
ST.hasVmemWriteVgprInOrder()) {
2443 ScoreBrackets.clearVGPRPendingEvents(
Reg);
2447 ScoreBrackets.hasPendingEvent(HWEvents::EXP_LDS_ACCESS)) {
2452 }
else if (
Op.getReg() == AMDGPU::SCC) {
2455 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait,
2459 if (
ST.hasWaitXcnt() &&
Op.isDef())
2478 if (
Opc == AMDGPU::S_BARRIER && !
ST.hasAutoWaitcntBeforeBarrier() &&
2479 !
ST.hasBackOffBarrier()) {
2480 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2487 ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS)) {
2492 ScoreBrackets.simplifyWaitcnt(
Wait);
2512 Wait = WCG->getAllZeroWaitcnt(
false);
2516 if (!ForceEmitWaitcnt[
T])
2521 if (FlushFlags.FlushVmCnt) {
2527 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
2533 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2537bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2539 MachineBasicBlock &
Block,
2540 WaitcntBrackets &ScoreBrackets,
2541 MachineInstr *OldWaitcntInstr) {
2544 if (OldWaitcntInstr)
2548 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2553 MachineOperand *WaitExp =
TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2563 <<
"Update Instr: " << *It);
2566 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2571 ScoreBrackets.applyWaitcnt(
Wait);
2576bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2577 return (
TII.isFLAT(
MI) &&
TII.mayAccessVMEMThroughFlat(
MI)) ||
2584 MachineBasicBlock *
Block)
const {
2585 auto BlockEnd =
Block->getParent()->end();
2586 auto BlockIter =
Block->getIterator();
2590 if (++BlockIter != BlockEnd) {
2591 It = BlockIter->instr_begin();
2598 if (!It->isMetaInstruction())
2606 return It->getOpcode() == AMDGPU::S_ENDPGM;
2610bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2611 MachineBasicBlock &
Block,
2612 WaitcntBrackets &ScoreBrackets) {
2613 AMDGPU::Waitcnt
Wait;
2614 bool NeedsEndPGMCheck =
false;
2622 NeedsEndPGMCheck =
true;
2625 ScoreBrackets.simplifyWaitcnt(
Wait);
2628 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2631 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2639void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2640 WaitcntBrackets *ScoreBrackets) {
2644 ScoreBrackets->updateByEvent(
E, Inst);
2646 if (
TII.isDS(Inst) &&
TII.usesLGKM_CNT(Inst)) {
2648 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2649 ScoreBrackets->setPendingGDS();
2651 }
else if (
TII.isFLAT(Inst)) {
2659 ScoreBrackets->setPendingFlat();
2661 }
else if (Inst.
isCall()) {
2664 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2665 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2666 }
else if (
TII.isVINTERP(Inst)) {
2667 int64_t
Imm =
TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2677bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2678 unsigned OtherScore) {
2679 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2680 unsigned OtherShifted =
2681 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2682 Score = std::max(MyShifted, OtherShifted);
2683 return OtherShifted > MyShifted;
2688 bool StrictDom =
false;
2692 if (AsyncMarks.empty() && OtherMarks.
empty()) {
2699 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
2700 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2703 if (AsyncMarks.size() > MaxSize)
2704 AsyncMarks.erase(AsyncMarks.begin(),
2705 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2711 constexpr CounterValueArray ZeroMark{};
2712 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2715 dbgs() <<
"Before merge:\n";
2716 for (
const auto &Mark : AsyncMarks) {
2720 dbgs() <<
"Other marks:\n";
2721 for (
const auto &Mark : OtherMarks) {
2730 unsigned OtherSize = OtherMarks.size();
2731 unsigned OurSize = AsyncMarks.size();
2732 unsigned MergeCount = std::min(OtherSize, OurSize);
2736 if (MergeCount == 0)
2740 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
2741 OtherMarks[OtherSize - Idx][
T]);
2746 dbgs() <<
"After merge:\n";
2747 for (
const auto &Mark : AsyncMarks) {
2761bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2762 bool StrictDom =
false;
2766 for (
auto K :
Other.VMem.keys())
2767 VMem.try_emplace(K);
2768 for (
auto K :
Other.SGPRs.keys())
2769 SGPRs.try_emplace(K);
2777 const HWEvents OldEvents = PendingEvents & EventsForT;
2778 const HWEvents OtherEvents =
Other.PendingEvents & EventsForT;
2779 if (!OldEvents.
contains(OtherEvents))
2781 PendingEvents |= OtherEvents;
2784 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2785 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2786 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2787 if (NewUB < ScoreLBs[
T])
2790 MergeInfo &
M = MergeInfos[
T];
2791 M.OldLB = ScoreLBs[
T];
2792 M.OtherLB =
Other.ScoreLBs[
T];
2793 M.MyShift = NewUB - ScoreUBs[
T];
2794 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2796 ScoreUBs[
T] = NewUB;
2799 StrictDom |= mergeScore(M, LastFlatLoadCnt,
Other.LastFlatLoadCnt);
2802 StrictDom |= mergeScore(M, LastFlatDsCnt,
Other.LastFlatDsCnt);
2803 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2807 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
2808 if (
Other.hasPendingEvent(HWEvents::SCC_WRITE)) {
2809 if (!(OldEvents & HWEvents::SCC_WRITE)) {
2810 PendingSCCWrite =
Other.PendingSCCWrite;
2811 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
2812 PendingSCCWrite =
nullptr;
2817 for (
auto &[RegID, Info] : VMem)
2818 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
2820 if (isSmemCounter(
T)) {
2821 for (
auto &[RegID, Info] : SGPRs) {
2822 auto It =
Other.SGPRs.find(RegID);
2823 unsigned OtherScore = (It !=
Other.SGPRs.end()) ? It->second.get(
T) : 0;
2824 StrictDom |= mergeScore(M,
Info.get(
T), OtherScore);
2829 for (
auto &[TID, Info] : VMem) {
2830 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
2832 Info.VGPRPendingEvents | It->second.VGPRPendingEvents;
2833 StrictDom |= NewVGPRContext !=
Info.VGPRPendingEvents;
2834 Info.VGPRPendingEvents = NewVGPRContext;
2838 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
2840 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
2842 purgeEmptyTrackingData();
2848 return Opcode == AMDGPU::S_WAITCNT ||
2851 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2852 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2853 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2854 Opcode == AMDGPU::WAIT_ASYNCMARK ||
2858void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
2860 bool ExpertMode)
const {
2864 .
addImm(ExpertMode ? 2 : 0)
2882class VCCZWorkaround {
2883 const WaitcntBrackets &ScoreBrackets;
2884 const GCNSubtarget &
ST;
2885 const SIInstrInfo &
TII;
2886 const SIRegisterInfo &
TRI;
2887 bool VCCZCorruptionBug =
false;
2888 bool VCCZNotUpdatedByPartialWrites =
false;
2891 bool MustRecomputeVCCZ =
true;
2894 VCCZWorkaround(
const WaitcntBrackets &ScoreBrackets,
const GCNSubtarget &ST,
2895 const SIInstrInfo &
TII,
const SIRegisterInfo &
TRI)
2897 VCCZCorruptionBug =
ST.hasReadVCCZBug();
2898 VCCZNotUpdatedByPartialWrites = !
ST.partialVCCWritesUpdateVCCZ();
2905 bool tryRecomputeVCCZ(MachineInstr &
MI) {
2907 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
2917 MustRecomputeVCCZ |= VCCZCorruptionBug &&
TII.isSMRD(
MI);
2923 std::optional<bool> PartiallyWritesToVCCOpt;
2924 auto PartiallyWritesToVCC = [](MachineInstr &
MI) {
2925 return MI.definesRegister(AMDGPU::VCC_LO,
nullptr) ||
2926 MI.definesRegister(AMDGPU::VCC_HI,
nullptr);
2928 if (VCCZNotUpdatedByPartialWrites) {
2929 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
2932 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
2938 if (!ScoreBrackets.hasPendingEvent(HWEvents::SMEM_ACCESS) ||
2939 !VCCZCorruptionBug) {
2941 if (!PartiallyWritesToVCCOpt)
2942 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
2943 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
2944 MI.definesRegister(AMDGPU::VCC,
nullptr);
2947 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
2948 *PartiallyWritesToVCCOpt);
2950 MustRecomputeVCCZ =
false;
2960 TII.get(
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2963 MustRecomputeVCCZ =
false;
2973bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2974 MachineBasicBlock &
Block,
2975 WaitcntBrackets &ScoreBrackets) {
2979 dbgs() <<
"*** Begin Block: ";
2981 ScoreBrackets.dump();
2983 VCCZWorkaround VCCZW(ScoreBrackets, ST,
TII,
TRI);
2986 MachineInstr *OldWaitcntInstr =
nullptr;
2991 Iter !=
E; ++Iter) {
2992 MachineInstr &Inst = *Iter;
2993 if (isNonWaitcntMetaInst(Inst))
2998 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
2999 if (!OldWaitcntInstr)
3000 OldWaitcntInstr = &Inst;
3004 PreheaderFlushFlags FlushFlags;
3005 if (
Block.getFirstTerminator() == Inst)
3006 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3009 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3011 OldWaitcntInstr =
nullptr;
3013 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3017 ScoreBrackets.recordAsyncMark(Inst);
3021 if (
TII.isSMRD(Inst)) {
3022 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3025 if (!Memop->isInvariant()) {
3026 const Value *Ptr = Memop->getValue();
3032 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3036 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3040 ScoreBrackets.dump();
3045 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3050 AMDGPU::Waitcnt
Wait;
3051 if (
Block.getFirstTerminator() ==
Block.end()) {
3052 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3053 if (FlushFlags.FlushVmCnt) {
3061 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
3070 dbgs() <<
"*** End Block: ";
3072 ScoreBrackets.dump();
3078bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3079 if (
Block.size() <= 1)
3087 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3093 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3094 LastAtomicWithSoftXcnt =
nullptr;
3098 MachineInstr &PrevMI = *
MI.getPrevNode();
3100 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3103 if (LastAtomicWithSoftXcnt) {
3107 LastAtomicWithSoftXcnt = &
MI;
3115SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3116 const WaitcntBrackets &ScoreBrackets) {
3117 auto [Iterator, IsInserted] =
3120 return Iterator->second;
3124 return PreheaderFlushFlags();
3128 return PreheaderFlushFlags();
3131 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3132 return Iterator->second;
3135 return PreheaderFlushFlags();
3138bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3140 return TII.mayAccessVMEMThroughFlat(
MI);
3144bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3150bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3179SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3180 const WaitcntBrackets &Brackets) {
3181 PreheaderFlushFlags
Flags;
3182 bool HasVMemLoad =
false;
3183 bool HasVMemStore =
false;
3184 bool UsesVgprVMEMLoadedOutside =
false;
3185 bool UsesVgprDSReadOutside =
false;
3186 bool VMemInvalidated =
false;
3190 bool TrackSimpleDSOpt =
ST.hasExtendedWaitCounts();
3191 DenseSet<MCRegUnit> VgprUse;
3192 DenseSet<MCRegUnit> VgprDefVMEM;
3193 DenseSet<MCRegUnit> VgprDefDS;
3199 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3200 unsigned DSReadPosition = 0;
3201 bool IsSingleBlock =
ML->getNumBlocks() == 1;
3202 bool TrackDSFlushPoint =
ST.hasExtendedWaitCounts() && IsSingleBlock;
3203 unsigned LastDSFlushPosition = 0;
3205 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3206 for (MachineInstr &
MI : *
MBB) {
3207 if (isVMEMOrFlatVMEM(
MI)) {
3208 HasVMemLoad |=
MI.mayLoad();
3209 HasVMemStore |=
MI.mayStore();
3213 if (mayStoreIncrementingDSCNT(
MI)) {
3216 if (VMemInvalidated)
3218 TrackSimpleDSOpt =
false;
3219 TrackDSFlushPoint =
false;
3221 bool IsDSRead = isDSRead(
MI);
3226 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3227 if (!TrackDSFlushPoint)
3229 if (
auto It = LastDSReadPositionMap.
find(RU);
3230 It != LastDSReadPositionMap.
end()) {
3234 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3238 for (
const MachineOperand &
Op :
MI.all_uses()) {
3239 if (
Op.isDebug() || !
TRI.isVectorRegister(MRI,
Op.getReg()))
3242 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3246 VMemInvalidated =
true;
3250 TrackSimpleDSOpt =
false;
3253 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3257 updateDSReadFlushTracking(RU);
3262 VMEMID
ID = toVMEMID(RU);
3266 UsesVgprVMEMLoadedOutside =
true;
3271 UsesVgprDSReadOutside =
true;
3276 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3277 for (
const MachineOperand &
Op :
MI.all_defs()) {
3278 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3282 VMemInvalidated =
true;
3287 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3298 if (IsDSRead || TrackDSFlushPoint) {
3299 for (
const MachineOperand &
Op :
MI.all_defs()) {
3300 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
3302 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3305 updateDSReadFlushTracking(RU);
3308 if (TrackDSFlushPoint)
3309 LastDSReadPositionMap[RU] = DSReadPosition;
3318 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3319 ((!
ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3320 (HasVMemLoad &&
ST.hasVmemWriteVgprInOrder())))
3321 Flags.FlushVmCnt =
true;
3327 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3330 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3331 bool DSFlushPointPrefetch =
3332 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3334 if (SimpleDSOpt || DSFlushPointPrefetch)
3335 Flags.FlushDsCnt =
true;
3340bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3341 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3343 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3345 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3346 AA = &AAR->getAAResults();
3348 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3360 if (!SIInsertWaitcnts(MLI, PDT,
AA, MF).
run())
3365 .preserve<AAManager>();
3368bool SIInsertWaitcnts::run() {
3376 if (ST.hasExtendedWaitCounts()) {
3377 IsExpertMode = ST.hasExpertSchedulingMode() &&
3386 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3391 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3395 SmemAccessCounter = getCounterFromEvent(HWEvents::SMEM_ACCESS);
3399 MachineBasicBlock &EntryBB = MF.
front();
3410 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3413 if (
ST.hasExtendedWaitCounts()) {
3422 if (!
ST.hasImageInsts() &&
3428 TII.get(instrsForExtendedCounterTypes[CT]))
3441 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3442 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3443 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3450 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3453 std::unique_ptr<WaitcntBrackets> Brackets;
3458 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3460 MachineBasicBlock *
MBB = BII->first;
3461 BlockInfo &BI = BII->second;
3467 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3469 *Brackets = *BI.Incoming;
3472 Brackets = std::make_unique<WaitcntBrackets>(
this);
3477 Brackets->~WaitcntBrackets();
3478 new (Brackets.get()) WaitcntBrackets(
this);
3482 if (
ST.hasWaitXcnt())
3484 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3487 if (Brackets->hasPendingEvent()) {
3488 BlockInfo *MoveBracketsToSucc =
nullptr;
3490 auto *SuccBII = BlockInfos.
find(Succ);
3491 BlockInfo &SuccBI = SuccBII->second;
3492 if (!SuccBI.Incoming) {
3493 SuccBI.Dirty =
true;
3494 if (SuccBII <= BII) {
3498 if (!MoveBracketsToSucc) {
3499 MoveBracketsToSucc = &SuccBI;
3501 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3505 dbgs() <<
"Try to merge ";
3511 if (SuccBI.Incoming->merge(*Brackets)) {
3512 SuccBI.Dirty =
true;
3513 if (SuccBII <= BII) {
3520 if (MoveBracketsToSucc)
3521 MoveBracketsToSucc->Incoming = std::move(Brackets);
3526 if (
ST.hasScalarStores()) {
3527 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3528 bool HaveScalarStores =
false;
3530 for (MachineBasicBlock &
MBB : MF) {
3531 for (MachineInstr &
MI :
MBB) {
3532 if (!HaveScalarStores &&
TII.isScalarStore(
MI))
3533 HaveScalarStores =
true;
3535 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3536 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3541 if (HaveScalarStores) {
3550 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3551 bool SeenDCacheWB =
false;
3555 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3556 SeenDCacheWB =
true;
3557 else if (
TII.isScalarStore(*
I))
3558 SeenDCacheWB =
false;
3561 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3562 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3578 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3580 setSchedulingMode(EntryBB,
I,
true);
3582 for (MachineInstr *
MI : CallInsts) {
3583 MachineBasicBlock &
MBB = *
MI->getParent();
3584 setSchedulingMode(
MBB,
MI,
false);
3585 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3588 for (MachineInstr *
MI : ReturnInsts)
3589 setSchedulingMode(*
MI->getParent(),
MI,
false);
3600 for (
auto [
MI,
_] : EndPgmInsts) {
3602 TII.get(AMDGPU::S_ALLOC_VGPR))
3606 }
else if (!WCG->isOptNone() &&
3607 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3608 (MF.getFrameInfo().hasCalls() ||
3609 ST.getOccupancyWithNumVGPRs(
3610 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3613 for (
auto [
MI, Flag] : EndPgmInsts) {
3615 if (
ST.requiresNopBeforeDeallocVGPRs()) {
3617 TII.get(AMDGPU::S_NOP))
3621 TII.get(AMDGPU::S_SENDMSG))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
AMDGPU::HWEvents HWEvents
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
Bit mask of hardware events.
constexpr unsigned size() const
constexpr bool contains(HWEvents Other) const
constexpr bool any() const
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
Get the array size.
bool empty() const
Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
HWEvents getSimplifiedVMEMEventsFor(const MachineInstr &Inst, const SIInstrInfo &TII)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
HWEvents getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST, bool IsExpertMode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
constexpr bool isMaybeAtomic(const T &...O)
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
@ Increment
Incrementally increasing token ID.
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
MCRegisterClass TargetRegisterClass
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.