47#define DEBUG_TYPE "si-insert-waitcnts"
50 "Force emit s_waitcnt expcnt(0) instrs");
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
54 "Force emit s_waitcnt vmcnt(0) instrs");
58 cl::desc(
"Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc(
"Force all waitcnt load counters to wait until 0"),
68 "amdgpu-expert-scheduling-mode",
69 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
117 TRACKINGID_RANGE_LEN = (1 << 16),
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
139 DECL(VMEM_SAMPLER_READ_ACCESS) \
140 DECL(VMEM_BVH_READ_ACCESS) \
141 DECL(GLOBAL_INV_ACCESS) \
142 DECL(VMEM_WRITE_ACCESS) \
143 DECL(SCRATCH_WRITE_ACCESS) \
153 DECL(EXP_POS_ACCESS) \
154 DECL(EXP_PARAM_ACCESS) \
156 DECL(EXP_LDS_ACCESS) \
157 DECL(VGPR_CSMACC_WRITE) \
158 DECL(VGPR_DPMACC_WRITE) \
159 DECL(VGPR_TRANS_WRITE) \
160 DECL(VGPR_XDL_WRITE) \
161 DECL(VGPR_LDS_READ) \
162 DECL(VGPR_FLAT_READ) \
166#define AMDGPU_EVENT_ENUM(Name) Name,
171#undef AMDGPU_EVENT_ENUM
185auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
186 return enum_seq(VMEM_ACCESS, MaxEvent);
189#define AMDGPU_EVENT_NAME(Name) #Name,
193#undef AMDGPU_EVENT_NAME
194static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
195 return WaitEventTypeName[
Event];
218 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
234 assert(updateVMCntOnly(Inst));
236 return VMEM_NOSAMPLER;
250 return VMEM_NOSAMPLER;
264 WaitEventSet() =
default;
265 explicit constexpr WaitEventSet(WaitEventType Event) {
266 static_assert(NUM_WAIT_EVENTS <=
sizeof(Mask) * 8,
267 "Not enough bits in Mask for all the events");
270 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
271 for (
auto &
E : Events) {
275 void insert(
const WaitEventType &Event) { Mask |= 1 <<
Event; }
276 void remove(
const WaitEventType &Event) { Mask &= ~(1 <<
Event); }
277 void remove(
const WaitEventSet &
Other) { Mask &= ~Other.Mask; }
278 bool contains(
const WaitEventType &Event)
const {
279 return Mask & (1 <<
Event);
283 return (~Mask &
Other.Mask) == 0;
308 return Mask ==
Other.Mask;
311 bool empty()
const {
return Mask == 0; }
313 bool twoOrMore()
const {
return Mask & (Mask - 1); }
314 operator bool()
const {
return !
empty(); }
315 void print(raw_ostream &OS)
const {
316 ListSeparator
LS(
", ");
317 for (WaitEventType Event : wait_events()) {
318 OS <<
LS << getWaitEventTypeName(Event);
324void WaitEventSet::dump()
const {
329class WaitcntBrackets;
337class WaitcntGenerator {
339 const GCNSubtarget &ST;
340 const SIInstrInfo &
TII;
341 AMDGPU::IsaVersion
IV;
344 bool ExpandWaitcntProfiling =
false;
345 const AMDGPU::HardwareLimits *Limits =
nullptr;
348 WaitcntGenerator() =
delete;
349 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
350 WaitcntGenerator(
const MachineFunction &MF,
InstCounterType MaxCounter,
351 const AMDGPU::HardwareLimits *Limits)
352 :
ST(MF.getSubtarget<GCNSubtarget>()),
TII(*
ST.getInstrInfo()),
356 ExpandWaitcntProfiling(
357 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
362 bool isOptNone()
const {
return OptNone; }
364 const AMDGPU::HardwareLimits &getLimits()
const {
return *Limits; }
378 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
379 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
383 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
388 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
390 AMDGPU::Waitcnt
Wait,
391 const WaitcntBrackets &ScoreBrackets) = 0;
407 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
409 virtual ~WaitcntGenerator() =
default;
412class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
413 static constexpr const WaitEventSet
416 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
417 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
418 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
419 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
420 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
429 using WaitcntGenerator::WaitcntGenerator;
431 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
432 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
435 bool createNewWaitcnt(MachineBasicBlock &
Block,
437 AMDGPU::Waitcnt
Wait,
438 const WaitcntBrackets &ScoreBrackets)
override;
441 return WaitEventMaskForInstPreGFX12[
T];
444 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
447class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
450 static constexpr const WaitEventSet
452 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
453 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
454 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
455 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
456 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
457 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
458 WaitEventSet({VMEM_BVH_READ_ACCESS}),
459 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
460 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
461 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
463 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
466 WaitcntGeneratorGFX12Plus() =
delete;
467 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
469 const AMDGPU::HardwareLimits *Limits,
471 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
474 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
475 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
478 bool createNewWaitcnt(MachineBasicBlock &
Block,
480 AMDGPU::Waitcnt
Wait,
481 const WaitcntBrackets &ScoreBrackets)
override;
484 return WaitEventMaskForInstGFX12Plus[
T];
487 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
491struct PreheaderFlushFlags {
492 bool FlushVmCnt =
false;
493 bool FlushDsCnt =
false;
496class SIInsertWaitcnts {
498 const GCNSubtarget *
ST;
499 const SIInstrInfo *
TII =
nullptr;
500 const SIRegisterInfo *
TRI =
nullptr;
501 const MachineRegisterInfo *
MRI =
nullptr;
504 bool IsExpertMode =
false;
507 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
508 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
509 MachineLoopInfo *MLI;
510 MachinePostDominatorTree *PDT;
514 std::unique_ptr<WaitcntBrackets> Incoming;
518 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
522 std::unique_ptr<WaitcntGenerator> WCG;
525 DenseSet<MachineInstr *> CallInsts;
526 DenseSet<MachineInstr *> ReturnInsts;
531 DenseMap<MachineInstr *, bool> EndPgmInsts;
533 AMDGPU::HardwareLimits Limits;
536 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
538 : MLI(MLI), PDT(PDT), AA(AA) {
539 (void)ForceExpCounter;
540 (void)ForceLgkmCounter;
541 (void)ForceVMCounter;
544 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
546 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
547 const WaitcntBrackets &Brackets);
548 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
549 const WaitcntBrackets &ScoreBrackets);
550 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
551 bool isDSRead(
const MachineInstr &
MI)
const;
552 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
553 bool run(MachineFunction &MF);
555 void setForceEmitWaitcnt() {
561 ForceEmitWaitcnt[
EXP_CNT] =
true;
563 ForceEmitWaitcnt[
EXP_CNT] =
false;
568 ForceEmitWaitcnt[
DS_CNT] =
true;
569 ForceEmitWaitcnt[
KM_CNT] =
true;
571 ForceEmitWaitcnt[
DS_CNT] =
false;
572 ForceEmitWaitcnt[
KM_CNT] =
false;
579 ForceEmitWaitcnt[
BVH_CNT] =
true;
583 ForceEmitWaitcnt[
BVH_CNT] =
false;
586 ForceEmitWaitcnt[
VA_VDST] =
false;
587 ForceEmitWaitcnt[
VM_VSRC] =
false;
593 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
596 case AMDGPU::GLOBAL_INV:
597 return GLOBAL_INV_ACCESS;
599 case AMDGPU::GLOBAL_WB:
600 case AMDGPU::GLOBAL_WBINV:
601 return VMEM_WRITE_ACCESS;
607 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
608 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
617 if (
TII->mayAccessScratch(Inst))
618 return SCRATCH_WRITE_ACCESS;
619 return VMEM_WRITE_ACCESS;
623 return VmemReadMapping[getVmemType(Inst)];
626 std::optional<WaitEventType>
627 getExpertSchedulingEventType(
const MachineInstr &Inst)
const;
629 bool isAsync(
const MachineInstr &
MI)
const {
634 const MachineOperand *
Async =
635 TII->getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
639 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
643 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
647 bool isVmemAccess(
const MachineInstr &
MI)
const;
648 bool generateWaitcntInstBefore(MachineInstr &
MI,
649 WaitcntBrackets &ScoreBrackets,
650 MachineInstr *OldWaitcntInstr,
651 PreheaderFlushFlags FlushFlags);
652 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
654 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
655 MachineInstr *OldWaitcntInstr);
657 WaitEventSet getEventsFor(
const MachineInstr &Inst)
const;
658 void updateEventWaitcntAfter(MachineInstr &Inst,
659 WaitcntBrackets *ScoreBrackets);
661 MachineBasicBlock *
Block)
const;
662 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
663 WaitcntBrackets &ScoreBrackets);
664 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
665 WaitcntBrackets &ScoreBrackets);
668 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
670 bool ExpertMode)
const;
672 return WCG->getWaitEvents(
T);
675 return WCG->getCounterFromEvent(
E);
687class WaitcntBrackets {
695 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
696 for (
auto &[
ID, Val] : VMem) {
700 for (
auto &[
ID, Val] : SGPRs) {
705 if (NumUnusedVmem || NumUnusedSGPRs) {
706 errs() <<
"WaitcntBracket had unused entries at destruction time: "
707 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
708 <<
" SGPR unused entries\n";
719 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
720 return T ==
X_CNT ? 1 : 0;
724 return ScoreUBs[
T] - ScoreLBs[
T];
728 return getVMemScore(
ID,
T) > getScoreLB(
T);
746 return getScoreUB(
T) - getScoreLB(
T);
750 auto It = SGPRs.find(RU);
751 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(
T)] : 0;
755 auto It = VMem.find(TID);
756 return It != VMem.end() ? It->second.Scores[
T] : 0;
763 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
766 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
767 AMDGPU::Waitcnt &UpdateWait)
const;
770 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
771 AMDGPU::Waitcnt &UpdateWait)
const;
772 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
773 AMDGPU::Waitcnt &UpdateWait)
const;
776 AMDGPU::Waitcnt &
Wait)
const;
778 AMDGPU::Waitcnt &
Wait)
const;
779 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
780 void tryClearSCCWriteEvent(MachineInstr *Inst);
782 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
785 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
786 void recordAsyncMark(MachineInstr &
MI);
788 bool hasPendingEvent()
const {
return !PendingEvents.empty(); }
789 bool hasPendingEvent(WaitEventType
E)
const {
790 return PendingEvents.contains(
E);
793 bool HasPending = PendingEvents &
Context->getWaitEvents(
T);
795 "Expected pending events iff scoreboard is not empty");
800 WaitEventSet Events = PendingEvents &
Context->getWaitEvents(
T);
802 return Events.twoOrMore();
805 bool hasPendingFlat()
const {
812 void setPendingFlat() {
817 bool hasPendingGDS()
const {
818 return LastGDS > ScoreLBs[
DS_CNT] && LastGDS <= ScoreUBs[
DS_CNT];
821 unsigned getPendingGDSWait()
const {
822 return std::min(getScoreUB(
DS_CNT) - LastGDS,
826 void setPendingGDS() { LastGDS = ScoreUBs[
DS_CNT]; }
830 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
831 for (MCRegUnit RU : regunits(
Reg)) {
832 auto It = VMem.find(toVMEMID(RU));
833 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
840 for (MCRegUnit RU : regunits(
Reg)) {
841 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
842 It->second.VMEMTypes = 0;
843 if (It->second.empty())
849 void setStateOnFunctionEntryOrReturn() {
855 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
859 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
860 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
863 void print(raw_ostream &)
const;
868 void purgeEmptyTrackingData();
878 using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
881 AMDGPU::Waitcnt &
Wait)
const;
883 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
884 unsigned OtherScore);
889 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
892 const TargetRegisterClass *RC =
Context->TRI->getPhysRegBaseClass(
Reg);
893 unsigned Size =
Context->TRI->getRegSizeInBits(*RC);
894 if (
Size == 16 &&
Context->ST->hasD16Writes32BitVgpr())
918 if (
Reg == AMDGPU::SCC) {
921 for (MCRegUnit RU : regunits(
Reg))
922 VMem[toVMEMID(RU)].Scores[
T] = Val;
924 auto STy = getSgprScoresIdx(
T);
925 for (MCRegUnit RU : regunits(
Reg))
926 SGPRs[RU].Scores[STy] = Val;
933 VMem[TID].Scores[
T] = Val;
939 const SIInsertWaitcnts *
Context;
943 WaitEventSet PendingEvents;
947 unsigned LastGDS = 0;
964 CounterValueArray Scores{};
966 unsigned VMEMTypes = 0;
976 std::array<unsigned, 2> Scores = {0};
978 bool empty()
const {
return !Scores[0] && !Scores[1]; }
981 DenseMap<VMEMID, VMEMInfo> VMem;
982 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
985 unsigned SCCScore = 0;
987 const MachineInstr *PendingSCCWrite =
nullptr;
991 SmallVector<const MachineInstr *> LDSDMAStores;
1000 static constexpr unsigned MaxAsyncMarks = 16;
1004 CounterValueArray AsyncScore{};
1007class SIInsertWaitcntsLegacy :
public MachineFunctionPass {
1010 SIInsertWaitcntsLegacy() : MachineFunctionPass(
ID) {}
1012 bool runOnMachineFunction(MachineFunction &MF)
override;
1014 StringRef getPassName()
const override {
1015 return "SI insert wait instructions";
1018 void getAnalysisUsage(AnalysisUsage &AU)
const override {
1021 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
1030void WaitcntBrackets::setScoreByOperand(
const MachineOperand &
Op,
1032 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
1040bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
1045 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1055bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
1057 if (!hasPointSampleAccel(
MI))
1060 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
1063void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
1067 unsigned UB = getScoreUB(
T);
1068 unsigned CurrScore = UB + 1;
1074 PendingEvents.insert(
E);
1075 setScoreUB(
T, CurrScore);
1078 const MachineRegisterInfo *
MRI =
Context->MRI;
1087 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1088 setScoreByOperand(*AddrOp,
EXP_CNT, CurrScore);
1091 if (
const auto *Data0 =
1092 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1093 setScoreByOperand(*Data0,
EXP_CNT, CurrScore);
1094 if (
const auto *Data1 =
1095 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1096 setScoreByOperand(*Data1,
EXP_CNT, CurrScore);
1098 Inst.
getOpcode() != AMDGPU::DS_APPEND &&
1099 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
1100 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1101 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1102 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1103 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1106 }
else if (
TII->isFLAT(Inst)) {
1108 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1111 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1114 }
else if (
TII->isMIMG(Inst)) {
1118 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1121 }
else if (
TII->isMTBUF(Inst)) {
1124 }
else if (
TII->isMUBUF(Inst)) {
1128 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1131 }
else if (
TII->isLDSDIR(Inst)) {
1133 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1136 if (
TII->isEXP(Inst)) {
1141 for (MachineOperand &DefMO : Inst.
all_defs()) {
1142 if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
1143 setScoreByOperand(DefMO,
EXP_CNT, CurrScore);
1147 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1148 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1149 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1153 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1154 if (PendingEvents.contains(OtherEvent)) {
1159 setScoreLB(
T, getScoreUB(
T) - 1);
1160 PendingEvents.remove(OtherEvent);
1162 for (
const MachineOperand &
Op : Inst.
all_uses())
1163 setScoreByOperand(
Op,
T, CurrScore);
1167 for (
const MachineOperand &
Op : Inst.
operands()) {
1172 setScoreByOperand(
Op,
T, CurrScore);
1184 for (
const MachineOperand &
Op : Inst.
defs()) {
1186 if (!
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1188 if (updateVMCntOnly(Inst)) {
1193 VmemType
V = getVmemType(Inst);
1194 unsigned char TypesMask = 1 <<
V;
1197 if (hasPointSampleAccel(Inst))
1198 TypesMask |= 1 << VMEM_NOSAMPLER;
1199 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1200 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1203 setScoreByOperand(
Op,
T, CurrScore);
1206 (
TII->isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1215 if (!MemOp->isStore() ||
1220 auto AAI = MemOp->getAAInfo();
1226 if (!AAI || !AAI.Scope)
1228 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1229 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1230 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1245 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1246 if (Slot && Slot < NUM_LDSDMA)
1247 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1255 "unexpected GFX1250 instruction");
1256 AsyncScore[
T] = CurrScore;
1260 setRegScore(AMDGPU::SCC,
T, CurrScore);
1261 PendingSCCWrite = &Inst;
1266void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1272 AsyncMarks.push_back(AsyncScore);
1275 dbgs() <<
"recordAsyncMark:\n" << Inst;
1276 for (
const auto &Mark : AsyncMarks) {
1283void WaitcntBrackets::print(raw_ostream &OS)
const {
1287 unsigned SR = getScoreRange(
T);
1290 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1294 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1298 OS <<
" EXP_CNT(" << SR <<
"):";
1301 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1305 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1308 OS <<
" BVH_CNT(" << SR <<
"):";
1311 OS <<
" KM_CNT(" << SR <<
"):";
1314 OS <<
" X_CNT(" << SR <<
"):";
1317 OS <<
" VA_VDST(" << SR <<
"): ";
1320 OS <<
" VM_VSRC(" << SR <<
"): ";
1323 OS <<
" UNKNOWN(" << SR <<
"):";
1329 unsigned LB = getScoreLB(
T);
1332 sort(SortedVMEMIDs);
1334 for (
auto ID : SortedVMEMIDs) {
1335 unsigned RegScore = VMem.at(
ID).Scores[
T];
1338 unsigned RelScore = RegScore - LB - 1;
1339 if (
ID < REGUNITS_END) {
1340 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1342 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1343 "Unhandled/unexpected ID value!");
1344 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1349 if (isSmemCounter(
T)) {
1351 sort(SortedSMEMIDs);
1352 for (
auto ID : SortedSMEMIDs) {
1353 unsigned RegScore = SGPRs.at(
ID).Scores[getSgprScoresIdx(
T)];
1356 unsigned RelScore = RegScore - LB - 1;
1357 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1361 if (
T ==
KM_CNT && SCCScore > 0)
1362 OS <<
' ' << SCCScore <<
":scc";
1367 OS <<
"Pending Events: ";
1368 if (hasPendingEvent()) {
1370 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1371 if (hasPendingEvent((WaitEventType)
I)) {
1372 OS <<
LS << WaitEventTypeName[
I];
1380 OS <<
"Async score: ";
1381 if (AsyncScore.empty())
1387 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1389 for (
const auto &Mark : AsyncMarks) {
1391 unsigned MarkedScore = Mark[
T];
1394 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1395 <<
"_CNT: " << MarkedScore;
1398 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1399 <<
"_CNT: " << MarkedScore;
1402 OS <<
" EXP_CNT: " << MarkedScore;
1405 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS")
1406 <<
"_CNT: " << MarkedScore;
1409 OS <<
" SAMPLE_CNT: " << MarkedScore;
1412 OS <<
" BVH_CNT: " << MarkedScore;
1415 OS <<
" KM_CNT: " << MarkedScore;
1418 OS <<
" X_CNT: " << MarkedScore;
1421 OS <<
" UNKNOWN: " << MarkedScore;
1432void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1433 AMDGPU::Waitcnt &UpdateWait)
const {
1434 simplifyWaitcnt(UpdateWait,
LOAD_CNT);
1435 simplifyWaitcnt(UpdateWait,
EXP_CNT);
1436 simplifyWaitcnt(UpdateWait,
DS_CNT);
1439 simplifyWaitcnt(UpdateWait,
BVH_CNT);
1440 simplifyWaitcnt(UpdateWait,
KM_CNT);
1441 simplifyXcnt(CheckWait, UpdateWait);
1442 simplifyWaitcnt(UpdateWait,
VA_VDST);
1443 simplifyVmVsrc(CheckWait, UpdateWait);
1447 unsigned &
Count)
const {
1451 if (
Count >= getScoreRange(
T))
1456 unsigned Cnt =
Wait.get(
T);
1457 simplifyWaitcnt(
T, Cnt);
1461void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1462 AMDGPU::Waitcnt &UpdateWait)
const {
1471 if (CheckWait.
get(
KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1476 if (CheckWait.
get(
LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1480 simplifyWaitcnt(UpdateWait,
X_CNT);
1483void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1484 AMDGPU::Waitcnt &UpdateWait)
const {
1489 std::min({CheckWait.get(LOAD_CNT), CheckWait.get(STORE_CNT),
1490 CheckWait.get(SAMPLE_CNT), CheckWait.get(BVH_CNT),
1491 CheckWait.get(DS_CNT)}))
1493 simplifyWaitcnt(UpdateWait,
VM_VSRC);
1496void WaitcntBrackets::purgeEmptyTrackingData() {
1508 unsigned ScoreToWait,
1509 AMDGPU::Waitcnt &
Wait)
const {
1510 const unsigned LB = getScoreLB(
T);
1511 const unsigned UB = getScoreUB(
T);
1514 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1516 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1520 addWait(
Wait,
T, 0);
1521 }
else if (counterOutOfOrder(
T)) {
1525 addWait(
Wait,
T, 0);
1529 unsigned NeededWait = std::min(
1530 UB - ScoreToWait, getWaitCountMax(
Context->getLimits(),
T) - 1);
1531 addWait(
Wait,
T, NeededWait);
1536AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1538 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1540 for (
const auto &Mark : AsyncMarks) {
1546 if (AsyncMarks.size() == MaxAsyncMarks) {
1551 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1552 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1555 AMDGPU::Waitcnt
Wait;
1556 if (AsyncMarks.size() <=
N) {
1561 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1562 const auto &RequiredMark = AsyncMarks[MarkIndex];
1564 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1570 dbgs() <<
"Removing " << (MarkIndex + 1)
1571 <<
" async marks after determining wait\n";
1573 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1580 AMDGPU::Waitcnt &
Wait)
const {
1581 if (
Reg == AMDGPU::SCC) {
1582 determineWaitForScore(
T, SCCScore,
Wait);
1585 for (MCRegUnit RU : regunits(
Reg))
1586 determineWaitForScore(
1587 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1593 AMDGPU::Waitcnt &
Wait)
const {
1594 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1595 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1598void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1601 if (PendingSCCWrite &&
1602 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1604 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1607 SCC_WRITE_PendingEvent) {
1611 PendingEvents.remove(SCC_WRITE_PendingEvent);
1612 PendingSCCWrite =
nullptr;
1616void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1630 const unsigned UB = getScoreUB(
T);
1634 if (counterOutOfOrder(
T))
1636 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1639 PendingEvents.remove(
Context->getWaitEvents(
T));
1642 if (
T ==
KM_CNT &&
Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1643 if (!hasMixedPendingEvents(
X_CNT))
1644 applyWaitcnt(
X_CNT, 0);
1646 PendingEvents.remove(SMEM_GROUP);
1648 if (
T ==
LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1650 if (!hasMixedPendingEvents(
X_CNT))
1652 else if (
Count == 0)
1653 PendingEvents.remove(VMEM_GROUP);
1658 unsigned Cnt =
Wait.get(
T);
1659 applyWaitcnt(
T, Cnt);
1666 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1667 (
T ==
X_CNT && hasPendingEvent(SMEM_GROUP)))
1674 unsigned Events = hasPendingEvent(
T);
1677 Events &= ~(1 << GLOBAL_INV_ACCESS);
1680 return Events & (Events - 1);
1683 return hasMixedPendingEvents(
T);
1693char SIInsertWaitcntsLegacy::
ID = 0;
1698 return new SIInsertWaitcntsLegacy();
1703 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1708 if (NewEnc == MO.
getImm())
1719 case AMDGPU::S_WAIT_LOADCNT:
1721 case AMDGPU::S_WAIT_EXPCNT:
1723 case AMDGPU::S_WAIT_STORECNT:
1725 case AMDGPU::S_WAIT_SAMPLECNT:
1727 case AMDGPU::S_WAIT_BVHCNT:
1729 case AMDGPU::S_WAIT_DSCNT:
1731 case AMDGPU::S_WAIT_KMCNT:
1733 case AMDGPU::S_WAIT_XCNT:
1740bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1754bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1755 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1757 assert(isNormalMode(MaxCounter));
1760 MachineInstr *WaitcntInstr =
nullptr;
1761 MachineInstr *WaitcntVsCntInstr =
nullptr;
1764 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1766 dbgs() <<
"end of block\n";
1774 if (
II.isMetaInstruction()) {
1780 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1784 if (Opcode == AMDGPU::S_WAITCNT) {
1785 unsigned IEnc =
II.getOperand(0).getImm();
1788 ScoreBrackets.simplifyWaitcnt(OldWait);
1792 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1793 II.eraseFromParent();
1797 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1800 <<
"Before: " <<
Wait <<
'\n';);
1801 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, LDSDMA_BEGIN,
Wait);
1810 II.eraseFromParent();
1811 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1812 unsigned N =
II.getOperand(0).getImm();
1814 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1817 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1818 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1821 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1823 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1826 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1827 II.eraseFromParent();
1830 WaitcntVsCntInstr = &
II;
1837 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1846 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1847 <<
"New Instr at block end: "
1848 << *WaitcntInstr <<
'\n'
1849 :
dbgs() <<
"applied pre-existing waitcnt\n"
1850 <<
"Old Instr: " << *It
1851 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1854 if (WaitcntVsCntInstr) {
1856 *WaitcntVsCntInstr, AMDGPU::OpName::simm16,
Wait.get(
STORE_CNT));
1857 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1863 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1864 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1866 :
dbgs() <<
"applied pre-existing waitcnt\n"
1867 <<
"Old Instr: " << *It
1868 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1876bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1878 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1879 assert(isNormalMode(MaxCounter));
1887 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
1890 EmitWaitcnt(--Outstanding);
1891 }
while (Outstanding > Target);
1897 if (
Wait.hasWaitExceptStoreCnt()) {
1899 if (ExpandWaitcntProfiling) {
1903 bool AnyOutOfOrder =
false;
1905 unsigned WaitCnt =
Wait.get(CT);
1906 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1907 AnyOutOfOrder =
true;
1912 if (AnyOutOfOrder) {
1920 unsigned WaitCnt =
Wait.get(CT);
1924 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
1925 getWaitCountMax(getLimits(), CT) - 1);
1926 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1937 [[maybe_unused]]
auto SWaitInst =
1942 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1943 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1947 if (
Wait.hasWaitStoreCnt()) {
1951 !ScoreBrackets.counterOutOfOrder(
STORE_CNT)) {
1953 unsigned Outstanding =
1954 std::min(ScoreBrackets.getOutstanding(
STORE_CNT),
1955 getWaitCountMax(getLimits(),
STORE_CNT) - 1);
1956 EmitExpandedWaitcnt(
1958 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1959 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1963 [[maybe_unused]]
auto SWaitInst =
1965 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1970 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1971 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1979WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1980 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
1984WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1985 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1986 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1987 ~0u , ExpertVal, ExpertVal);
1994bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1995 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1997 assert(!isNormalMode(MaxCounter));
2000 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
2001 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
2002 MachineInstr *WaitcntDepctrInstr =
nullptr;
2006 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
2008 dbgs() <<
"end of block\n";
2014 AMDGPU::Waitcnt RequiredWait;
2019 if (
II.isMetaInstruction()) {
2024 MachineInstr **UpdatableInstr;
2030 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
2034 if (Opcode == AMDGPU::S_WAITCNT)
2037 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2039 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2044 RequiredWait = RequiredWait.combined(OldWait);
2045 UpdatableInstr = &CombinedLoadDsCntInstr;
2046 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2048 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2053 RequiredWait = RequiredWait.combined(OldWait);
2054 UpdatableInstr = &CombinedStoreDsCntInstr;
2055 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2057 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2058 AMDGPU::Waitcnt OldWait;
2062 ScoreBrackets.simplifyWaitcnt(OldWait);
2064 UpdatableInstr = &WaitcntDepctrInstr;
2065 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2068 II.eraseFromParent();
2070 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2076 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2078 addWait(
Wait, CT.value(), OldCnt);
2080 addWait(RequiredWait, CT.value(), OldCnt);
2081 UpdatableInstr = &WaitInstrs[CT.value()];
2085 if (!*UpdatableInstr) {
2086 *UpdatableInstr = &
II;
2087 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2094 unsigned Enc =
TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2102 II.eraseFromParent();
2106 II.eraseFromParent();
2111 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
2112 Wait =
Wait.combined(RequiredWait);
2114 if (CombinedLoadDsCntInstr) {
2130 AMDGPU::OpName::simm16, NewEnc);
2131 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2137 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2138 <<
"New Instr at block end: "
2139 << *CombinedLoadDsCntInstr <<
'\n'
2140 :
dbgs() <<
"applied pre-existing waitcnt\n"
2141 <<
"Old Instr: " << *It <<
"New Instr: "
2142 << *CombinedLoadDsCntInstr <<
'\n');
2149 if (CombinedStoreDsCntInstr) {
2154 AMDGPU::OpName::simm16, NewEnc);
2155 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2161 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2162 <<
"New Instr at block end: "
2163 << *CombinedStoreDsCntInstr <<
'\n'
2164 :
dbgs() <<
"applied pre-existing waitcnt\n"
2165 <<
"Old Instr: " << *It <<
"New Instr: "
2166 << *CombinedStoreDsCntInstr <<
'\n');
2196 for (MachineInstr **WI : WaitsToErase) {
2200 (*WI)->eraseFromParent();
2207 if (!WaitInstrs[CT])
2210 unsigned NewCnt =
Wait.get(CT);
2211 if (NewCnt != ~0u) {
2213 AMDGPU::OpName::simm16, NewCnt);
2214 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2216 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2217 setNoWait(
Wait, CT);
2220 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2221 <<
"New Instr at block end: " << *WaitInstrs[CT]
2223 :
dbgs() <<
"applied pre-existing waitcnt\n"
2224 <<
"Old Instr: " << *It
2225 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2232 if (WaitcntDepctrInstr) {
2236 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2251 AMDGPU::OpName::simm16, Enc);
2253 <<
"New Instr at block end: "
2254 << *WaitcntDepctrInstr <<
'\n'
2255 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2256 <<
"Old Instr: " << *It <<
"New Instr: "
2257 << *WaitcntDepctrInstr <<
'\n');
2268bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2270 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2271 assert(!isNormalMode(MaxCounter));
2277 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
2279 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0
u; --
I)
2281 EmitWaitcnt(Target);
2287 if (ExpandWaitcntProfiling) {
2294 if (ScoreBrackets.counterOutOfOrder(CT)) {
2301 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2302 getWaitCountMax(getLimits(), CT) - 1);
2303 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2314 MachineInstr *SWaitInst =
nullptr;
2338 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2339 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2351 [[maybe_unused]]
auto SWaitInst =
2358 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2359 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2362 if (
Wait.hasWaitDepctr()) {
2367 [[maybe_unused]]
auto SWaitInst =
2373 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2374 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2393bool SIInsertWaitcnts::generateWaitcntInstBefore(
2394 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2395 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2397 setForceEmitWaitcnt();
2401 AMDGPU::Waitcnt
Wait;
2402 const unsigned Opc =
MI.getOpcode();
2405 case AMDGPU::BUFFER_WBINVL1:
2406 case AMDGPU::BUFFER_WBINVL1_SC:
2407 case AMDGPU::BUFFER_WBINVL1_VOL:
2408 case AMDGPU::BUFFER_GL0_INV:
2409 case AMDGPU::BUFFER_GL1_INV: {
2417 case AMDGPU::SI_RETURN_TO_EPILOG:
2418 case AMDGPU::SI_RETURN:
2419 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2420 case AMDGPU::S_SETPC_B64_return: {
2425 AMDGPU::Waitcnt AllZeroWait =
2426 WCG->getAllZeroWaitcnt(
false);
2431 if (
ST->hasExtendedWaitCounts() &&
2432 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2437 case AMDGPU::S_ENDPGM:
2438 case AMDGPU::S_ENDPGM_SAVED: {
2447 EndPgmInsts[&
MI] = !ScoreBrackets.empty(
STORE_CNT) &&
2448 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2451 case AMDGPU::S_SENDMSG:
2452 case AMDGPU::S_SENDMSGHALT: {
2453 if (
ST->hasLegacyGeometry() &&
2468 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
2471 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2472 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2473 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2474 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2481 if (
TII->isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2482 addWait(
Wait,
DS_CNT, ScoreBrackets.getPendingGDSWait());
2489 Wait = AMDGPU::Waitcnt();
2491 const MachineOperand &CallAddrOp =
TII->getCalleeOperand(
MI);
2492 if (CallAddrOp.
isReg()) {
2493 ScoreBrackets.determineWaitForPhysReg(
2496 if (
const auto *RtnAddrOp =
2497 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2498 ScoreBrackets.determineWaitForPhysReg(
2499 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait);
2502 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2503 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2519 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2520 const Value *Ptr = Memop->getValue();
2521 if (Memop->isStore()) {
2522 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2523 addWait(
Wait, SmemAccessCounter, 0);
2525 SLoadAddresses.
erase(It);
2528 unsigned AS = Memop->getAddrSpace();
2532 if (
TII->mayWriteLDSThroughDMA(
MI))
2536 unsigned TID = LDSDMA_BEGIN;
2537 if (Ptr && Memop->getAAInfo()) {
2538 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2539 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2540 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2541 if ((
I + 1) >= NUM_LDSDMA) {
2544 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2548 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID +
I + 1,
Wait);
2552 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2554 if (Memop->isStore()) {
2555 ScoreBrackets.determineWaitForLDSDMA(
EXP_CNT, TID,
Wait);
2560 for (
const MachineOperand &
Op :
MI.operands()) {
2565 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
2570 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
2577 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2589 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2590 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2591 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2592 !
ST->hasVmemWriteVgprInOrder()) {
2596 ScoreBrackets.clearVgprVmemTypes(
Reg);
2599 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2603 }
else if (
Op.getReg() == AMDGPU::SCC) {
2606 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait);
2609 if (
ST->hasWaitXcnt() &&
Op.isDef())
2610 ScoreBrackets.determineWaitForPhysReg(
X_CNT,
Reg,
Wait);
2628 if (
Opc == AMDGPU::S_BARRIER && !
ST->hasAutoWaitcntBeforeBarrier() &&
2629 !
ST->hasBackOffBarrier()) {
2630 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2637 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2642 ScoreBrackets.simplifyWaitcnt(
Wait);
2648 if (
TII->isVALU(
MI))
2655 ScoreBrackets.applyWaitcnt(
Wait,
X_CNT);
2662 Wait = WCG->getAllZeroWaitcnt(
false);
2666 if (!ForceEmitWaitcnt[
T])
2671 if (FlushFlags.FlushVmCnt) {
2676 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
2682 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2686bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2688 MachineBasicBlock &
Block,
2689 WaitcntBrackets &ScoreBrackets,
2690 MachineInstr *OldWaitcntInstr) {
2693 if (OldWaitcntInstr)
2697 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2702 MachineOperand *WaitExp =
2703 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2713 <<
"Update Instr: " << *It);
2716 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2721 ScoreBrackets.applyWaitcnt(
Wait);
2726std::optional<WaitEventType>
2727SIInsertWaitcnts::getExpertSchedulingEventType(
const MachineInstr &Inst)
const {
2728 if (
TII->isVALU(Inst)) {
2733 if (
TII->isXDL(Inst))
2734 return VGPR_XDL_WRITE;
2736 if (
TII->isTRANS(Inst))
2737 return VGPR_TRANS_WRITE;
2740 return VGPR_DPMACC_WRITE;
2742 return VGPR_CSMACC_WRITE;
2749 if (
TII->isFLAT(Inst))
2750 return VGPR_FLAT_READ;
2752 if (
TII->isDS(Inst))
2753 return VGPR_LDS_READ;
2755 if (
TII->isVMEM(Inst) ||
TII->isVIMAGE(Inst) ||
TII->isVSAMPLE(Inst))
2756 return VGPR_VMEM_READ;
2763bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2764 return (
TII->isFLAT(
MI) &&
TII->mayAccessVMEMThroughFlat(
MI)) ||
2771 MachineBasicBlock *
Block)
const {
2772 auto BlockEnd =
Block->getParent()->end();
2773 auto BlockIter =
Block->getIterator();
2777 if (++BlockIter != BlockEnd) {
2778 It = BlockIter->instr_begin();
2785 if (!It->isMetaInstruction())
2793 return It->getOpcode() == AMDGPU::S_ENDPGM;
2797bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2798 MachineBasicBlock &
Block,
2799 WaitcntBrackets &ScoreBrackets) {
2800 AMDGPU::Waitcnt
Wait;
2801 bool NeedsEndPGMCheck =
false;
2809 NeedsEndPGMCheck =
true;
2812 ScoreBrackets.simplifyWaitcnt(
Wait);
2815 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2818 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2826WaitEventSet SIInsertWaitcnts::getEventsFor(
const MachineInstr &Inst)
const {
2827 WaitEventSet Events;
2829 if (
const auto ET = getExpertSchedulingEventType(Inst))
2833 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2835 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2836 Events.insert(GDS_ACCESS);
2837 Events.insert(GDS_GPR_LOCK);
2839 Events.insert(LDS_ACCESS);
2841 }
else if (
TII->isFLAT(Inst)) {
2843 Events.insert(getVmemWaitEventType(Inst));
2846 if (
TII->mayAccessVMEMThroughFlat(Inst)) {
2847 if (
ST->hasWaitXcnt())
2848 Events.insert(VMEM_GROUP);
2849 Events.insert(getVmemWaitEventType(Inst));
2851 if (
TII->mayAccessLDSThroughFlat(Inst))
2852 Events.insert(LDS_ACCESS);
2856 Inst.
getOpcode() == AMDGPU::BUFFER_WBL2)) {
2860 if (
ST->hasWaitXcnt())
2861 Events.insert(VMEM_GROUP);
2862 Events.insert(getVmemWaitEventType(Inst));
2863 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2865 Events.insert(VMW_GPR_LOCK);
2867 }
else if (
TII->isSMRD(Inst)) {
2868 if (
ST->hasWaitXcnt())
2869 Events.insert(SMEM_GROUP);
2870 Events.insert(SMEM_ACCESS);
2872 Events.insert(EXP_LDS_ACCESS);
2874 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2876 Events.insert(EXP_PARAM_ACCESS);
2878 Events.insert(EXP_POS_ACCESS);
2880 Events.insert(EXP_GPR_LOCK);
2882 Events.insert(SCC_WRITE);
2885 case AMDGPU::S_SENDMSG:
2886 case AMDGPU::S_SENDMSG_RTN_B32:
2887 case AMDGPU::S_SENDMSG_RTN_B64:
2888 case AMDGPU::S_SENDMSGHALT:
2889 Events.insert(SQ_MESSAGE);
2891 case AMDGPU::S_MEMTIME:
2892 case AMDGPU::S_MEMREALTIME:
2893 case AMDGPU::S_GET_BARRIER_STATE_M0:
2894 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2895 Events.insert(SMEM_ACCESS);
2902void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2903 WaitcntBrackets *ScoreBrackets) {
2905 WaitEventSet InstEvents = getEventsFor(Inst);
2906 for (WaitEventType
E : wait_events()) {
2907 if (InstEvents.contains(
E))
2908 ScoreBrackets->updateByEvent(
E, Inst);
2911 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2913 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2914 ScoreBrackets->setPendingGDS();
2916 }
else if (
TII->isFLAT(Inst)) {
2924 ScoreBrackets->setPendingFlat();
2925 }
else if (Inst.
isCall()) {
2927 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2928 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2929 }
else if (
TII->isVINTERP(Inst)) {
2930 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2935bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2936 unsigned OtherScore) {
2937 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2938 unsigned OtherShifted =
2939 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2940 Score = std::max(MyShifted, OtherShifted);
2941 return OtherShifted > MyShifted;
2946 bool StrictDom =
false;
2950 if (AsyncMarks.empty() && OtherMarks.
empty()) {
2957 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
2958 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2961 if (AsyncMarks.size() > MaxSize)
2962 AsyncMarks.erase(AsyncMarks.begin(),
2963 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2969 constexpr CounterValueArray ZeroMark{};
2970 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2973 dbgs() <<
"Before merge:\n";
2974 for (
const auto &Mark : AsyncMarks) {
2978 dbgs() <<
"Other marks:\n";
2979 for (
const auto &Mark : OtherMarks) {
2988 unsigned OtherSize = OtherMarks.size();
2989 unsigned OurSize = AsyncMarks.size();
2990 unsigned MergeCount = std::min(OtherSize, OurSize);
2993 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
2994 OtherMarks[OtherSize - Idx][
T]);
2999 dbgs() <<
"After merge:\n";
3000 for (
const auto &Mark : AsyncMarks) {
3014bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
3015 bool StrictDom =
false;
3019 for (
auto K :
Other.VMem.keys())
3020 VMem.try_emplace(K);
3021 for (
auto K :
Other.SGPRs.keys())
3022 SGPRs.try_emplace(K);
3029 const WaitEventSet &EventsForT =
Context->getWaitEvents(
T);
3030 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3031 const WaitEventSet OtherEvents =
Other.PendingEvents & EventsForT;
3032 if (!OldEvents.contains(OtherEvents))
3034 PendingEvents |= OtherEvents;
3037 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
3038 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
3039 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
3040 if (NewUB < ScoreLBs[
T])
3043 MergeInfo &
M = MergeInfos[
T];
3044 M.OldLB = ScoreLBs[
T];
3045 M.OtherLB =
Other.ScoreLBs[
T];
3046 M.MyShift = NewUB - ScoreUBs[
T];
3047 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
3049 ScoreUBs[
T] = NewUB;
3051 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
3054 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
3057 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
3058 if (
Other.hasPendingEvent(SCC_WRITE)) {
3059 if (!OldEvents.contains(SCC_WRITE)) {
3060 PendingSCCWrite =
Other.PendingSCCWrite;
3061 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
3062 PendingSCCWrite =
nullptr;
3067 for (
auto &[RegID, Info] : VMem)
3068 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
3070 if (isSmemCounter(
T)) {
3071 unsigned Idx = getSgprScoresIdx(
T);
3072 for (
auto &[RegID, Info] : SGPRs) {
3073 auto It =
Other.SGPRs.find(RegID);
3074 unsigned OtherScore =
3075 (It !=
Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
3076 StrictDom |= mergeScore(M,
Info.Scores[Idx], OtherScore);
3081 for (
auto &[TID, Info] : VMem) {
3082 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
3083 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
3084 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
3085 Info.VMEMTypes = NewVmemTypes;
3089 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
3091 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
3093 purgeEmptyTrackingData();
3099 return Opcode == AMDGPU::S_WAITCNT ||
3102 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3103 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3104 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3105 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3109void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
3111 bool ExpertMode)
const {
3115 .
addImm(ExpertMode ? 2 : 0)
3120bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3121 MachineBasicBlock &
Block,
3122 WaitcntBrackets &ScoreBrackets) {
3126 dbgs() <<
"*** Begin Block: ";
3128 ScoreBrackets.dump();
3134 bool VCCZCorrect =
true;
3135 if (
ST->hasReadVCCZBug()) {
3138 VCCZCorrect =
false;
3139 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
3142 VCCZCorrect =
false;
3146 MachineInstr *OldWaitcntInstr =
nullptr;
3151 Iter !=
E; ++Iter) {
3152 MachineInstr &Inst = *Iter;
3158 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3159 if (!OldWaitcntInstr)
3160 OldWaitcntInstr = &Inst;
3164 PreheaderFlushFlags FlushFlags;
3165 if (
Block.getFirstTerminator() == Inst)
3166 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3168 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3170 assert(
ST->getGeneration() < AMDGPUSubtarget::GFX12);
3171 ScoreBrackets.recordAsyncMark(Inst);
3176 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3178 OldWaitcntInstr =
nullptr;
3184 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
3188 if (!
ST->partialVCCWritesUpdateVCCZ())
3189 VCCZCorrect =
false;
3198 if (
ST->hasReadVCCZBug() &&
3199 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
3202 VCCZCorrect =
false;
3210 if (
TII->isSMRD(Inst)) {
3211 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3214 if (!Memop->isInvariant()) {
3215 const Value *Ptr = Memop->getValue();
3219 if (
ST->hasReadVCCZBug()) {
3221 VCCZCorrect =
false;
3225 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3229 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3233 ScoreBrackets.dump();
3243 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3253 AMDGPU::Waitcnt
Wait;
3254 if (
Block.getFirstTerminator() ==
Block.end()) {
3255 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3256 if (FlushFlags.FlushVmCnt) {
3257 if (ScoreBrackets.hasPendingEvent(
LOAD_CNT))
3259 if (ScoreBrackets.hasPendingEvent(
SAMPLE_CNT))
3261 if (ScoreBrackets.hasPendingEvent(
BVH_CNT))
3264 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
3273 dbgs() <<
"*** End Block: ";
3275 ScoreBrackets.dump();
3281bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3282 if (
Block.size() <= 1)
3290 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3295 TII->isDS(
MI) || (
TII->isFLAT(
MI) &&
TII->mayAccessLDSThroughFlat(
MI));
3296 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3297 LastAtomicWithSoftXcnt =
nullptr;
3300 MI.mayLoad() &&
MI.mayStore();
3301 MachineInstr &PrevMI = *
MI.getPrevNode();
3303 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3306 if (LastAtomicWithSoftXcnt) {
3310 LastAtomicWithSoftXcnt = &
MI;
3318SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3319 const WaitcntBrackets &ScoreBrackets) {
3320 auto [Iterator, IsInserted] =
3323 return Iterator->second;
3327 return PreheaderFlushFlags();
3331 return PreheaderFlushFlags();
3334 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3335 return Iterator->second;
3338 return PreheaderFlushFlags();
3341bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3343 return TII->mayAccessVMEMThroughFlat(
MI);
3347bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3353bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3379SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3380 const WaitcntBrackets &Brackets) {
3381 PreheaderFlushFlags
Flags;
3382 bool HasVMemLoad =
false;
3383 bool HasVMemStore =
false;
3384 bool UsesVgprLoadedOutsideVMEM =
false;
3385 bool UsesVgprLoadedOutsideDS =
false;
3386 bool VMemInvalidated =
false;
3388 bool DSInvalidated = !
ST->hasExtendedWaitCounts();
3389 DenseSet<MCRegUnit> VgprUse;
3390 DenseSet<MCRegUnit> VgprDefVMEM;
3391 DenseSet<MCRegUnit> VgprDefDS;
3393 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3394 for (MachineInstr &
MI : *
MBB) {
3395 if (isVMEMOrFlatVMEM(
MI)) {
3396 HasVMemLoad |=
MI.mayLoad();
3397 HasVMemStore |=
MI.mayStore();
3401 if (mayStoreIncrementingDSCNT(
MI)) {
3404 if (VMemInvalidated)
3406 DSInvalidated =
true;
3408 for (
const MachineOperand &
Op :
MI.all_uses()) {
3409 if (
Op.isDebug() || !
TRI->isVectorRegister(*
MRI,
Op.getReg()))
3412 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3416 VMemInvalidated =
true;
3420 DSInvalidated =
true;
3423 if (VMemInvalidated && DSInvalidated)
3429 VMEMID
ID = toVMEMID(RU);
3433 UsesVgprLoadedOutsideVMEM =
true;
3437 else if (Brackets.hasPendingVMEM(
ID,
DS_CNT))
3438 UsesVgprLoadedOutsideDS =
true;
3443 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3444 for (
const MachineOperand &
Op :
MI.all_defs()) {
3445 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3449 VMemInvalidated =
true;
3454 if (VMemInvalidated && DSInvalidated)
3466 for (
const MachineOperand &
Op :
MI.all_defs()) {
3467 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3476 if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
3477 ((!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3478 (HasVMemLoad &&
ST->hasVmemWriteVgprInOrder())))
3479 Flags.FlushVmCnt =
true;
3486 if (!DSInvalidated && UsesVgprLoadedOutsideDS)
3487 Flags.FlushDsCnt =
true;
3492bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3493 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3495 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3497 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3498 AA = &AAR->getAAResults();
3500 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3512 if (!SIInsertWaitcnts(MLI, PDT,
AA).
run(MF))
3517 .preserve<AAManager>();
3522 TII = ST->getInstrInfo();
3523 TRI = &
TII->getRegisterInfo();
3532 if (ST->hasExtendedWaitCounts()) {
3533 IsExpertMode = ST->hasExpertSchedulingMode() &&
3541 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
3551 ForceEmitWaitcnt[
T] =
false;
3553 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3558 MachineBasicBlock &EntryBB = MF.
front();
3568 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3571 if (
ST->hasExtendedWaitCounts()) {
3578 if (!
ST->hasImageInsts() &&
3583 TII->get(instrsForExtendedCounterTypes[CT]))
3596 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3597 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3598 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3605 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3608 std::unique_ptr<WaitcntBrackets> Brackets;
3613 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3615 MachineBasicBlock *
MBB = BII->first;
3616 BlockInfo &BI = BII->second;
3622 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3624 *Brackets = *BI.Incoming;
3627 Brackets = std::make_unique<WaitcntBrackets>(
this);
3632 Brackets->~WaitcntBrackets();
3633 new (Brackets.get()) WaitcntBrackets(
this);
3637 if (
ST->hasWaitXcnt())
3639 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3642 if (Brackets->hasPendingEvent()) {
3643 BlockInfo *MoveBracketsToSucc =
nullptr;
3645 auto *SuccBII = BlockInfos.
find(Succ);
3646 BlockInfo &SuccBI = SuccBII->second;
3647 if (!SuccBI.Incoming) {
3648 SuccBI.Dirty =
true;
3649 if (SuccBII <= BII) {
3653 if (!MoveBracketsToSucc) {
3654 MoveBracketsToSucc = &SuccBI;
3656 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3660 dbgs() <<
"Try to merge ";
3666 if (SuccBI.Incoming->merge(*Brackets)) {
3667 SuccBI.Dirty =
true;
3668 if (SuccBII <= BII) {
3675 if (MoveBracketsToSucc)
3676 MoveBracketsToSucc->Incoming = std::move(Brackets);
3681 if (
ST->hasScalarStores()) {
3682 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3683 bool HaveScalarStores =
false;
3685 for (MachineBasicBlock &
MBB : MF) {
3686 for (MachineInstr &
MI :
MBB) {
3687 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
3688 HaveScalarStores =
true;
3690 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3691 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3696 if (HaveScalarStores) {
3705 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3706 bool SeenDCacheWB =
false;
3710 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3711 SeenDCacheWB =
true;
3712 else if (
TII->isScalarStore(*
I))
3713 SeenDCacheWB =
false;
3716 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3717 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3733 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3735 setSchedulingMode(EntryBB,
I,
true);
3737 for (MachineInstr *
MI : CallInsts) {
3738 MachineBasicBlock &
MBB = *
MI->getParent();
3739 setSchedulingMode(
MBB,
MI,
false);
3740 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3743 for (MachineInstr *
MI : ReturnInsts)
3744 setSchedulingMode(*
MI->getParent(),
MI,
false);
3755 for (
auto [
MI,
_] : EndPgmInsts) {
3757 TII->get(AMDGPU::S_ALLOC_VGPR))
3761 }
else if (!WCG->isOptNone() &&
3762 ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
3763 (MF.getFrameInfo().hasCalls() ||
3764 ST->getOccupancyWithNumVGPRs(
3765 TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass),
3768 for (
auto [
MI, Flag] : EndPgmInsts) {
3770 if (
ST->requiresNopBeforeDeallocVGPRs()) {
3772 TII->get(AMDGPU::S_NOP))
3776 TII->get(AMDGPU::S_SENDMSG))
3784 ReturnInsts.
clear();
3785 EndPgmInsts.clear();
3786 PreheadersToFlush.
clear();
3787 SLoadAddresses.
clear();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
static constexpr bool is_iterable