47#define DEBUG_TYPE "si-insert-waitcnts"
50 "Force emit s_waitcnt expcnt(0) instrs");
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
54 "Force emit s_waitcnt vmcnt(0) instrs");
58 cl::desc(
"Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc(
"Force all waitcnt load counters to wait until 0"),
68 "amdgpu-expert-scheduling-mode",
69 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
102template <
typename EmitWaitcntFn>
103static void EmitExpandedWaitcnt(
unsigned Outstanding,
unsigned Target,
104 EmitWaitcntFn &&EmitWaitcnt) {
106 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0
u; --
I)
126 TRACKINGID_RANGE_LEN = (1 << 16),
131 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
136 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
137 LDSDMA_BEGIN = REGUNITS_END,
138 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
142static constexpr VMEMID toVMEMID(MCRegUnit RU) {
143 return static_cast<unsigned>(RU);
146#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
148 DECL(VMEM_SAMPLER_READ_ACCESS) \
149 DECL(VMEM_BVH_READ_ACCESS) \
150 DECL(GLOBAL_INV_ACCESS) \
151 DECL(VMEM_WRITE_ACCESS) \
152 DECL(SCRATCH_WRITE_ACCESS) \
162 DECL(EXP_POS_ACCESS) \
163 DECL(EXP_PARAM_ACCESS) \
165 DECL(EXP_LDS_ACCESS) \
166 DECL(VGPR_CSMACC_WRITE) \
167 DECL(VGPR_DPMACC_WRITE) \
168 DECL(VGPR_TRANS_WRITE) \
169 DECL(VGPR_XDL_WRITE) \
170 DECL(VGPR_LDS_READ) \
171 DECL(VGPR_FLAT_READ) \
172 DECL(VGPR_VMEM_READ) \
177#define AMDGPU_EVENT_ENUM(Name) Name,
182#undef AMDGPU_EVENT_ENUM
196auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
197 return enum_seq(VMEM_ACCESS, MaxEvent);
200#define AMDGPU_EVENT_NAME(Name) #Name,
204#undef AMDGPU_EVENT_NAME
205static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
206 return WaitEventTypeName[
Event];
230 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT,
231 AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
232 AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
233 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
234 AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
239 switch (
MI.getOpcode()) {
240 case AMDGPU::ASYNCMARK:
241 case AMDGPU::WAIT_ASYNCMARK:
244 return MI.isMetaInstruction();
260 assert(updateVMCntOnly(Inst));
262 return VMEM_NOSAMPLER;
276 return VMEM_NOSAMPLER;
292 WaitEventSet() =
default;
293 explicit constexpr WaitEventSet(WaitEventType Event) {
294 static_assert(NUM_WAIT_EVENTS <=
sizeof(Mask) * 8,
295 "Not enough bits in Mask for all the events");
298 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
299 for (
auto &
E : Events) {
303 void insert(
const WaitEventType &Event) { Mask |= 1 <<
Event; }
304 void remove(
const WaitEventType &Event) { Mask &= ~(1 <<
Event); }
305 void remove(
const WaitEventSet &
Other) { Mask &= ~Other.Mask; }
306 bool contains(
const WaitEventType &Event)
const {
307 return Mask & (1 <<
Event);
311 return (~Mask &
Other.Mask) == 0;
336 return Mask ==
Other.Mask;
339 bool empty()
const {
return Mask == 0; }
341 bool twoOrMore()
const {
return Mask & (Mask - 1); }
342 operator bool()
const {
return !
empty(); }
343 void print(raw_ostream &OS)
const {
344 ListSeparator
LS(
", ");
345 for (WaitEventType Event : wait_events()) {
347 OS <<
LS << getWaitEventTypeName(Event);
353void WaitEventSet::dump()
const {
358class WaitcntBrackets;
366class WaitcntGenerator {
368 const GCNSubtarget &ST;
369 const SIInstrInfo &
TII;
370 AMDGPU::IsaVersion
IV;
373 bool ExpandWaitcntProfiling =
false;
374 const AMDGPU::HardwareLimits &Limits;
377 WaitcntGenerator() =
delete;
378 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
379 WaitcntGenerator(
const MachineFunction &MF,
381 const AMDGPU::HardwareLimits &Limits)
382 :
ST(MF.getSubtarget<GCNSubtarget>()),
TII(*
ST.getInstrInfo()),
386 ExpandWaitcntProfiling(
387 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
392 bool isOptNone()
const {
return OptNone; }
394 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
408 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
409 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
413 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
418 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
420 AMDGPU::Waitcnt
Wait,
421 const WaitcntBrackets &ScoreBrackets) = 0;
424 virtual const WaitEventSet &
441 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
443 virtual ~WaitcntGenerator() =
default;
446class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
447 static constexpr const WaitEventSet
450 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
451 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
452 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
453 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
454 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
465 using WaitcntGenerator::WaitcntGenerator;
467 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
468 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
471 bool createNewWaitcnt(MachineBasicBlock &
Block,
473 AMDGPU::Waitcnt
Wait,
474 const WaitcntBrackets &ScoreBrackets)
override;
477 return WaitEventMaskForInstPreGFX12[
T];
480 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
483class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
486 static constexpr const WaitEventSet
488 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
489 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
490 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
491 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
492 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
493 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
494 WaitEventSet({VMEM_BVH_READ_ACCESS}),
495 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
496 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
497 WaitEventSet({ASYNC_ACCESS}),
498 WaitEventSet({TENSOR_ACCESS}),
499 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
501 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
504 WaitcntGeneratorGFX12Plus() =
delete;
505 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
507 const AMDGPU::HardwareLimits &Limits,
509 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
512 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
513 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
516 bool createNewWaitcnt(MachineBasicBlock &
Block,
518 AMDGPU::Waitcnt
Wait,
519 const WaitcntBrackets &ScoreBrackets)
override;
522 return WaitEventMaskForInstGFX12Plus[
T];
525 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
529struct PreheaderFlushFlags {
530 bool FlushVmCnt =
false;
531 bool FlushDsCnt =
false;
534class SIInsertWaitcnts {
535 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
536 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
537 MachineLoopInfo &MLI;
538 MachinePostDominatorTree &PDT;
543 std::unique_ptr<WaitcntBrackets> Incoming;
545 BlockInfo() =
default;
546 BlockInfo(BlockInfo &&) =
default;
547 BlockInfo &operator=(BlockInfo &&) =
default;
551 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
555 std::unique_ptr<WaitcntGenerator> WCG;
558 DenseSet<MachineInstr *> CallInsts;
559 DenseSet<MachineInstr *> ReturnInsts;
564 DenseMap<MachineInstr *, bool> EndPgmInsts;
566 AMDGPU::HardwareLimits Limits;
569 const GCNSubtarget &
ST;
570 const SIInstrInfo &
TII;
571 const SIRegisterInfo &
TRI;
572 const MachineRegisterInfo &MRI;
575 bool IsExpertMode =
false;
577 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
579 : MLI(MLI), PDT(PDT), AA(AA), MF(MF),
ST(MF.getSubtarget<GCNSubtarget>()),
580 TII(*
ST.getInstrInfo()),
TRI(
TII.getRegisterInfo()),
581 MRI(MF.getRegInfo()) {
582 (void)ForceExpCounter;
583 (void)ForceLgkmCounter;
584 (void)ForceVMCounter;
587 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
589 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
590 const WaitcntBrackets &Brackets);
591 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
592 const WaitcntBrackets &ScoreBrackets);
593 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
594 bool isDSRead(
const MachineInstr &
MI)
const;
595 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
598 void setForceEmitWaitcnt() {
636 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
639 case AMDGPU::GLOBAL_INV:
640 return GLOBAL_INV_ACCESS;
642 case AMDGPU::GLOBAL_WB:
643 case AMDGPU::GLOBAL_WBINV:
644 return VMEM_WRITE_ACCESS;
650 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
651 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
660 if (
TII.mayAccessScratch(Inst))
661 return SCRATCH_WRITE_ACCESS;
662 return VMEM_WRITE_ACCESS;
666 return VmemReadMapping[getVmemType(Inst)];
669 std::optional<WaitEventType>
670 getExpertSchedulingEventType(
const MachineInstr &Inst)
const;
672 bool isAsync(
const MachineInstr &
MI)
const {
677 const MachineOperand *
Async =
678 TII.getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
682 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
686 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
690 bool shouldUpdateAsyncMark(
const MachineInstr &
MI,
694 if (!isAsyncLdsDmaWrite(
MI))
701 bool isVmemAccess(
const MachineInstr &
MI)
const;
702 bool generateWaitcntInstBefore(MachineInstr &
MI,
703 WaitcntBrackets &ScoreBrackets,
704 MachineInstr *OldWaitcntInstr,
705 PreheaderFlushFlags FlushFlags);
706 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
708 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
709 MachineInstr *OldWaitcntInstr);
711 WaitEventSet getEventsFor(
const MachineInstr &Inst)
const;
712 void updateEventWaitcntAfter(MachineInstr &Inst,
713 WaitcntBrackets *ScoreBrackets);
715 MachineBasicBlock *
Block)
const;
716 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
717 WaitcntBrackets &ScoreBrackets);
718 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
719 WaitcntBrackets &ScoreBrackets);
722 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
724 bool ExpertMode)
const;
726 return WCG->getWaitEvents(
T);
729 return WCG->getCounterFromEvent(
E);
741class WaitcntBrackets {
749 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
750 for (
auto &[
ID, Val] : VMem) {
754 for (
auto &[
ID, Val] : SGPRs) {
759 if (NumUnusedVmem || NumUnusedSGPRs) {
760 errs() <<
"WaitcntBracket had unused entries at destruction time: "
761 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
762 <<
" SGPR unused entries\n";
773 return ScoreUBs[
T] - ScoreLBs[
T];
777 return getVMemScore(
ID,
T) > getScoreLB(
T);
795 return getScoreUB(
T) - getScoreLB(
T);
799 auto It = SGPRs.find(RU);
800 return It != SGPRs.end() ? It->second.get(
T) : 0;
804 auto It = VMem.find(TID);
805 return It != VMem.end() ? It->second.Scores[
T] : 0;
812 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
815 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
816 AMDGPU::Waitcnt &UpdateWait)
const;
819 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
820 AMDGPU::Waitcnt &UpdateWait)
const;
821 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
822 AMDGPU::Waitcnt &UpdateWait)
const;
825 AMDGPU::Waitcnt &
Wait,
826 const MachineInstr &
MI)
const;
827 MCPhysReg determineVGPR16Dependency(
const MachineInstr &
MI,
831 AMDGPU::Waitcnt &
Wait)
const;
832 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
833 void tryClearSCCWriteEvent(MachineInstr *Inst);
835 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
838 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
839 void recordAsyncMark(MachineInstr &
MI);
841 bool hasPendingEvent()
const {
return !PendingEvents.empty(); }
842 bool hasPendingEvent(WaitEventType
E)
const {
843 return PendingEvents.contains(
E);
846 bool HasPending = PendingEvents &
Context->getWaitEvents(
T);
848 "Expected pending events iff scoreboard is not empty");
853 WaitEventSet Events = PendingEvents &
Context->getWaitEvents(
T);
855 return Events.twoOrMore();
858 bool hasPendingFlat()
const {
865 void setPendingFlat() {
870 bool hasPendingGDS()
const {
875 unsigned getPendingGDSWait()
const {
884 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
885 for (MCRegUnit RU : regunits(
Reg)) {
886 auto It = VMem.find(toVMEMID(RU));
887 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
894 for (MCRegUnit RU : regunits(
Reg)) {
895 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
896 It->second.VMEMTypes = 0;
897 if (It->second.empty())
903 void setStateOnFunctionEntryOrReturn() {
910 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
914 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
915 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
918 void print(raw_ostream &)
const;
923 void purgeEmptyTrackingData();
933 using CounterValueArray = std::array<unsigned, AMDGPU::NUM_INST_CNTS>;
936 AMDGPU::Waitcnt &
Wait)
const;
938 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
939 unsigned OtherScore);
944 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
971 if (
Reg == AMDGPU::SCC) {
974 for (MCRegUnit RU : regunits(
Reg))
975 VMem[toVMEMID(RU)].Scores[
T] = Val;
977 for (MCRegUnit RU : regunits(
Reg))
978 SGPRs[RU].get(
T) = Val;
985 VMem[TID].Scores[
T] = Val;
988 void setScoreByOperand(
const MachineOperand &
Op,
991 const SIInsertWaitcnts *
Context;
995 WaitEventSet PendingEvents;
997 unsigned LastFlatDsCnt = 0;
998 unsigned LastFlatLoadCnt = 0;
1000 unsigned LastGDS = 0;
1017 CounterValueArray Scores{};
1019 unsigned VMEMTypes = 0;
1028 unsigned ScoreDsKmCnt = 0;
1029 unsigned ScoreXCnt = 0;
1045 bool empty()
const {
return !ScoreDsKmCnt && !ScoreXCnt; }
1048 DenseMap<VMEMID, VMEMInfo> VMem;
1049 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
1052 unsigned SCCScore = 0;
1054 const MachineInstr *PendingSCCWrite =
nullptr;
1058 SmallVector<const MachineInstr *> LDSDMAStores;
1067 static constexpr unsigned MaxAsyncMarks = 16;
1071 CounterValueArray AsyncScore{};
1074SIInsertWaitcnts::BlockInfo::~BlockInfo() =
default;
1076class SIInsertWaitcntsLegacy :
public MachineFunctionPass {
1079 SIInsertWaitcntsLegacy() : MachineFunctionPass(
ID) {}
1081 bool runOnMachineFunction(MachineFunction &MF)
override;
1083 StringRef getPassName()
const override {
1084 return "SI insert wait instructions";
1087 void getAnalysisUsage(AnalysisUsage &AU)
const override {
1090 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
1099void WaitcntBrackets::setScoreByOperand(
const MachineOperand &
Op,
1102 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
1110bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
1115 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1125bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
1127 if (!hasPointSampleAccel(
MI))
1130 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
1133void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
1137 unsigned UB = getScoreUB(
T);
1140 Context->ST.hasVOP3PX2IncrementsVaVdstTwice()) {
1151 PendingEvents.insert(
E);
1152 setScoreUB(
T, CurrScore);
1155 const MachineRegisterInfo &MRI =
Context->MRI;
1164 if (
const auto *AddrOp =
TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1168 if (
const auto *Data0 =
1169 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1171 if (
const auto *Data1 =
1172 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1175 Inst.
getOpcode() != AMDGPU::DS_APPEND &&
1176 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
1177 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1178 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1179 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
1183 }
else if (
TII.isFLAT(Inst)) {
1185 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1188 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1191 }
else if (
TII.isMIMG(Inst)) {
1195 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1198 }
else if (
TII.isMTBUF(Inst)) {
1201 }
else if (
TII.isMUBUF(Inst)) {
1205 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1208 }
else if (
TII.isLDSDIR(Inst)) {
1210 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1213 if (
TII.isEXP(Inst)) {
1218 for (MachineOperand &DefMO : Inst.
all_defs()) {
1219 if (
TRI.isVGPR(MRI, DefMO.getReg())) {
1224 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1225 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
1230 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1231 if (PendingEvents.contains(OtherEvent)) {
1236 setScoreLB(
T, getScoreUB(
T) - 1);
1237 PendingEvents.remove(OtherEvent);
1239 for (
const MachineOperand &
Op : Inst.
all_uses())
1240 setScoreByOperand(
Op,
T, CurrScore);
1244 for (
const MachineOperand &
Op : Inst.
operands()) {
1249 setScoreByOperand(
Op,
T, CurrScore);
1261 for (
const MachineOperand &
Op : Inst.
defs()) {
1264 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
1266 if (updateVMCntOnly(Inst)) {
1271 VmemType
V = getVmemType(Inst);
1272 unsigned char TypesMask = 1 <<
V;
1275 if (hasPointSampleAccel(Inst))
1276 TypesMask |= 1 << VMEM_NOSAMPLER;
1277 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1278 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1281 setScoreByOperand(
Op,
T, CurrScore);
1284 (
TII.isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1293 if (!MemOp->isStore() ||
1298 auto AAI = MemOp->getAAInfo();
1304 if (!AAI || !AAI.Scope)
1306 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1307 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1308 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1323 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1324 if (Slot && Slot < NUM_LDSDMA)
1325 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1328 if (
Context->shouldUpdateAsyncMark(Inst,
T)) {
1329 AsyncScore[
T] = CurrScore;
1333 setRegScore(AMDGPU::SCC,
T, CurrScore);
1334 PendingSCCWrite = &Inst;
1339void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1345 AsyncMarks.push_back(AsyncScore);
1348 dbgs() <<
"recordAsyncMark:\n" << Inst;
1349 for (
const auto &Mark : AsyncMarks) {
1356void WaitcntBrackets::print(raw_ostream &OS)
const {
1360 unsigned SR = getScoreRange(
T);
1363 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1367 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1371 OS <<
" EXP_CNT(" << SR <<
"):";
1374 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1378 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1381 OS <<
" BVH_CNT(" << SR <<
"):";
1384 OS <<
" KM_CNT(" << SR <<
"):";
1387 OS <<
" X_CNT(" << SR <<
"):";
1390 OS <<
" ASYNC_CNT(" << SR <<
"):";
1393 OS <<
" VA_VDST(" << SR <<
"): ";
1396 OS <<
" VM_VSRC(" << SR <<
"): ";
1399 OS <<
" UNKNOWN(" << SR <<
"):";
1405 unsigned LB = getScoreLB(
T);
1408 sort(SortedVMEMIDs);
1410 for (
auto ID : SortedVMEMIDs) {
1411 unsigned RegScore = VMem.at(
ID).Scores[
T];
1414 unsigned RelScore = RegScore - LB - 1;
1415 if (
ID < REGUNITS_END) {
1416 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1418 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1419 "Unhandled/unexpected ID value!");
1420 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1425 if (isSmemCounter(
T)) {
1427 sort(SortedSMEMIDs);
1428 for (
auto ID : SortedSMEMIDs) {
1429 unsigned RegScore = SGPRs.at(
ID).get(
T);
1432 unsigned RelScore = RegScore - LB - 1;
1433 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1438 OS <<
' ' << SCCScore <<
":scc";
1443 OS <<
"Pending Events: ";
1444 if (hasPendingEvent()) {
1446 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1447 if (hasPendingEvent((WaitEventType)
I)) {
1448 OS <<
LS << WaitEventTypeName[
I];
1456 OS <<
"Async score: ";
1457 if (AsyncScore.empty())
1463 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1465 for (
const auto &Mark : AsyncMarks) {
1467 unsigned MarkedScore = Mark[
T];
1470 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1471 <<
"_CNT: " << MarkedScore;
1474 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1475 <<
"_CNT: " << MarkedScore;
1478 OS <<
" EXP_CNT: " << MarkedScore;
1481 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS")
1482 <<
"_CNT: " << MarkedScore;
1485 OS <<
" SAMPLE_CNT: " << MarkedScore;
1488 OS <<
" BVH_CNT: " << MarkedScore;
1491 OS <<
" KM_CNT: " << MarkedScore;
1494 OS <<
" X_CNT: " << MarkedScore;
1497 OS <<
" ASYNC_CNT: " << MarkedScore;
1500 OS <<
" UNKNOWN: " << MarkedScore;
1511void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1512 AMDGPU::Waitcnt &UpdateWait)
const {
1520 simplifyXcnt(CheckWait, UpdateWait);
1522 simplifyVmVsrc(CheckWait, UpdateWait);
1527 unsigned &
Count)
const {
1531 if (
Count >= getScoreRange(
T))
1535void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait,
1537 unsigned Cnt =
Wait.get(
T);
1538 simplifyWaitcnt(
T, Cnt);
1542void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1543 AMDGPU::Waitcnt &UpdateWait)
const {
1564void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1565 AMDGPU::Waitcnt &UpdateWait)
const {
1570 std::min({CheckWait.get(AMDGPU::LOAD_CNT),
1571 CheckWait.get(AMDGPU::STORE_CNT),
1572 CheckWait.get(AMDGPU::SAMPLE_CNT),
1573 CheckWait.get(AMDGPU::BVH_CNT), CheckWait.get(AMDGPU::DS_CNT)}))
1578void WaitcntBrackets::purgeEmptyTrackingData() {
1579 VMem.remove_if([](
const auto &
P) {
return P.second.empty(); });
1580 SGPRs.remove_if([](
const auto &
P) {
return P.second.empty(); });
1584 unsigned ScoreToWait,
1585 AMDGPU::Waitcnt &
Wait)
const {
1586 const unsigned LB = getScoreLB(
T);
1587 const unsigned UB = getScoreUB(
T);
1590 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1592 !
Context->ST.hasFlatLgkmVMemCountInOrder()) {
1596 addWait(
Wait,
T, 0);
1597 }
else if (counterOutOfOrder(
T)) {
1601 addWait(
Wait,
T, 0);
1605 unsigned NeededWait = std::min(
1606 UB - ScoreToWait, getWaitCountMax(
Context->getLimits(),
T) - 1);
1607 addWait(
Wait,
T, NeededWait);
1612AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1614 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1616 for (
const auto &Mark : AsyncMarks) {
1622 if (AsyncMarks.size() == MaxAsyncMarks) {
1627 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1628 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1631 AMDGPU::Waitcnt
Wait;
1632 if (AsyncMarks.size() <=
N) {
1637 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1638 const auto &RequiredMark = AsyncMarks[MarkIndex];
1640 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1646 dbgs() <<
"Removing " << (MarkIndex + 1)
1647 <<
" async marks after determining wait\n";
1649 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1662MCPhysReg WaitcntBrackets::determineVGPR16Dependency(
const MachineInstr &
MI,
1665 const TargetRegisterClass *RC =
Context->TRI.getPhysRegBaseClass(
Reg);
1666 unsigned Size =
Context->TRI.getRegSizeInBits(*RC);
1668 if (
Size != 16 || !
Context->ST.hasD16Writes32BitVgpr())
1678 AMDGPU::Waitcnt
Wait;
1679 for (MCRegUnit RU : regunits(OtherHalf))
1680 determineWaitForScore(
T, getVMemScore(toVMEMID(RU),
T),
Wait);
1683 if (!
Wait.hasWait())
1690 WaitEventSet MIEvents =
Context->getEventsFor(
MI);
1691 WaitEventSet OtherHalfEvents =
Context->getWaitEvents(
T);
1692 WaitEventSet Events = MIEvents & OtherHalfEvents;
1693 if (Events.twoOrMore())
1700 AMDGPU::Waitcnt &
Wait,
1701 const MachineInstr &
MI)
const {
1702 if (
Reg == AMDGPU::SCC) {
1703 determineWaitForScore(
T, SCCScore,
Wait);
1707 Reg = determineVGPR16Dependency(
MI,
T,
Reg);
1708 for (MCRegUnit RU : regunits(
Reg))
1709 determineWaitForScore(
1710 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1717 AMDGPU::Waitcnt &
Wait)
const {
1718 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1719 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1722void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1725 if (PendingSCCWrite &&
1726 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1728 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1731 SCC_WRITE_PendingEvent) {
1735 PendingEvents.remove(SCC_WRITE_PendingEvent);
1736 PendingSCCWrite =
nullptr;
1740void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1742 applyWaitcnt(
Wait,
T);
1746 const unsigned UB = getScoreUB(
T);
1750 if (counterOutOfOrder(
T))
1752 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1755 PendingEvents.remove(
Context->getWaitEvents(
T));
1762 PendingEvents.remove(SMEM_GROUP);
1768 else if (
Count == 0)
1769 PendingEvents.remove(VMEM_GROUP);
1773void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait,
1775 unsigned Cnt =
Wait.get(
T);
1776 applyWaitcnt(
T, Cnt);
1783 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1791 WaitEventSet Events = PendingEvents &
Context->getWaitEvents(
T);
1794 Events.remove(GLOBAL_INV_ACCESS);
1797 return Events.twoOrMore();
1800 return hasMixedPendingEvents(
T);
1810char SIInsertWaitcntsLegacy::
ID = 0;
1815 return new SIInsertWaitcntsLegacy();
1820 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1825 if (NewEnc == MO.
getImm())
1834static std::optional<AMDGPU::InstCounterType>
1837 case AMDGPU::S_WAIT_LOADCNT:
1839 case AMDGPU::S_WAIT_EXPCNT:
1841 case AMDGPU::S_WAIT_STORECNT:
1843 case AMDGPU::S_WAIT_SAMPLECNT:
1845 case AMDGPU::S_WAIT_BVHCNT:
1847 case AMDGPU::S_WAIT_DSCNT:
1849 case AMDGPU::S_WAIT_KMCNT:
1851 case AMDGPU::S_WAIT_XCNT:
1853 case AMDGPU::S_WAIT_ASYNCCNT:
1855 case AMDGPU::S_WAIT_TENSORCNT:
1862bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1876bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1877 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1879 assert(isNormalMode(MaxCounter));
1882 MachineInstr *WaitcntInstr =
nullptr;
1883 MachineInstr *WaitcntVsCntInstr =
nullptr;
1886 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1888 dbgs() <<
"end of block\n";
1896 if (isNonWaitcntMetaInst(
II)) {
1902 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1906 if (Opcode == AMDGPU::S_WAITCNT) {
1907 unsigned IEnc =
II.getOperand(0).getImm();
1910 ScoreBrackets.simplifyWaitcnt(OldWait);
1914 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1915 II.eraseFromParent();
1919 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1922 <<
"Before: " <<
Wait <<
'\n';);
1933 II.eraseFromParent();
1934 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1935 unsigned N =
II.getOperand(0).getImm();
1937 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1940 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1941 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1944 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1950 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1951 II.eraseFromParent();
1954 WaitcntVsCntInstr = &
II;
1961 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1970 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1971 <<
"New Instr at block end: "
1972 << *WaitcntInstr <<
'\n'
1973 :
dbgs() <<
"applied pre-existing waitcnt\n"
1974 <<
"Old Instr: " << *It
1975 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1978 if (WaitcntVsCntInstr) {
1982 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1988 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1989 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1991 :
dbgs() <<
"applied pre-existing waitcnt\n"
1992 <<
"Old Instr: " << *It
1993 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
2001bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
2003 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2004 assert(isNormalMode(MaxCounter));
2011 if (
Wait.hasWaitExceptStoreCnt()) {
2013 if (ExpandWaitcntProfiling) {
2017 bool AnyOutOfOrder =
false;
2019 unsigned WaitCnt =
Wait.get(CT);
2020 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
2021 AnyOutOfOrder =
true;
2026 if (AnyOutOfOrder) {
2034 unsigned WaitCnt =
Wait.get(CT);
2038 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2039 getWaitCountMax(getLimits(), CT) - 1);
2040 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
2052 [[maybe_unused]]
auto SWaitInst =
2057 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2058 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2062 if (
Wait.hasWaitStoreCnt()) {
2068 unsigned Outstanding =
2071 EmitExpandedWaitcnt(
2073 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
2074 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2079 [[maybe_unused]]
auto SWaitInst =
2081 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2086 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2087 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2095WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
2096 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
2100WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
2101 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
2102 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
2104 ~0u , ExpertVal, ExpertVal);
2111bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
2112 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
2114 assert(!isNormalMode(MaxCounter));
2117 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
2118 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
2119 MachineInstr *WaitcntDepctrInstr =
nullptr;
2123 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
2125 dbgs() <<
"end of block\n";
2131 AMDGPU::Waitcnt RequiredWait;
2136 if (isNonWaitcntMetaInst(
II)) {
2145 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
2149 if (Opcode == AMDGPU::S_WAITCNT)
2152 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2154 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2159 RequiredWait = RequiredWait.combined(OldWait);
2161 if (CombinedLoadDsCntInstr ==
nullptr) {
2162 CombinedLoadDsCntInstr = &
II;
2164 II.eraseFromParent();
2167 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2169 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2174 RequiredWait = RequiredWait.combined(OldWait);
2176 if (CombinedStoreDsCntInstr ==
nullptr) {
2177 CombinedStoreDsCntInstr = &
II;
2179 II.eraseFromParent();
2182 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2184 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2185 AMDGPU::Waitcnt OldWait;
2189 ScoreBrackets.simplifyWaitcnt(OldWait);
2191 if (WaitcntDepctrInstr ==
nullptr) {
2192 WaitcntDepctrInstr = &
II;
2201 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2209 II.eraseFromParent();
2213 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2216 II.eraseFromParent();
2218 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2221 unsigned N =
II.getOperand(0).getImm();
2222 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
2228 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2230 addWait(
Wait, CT.value(), OldCnt);
2232 addWait(RequiredWait, CT.value(), OldCnt);
2234 if (WaitInstrs[CT.value()] ==
nullptr) {
2235 WaitInstrs[CT.value()] = &
II;
2237 II.eraseFromParent();
2243 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
2244 Wait =
Wait.combined(RequiredWait);
2246 if (CombinedLoadDsCntInstr) {
2262 AMDGPU::OpName::simm16, NewEnc);
2263 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2269 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2270 <<
"New Instr at block end: "
2271 << *CombinedLoadDsCntInstr <<
'\n'
2272 :
dbgs() <<
"applied pre-existing waitcnt\n"
2273 <<
"Old Instr: " << *It <<
"New Instr: "
2274 << *CombinedLoadDsCntInstr <<
'\n');
2281 if (CombinedStoreDsCntInstr) {
2286 AMDGPU::OpName::simm16, NewEnc);
2287 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2293 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2294 <<
"New Instr at block end: "
2295 << *CombinedStoreDsCntInstr <<
'\n'
2296 :
dbgs() <<
"applied pre-existing waitcnt\n"
2297 <<
"Old Instr: " << *It <<
"New Instr: "
2298 << *CombinedStoreDsCntInstr <<
'\n');
2328 for (MachineInstr **WI : WaitsToErase) {
2332 (*WI)->eraseFromParent();
2339 if (!WaitInstrs[CT])
2342 unsigned NewCnt =
Wait.get(CT);
2343 if (NewCnt != ~0u) {
2345 AMDGPU::OpName::simm16, NewCnt);
2346 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2348 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2349 setNoWait(
Wait, CT);
2352 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2353 <<
"New Instr at block end: " << *WaitInstrs[CT]
2355 :
dbgs() <<
"applied pre-existing waitcnt\n"
2356 <<
"Old Instr: " << *It
2357 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2364 if (WaitcntDepctrInstr) {
2368 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2383 AMDGPU::OpName::simm16, Enc);
2385 <<
"New Instr at block end: "
2386 << *WaitcntDepctrInstr <<
'\n'
2387 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2388 <<
"Old Instr: " << *It <<
"New Instr: "
2389 << *WaitcntDepctrInstr <<
'\n');
2400bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2402 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2403 assert(!isNormalMode(MaxCounter));
2410 if (ExpandWaitcntProfiling) {
2417 if (ScoreBrackets.counterOutOfOrder(CT)) {
2424 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2425 getWaitCountMax(getLimits(), CT) - 1);
2426 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2438 MachineInstr *SWaitInst =
nullptr;
2462 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2463 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2475 [[maybe_unused]]
auto SWaitInst =
2482 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2483 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2486 if (
Wait.hasWaitDepctr()) {
2492 [[maybe_unused]]
auto SWaitInst =
2498 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2499 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2518bool SIInsertWaitcnts::generateWaitcntInstBefore(
2519 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2520 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2522 setForceEmitWaitcnt();
2526 AMDGPU::Waitcnt
Wait;
2527 const unsigned Opc =
MI.getOpcode();
2530 case AMDGPU::BUFFER_WBINVL1:
2531 case AMDGPU::BUFFER_WBINVL1_SC:
2532 case AMDGPU::BUFFER_WBINVL1_VOL:
2533 case AMDGPU::BUFFER_GL0_INV:
2534 case AMDGPU::BUFFER_GL1_INV: {
2542 case AMDGPU::SI_RETURN_TO_EPILOG:
2543 case AMDGPU::SI_RETURN:
2544 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2545 case AMDGPU::S_SETPC_B64_return: {
2550 AMDGPU::Waitcnt AllZeroWait =
2551 WCG->getAllZeroWaitcnt(
false);
2556 if (
ST.hasExtendedWaitCounts() &&
2557 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2562 case AMDGPU::S_ENDPGM:
2563 case AMDGPU::S_ENDPGM_SAVED: {
2573 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2576 case AMDGPU::S_SENDMSG:
2577 case AMDGPU::S_SENDMSGHALT: {
2578 if (
ST.hasLegacyGeometry() &&
2593 if (
MI.modifiesRegister(AMDGPU::EXEC, &
TRI)) {
2596 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2597 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2598 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2599 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2606 if (
TII.isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2614 Wait = AMDGPU::Waitcnt();
2616 const MachineOperand &CallAddrOp =
TII.getCalleeOperand(
MI);
2617 if (CallAddrOp.
isReg()) {
2618 ScoreBrackets.determineWaitForPhysReg(
2621 if (
const auto *RtnAddrOp =
2622 TII.getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2623 ScoreBrackets.determineWaitForPhysReg(
2624 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait,
MI);
2627 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2628 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2644 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2645 const Value *Ptr = Memop->getValue();
2646 if (Memop->isStore()) {
2647 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2648 addWait(
Wait, SmemAccessCounter, 0);
2650 SLoadAddresses.
erase(It);
2653 unsigned AS = Memop->getAddrSpace();
2657 if (
TII.mayWriteLDSThroughDMA(
MI))
2661 unsigned TID = LDSDMA_BEGIN;
2662 if (Ptr && Memop->getAAInfo()) {
2663 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2664 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2665 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2666 if ((
I + 1) >= NUM_LDSDMA) {
2681 if (Memop->isStore()) {
2687 for (
const MachineOperand &
Op :
MI.operands()) {
2692 if (
Op.isTied() &&
Op.isUse() &&
TII.doesNotReadTiedSource(
MI))
2697 const bool IsVGPR =
TRI.isVectorRegister(MRI,
Op.getReg());
2704 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2717 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2718 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2719 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2720 !
ST.hasVmemWriteVgprInOrder()) {
2727 ScoreBrackets.clearVgprVmemTypes(
Reg);
2730 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2735 }
else if (
Op.getReg() == AMDGPU::SCC) {
2738 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait,
2742 if (
ST.hasWaitXcnt() &&
Op.isDef())
2761 if (
Opc == AMDGPU::S_BARRIER && !
ST.hasAutoWaitcntBeforeBarrier() &&
2762 !
ST.hasBackOffBarrier()) {
2763 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2770 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2775 ScoreBrackets.simplifyWaitcnt(
Wait);
2795 Wait = WCG->getAllZeroWaitcnt(
false);
2799 if (!ForceEmitWaitcnt[
T])
2804 if (FlushFlags.FlushVmCnt) {
2810 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
2816 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2820bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2822 MachineBasicBlock &
Block,
2823 WaitcntBrackets &ScoreBrackets,
2824 MachineInstr *OldWaitcntInstr) {
2827 if (OldWaitcntInstr)
2831 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2836 MachineOperand *WaitExp =
TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2846 <<
"Update Instr: " << *It);
2849 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2854 ScoreBrackets.applyWaitcnt(
Wait);
2859std::optional<WaitEventType>
2860SIInsertWaitcnts::getExpertSchedulingEventType(
const MachineInstr &Inst)
const {
2861 if (
TII.isVALU(Inst)) {
2866 if (
TII.isXDL(Inst))
2867 return VGPR_XDL_WRITE;
2869 if (
TII.isTRANS(Inst))
2870 return VGPR_TRANS_WRITE;
2873 return VGPR_DPMACC_WRITE;
2875 return VGPR_CSMACC_WRITE;
2882 if (
TII.isFLAT(Inst))
2883 return VGPR_FLAT_READ;
2886 return VGPR_LDS_READ;
2888 if (
TII.isVMEM(Inst) ||
TII.isVIMAGE(Inst) ||
TII.isVSAMPLE(Inst))
2889 return VGPR_VMEM_READ;
2896bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2897 return (
TII.isFLAT(
MI) &&
TII.mayAccessVMEMThroughFlat(
MI)) ||
2904 MachineBasicBlock *
Block)
const {
2905 auto BlockEnd =
Block->getParent()->end();
2906 auto BlockIter =
Block->getIterator();
2910 if (++BlockIter != BlockEnd) {
2911 It = BlockIter->instr_begin();
2918 if (!It->isMetaInstruction())
2926 return It->getOpcode() == AMDGPU::S_ENDPGM;
2930bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2931 MachineBasicBlock &
Block,
2932 WaitcntBrackets &ScoreBrackets) {
2933 AMDGPU::Waitcnt
Wait;
2934 bool NeedsEndPGMCheck =
false;
2942 NeedsEndPGMCheck =
true;
2945 ScoreBrackets.simplifyWaitcnt(
Wait);
2948 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2951 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2959WaitEventSet SIInsertWaitcnts::getEventsFor(
const MachineInstr &Inst)
const {
2960 WaitEventSet Events;
2962 if (
const auto ET = getExpertSchedulingEventType(Inst))
2966 if (
TII.isDS(Inst) &&
TII.usesLGKM_CNT(Inst)) {
2968 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2969 Events.insert(GDS_ACCESS);
2970 Events.insert(GDS_GPR_LOCK);
2972 Events.insert(LDS_ACCESS);
2974 }
else if (
TII.isFLAT(Inst)) {
2976 Events.insert(getVmemWaitEventType(Inst));
2979 if (
TII.mayAccessVMEMThroughFlat(Inst)) {
2980 if (
ST.hasWaitXcnt())
2981 Events.insert(VMEM_GROUP);
2982 Events.insert(getVmemWaitEventType(Inst));
2984 if (
TII.mayAccessLDSThroughFlat(Inst))
2985 Events.insert(LDS_ACCESS);
2989 Inst.
getOpcode() == AMDGPU::BUFFER_WBL2)) {
2993 if (
ST.hasWaitXcnt())
2994 Events.insert(VMEM_GROUP);
2995 Events.insert(getVmemWaitEventType(Inst));
2996 if (
ST.vmemWriteNeedsExpWaitcnt() &&
2998 Events.insert(VMW_GPR_LOCK);
3000 }
else if (
TII.isSMRD(Inst)) {
3001 if (
ST.hasWaitXcnt())
3002 Events.insert(SMEM_GROUP);
3003 Events.insert(SMEM_ACCESS);
3005 Events.insert(EXP_LDS_ACCESS);
3007 unsigned Imm =
TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
3009 Events.insert(EXP_PARAM_ACCESS);
3011 Events.insert(EXP_POS_ACCESS);
3013 Events.insert(EXP_GPR_LOCK);
3015 Events.insert(SCC_WRITE);
3018 case AMDGPU::S_SENDMSG:
3019 case AMDGPU::S_SENDMSG_RTN_B32:
3020 case AMDGPU::S_SENDMSG_RTN_B64:
3021 case AMDGPU::S_SENDMSGHALT:
3022 Events.insert(SQ_MESSAGE);
3024 case AMDGPU::S_MEMTIME:
3025 case AMDGPU::S_MEMREALTIME:
3026 case AMDGPU::S_GET_BARRIER_STATE_M0:
3027 case AMDGPU::S_GET_BARRIER_STATE_IMM:
3028 Events.insert(SMEM_ACCESS);
3035void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
3036 WaitcntBrackets *ScoreBrackets) {
3038 WaitEventSet InstEvents = getEventsFor(Inst);
3039 for (WaitEventType
E : wait_events()) {
3040 if (InstEvents.contains(
E))
3041 ScoreBrackets->updateByEvent(
E, Inst);
3044 if (
TII.isDS(Inst) &&
TII.usesLGKM_CNT(Inst)) {
3046 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
3047 ScoreBrackets->setPendingGDS();
3049 }
else if (
TII.isFLAT(Inst)) {
3057 ScoreBrackets->setPendingFlat();
3060 ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
3063 ScoreBrackets->updateByEvent(TENSOR_ACCESS, Inst);
3064 }
else if (Inst.
isCall()) {
3067 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
3068 ScoreBrackets->setStateOnFunctionEntryOrReturn();
3069 }
else if (
TII.isVINTERP(Inst)) {
3070 int64_t
Imm =
TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
3080bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
3081 unsigned OtherScore) {
3082 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
3083 unsigned OtherShifted =
3084 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
3085 Score = std::max(MyShifted, OtherShifted);
3086 return OtherShifted > MyShifted;
3091 bool StrictDom =
false;
3095 if (AsyncMarks.empty() && OtherMarks.
empty()) {
3102 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
3103 MaxSize = std::min(MaxSize, MaxAsyncMarks);
3106 if (AsyncMarks.size() > MaxSize)
3107 AsyncMarks.erase(AsyncMarks.begin(),
3108 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
3114 constexpr CounterValueArray ZeroMark{};
3115 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
3118 dbgs() <<
"Before merge:\n";
3119 for (
const auto &Mark : AsyncMarks) {
3123 dbgs() <<
"Other marks:\n";
3124 for (
const auto &Mark : OtherMarks) {
3133 unsigned OtherSize = OtherMarks.size();
3134 unsigned OurSize = AsyncMarks.size();
3135 unsigned MergeCount = std::min(OtherSize, OurSize);
3139 if (MergeCount == 0)
3143 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
3144 OtherMarks[OtherSize - Idx][
T]);
3149 dbgs() <<
"After merge:\n";
3150 for (
const auto &Mark : AsyncMarks) {
3164bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
3165 bool StrictDom =
false;
3169 for (
auto K :
Other.VMem.keys())
3170 VMem.try_emplace(K);
3171 for (
auto K :
Other.SGPRs.keys())
3172 SGPRs.try_emplace(K);
3179 const WaitEventSet &EventsForT =
Context->getWaitEvents(
T);
3180 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3181 const WaitEventSet OtherEvents =
Other.PendingEvents & EventsForT;
3182 if (!OldEvents.contains(OtherEvents))
3184 PendingEvents |= OtherEvents;
3187 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
3188 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
3189 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
3190 if (NewUB < ScoreLBs[
T])
3193 MergeInfo &
M = MergeInfos[
T];
3194 M.OldLB = ScoreLBs[
T];
3195 M.OtherLB =
Other.ScoreLBs[
T];
3196 M.MyShift = NewUB - ScoreUBs[
T];
3197 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
3199 ScoreUBs[
T] = NewUB;
3202 StrictDom |= mergeScore(M, LastFlatLoadCnt,
Other.LastFlatLoadCnt);
3205 StrictDom |= mergeScore(M, LastFlatDsCnt,
Other.LastFlatDsCnt);
3206 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
3210 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
3211 if (
Other.hasPendingEvent(SCC_WRITE)) {
3212 if (!OldEvents.contains(SCC_WRITE)) {
3213 PendingSCCWrite =
Other.PendingSCCWrite;
3214 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
3215 PendingSCCWrite =
nullptr;
3220 for (
auto &[RegID, Info] : VMem)
3221 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
3223 if (isSmemCounter(
T)) {
3224 for (
auto &[RegID, Info] : SGPRs) {
3225 auto It =
Other.SGPRs.find(RegID);
3226 unsigned OtherScore = (It !=
Other.SGPRs.end()) ? It->second.get(
T) : 0;
3227 StrictDom |= mergeScore(M,
Info.get(
T), OtherScore);
3232 for (
auto &[TID, Info] : VMem) {
3233 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
3234 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
3235 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
3236 Info.VMEMTypes = NewVmemTypes;
3240 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
3242 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
3244 purgeEmptyTrackingData();
3250 return Opcode == AMDGPU::S_WAITCNT ||
3253 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3254 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3255 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3256 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3260void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
3262 bool ExpertMode)
const {
3266 .
addImm(ExpertMode ? 2 : 0)
3284class VCCZWorkaround {
3285 const WaitcntBrackets &ScoreBrackets;
3286 const GCNSubtarget &
ST;
3287 const SIInstrInfo &
TII;
3288 const SIRegisterInfo &
TRI;
3289 bool VCCZCorruptionBug =
false;
3290 bool VCCZNotUpdatedByPartialWrites =
false;
3293 bool MustRecomputeVCCZ =
true;
3296 VCCZWorkaround(
const WaitcntBrackets &ScoreBrackets,
const GCNSubtarget &ST,
3297 const SIInstrInfo &
TII,
const SIRegisterInfo &
TRI)
3299 VCCZCorruptionBug =
ST.hasReadVCCZBug();
3300 VCCZNotUpdatedByPartialWrites = !
ST.partialVCCWritesUpdateVCCZ();
3307 bool tryRecomputeVCCZ(MachineInstr &
MI) {
3309 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3319 MustRecomputeVCCZ |= VCCZCorruptionBug &&
TII.isSMRD(
MI);
3325 std::optional<bool> PartiallyWritesToVCCOpt;
3326 auto PartiallyWritesToVCC = [](MachineInstr &
MI) {
3327 return MI.definesRegister(AMDGPU::VCC_LO,
nullptr) ||
3328 MI.definesRegister(AMDGPU::VCC_HI,
nullptr);
3330 if (VCCZNotUpdatedByPartialWrites) {
3331 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3334 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3340 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3342 if (!PartiallyWritesToVCCOpt)
3343 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3344 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3345 MI.definesRegister(AMDGPU::VCC,
nullptr);
3348 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3349 *PartiallyWritesToVCCOpt);
3351 MustRecomputeVCCZ =
false;
3361 TII.get(
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3364 MustRecomputeVCCZ =
false;
3374bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3375 MachineBasicBlock &
Block,
3376 WaitcntBrackets &ScoreBrackets) {
3380 dbgs() <<
"*** Begin Block: ";
3382 ScoreBrackets.dump();
3384 VCCZWorkaround VCCZW(ScoreBrackets, ST,
TII,
TRI);
3387 MachineInstr *OldWaitcntInstr =
nullptr;
3392 Iter !=
E; ++Iter) {
3393 MachineInstr &Inst = *Iter;
3394 if (isNonWaitcntMetaInst(Inst))
3399 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3400 if (!OldWaitcntInstr)
3401 OldWaitcntInstr = &Inst;
3405 PreheaderFlushFlags FlushFlags;
3406 if (
Block.getFirstTerminator() == Inst)
3407 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3410 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3412 OldWaitcntInstr =
nullptr;
3414 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3418 ScoreBrackets.recordAsyncMark(Inst);
3422 if (
TII.isSMRD(Inst)) {
3423 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3426 if (!Memop->isInvariant()) {
3427 const Value *Ptr = Memop->getValue();
3433 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3437 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3441 ScoreBrackets.dump();
3446 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3451 AMDGPU::Waitcnt
Wait;
3452 if (
Block.getFirstTerminator() ==
Block.end()) {
3453 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3454 if (FlushFlags.FlushVmCnt) {
3462 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
AMDGPU::DS_CNT))
3471 dbgs() <<
"*** End Block: ";
3473 ScoreBrackets.dump();
3479bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3480 if (
Block.size() <= 1)
3488 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3494 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3495 LastAtomicWithSoftXcnt =
nullptr;
3498 MI.mayLoad() &&
MI.mayStore();
3499 MachineInstr &PrevMI = *
MI.getPrevNode();
3501 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3504 if (LastAtomicWithSoftXcnt) {
3508 LastAtomicWithSoftXcnt = &
MI;
3516SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3517 const WaitcntBrackets &ScoreBrackets) {
3518 auto [Iterator, IsInserted] =
3521 return Iterator->second;
3525 return PreheaderFlushFlags();
3529 return PreheaderFlushFlags();
3532 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3533 return Iterator->second;
3536 return PreheaderFlushFlags();
3539bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3541 return TII.mayAccessVMEMThroughFlat(
MI);
3545bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3551bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3580SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3581 const WaitcntBrackets &Brackets) {
3582 PreheaderFlushFlags
Flags;
3583 bool HasVMemLoad =
false;
3584 bool HasVMemStore =
false;
3585 bool UsesVgprVMEMLoadedOutside =
false;
3586 bool UsesVgprDSReadOutside =
false;
3587 bool VMemInvalidated =
false;
3591 bool TrackSimpleDSOpt =
ST.hasExtendedWaitCounts();
3592 DenseSet<MCRegUnit> VgprUse;
3593 DenseSet<MCRegUnit> VgprDefVMEM;
3594 DenseSet<MCRegUnit> VgprDefDS;
3600 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3601 unsigned DSReadPosition = 0;
3602 bool IsSingleBlock =
ML->getNumBlocks() == 1;
3603 bool TrackDSFlushPoint =
ST.hasExtendedWaitCounts() && IsSingleBlock;
3604 unsigned LastDSFlushPosition = 0;
3606 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3607 for (MachineInstr &
MI : *
MBB) {
3608 if (isVMEMOrFlatVMEM(
MI)) {
3609 HasVMemLoad |=
MI.mayLoad();
3610 HasVMemStore |=
MI.mayStore();
3614 if (mayStoreIncrementingDSCNT(
MI)) {
3617 if (VMemInvalidated)
3619 TrackSimpleDSOpt =
false;
3620 TrackDSFlushPoint =
false;
3622 bool IsDSRead = isDSRead(
MI);
3627 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3628 if (!TrackDSFlushPoint)
3630 if (
auto It = LastDSReadPositionMap.
find(RU);
3631 It != LastDSReadPositionMap.
end()) {
3635 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3639 for (
const MachineOperand &
Op :
MI.all_uses()) {
3640 if (
Op.isDebug() || !
TRI.isVectorRegister(MRI,
Op.getReg()))
3643 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3647 VMemInvalidated =
true;
3651 TrackSimpleDSOpt =
false;
3654 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3658 updateDSReadFlushTracking(RU);
3663 VMEMID
ID = toVMEMID(RU);
3667 UsesVgprVMEMLoadedOutside =
true;
3672 UsesVgprDSReadOutside =
true;
3677 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3678 for (
const MachineOperand &
Op :
MI.all_defs()) {
3679 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3683 VMemInvalidated =
true;
3688 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3699 if (IsDSRead || TrackDSFlushPoint) {
3700 for (
const MachineOperand &
Op :
MI.all_defs()) {
3701 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
3703 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3706 updateDSReadFlushTracking(RU);
3709 if (TrackDSFlushPoint)
3710 LastDSReadPositionMap[RU] = DSReadPosition;
3719 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3720 ((!
ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3721 (HasVMemLoad &&
ST.hasVmemWriteVgprInOrder())))
3722 Flags.FlushVmCnt =
true;
3728 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3731 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3732 bool DSFlushPointPrefetch =
3733 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3735 if (SimpleDSOpt || DSFlushPointPrefetch)
3736 Flags.FlushDsCnt =
true;
3741bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3742 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3744 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3746 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3747 AA = &AAR->getAAResults();
3749 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3761 if (!SIInsertWaitcnts(MLI, PDT,
AA, MF).
run())
3766 .preserve<AAManager>();
3769bool SIInsertWaitcnts::run() {
3777 if (ST.hasExtendedWaitCounts()) {
3778 IsExpertMode = ST.hasExpertSchedulingMode() &&
3787 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3792 WCG = std::make_unique<WaitcntGeneratorPreGFX12>(
3796 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3800 MachineBasicBlock &EntryBB = MF.
front();
3811 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3814 if (
ST.hasExtendedWaitCounts()) {
3823 if (!
ST.hasImageInsts() &&
3829 TII.get(instrsForExtendedCounterTypes[CT]))
3842 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3843 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3844 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3851 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3854 std::unique_ptr<WaitcntBrackets> Brackets;
3859 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3861 MachineBasicBlock *
MBB = BII->first;
3862 BlockInfo &BI = BII->second;
3868 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3870 *Brackets = *BI.Incoming;
3873 Brackets = std::make_unique<WaitcntBrackets>(
this);
3878 Brackets->~WaitcntBrackets();
3879 new (Brackets.get()) WaitcntBrackets(
this);
3883 if (
ST.hasWaitXcnt())
3885 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3888 if (Brackets->hasPendingEvent()) {
3889 BlockInfo *MoveBracketsToSucc =
nullptr;
3891 auto *SuccBII = BlockInfos.
find(Succ);
3892 BlockInfo &SuccBI = SuccBII->second;
3893 if (!SuccBI.Incoming) {
3894 SuccBI.Dirty =
true;
3895 if (SuccBII <= BII) {
3899 if (!MoveBracketsToSucc) {
3900 MoveBracketsToSucc = &SuccBI;
3902 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3906 dbgs() <<
"Try to merge ";
3912 if (SuccBI.Incoming->merge(*Brackets)) {
3913 SuccBI.Dirty =
true;
3914 if (SuccBII <= BII) {
3921 if (MoveBracketsToSucc)
3922 MoveBracketsToSucc->Incoming = std::move(Brackets);
3927 if (
ST.hasScalarStores()) {
3928 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3929 bool HaveScalarStores =
false;
3931 for (MachineBasicBlock &
MBB : MF) {
3932 for (MachineInstr &
MI :
MBB) {
3933 if (!HaveScalarStores &&
TII.isScalarStore(
MI))
3934 HaveScalarStores =
true;
3936 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3937 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3942 if (HaveScalarStores) {
3951 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3952 bool SeenDCacheWB =
false;
3956 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3957 SeenDCacheWB =
true;
3958 else if (
TII.isScalarStore(*
I))
3959 SeenDCacheWB =
false;
3962 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3963 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3979 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3981 setSchedulingMode(EntryBB,
I,
true);
3983 for (MachineInstr *
MI : CallInsts) {
3984 MachineBasicBlock &
MBB = *
MI->getParent();
3985 setSchedulingMode(
MBB,
MI,
false);
3986 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3989 for (MachineInstr *
MI : ReturnInsts)
3990 setSchedulingMode(*
MI->getParent(),
MI,
false);
4001 for (
auto [
MI,
_] : EndPgmInsts) {
4003 TII.get(AMDGPU::S_ALLOC_VGPR))
4007 }
else if (!WCG->isOptNone() &&
4008 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
4009 (MF.getFrameInfo().hasCalls() ||
4010 ST.getOccupancyWithNumVGPRs(
4011 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
4014 for (
auto [
MI, Flag] : EndPgmInsts) {
4016 if (
ST.requiresNopBeforeDeallocVGPRs()) {
4018 TII.get(AMDGPU::S_NOP))
4022 TII.get(AMDGPU::S_SENDMSG))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static std::optional< AMDGPU::InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool isWaitInstr(MachineInstr &Inst)
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
Get the array size.
bool empty() const
Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isXcntDrain(const MachineInstr &MI)
True if MI implicitly drains XCNT.
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool usesTENSOR_CNT(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Target - Wrapper for Target specific information.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isDPMACCInstruction(unsigned Opc)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getHasMatrixScale(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
@ Increment
Incrementally increasing token ID.
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
static constexpr bool is_iterable