47#define DEBUG_TYPE "si-insert-waitcnts"
50 "Force emit s_waitcnt expcnt(0) instrs");
52 "Force emit s_waitcnt lgkmcnt(0) instrs");
54 "Force emit s_waitcnt vmcnt(0) instrs");
58 cl::desc(
"Force all waitcnt instrs to be emitted as "
59 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
63 "amdgpu-waitcnt-load-forcezero",
64 cl::desc(
"Force all waitcnt load counters to wait until 0"),
68 "amdgpu-expert-scheduling-mode",
69 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
117 TRACKINGID_RANGE_LEN = (1 << 16),
122 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
127 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
128 LDSDMA_BEGIN = REGUNITS_END,
129 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
133static constexpr VMEMID toVMEMID(MCRegUnit RU) {
134 return static_cast<unsigned>(RU);
137#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
139 DECL(VMEM_SAMPLER_READ_ACCESS) \
140 DECL(VMEM_BVH_READ_ACCESS) \
141 DECL(GLOBAL_INV_ACCESS) \
142 DECL(VMEM_WRITE_ACCESS) \
143 DECL(SCRATCH_WRITE_ACCESS) \
153 DECL(EXP_POS_ACCESS) \
154 DECL(EXP_PARAM_ACCESS) \
156 DECL(EXP_LDS_ACCESS) \
157 DECL(VGPR_CSMACC_WRITE) \
158 DECL(VGPR_DPMACC_WRITE) \
159 DECL(VGPR_TRANS_WRITE) \
160 DECL(VGPR_XDL_WRITE) \
161 DECL(VGPR_LDS_READ) \
162 DECL(VGPR_FLAT_READ) \
166#define AMDGPU_EVENT_ENUM(Name) Name,
171#undef AMDGPU_EVENT_ENUM
185auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
186 return enum_seq(VMEM_ACCESS, MaxEvent);
189#define AMDGPU_EVENT_NAME(Name) #Name,
193#undef AMDGPU_EVENT_NAME
194static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
195 return WaitEventTypeName[
Event];
218 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
219 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
220 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
234 assert(updateVMCntOnly(Inst));
236 return VMEM_NOSAMPLER;
250 return VMEM_NOSAMPLER;
264 WaitEventSet() =
default;
265 explicit constexpr WaitEventSet(WaitEventType Event) {
266 static_assert(NUM_WAIT_EVENTS <=
sizeof(Mask) * 8,
267 "Not enough bits in Mask for all the events");
270 constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
271 for (
auto &
E : Events) {
275 void insert(
const WaitEventType &Event) { Mask |= 1 <<
Event; }
276 void remove(
const WaitEventType &Event) { Mask &= ~(1 <<
Event); }
277 void remove(
const WaitEventSet &
Other) { Mask &= ~Other.Mask; }
278 bool contains(
const WaitEventType &Event)
const {
279 return Mask & (1 <<
Event);
283 return (~Mask &
Other.Mask) == 0;
308 return Mask ==
Other.Mask;
311 bool empty()
const {
return Mask == 0; }
313 bool twoOrMore()
const {
return Mask & (Mask - 1); }
314 operator bool()
const {
return !
empty(); }
315 void print(raw_ostream &OS)
const {
316 ListSeparator
LS(
", ");
317 for (WaitEventType Event : wait_events()) {
319 OS <<
LS << getWaitEventTypeName(Event);
325void WaitEventSet::dump()
const {
330class WaitcntBrackets;
338class WaitcntGenerator {
340 const GCNSubtarget &ST;
341 const SIInstrInfo &
TII;
342 AMDGPU::IsaVersion
IV;
345 bool ExpandWaitcntProfiling =
false;
346 const AMDGPU::HardwareLimits &Limits;
349 WaitcntGenerator() =
delete;
350 WaitcntGenerator(
const WaitcntGenerator &) =
delete;
351 WaitcntGenerator(
const MachineFunction &MF,
InstCounterType MaxCounter,
352 const AMDGPU::HardwareLimits &Limits)
353 :
ST(MF.getSubtarget<GCNSubtarget>()),
TII(*
ST.getInstrInfo()),
357 ExpandWaitcntProfiling(
358 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
363 bool isOptNone()
const {
return OptNone; }
365 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
379 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
380 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
384 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
389 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
391 AMDGPU::Waitcnt
Wait,
392 const WaitcntBrackets &ScoreBrackets) = 0;
408 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
410 virtual ~WaitcntGenerator() =
default;
413class WaitcntGeneratorPreGFX12 final :
public WaitcntGenerator {
414 static constexpr const WaitEventSet
417 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
418 WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
419 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
420 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
421 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
430 using WaitcntGenerator::WaitcntGenerator;
432 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
433 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
436 bool createNewWaitcnt(MachineBasicBlock &
Block,
438 AMDGPU::Waitcnt
Wait,
439 const WaitcntBrackets &ScoreBrackets)
override;
442 return WaitEventMaskForInstPreGFX12[
T];
445 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
448class WaitcntGeneratorGFX12Plus final :
public WaitcntGenerator {
451 static constexpr const WaitEventSet
453 WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
454 WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
455 WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
456 EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
457 WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
458 WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
459 WaitEventSet({VMEM_BVH_READ_ACCESS}),
460 WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
461 WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
462 WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
464 WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
467 WaitcntGeneratorGFX12Plus() =
delete;
468 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
470 const AMDGPU::HardwareLimits &Limits,
472 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
475 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
476 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
479 bool createNewWaitcnt(MachineBasicBlock &
Block,
481 AMDGPU::Waitcnt
Wait,
482 const WaitcntBrackets &ScoreBrackets)
override;
485 return WaitEventMaskForInstGFX12Plus[
T];
488 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
492struct PreheaderFlushFlags {
493 bool FlushVmCnt =
false;
494 bool FlushDsCnt =
false;
497class SIInsertWaitcnts {
498 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
499 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
500 MachineLoopInfo &MLI;
501 MachinePostDominatorTree &PDT;
506 std::unique_ptr<WaitcntBrackets> Incoming;
510 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
514 std::unique_ptr<WaitcntGenerator> WCG;
517 DenseSet<MachineInstr *> CallInsts;
518 DenseSet<MachineInstr *> ReturnInsts;
523 DenseMap<MachineInstr *, bool> EndPgmInsts;
525 AMDGPU::HardwareLimits Limits;
528 const GCNSubtarget &
ST;
529 const SIInstrInfo &
TII;
530 const SIRegisterInfo &
TRI;
531 const MachineRegisterInfo &MRI;
534 bool IsExpertMode =
false;
536 SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
538 : MLI(MLI), PDT(PDT), AA(AA), MF(MF),
ST(MF.getSubtarget<GCNSubtarget>()),
539 TII(*
ST.getInstrInfo()),
TRI(
TII.getRegisterInfo()),
540 MRI(MF.getRegInfo()) {
541 (void)ForceExpCounter;
542 (void)ForceLgkmCounter;
543 (void)ForceVMCounter;
546 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
548 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
549 const WaitcntBrackets &Brackets);
550 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
551 const WaitcntBrackets &ScoreBrackets);
552 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
553 bool isDSRead(
const MachineInstr &
MI)
const;
554 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
557 void setForceEmitWaitcnt() {
563 ForceEmitWaitcnt[
EXP_CNT] =
true;
565 ForceEmitWaitcnt[
EXP_CNT] =
false;
570 ForceEmitWaitcnt[
DS_CNT] =
true;
571 ForceEmitWaitcnt[
KM_CNT] =
true;
573 ForceEmitWaitcnt[
DS_CNT] =
false;
574 ForceEmitWaitcnt[
KM_CNT] =
false;
581 ForceEmitWaitcnt[
BVH_CNT] =
true;
585 ForceEmitWaitcnt[
BVH_CNT] =
false;
588 ForceEmitWaitcnt[
VA_VDST] =
false;
589 ForceEmitWaitcnt[
VM_VSRC] =
false;
595 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
598 case AMDGPU::GLOBAL_INV:
599 return GLOBAL_INV_ACCESS;
601 case AMDGPU::GLOBAL_WB:
602 case AMDGPU::GLOBAL_WBINV:
603 return VMEM_WRITE_ACCESS;
609 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
610 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
619 if (
TII.mayAccessScratch(Inst))
620 return SCRATCH_WRITE_ACCESS;
621 return VMEM_WRITE_ACCESS;
625 return VmemReadMapping[getVmemType(Inst)];
628 std::optional<WaitEventType>
629 getExpertSchedulingEventType(
const MachineInstr &Inst)
const;
631 bool isAsync(
const MachineInstr &
MI)
const {
636 const MachineOperand *
Async =
637 TII.getNamedOperand(
MI, AMDGPU::OpName::IsAsync);
641 bool isNonAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
645 bool isAsyncLdsDmaWrite(
const MachineInstr &
MI)
const {
649 bool isVmemAccess(
const MachineInstr &
MI)
const;
650 bool generateWaitcntInstBefore(MachineInstr &
MI,
651 WaitcntBrackets &ScoreBrackets,
652 MachineInstr *OldWaitcntInstr,
653 PreheaderFlushFlags FlushFlags);
654 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
656 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
657 MachineInstr *OldWaitcntInstr);
659 WaitEventSet getEventsFor(
const MachineInstr &Inst)
const;
660 void updateEventWaitcntAfter(MachineInstr &Inst,
661 WaitcntBrackets *ScoreBrackets);
663 MachineBasicBlock *
Block)
const;
664 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
665 WaitcntBrackets &ScoreBrackets);
666 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
667 WaitcntBrackets &ScoreBrackets);
670 bool removeRedundantSoftXcnts(MachineBasicBlock &
Block);
672 bool ExpertMode)
const;
674 return WCG->getWaitEvents(
T);
677 return WCG->getCounterFromEvent(
E);
689class WaitcntBrackets {
697 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
698 for (
auto &[
ID, Val] : VMem) {
702 for (
auto &[
ID, Val] : SGPRs) {
707 if (NumUnusedVmem || NumUnusedSGPRs) {
708 errs() <<
"WaitcntBracket had unused entries at destruction time: "
709 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
710 <<
" SGPR unused entries\n";
721 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
722 return T ==
X_CNT ? 1 : 0;
726 return ScoreUBs[
T] - ScoreLBs[
T];
730 return getVMemScore(
ID,
T) > getScoreLB(
T);
748 return getScoreUB(
T) - getScoreLB(
T);
752 auto It = SGPRs.find(RU);
753 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(
T)] : 0;
757 auto It = VMem.find(TID);
758 return It != VMem.end() ? It->second.Scores[
T] : 0;
765 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
768 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
769 AMDGPU::Waitcnt &UpdateWait)
const;
772 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
773 AMDGPU::Waitcnt &UpdateWait)
const;
774 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
775 AMDGPU::Waitcnt &UpdateWait)
const;
778 AMDGPU::Waitcnt &
Wait)
const;
780 AMDGPU::Waitcnt &
Wait)
const;
781 AMDGPU::Waitcnt determineAsyncWait(
unsigned N);
782 void tryClearSCCWriteEvent(MachineInstr *Inst);
784 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
787 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
788 void recordAsyncMark(MachineInstr &
MI);
790 bool hasPendingEvent()
const {
return !PendingEvents.empty(); }
791 bool hasPendingEvent(WaitEventType
E)
const {
792 return PendingEvents.contains(
E);
795 bool HasPending = PendingEvents &
Context->getWaitEvents(
T);
797 "Expected pending events iff scoreboard is not empty");
802 WaitEventSet Events = PendingEvents &
Context->getWaitEvents(
T);
804 return Events.twoOrMore();
807 bool hasPendingFlat()
const {
814 void setPendingFlat() {
819 bool hasPendingGDS()
const {
820 return LastGDS > ScoreLBs[
DS_CNT] && LastGDS <= ScoreUBs[
DS_CNT];
823 unsigned getPendingGDSWait()
const {
824 return std::min(getScoreUB(
DS_CNT) - LastGDS,
828 void setPendingGDS() { LastGDS = ScoreUBs[
DS_CNT]; }
832 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
833 for (MCRegUnit RU : regunits(
Reg)) {
834 auto It = VMem.find(toVMEMID(RU));
835 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
842 for (MCRegUnit RU : regunits(
Reg)) {
843 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
844 It->second.VMEMTypes = 0;
845 if (It->second.empty())
851 void setStateOnFunctionEntryOrReturn() {
857 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
861 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
862 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
865 void print(raw_ostream &)
const;
870 void purgeEmptyTrackingData();
880 using CounterValueArray = std::array<unsigned, NUM_INST_CNTS>;
883 AMDGPU::Waitcnt &
Wait)
const;
885 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
886 unsigned OtherScore);
891 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
894 const TargetRegisterClass *RC =
Context->TRI.getPhysRegBaseClass(
Reg);
895 unsigned Size =
Context->TRI.getRegSizeInBits(*RC);
896 if (
Size == 16 &&
Context->ST.hasD16Writes32BitVgpr())
920 if (
Reg == AMDGPU::SCC) {
923 for (MCRegUnit RU : regunits(
Reg))
924 VMem[toVMEMID(RU)].Scores[
T] = Val;
926 auto STy = getSgprScoresIdx(
T);
927 for (MCRegUnit RU : regunits(
Reg))
928 SGPRs[RU].Scores[STy] = Val;
935 VMem[TID].Scores[
T] = Val;
941 const SIInsertWaitcnts *
Context;
945 WaitEventSet PendingEvents;
949 unsigned LastGDS = 0;
966 CounterValueArray Scores{};
968 unsigned VMEMTypes = 0;
978 std::array<unsigned, 2> Scores = {0};
980 bool empty()
const {
return !Scores[0] && !Scores[1]; }
983 DenseMap<VMEMID, VMEMInfo> VMem;
984 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
987 unsigned SCCScore = 0;
989 const MachineInstr *PendingSCCWrite =
nullptr;
993 SmallVector<const MachineInstr *> LDSDMAStores;
1002 static constexpr unsigned MaxAsyncMarks = 16;
1006 CounterValueArray AsyncScore{};
1009class SIInsertWaitcntsLegacy :
public MachineFunctionPass {
1012 SIInsertWaitcntsLegacy() : MachineFunctionPass(
ID) {}
1014 bool runOnMachineFunction(MachineFunction &MF)
override;
1016 StringRef getPassName()
const override {
1017 return "SI insert wait instructions";
1020 void getAnalysisUsage(AnalysisUsage &AU)
const override {
1023 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
1032void WaitcntBrackets::setScoreByOperand(
const MachineOperand &
Op,
1034 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
1042bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
1047 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
1057bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
1059 if (!hasPointSampleAccel(
MI))
1062 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
1065void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
1069 unsigned UB = getScoreUB(
T);
1070 unsigned CurrScore = UB + 1;
1076 PendingEvents.insert(
E);
1077 setScoreUB(
T, CurrScore);
1080 const MachineRegisterInfo &MRI =
Context->MRI;
1089 if (
const auto *AddrOp =
TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
1090 setScoreByOperand(*AddrOp,
EXP_CNT, CurrScore);
1093 if (
const auto *Data0 =
1094 TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
1095 setScoreByOperand(*Data0,
EXP_CNT, CurrScore);
1096 if (
const auto *Data1 =
1097 TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
1098 setScoreByOperand(*Data1,
EXP_CNT, CurrScore);
1100 Inst.
getOpcode() != AMDGPU::DS_APPEND &&
1101 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
1102 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1103 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1104 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
1105 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1108 }
else if (
TII.isFLAT(Inst)) {
1110 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1113 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1116 }
else if (
TII.isMIMG(Inst)) {
1120 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1123 }
else if (
TII.isMTBUF(Inst)) {
1126 }
else if (
TII.isMUBUF(Inst)) {
1130 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::data),
1133 }
else if (
TII.isLDSDIR(Inst)) {
1135 setScoreByOperand(*
TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
1138 if (
TII.isEXP(Inst)) {
1143 for (MachineOperand &DefMO : Inst.
all_defs()) {
1144 if (
TRI.isVGPR(MRI, DefMO.getReg())) {
1145 setScoreByOperand(DefMO,
EXP_CNT, CurrScore);
1149 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1150 if (
TRI.isVectorRegister(MRI,
Op.getReg()))
1151 setScoreByOperand(
Op,
EXP_CNT, CurrScore);
1155 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1156 if (PendingEvents.contains(OtherEvent)) {
1161 setScoreLB(
T, getScoreUB(
T) - 1);
1162 PendingEvents.remove(OtherEvent);
1164 for (
const MachineOperand &
Op : Inst.
all_uses())
1165 setScoreByOperand(
Op,
T, CurrScore);
1169 for (
const MachineOperand &
Op : Inst.
operands()) {
1174 setScoreByOperand(
Op,
T, CurrScore);
1186 for (
const MachineOperand &
Op : Inst.
defs()) {
1188 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
1190 if (updateVMCntOnly(Inst)) {
1195 VmemType
V = getVmemType(Inst);
1196 unsigned char TypesMask = 1 <<
V;
1199 if (hasPointSampleAccel(Inst))
1200 TypesMask |= 1 << VMEM_NOSAMPLER;
1201 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1202 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1205 setScoreByOperand(
Op,
T, CurrScore);
1208 (
TII.isDS(Inst) ||
Context->isNonAsyncLdsDmaWrite(Inst))) {
1217 if (!MemOp->isStore() ||
1222 auto AAI = MemOp->getAAInfo();
1228 if (!AAI || !AAI.Scope)
1230 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1231 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1232 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1247 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1248 if (Slot && Slot < NUM_LDSDMA)
1249 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1257 "unexpected GFX1250 instruction");
1258 AsyncScore[
T] = CurrScore;
1262 setRegScore(AMDGPU::SCC,
T, CurrScore);
1263 PendingSCCWrite = &Inst;
1268void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
1274 AsyncMarks.push_back(AsyncScore);
1277 dbgs() <<
"recordAsyncMark:\n" << Inst;
1278 for (
const auto &Mark : AsyncMarks) {
1285void WaitcntBrackets::print(raw_ostream &OS)
const {
1289 unsigned SR = getScoreRange(
T);
1292 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1296 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1300 OS <<
" EXP_CNT(" << SR <<
"):";
1303 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1307 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1310 OS <<
" BVH_CNT(" << SR <<
"):";
1313 OS <<
" KM_CNT(" << SR <<
"):";
1316 OS <<
" X_CNT(" << SR <<
"):";
1319 OS <<
" VA_VDST(" << SR <<
"): ";
1322 OS <<
" VM_VSRC(" << SR <<
"): ";
1325 OS <<
" UNKNOWN(" << SR <<
"):";
1331 unsigned LB = getScoreLB(
T);
1334 sort(SortedVMEMIDs);
1336 for (
auto ID : SortedVMEMIDs) {
1337 unsigned RegScore = VMem.at(
ID).Scores[
T];
1340 unsigned RelScore = RegScore - LB - 1;
1341 if (
ID < REGUNITS_END) {
1342 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1344 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1345 "Unhandled/unexpected ID value!");
1346 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1351 if (isSmemCounter(
T)) {
1353 sort(SortedSMEMIDs);
1354 for (
auto ID : SortedSMEMIDs) {
1355 unsigned RegScore = SGPRs.at(
ID).Scores[getSgprScoresIdx(
T)];
1358 unsigned RelScore = RegScore - LB - 1;
1359 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1363 if (
T ==
KM_CNT && SCCScore > 0)
1364 OS <<
' ' << SCCScore <<
":scc";
1369 OS <<
"Pending Events: ";
1370 if (hasPendingEvent()) {
1372 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1373 if (hasPendingEvent((WaitEventType)
I)) {
1374 OS <<
LS << WaitEventTypeName[
I];
1382 OS <<
"Async score: ";
1383 if (AsyncScore.empty())
1389 OS <<
"Async marks: " << AsyncMarks.size() <<
'\n';
1391 for (
const auto &Mark : AsyncMarks) {
1393 unsigned MarkedScore = Mark[
T];
1396 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"LOAD" :
"VM")
1397 <<
"_CNT: " << MarkedScore;
1400 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"DS" :
"LGKM")
1401 <<
"_CNT: " << MarkedScore;
1404 OS <<
" EXP_CNT: " << MarkedScore;
1407 OS <<
" " << (
ST.hasExtendedWaitCounts() ?
"STORE" :
"VS")
1408 <<
"_CNT: " << MarkedScore;
1411 OS <<
" SAMPLE_CNT: " << MarkedScore;
1414 OS <<
" BVH_CNT: " << MarkedScore;
1417 OS <<
" KM_CNT: " << MarkedScore;
1420 OS <<
" X_CNT: " << MarkedScore;
1423 OS <<
" UNKNOWN: " << MarkedScore;
1434void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1435 AMDGPU::Waitcnt &UpdateWait)
const {
1436 simplifyWaitcnt(UpdateWait,
LOAD_CNT);
1437 simplifyWaitcnt(UpdateWait,
EXP_CNT);
1438 simplifyWaitcnt(UpdateWait,
DS_CNT);
1441 simplifyWaitcnt(UpdateWait,
BVH_CNT);
1442 simplifyWaitcnt(UpdateWait,
KM_CNT);
1443 simplifyXcnt(CheckWait, UpdateWait);
1444 simplifyWaitcnt(UpdateWait,
VA_VDST);
1445 simplifyVmVsrc(CheckWait, UpdateWait);
1449 unsigned &
Count)
const {
1453 if (
Count >= getScoreRange(
T))
1458 unsigned Cnt =
Wait.get(
T);
1459 simplifyWaitcnt(
T, Cnt);
1463void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1464 AMDGPU::Waitcnt &UpdateWait)
const {
1473 if (CheckWait.
get(
KM_CNT) == 0 && hasPendingEvent(SMEM_GROUP))
1478 if (CheckWait.
get(
LOAD_CNT) != ~0u && hasPendingEvent(VMEM_GROUP) &&
1482 simplifyWaitcnt(UpdateWait,
X_CNT);
1485void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1486 AMDGPU::Waitcnt &UpdateWait)
const {
1491 std::min({CheckWait.get(LOAD_CNT), CheckWait.get(STORE_CNT),
1492 CheckWait.get(SAMPLE_CNT), CheckWait.get(BVH_CNT),
1493 CheckWait.get(DS_CNT)}))
1495 simplifyWaitcnt(UpdateWait,
VM_VSRC);
1498void WaitcntBrackets::purgeEmptyTrackingData() {
1510 unsigned ScoreToWait,
1511 AMDGPU::Waitcnt &
Wait)
const {
1512 const unsigned LB = getScoreLB(
T);
1513 const unsigned UB = getScoreUB(
T);
1516 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1518 !
Context->ST.hasFlatLgkmVMemCountInOrder()) {
1522 addWait(
Wait,
T, 0);
1523 }
else if (counterOutOfOrder(
T)) {
1527 addWait(
Wait,
T, 0);
1531 unsigned NeededWait = std::min(
1532 UB - ScoreToWait, getWaitCountMax(
Context->getLimits(),
T) - 1);
1533 addWait(
Wait,
T, NeededWait);
1538AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(
unsigned N) {
1540 dbgs() <<
"Need " <<
N <<
" async marks. Found " << AsyncMarks.size()
1542 for (
const auto &Mark : AsyncMarks) {
1548 if (AsyncMarks.size() == MaxAsyncMarks) {
1553 LLVM_DEBUG(
dbgs() <<
"Possible truncation. Ensuring a non-trivial wait.\n");
1554 N = std::min(
N, (
unsigned)MaxAsyncMarks - 1);
1557 AMDGPU::Waitcnt
Wait;
1558 if (AsyncMarks.size() <=
N) {
1563 size_t MarkIndex = AsyncMarks.size() -
N - 1;
1564 const auto &RequiredMark = AsyncMarks[MarkIndex];
1566 determineWaitForScore(
T, RequiredMark[
T],
Wait);
1572 dbgs() <<
"Removing " << (MarkIndex + 1)
1573 <<
" async marks after determining wait\n";
1575 AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1);
1582 AMDGPU::Waitcnt &
Wait)
const {
1583 if (
Reg == AMDGPU::SCC) {
1584 determineWaitForScore(
T, SCCScore,
Wait);
1587 for (MCRegUnit RU : regunits(
Reg))
1588 determineWaitForScore(
1589 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1595 AMDGPU::Waitcnt &
Wait)
const {
1596 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1597 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1600void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1603 if (PendingSCCWrite &&
1604 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1606 WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
1609 SCC_WRITE_PendingEvent) {
1613 PendingEvents.remove(SCC_WRITE_PendingEvent);
1614 PendingSCCWrite =
nullptr;
1618void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1620 applyWaitcnt(
Wait,
T);
1624 const unsigned UB = getScoreUB(
T);
1628 if (counterOutOfOrder(
T))
1630 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1633 PendingEvents.remove(
Context->getWaitEvents(
T));
1636 if (
T ==
KM_CNT &&
Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1637 if (!hasMixedPendingEvents(
X_CNT))
1638 applyWaitcnt(
X_CNT, 0);
1640 PendingEvents.remove(SMEM_GROUP);
1642 if (
T ==
LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1644 if (!hasMixedPendingEvents(
X_CNT))
1646 else if (
Count == 0)
1647 PendingEvents.remove(VMEM_GROUP);
1652 unsigned Cnt =
Wait.get(
T);
1653 applyWaitcnt(
T, Cnt);
1660 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1661 (
T ==
X_CNT && hasPendingEvent(SMEM_GROUP)))
1668 unsigned Events = hasPendingEvent(
T);
1671 Events &= ~(1 << GLOBAL_INV_ACCESS);
1674 return Events & (Events - 1);
1677 return hasMixedPendingEvents(
T);
1687char SIInsertWaitcntsLegacy::
ID = 0;
1692 return new SIInsertWaitcntsLegacy();
1697 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1702 if (NewEnc == MO.
getImm())
1713 case AMDGPU::S_WAIT_LOADCNT:
1715 case AMDGPU::S_WAIT_EXPCNT:
1717 case AMDGPU::S_WAIT_STORECNT:
1719 case AMDGPU::S_WAIT_SAMPLECNT:
1721 case AMDGPU::S_WAIT_BVHCNT:
1723 case AMDGPU::S_WAIT_DSCNT:
1725 case AMDGPU::S_WAIT_KMCNT:
1727 case AMDGPU::S_WAIT_XCNT:
1734bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1748bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1749 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1751 assert(isNormalMode(MaxCounter));
1754 MachineInstr *WaitcntInstr =
nullptr;
1755 MachineInstr *WaitcntVsCntInstr =
nullptr;
1758 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1760 dbgs() <<
"end of block\n";
1768 if (
II.isMetaInstruction()) {
1774 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1778 if (Opcode == AMDGPU::S_WAITCNT) {
1779 unsigned IEnc =
II.getOperand(0).getImm();
1782 ScoreBrackets.simplifyWaitcnt(OldWait);
1786 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1787 II.eraseFromParent();
1791 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1794 <<
"Before: " <<
Wait <<
'\n';);
1795 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, LDSDMA_BEGIN,
Wait);
1804 II.eraseFromParent();
1805 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
1806 unsigned N =
II.getOperand(0).getImm();
1808 AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(
N);
1811 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1812 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1815 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1817 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1820 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1821 II.eraseFromParent();
1824 WaitcntVsCntInstr = &
II;
1831 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1840 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1841 <<
"New Instr at block end: "
1842 << *WaitcntInstr <<
'\n'
1843 :
dbgs() <<
"applied pre-existing waitcnt\n"
1844 <<
"Old Instr: " << *It
1845 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1848 if (WaitcntVsCntInstr) {
1850 *WaitcntVsCntInstr, AMDGPU::OpName::simm16,
Wait.get(
STORE_CNT));
1851 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1857 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1858 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1860 :
dbgs() <<
"applied pre-existing waitcnt\n"
1861 <<
"Old Instr: " << *It
1862 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1870bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1872 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
1873 assert(isNormalMode(MaxCounter));
1881 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
1884 EmitWaitcnt(--Outstanding);
1885 }
while (Outstanding > Target);
1891 if (
Wait.hasWaitExceptStoreCnt()) {
1893 if (ExpandWaitcntProfiling) {
1897 bool AnyOutOfOrder =
false;
1899 unsigned WaitCnt =
Wait.get(CT);
1900 if (WaitCnt != ~0u && ScoreBrackets.counterOutOfOrder(CT)) {
1901 AnyOutOfOrder =
true;
1906 if (AnyOutOfOrder) {
1914 unsigned WaitCnt =
Wait.get(CT);
1918 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
1919 getWaitCountMax(getLimits(), CT) - 1);
1920 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1931 [[maybe_unused]]
auto SWaitInst =
1936 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1937 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1941 if (
Wait.hasWaitStoreCnt()) {
1945 !ScoreBrackets.counterOutOfOrder(
STORE_CNT)) {
1947 unsigned Outstanding =
1948 std::min(ScoreBrackets.getOutstanding(
STORE_CNT),
1949 getWaitCountMax(getLimits(),
STORE_CNT) - 1);
1950 EmitExpandedWaitcnt(
1952 BuildMI(Block, It, DL, TII.get(AMDGPU::S_WAITCNT_VSCNT))
1953 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1957 [[maybe_unused]]
auto SWaitInst =
1959 .
addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1964 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1965 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1973WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1974 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST.hasVscnt() ? 0 : ~0u);
1978WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1979 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1980 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1981 ~0u , ExpertVal, ExpertVal);
1988bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1989 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1991 assert(!isNormalMode(MaxCounter));
1994 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1995 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1996 MachineInstr *WaitcntDepctrInstr =
nullptr;
2000 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
2002 dbgs() <<
"end of block\n";
2008 AMDGPU::Waitcnt RequiredWait;
2013 if (
II.isMetaInstruction()) {
2022 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
2026 if (Opcode == AMDGPU::S_WAITCNT)
2029 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
2031 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2036 RequiredWait = RequiredWait.combined(OldWait);
2038 if (CombinedLoadDsCntInstr ==
nullptr) {
2039 CombinedLoadDsCntInstr = &
II;
2041 II.eraseFromParent();
2044 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
2046 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2051 RequiredWait = RequiredWait.combined(OldWait);
2053 if (CombinedStoreDsCntInstr ==
nullptr) {
2054 CombinedStoreDsCntInstr = &
II;
2056 II.eraseFromParent();
2059 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
2061 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2062 AMDGPU::Waitcnt OldWait;
2066 ScoreBrackets.simplifyWaitcnt(OldWait);
2068 if (WaitcntDepctrInstr ==
nullptr) {
2069 WaitcntDepctrInstr = &
II;
2078 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2086 II.eraseFromParent();
2090 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
2093 II.eraseFromParent();
2095 }
else if (Opcode == AMDGPU::WAIT_ASYNCMARK) {
2101 TII.getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
2103 addWait(
Wait, CT.value(), OldCnt);
2105 addWait(RequiredWait, CT.value(), OldCnt);
2107 if (WaitInstrs[CT.value()] ==
nullptr) {
2108 WaitInstrs[CT.value()] = &
II;
2110 II.eraseFromParent();
2116 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
2117 Wait =
Wait.combined(RequiredWait);
2119 if (CombinedLoadDsCntInstr) {
2135 AMDGPU::OpName::simm16, NewEnc);
2136 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
2142 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2143 <<
"New Instr at block end: "
2144 << *CombinedLoadDsCntInstr <<
'\n'
2145 :
dbgs() <<
"applied pre-existing waitcnt\n"
2146 <<
"Old Instr: " << *It <<
"New Instr: "
2147 << *CombinedLoadDsCntInstr <<
'\n');
2154 if (CombinedStoreDsCntInstr) {
2159 AMDGPU::OpName::simm16, NewEnc);
2160 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
2166 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
2167 <<
"New Instr at block end: "
2168 << *CombinedStoreDsCntInstr <<
'\n'
2169 :
dbgs() <<
"applied pre-existing waitcnt\n"
2170 <<
"Old Instr: " << *It <<
"New Instr: "
2171 << *CombinedStoreDsCntInstr <<
'\n');
2201 for (MachineInstr **WI : WaitsToErase) {
2205 (*WI)->eraseFromParent();
2212 if (!WaitInstrs[CT])
2215 unsigned NewCnt =
Wait.get(CT);
2216 if (NewCnt != ~0u) {
2218 AMDGPU::OpName::simm16, NewCnt);
2219 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2221 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2222 setNoWait(
Wait, CT);
2225 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2226 <<
"New Instr at block end: " << *WaitInstrs[CT]
2228 :
dbgs() <<
"applied pre-existing waitcnt\n"
2229 <<
"Old Instr: " << *It
2230 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2237 if (WaitcntDepctrInstr) {
2241 TII.getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2256 AMDGPU::OpName::simm16, Enc);
2258 <<
"New Instr at block end: "
2259 << *WaitcntDepctrInstr <<
'\n'
2260 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2261 <<
"Old Instr: " << *It <<
"New Instr: "
2262 << *WaitcntDepctrInstr <<
'\n');
2273bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2275 AMDGPU::Waitcnt
Wait,
const WaitcntBrackets &ScoreBrackets) {
2276 assert(!isNormalMode(MaxCounter));
2282 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
2284 for (
unsigned I = Outstanding - 1;
I >
Target &&
I != ~0
u; --
I)
2286 EmitWaitcnt(Target);
2292 if (ExpandWaitcntProfiling) {
2299 if (ScoreBrackets.counterOutOfOrder(CT)) {
2306 unsigned Outstanding = std::min(ScoreBrackets.getOutstanding(CT),
2307 getWaitCountMax(getLimits(), CT) - 1);
2308 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2319 MachineInstr *SWaitInst =
nullptr;
2343 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2344 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2356 [[maybe_unused]]
auto SWaitInst =
2363 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2364 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2367 if (
Wait.hasWaitDepctr()) {
2372 [[maybe_unused]]
auto SWaitInst =
2378 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2379 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2398bool SIInsertWaitcnts::generateWaitcntInstBefore(
2399 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2400 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2402 setForceEmitWaitcnt();
2406 AMDGPU::Waitcnt
Wait;
2407 const unsigned Opc =
MI.getOpcode();
2410 case AMDGPU::BUFFER_WBINVL1:
2411 case AMDGPU::BUFFER_WBINVL1_SC:
2412 case AMDGPU::BUFFER_WBINVL1_VOL:
2413 case AMDGPU::BUFFER_GL0_INV:
2414 case AMDGPU::BUFFER_GL1_INV: {
2422 case AMDGPU::SI_RETURN_TO_EPILOG:
2423 case AMDGPU::SI_RETURN:
2424 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
2425 case AMDGPU::S_SETPC_B64_return: {
2430 AMDGPU::Waitcnt AllZeroWait =
2431 WCG->getAllZeroWaitcnt(
false);
2436 if (
ST.hasExtendedWaitCounts() &&
2437 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2442 case AMDGPU::S_ENDPGM:
2443 case AMDGPU::S_ENDPGM_SAVED: {
2452 EndPgmInsts[&
MI] = !ScoreBrackets.empty(
STORE_CNT) &&
2453 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
2456 case AMDGPU::S_SENDMSG:
2457 case AMDGPU::S_SENDMSGHALT: {
2458 if (
ST.hasLegacyGeometry() &&
2473 if (
MI.modifiesRegister(AMDGPU::EXEC, &
TRI)) {
2476 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2477 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2478 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2479 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2486 if (
TII.isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2487 addWait(
Wait,
DS_CNT, ScoreBrackets.getPendingGDSWait());
2494 Wait = AMDGPU::Waitcnt();
2496 const MachineOperand &CallAddrOp =
TII.getCalleeOperand(
MI);
2497 if (CallAddrOp.
isReg()) {
2498 ScoreBrackets.determineWaitForPhysReg(
2501 if (
const auto *RtnAddrOp =
2502 TII.getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2503 ScoreBrackets.determineWaitForPhysReg(
2504 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait);
2507 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2508 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2524 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2525 const Value *Ptr = Memop->getValue();
2526 if (Memop->isStore()) {
2527 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2528 addWait(
Wait, SmemAccessCounter, 0);
2530 SLoadAddresses.
erase(It);
2533 unsigned AS = Memop->getAddrSpace();
2537 if (
TII.mayWriteLDSThroughDMA(
MI))
2541 unsigned TID = LDSDMA_BEGIN;
2542 if (Ptr && Memop->getAAInfo()) {
2543 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2544 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2545 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2546 if ((
I + 1) >= NUM_LDSDMA) {
2549 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2553 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID +
I + 1,
Wait);
2557 ScoreBrackets.determineWaitForLDSDMA(
LOAD_CNT, TID,
Wait);
2559 if (Memop->isStore()) {
2560 ScoreBrackets.determineWaitForLDSDMA(
EXP_CNT, TID,
Wait);
2565 for (
const MachineOperand &
Op :
MI.operands()) {
2570 if (
Op.isTied() &&
Op.isUse() &&
TII.doesNotReadTiedSource(
MI))
2575 const bool IsVGPR =
TRI.isVectorRegister(MRI,
Op.getReg());
2582 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2594 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2595 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2596 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2597 !
ST.hasVmemWriteVgprInOrder()) {
2601 ScoreBrackets.clearVgprVmemTypes(
Reg);
2604 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2608 }
else if (
Op.getReg() == AMDGPU::SCC) {
2611 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait);
2614 if (
ST.hasWaitXcnt() &&
Op.isDef())
2615 ScoreBrackets.determineWaitForPhysReg(
X_CNT,
Reg,
Wait);
2633 if (
Opc == AMDGPU::S_BARRIER && !
ST.hasAutoWaitcntBeforeBarrier() &&
2634 !
ST.hasBackOffBarrier()) {
2635 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2642 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2647 ScoreBrackets.simplifyWaitcnt(
Wait);
2660 ScoreBrackets.applyWaitcnt(
Wait,
X_CNT);
2667 Wait = WCG->getAllZeroWaitcnt(
false);
2671 if (!ForceEmitWaitcnt[
T])
2676 if (FlushFlags.FlushVmCnt) {
2681 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
2687 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2691bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2693 MachineBasicBlock &
Block,
2694 WaitcntBrackets &ScoreBrackets,
2695 MachineInstr *OldWaitcntInstr) {
2698 if (OldWaitcntInstr)
2702 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2707 MachineOperand *WaitExp =
TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
2717 <<
"Update Instr: " << *It);
2720 if (WCG->createNewWaitcnt(
Block, It,
Wait, ScoreBrackets))
2725 ScoreBrackets.applyWaitcnt(
Wait);
2730std::optional<WaitEventType>
2731SIInsertWaitcnts::getExpertSchedulingEventType(
const MachineInstr &Inst)
const {
2732 if (
TII.isVALU(Inst)) {
2737 if (
TII.isXDL(Inst))
2738 return VGPR_XDL_WRITE;
2740 if (
TII.isTRANS(Inst))
2741 return VGPR_TRANS_WRITE;
2744 return VGPR_DPMACC_WRITE;
2746 return VGPR_CSMACC_WRITE;
2753 if (
TII.isFLAT(Inst))
2754 return VGPR_FLAT_READ;
2757 return VGPR_LDS_READ;
2759 if (
TII.isVMEM(Inst) ||
TII.isVIMAGE(Inst) ||
TII.isVSAMPLE(Inst))
2760 return VGPR_VMEM_READ;
2767bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2768 return (
TII.isFLAT(
MI) &&
TII.mayAccessVMEMThroughFlat(
MI)) ||
2775 MachineBasicBlock *
Block)
const {
2776 auto BlockEnd =
Block->getParent()->end();
2777 auto BlockIter =
Block->getIterator();
2781 if (++BlockIter != BlockEnd) {
2782 It = BlockIter->instr_begin();
2789 if (!It->isMetaInstruction())
2797 return It->getOpcode() == AMDGPU::S_ENDPGM;
2801bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2802 MachineBasicBlock &
Block,
2803 WaitcntBrackets &ScoreBrackets) {
2804 AMDGPU::Waitcnt
Wait;
2805 bool NeedsEndPGMCheck =
false;
2813 NeedsEndPGMCheck =
true;
2816 ScoreBrackets.simplifyWaitcnt(
Wait);
2819 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2822 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2830WaitEventSet SIInsertWaitcnts::getEventsFor(
const MachineInstr &Inst)
const {
2831 WaitEventSet Events;
2833 if (
const auto ET = getExpertSchedulingEventType(Inst))
2837 if (
TII.isDS(Inst) &&
TII.usesLGKM_CNT(Inst)) {
2839 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2840 Events.insert(GDS_ACCESS);
2841 Events.insert(GDS_GPR_LOCK);
2843 Events.insert(LDS_ACCESS);
2845 }
else if (
TII.isFLAT(Inst)) {
2847 Events.insert(getVmemWaitEventType(Inst));
2850 if (
TII.mayAccessVMEMThroughFlat(Inst)) {
2851 if (
ST.hasWaitXcnt())
2852 Events.insert(VMEM_GROUP);
2853 Events.insert(getVmemWaitEventType(Inst));
2855 if (
TII.mayAccessLDSThroughFlat(Inst))
2856 Events.insert(LDS_ACCESS);
2860 Inst.
getOpcode() == AMDGPU::BUFFER_WBL2)) {
2864 if (
ST.hasWaitXcnt())
2865 Events.insert(VMEM_GROUP);
2866 Events.insert(getVmemWaitEventType(Inst));
2867 if (
ST.vmemWriteNeedsExpWaitcnt() &&
2869 Events.insert(VMW_GPR_LOCK);
2871 }
else if (
TII.isSMRD(Inst)) {
2872 if (
ST.hasWaitXcnt())
2873 Events.insert(SMEM_GROUP);
2874 Events.insert(SMEM_ACCESS);
2876 Events.insert(EXP_LDS_ACCESS);
2878 unsigned Imm =
TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2880 Events.insert(EXP_PARAM_ACCESS);
2882 Events.insert(EXP_POS_ACCESS);
2884 Events.insert(EXP_GPR_LOCK);
2886 Events.insert(SCC_WRITE);
2889 case AMDGPU::S_SENDMSG:
2890 case AMDGPU::S_SENDMSG_RTN_B32:
2891 case AMDGPU::S_SENDMSG_RTN_B64:
2892 case AMDGPU::S_SENDMSGHALT:
2893 Events.insert(SQ_MESSAGE);
2895 case AMDGPU::S_MEMTIME:
2896 case AMDGPU::S_MEMREALTIME:
2897 case AMDGPU::S_GET_BARRIER_STATE_M0:
2898 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2899 Events.insert(SMEM_ACCESS);
2906void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2907 WaitcntBrackets *ScoreBrackets) {
2909 WaitEventSet InstEvents = getEventsFor(Inst);
2910 for (WaitEventType
E : wait_events()) {
2911 if (InstEvents.contains(
E))
2912 ScoreBrackets->updateByEvent(
E, Inst);
2915 if (
TII.isDS(Inst) &&
TII.usesLGKM_CNT(Inst)) {
2917 TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2918 ScoreBrackets->setPendingGDS();
2920 }
else if (
TII.isFLAT(Inst)) {
2928 ScoreBrackets->setPendingFlat();
2929 }
else if (Inst.
isCall()) {
2931 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2932 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2933 }
else if (
TII.isVINTERP(Inst)) {
2934 int64_t
Imm =
TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2939bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2940 unsigned OtherScore) {
2941 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2942 unsigned OtherShifted =
2943 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2944 Score = std::max(MyShifted, OtherShifted);
2945 return OtherShifted > MyShifted;
2950 bool StrictDom =
false;
2954 if (AsyncMarks.empty() && OtherMarks.
empty()) {
2961 auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.
size());
2962 MaxSize = std::min(MaxSize, MaxAsyncMarks);
2965 if (AsyncMarks.size() > MaxSize)
2966 AsyncMarks.erase(AsyncMarks.begin(),
2967 AsyncMarks.begin() + (AsyncMarks.size() - MaxSize));
2973 constexpr CounterValueArray ZeroMark{};
2974 AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark);
2977 dbgs() <<
"Before merge:\n";
2978 for (
const auto &Mark : AsyncMarks) {
2982 dbgs() <<
"Other marks:\n";
2983 for (
const auto &Mark : OtherMarks) {
2992 unsigned OtherSize = OtherMarks.size();
2993 unsigned OurSize = AsyncMarks.size();
2994 unsigned MergeCount = std::min(OtherSize, OurSize);
2997 StrictDom |= mergeScore(MergeInfos[
T], AsyncMarks[OurSize - Idx][
T],
2998 OtherMarks[OtherSize - Idx][
T]);
3003 dbgs() <<
"After merge:\n";
3004 for (
const auto &Mark : AsyncMarks) {
3018bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
3019 bool StrictDom =
false;
3023 for (
auto K :
Other.VMem.keys())
3024 VMem.try_emplace(K);
3025 for (
auto K :
Other.SGPRs.keys())
3026 SGPRs.try_emplace(K);
3033 const WaitEventSet &EventsForT =
Context->getWaitEvents(
T);
3034 const WaitEventSet OldEvents = PendingEvents & EventsForT;
3035 const WaitEventSet OtherEvents =
Other.PendingEvents & EventsForT;
3036 if (!OldEvents.contains(OtherEvents))
3038 PendingEvents |= OtherEvents;
3041 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
3042 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
3043 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
3044 if (NewUB < ScoreLBs[
T])
3047 MergeInfo &
M = MergeInfos[
T];
3048 M.OldLB = ScoreLBs[
T];
3049 M.OtherLB =
Other.ScoreLBs[
T];
3050 M.MyShift = NewUB - ScoreUBs[
T];
3051 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
3053 ScoreUBs[
T] = NewUB;
3055 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
3058 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
3061 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
3062 if (
Other.hasPendingEvent(SCC_WRITE)) {
3063 if (!OldEvents.contains(SCC_WRITE)) {
3064 PendingSCCWrite =
Other.PendingSCCWrite;
3065 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
3066 PendingSCCWrite =
nullptr;
3071 for (
auto &[RegID, Info] : VMem)
3072 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
3074 if (isSmemCounter(
T)) {
3075 unsigned Idx = getSgprScoresIdx(
T);
3076 for (
auto &[RegID, Info] : SGPRs) {
3077 auto It =
Other.SGPRs.find(RegID);
3078 unsigned OtherScore =
3079 (It !=
Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
3080 StrictDom |= mergeScore(M,
Info.Scores[Idx], OtherScore);
3085 for (
auto &[TID, Info] : VMem) {
3086 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
3087 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
3088 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
3089 Info.VMEMTypes = NewVmemTypes;
3093 StrictDom |= mergeAsyncMarks(MergeInfos,
Other.AsyncMarks);
3095 StrictDom |= mergeScore(MergeInfos[
T], AsyncScore[
T],
Other.AsyncScore[
T]);
3097 purgeEmptyTrackingData();
3103 return Opcode == AMDGPU::S_WAITCNT ||
3106 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
3107 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
3108 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
3109 Opcode == AMDGPU::WAIT_ASYNCMARK ||
3113void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
3115 bool ExpertMode)
const {
3119 .
addImm(ExpertMode ? 2 : 0)
3137class VCCZWorkaround {
3138 const WaitcntBrackets &ScoreBrackets;
3139 const GCNSubtarget &
ST;
3140 const SIInstrInfo &
TII;
3141 const SIRegisterInfo &
TRI;
3142 bool VCCZCorruptionBug =
false;
3143 bool VCCZNotUpdatedByPartialWrites =
false;
3146 bool MustRecomputeVCCZ =
true;
3149 VCCZWorkaround(
const WaitcntBrackets &ScoreBrackets,
const GCNSubtarget &ST,
3150 const SIInstrInfo &
TII,
const SIRegisterInfo &
TRI)
3152 VCCZCorruptionBug =
ST.hasReadVCCZBug();
3153 VCCZNotUpdatedByPartialWrites = !
ST.partialVCCWritesUpdateVCCZ();
3160 bool tryRecomputeVCCZ(MachineInstr &
MI) {
3162 if (!VCCZCorruptionBug && !VCCZNotUpdatedByPartialWrites)
3172 MustRecomputeVCCZ |= VCCZCorruptionBug &&
TII.isSMRD(
MI);
3178 std::optional<bool> PartiallyWritesToVCCOpt;
3179 auto PartiallyWritesToVCC = [](MachineInstr &
MI) {
3180 return MI.definesRegister(AMDGPU::VCC_LO,
nullptr) ||
3181 MI.definesRegister(AMDGPU::VCC_HI,
nullptr);
3183 if (VCCZNotUpdatedByPartialWrites) {
3184 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3187 MustRecomputeVCCZ |= *PartiallyWritesToVCCOpt;
3193 if (!ScoreBrackets.hasPendingEvent(SMEM_ACCESS) || !VCCZCorruptionBug) {
3195 if (!PartiallyWritesToVCCOpt)
3196 PartiallyWritesToVCCOpt = PartiallyWritesToVCC(
MI);
3197 bool FullyWritesToVCC = !*PartiallyWritesToVCCOpt &&
3198 MI.definesRegister(AMDGPU::VCC,
nullptr);
3201 bool UpdatesVCCZ = FullyWritesToVCC || (!VCCZNotUpdatedByPartialWrites &&
3202 *PartiallyWritesToVCCOpt);
3204 MustRecomputeVCCZ =
false;
3214 TII.get(
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3217 MustRecomputeVCCZ =
false;
3227bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
3228 MachineBasicBlock &
Block,
3229 WaitcntBrackets &ScoreBrackets) {
3233 dbgs() <<
"*** Begin Block: ";
3235 ScoreBrackets.dump();
3237 VCCZWorkaround VCCZW(ScoreBrackets, ST,
TII,
TRI);
3240 MachineInstr *OldWaitcntInstr =
nullptr;
3245 Iter !=
E; ++Iter) {
3246 MachineInstr &Inst = *Iter;
3252 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
3253 if (!OldWaitcntInstr)
3254 OldWaitcntInstr = &Inst;
3258 PreheaderFlushFlags FlushFlags;
3259 if (
Block.getFirstTerminator() == Inst)
3260 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3263 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
3265 OldWaitcntInstr =
nullptr;
3267 if (Inst.
getOpcode() == AMDGPU::ASYNCMARK) {
3273 assert(
ST.getGeneration() < AMDGPUSubtarget::GFX12);
3274 ScoreBrackets.recordAsyncMark(Inst);
3278 if (
TII.isSMRD(Inst)) {
3279 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
3282 if (!Memop->isInvariant()) {
3283 const Value *Ptr = Memop->getValue();
3289 updateEventWaitcntAfter(Inst, &ScoreBrackets);
3293 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
3297 ScoreBrackets.dump();
3302 Modified |= VCCZW.tryRecomputeVCCZ(Inst);
3307 AMDGPU::Waitcnt
Wait;
3308 if (
Block.getFirstTerminator() ==
Block.end()) {
3309 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3310 if (FlushFlags.FlushVmCnt) {
3311 if (ScoreBrackets.hasPendingEvent(
LOAD_CNT))
3313 if (ScoreBrackets.hasPendingEvent(
SAMPLE_CNT))
3315 if (ScoreBrackets.hasPendingEvent(
BVH_CNT))
3318 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(
DS_CNT))
3327 dbgs() <<
"*** End Block: ";
3329 ScoreBrackets.dump();
3335bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &
Block) {
3336 if (
Block.size() <= 1)
3344 MachineInstr *LastAtomicWithSoftXcnt =
nullptr;
3350 if (!IsLDS && (
MI.mayLoad() ^
MI.mayStore()))
3351 LastAtomicWithSoftXcnt =
nullptr;
3354 MI.mayLoad() &&
MI.mayStore();
3355 MachineInstr &PrevMI = *
MI.getPrevNode();
3357 if (PrevMI.
getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
3360 if (LastAtomicWithSoftXcnt) {
3364 LastAtomicWithSoftXcnt = &
MI;
3372SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3373 const WaitcntBrackets &ScoreBrackets) {
3374 auto [Iterator, IsInserted] =
3377 return Iterator->second;
3381 return PreheaderFlushFlags();
3385 return PreheaderFlushFlags();
3388 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3389 return Iterator->second;
3392 return PreheaderFlushFlags();
3395bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3397 return TII.mayAccessVMEMThroughFlat(
MI);
3401bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3407bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3436SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3437 const WaitcntBrackets &Brackets) {
3438 PreheaderFlushFlags
Flags;
3439 bool HasVMemLoad =
false;
3440 bool HasVMemStore =
false;
3441 bool UsesVgprVMEMLoadedOutside =
false;
3442 bool UsesVgprDSReadOutside =
false;
3443 bool VMemInvalidated =
false;
3447 bool TrackSimpleDSOpt =
ST.hasExtendedWaitCounts();
3448 DenseSet<MCRegUnit> VgprUse;
3449 DenseSet<MCRegUnit> VgprDefVMEM;
3450 DenseSet<MCRegUnit> VgprDefDS;
3456 DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
3457 unsigned DSReadPosition = 0;
3458 bool IsSingleBlock =
ML->getNumBlocks() == 1;
3459 bool TrackDSFlushPoint =
ST.hasExtendedWaitCounts() && IsSingleBlock;
3460 unsigned LastDSFlushPosition = 0;
3462 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3463 for (MachineInstr &
MI : *
MBB) {
3464 if (isVMEMOrFlatVMEM(
MI)) {
3465 HasVMemLoad |=
MI.mayLoad();
3466 HasVMemStore |=
MI.mayStore();
3470 if (mayStoreIncrementingDSCNT(
MI)) {
3473 if (VMemInvalidated)
3475 TrackSimpleDSOpt =
false;
3476 TrackDSFlushPoint =
false;
3478 bool IsDSRead = isDSRead(
MI);
3483 auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
3484 if (!TrackDSFlushPoint)
3486 if (
auto It = LastDSReadPositionMap.
find(RU);
3487 It != LastDSReadPositionMap.
end()) {
3491 LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
3495 for (
const MachineOperand &
Op :
MI.all_uses()) {
3496 if (
Op.isDebug() || !
TRI.isVectorRegister(MRI,
Op.getReg()))
3499 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3503 VMemInvalidated =
true;
3507 TrackSimpleDSOpt =
false;
3510 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3514 updateDSReadFlushTracking(RU);
3519 VMEMID
ID = toVMEMID(RU);
3523 UsesVgprVMEMLoadedOutside =
true;
3527 else if (Brackets.hasPendingVMEM(
ID,
DS_CNT))
3528 UsesVgprDSReadOutside =
true;
3533 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3534 for (
const MachineOperand &
Op :
MI.all_defs()) {
3535 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3539 VMemInvalidated =
true;
3544 if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
3555 if (IsDSRead || TrackDSFlushPoint) {
3556 for (
const MachineOperand &
Op :
MI.all_defs()) {
3557 if (!
TRI.isVectorRegister(MRI,
Op.getReg()))
3559 for (MCRegUnit RU :
TRI.regunits(
Op.getReg().asMCReg())) {
3562 updateDSReadFlushTracking(RU);
3565 if (TrackDSFlushPoint)
3566 LastDSReadPositionMap[RU] = DSReadPosition;
3575 if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
3576 ((!
ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3577 (HasVMemLoad &&
ST.hasVmemWriteVgprInOrder())))
3578 Flags.FlushVmCnt =
true;
3584 bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
3587 bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
3588 bool DSFlushPointPrefetch =
3589 TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
3591 if (SimpleDSOpt || DSFlushPointPrefetch)
3592 Flags.FlushDsCnt =
true;
3597bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3598 auto &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3600 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3602 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3603 AA = &AAR->getAAResults();
3605 return SIInsertWaitcnts(MLI, PDT, AA, MF).run();
3617 if (!SIInsertWaitcnts(MLI, PDT,
AA, MF).
run())
3622 .preserve<AAManager>();
3625bool SIInsertWaitcnts::run() {
3633 if (ST.hasExtendedWaitCounts()) {
3634 IsExpertMode = ST.hasExpertSchedulingMode() &&
3642 WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
3651 SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
3655 MachineBasicBlock &EntryBB = MF.
front();
3665 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3668 if (
ST.hasExtendedWaitCounts()) {
3675 if (!
ST.hasImageInsts() &&
3680 TII.get(instrsForExtendedCounterTypes[CT]))
3693 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3694 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3695 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3702 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3705 std::unique_ptr<WaitcntBrackets> Brackets;
3710 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3712 MachineBasicBlock *
MBB = BII->first;
3713 BlockInfo &BI = BII->second;
3719 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3721 *Brackets = *BI.Incoming;
3724 Brackets = std::make_unique<WaitcntBrackets>(
this);
3729 Brackets->~WaitcntBrackets();
3730 new (Brackets.get()) WaitcntBrackets(
this);
3734 if (
ST.hasWaitXcnt())
3736 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3739 if (Brackets->hasPendingEvent()) {
3740 BlockInfo *MoveBracketsToSucc =
nullptr;
3742 auto *SuccBII = BlockInfos.
find(Succ);
3743 BlockInfo &SuccBI = SuccBII->second;
3744 if (!SuccBI.Incoming) {
3745 SuccBI.Dirty =
true;
3746 if (SuccBII <= BII) {
3750 if (!MoveBracketsToSucc) {
3751 MoveBracketsToSucc = &SuccBI;
3753 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3757 dbgs() <<
"Try to merge ";
3763 if (SuccBI.Incoming->merge(*Brackets)) {
3764 SuccBI.Dirty =
true;
3765 if (SuccBII <= BII) {
3772 if (MoveBracketsToSucc)
3773 MoveBracketsToSucc->Incoming = std::move(Brackets);
3778 if (
ST.hasScalarStores()) {
3779 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3780 bool HaveScalarStores =
false;
3782 for (MachineBasicBlock &
MBB : MF) {
3783 for (MachineInstr &
MI :
MBB) {
3784 if (!HaveScalarStores &&
TII.isScalarStore(
MI))
3785 HaveScalarStores =
true;
3787 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3788 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3793 if (HaveScalarStores) {
3802 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3803 bool SeenDCacheWB =
false;
3807 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3808 SeenDCacheWB =
true;
3809 else if (
TII.isScalarStore(*
I))
3810 SeenDCacheWB =
false;
3813 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3814 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3830 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3832 setSchedulingMode(EntryBB,
I,
true);
3834 for (MachineInstr *
MI : CallInsts) {
3835 MachineBasicBlock &
MBB = *
MI->getParent();
3836 setSchedulingMode(
MBB,
MI,
false);
3837 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3840 for (MachineInstr *
MI : ReturnInsts)
3841 setSchedulingMode(*
MI->getParent(),
MI,
false);
3852 for (
auto [
MI,
_] : EndPgmInsts) {
3854 TII.get(AMDGPU::S_ALLOC_VGPR))
3858 }
else if (!WCG->isOptNone() &&
3859 ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
3860 (MF.getFrameInfo().hasCalls() ||
3861 ST.getOccupancyWithNumVGPRs(
3862 TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
3865 for (
auto [
MI, Flag] : EndPgmInsts) {
3867 if (
ST.requiresNopBeforeDeallocVGPRs()) {
3869 TII.get(AMDGPU::S_NOP))
3873 TII.get(AMDGPU::S_SENDMSG))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
Represents the counter values to wait for in an s_waitcnt instruction.
unsigned get(InstCounterType T) const
void set(InstCounterType T, unsigned Val)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
LLVM_ABI void printName(raw_ostream &os, unsigned printNameFlags=PrintNameIr, ModuleSlotTracker *moduleSlotTracker=nullptr) const
Print the basic block's name as:
MachineInstrBundleIterator< MachineInstr > iterator
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI MachineInstrBundleIterator< MachineInstr > eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
virtual void print(raw_ostream &OS, const Module *M) const
print - Print out the internal state of the pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool usesASYNC_CNT(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
iota_range< InstCounterType > inst_counter_types(InstCounterType MaxCounter)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
LLVM_ABI std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
auto seq_inclusive(T Begin, T End)
Iterate over an integral type from Begin to End inclusive.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
APInt operator&(APInt a, const APInt &b)
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
bool operator!=(uint64_t V1, const APInt &V2)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
@ Async
"Asynchronous" unwind tables (instr precise)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)
APInt operator|(APInt a, const APInt &b)
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
static constexpr bool is_iterable