46#define DEBUG_TYPE "si-insert-waitcnts"
49 "Force emit s_waitcnt expcnt(0) instrs");
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
53 "Force emit s_waitcnt vmcnt(0) instrs");
57 cl::desc(
"Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc(
"Force all waitcnt load counters to wait until 0"),
67 "amdgpu-expert-scheduling-mode",
68 cl::desc(
"Enable expert scheduling mode 2 for all functions (GFX12+ only)"),
82 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
86 NUM_EXTENDED_INST_CNTS,
87 VA_VDST = NUM_EXTENDED_INST_CNTS,
90 NUM_INST_CNTS = NUM_EXPERT_INST_CNTS
104auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
105 return enum_seq(LOAD_CNT, MaxCounter);
152 TRACKINGID_RANGE_LEN = (1 << 16),
157 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
162 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
163 LDSDMA_BEGIN = REGUNITS_END,
164 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
168static constexpr VMEMID toVMEMID(MCRegUnit RU) {
169 return static_cast<unsigned>(RU);
172#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
174 DECL(VMEM_SAMPLER_READ_ACCESS) \
175 DECL(VMEM_BVH_READ_ACCESS) \
176 DECL(GLOBAL_INV_ACCESS) \
177 DECL(VMEM_WRITE_ACCESS) \
178 DECL(SCRATCH_WRITE_ACCESS) \
188 DECL(EXP_POS_ACCESS) \
189 DECL(EXP_PARAM_ACCESS) \
191 DECL(EXP_LDS_ACCESS) \
192 DECL(VGPR_CSMACC_WRITE) \
193 DECL(VGPR_DPMACC_WRITE) \
194 DECL(VGPR_TRANS_WRITE) \
195 DECL(VGPR_XDL_WRITE) \
196 DECL(VGPR_LDS_READ) \
197 DECL(VGPR_FLAT_READ) \
201#define AMDGPU_EVENT_ENUM(Name) Name,
206#undef AMDGPU_EVENT_ENUM
208#define AMDGPU_EVENT_NAME(Name) #Name,
212#undef AMDGPU_EVENT_NAME
233static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
234 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
235 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
236 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
244static bool isNormalMode(InstCounterType MaxCounter) {
245 return MaxCounter == NUM_NORMAL_INST_CNTS;
250 assert(updateVMCntOnly(Inst));
252 return VMEM_NOSAMPLER;
266 return VMEM_NOSAMPLER;
278 return Wait.StoreCnt;
280 return Wait.SampleCnt;
297 unsigned &WC = getCounterRef(
Wait,
T);
298 WC = std::min(WC,
Count);
302 getCounterRef(
Wait,
T) = ~0
u;
306 return getCounterRef(
Wait,
T);
310InstCounterType eventCounter(
const unsigned *masks, WaitEventType
E) {
311 for (
auto T : inst_counter_types()) {
312 if (masks[
T] & (1 <<
E))
318class WaitcntBrackets;
326class WaitcntGenerator {
328 const GCNSubtarget *ST =
nullptr;
329 const SIInstrInfo *TII =
nullptr;
330 AMDGPU::IsaVersion IV;
331 InstCounterType MaxCounter;
333 bool ExpandWaitcntProfiling =
false;
334 const AMDGPU::HardwareLimits *Limits =
nullptr;
337 WaitcntGenerator() =
default;
338 WaitcntGenerator(
const MachineFunction &MF, InstCounterType MaxCounter,
339 const AMDGPU::HardwareLimits *Limits)
340 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
344 ExpandWaitcntProfiling(
345 MF.
getFunction().hasFnAttribute(
"amdgpu-expand-waitcnt-profiling")),
350 bool isOptNone()
const {
return OptNone; }
352 const AMDGPU::HardwareLimits &getLimits()
const {
return *Limits; }
366 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
367 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
371 bool promoteSoftWaitCnt(MachineInstr *Waitcnt)
const;
376 virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
378 AMDGPU::Waitcnt
Wait,
379 WaitcntBrackets *ScoreBrackets =
nullptr) = 0;
383 virtual const unsigned *getWaitEventMask()
const = 0;
387 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const = 0;
389 virtual ~WaitcntGenerator() =
default;
392 static constexpr unsigned
393 eventMask(std::initializer_list<WaitEventType> Events) {
395 for (
auto &
E : Events)
402class WaitcntGeneratorPreGFX12 :
public WaitcntGenerator {
404 WaitcntGeneratorPreGFX12() =
default;
405 WaitcntGeneratorPreGFX12(
const MachineFunction &MF,
406 const AMDGPU::HardwareLimits *Limits)
407 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS, Limits) {}
410 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
411 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
414 bool createNewWaitcnt(MachineBasicBlock &
Block,
416 AMDGPU::Waitcnt
Wait,
417 WaitcntBrackets *ScoreBrackets =
nullptr)
override;
419 const unsigned *getWaitEventMask()
const override {
422 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
424 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
425 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
426 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
427 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
428 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
436 return WaitEventMaskForInstPreGFX12;
439 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
442class WaitcntGeneratorGFX12Plus :
public WaitcntGenerator {
447 WaitcntGeneratorGFX12Plus() =
default;
448 WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
449 InstCounterType MaxCounter,
450 const AMDGPU::HardwareLimits *Limits,
452 : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
455 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
456 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
459 bool createNewWaitcnt(MachineBasicBlock &
Block,
461 AMDGPU::Waitcnt
Wait,
462 WaitcntBrackets *ScoreBrackets =
nullptr)
override;
464 const unsigned *getWaitEventMask()
const override {
467 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
468 eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
469 eventMask({LDS_ACCESS, GDS_ACCESS}),
470 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
471 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
472 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
473 eventMask({VMEM_SAMPLER_READ_ACCESS}),
474 eventMask({VMEM_BVH_READ_ACCESS}),
475 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
476 eventMask({VMEM_GROUP, SMEM_GROUP}),
477 eventMask({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
479 eventMask({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
481 return WaitEventMaskForInstGFX12Plus;
484 AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt)
const override;
488struct PreheaderFlushFlags {
489 bool FlushVmCnt =
false;
490 bool FlushDsCnt =
false;
493class SIInsertWaitcnts {
495 const GCNSubtarget *ST;
496 const SIInstrInfo *TII =
nullptr;
497 const SIRegisterInfo *TRI =
nullptr;
498 const MachineRegisterInfo *MRI =
nullptr;
499 InstCounterType SmemAccessCounter;
500 InstCounterType MaxCounter;
501 bool IsExpertMode =
false;
502 const unsigned *WaitEventMaskForInst;
505 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
506 DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
507 MachineLoopInfo *MLI;
508 MachinePostDominatorTree *PDT;
512 std::unique_ptr<WaitcntBrackets> Incoming;
516 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
518 bool ForceEmitWaitcnt[NUM_INST_CNTS];
523 WaitcntGeneratorPreGFX12 WCGPreGFX12;
524 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
526 WaitcntGenerator *WCG =
nullptr;
529 DenseSet<MachineInstr *> CallInsts;
530 DenseSet<MachineInstr *> ReturnInsts;
534 DenseSet<MachineInstr *> ReleaseVGPRInsts;
536 AMDGPU::HardwareLimits Limits;
539 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
541 : MLI(MLI), PDT(PDT), AA(AA) {
542 (void)ForceExpCounter;
543 (void)ForceLgkmCounter;
544 (void)ForceVMCounter;
547 const AMDGPU::HardwareLimits &getLimits()
const {
return Limits; }
549 PreheaderFlushFlags getPreheaderFlushFlags(MachineLoop *
ML,
550 const WaitcntBrackets &Brackets);
551 PreheaderFlushFlags isPreheaderToFlush(MachineBasicBlock &
MBB,
552 const WaitcntBrackets &ScoreBrackets);
553 bool isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const;
554 bool isDSRead(
const MachineInstr &
MI)
const;
555 bool mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const;
556 bool run(MachineFunction &MF);
558 void setForceEmitWaitcnt() {
564 ForceEmitWaitcnt[
EXP_CNT] =
true;
566 ForceEmitWaitcnt[
EXP_CNT] =
false;
571 ForceEmitWaitcnt[DS_CNT] =
true;
572 ForceEmitWaitcnt[KM_CNT] =
true;
574 ForceEmitWaitcnt[DS_CNT] =
false;
575 ForceEmitWaitcnt[KM_CNT] =
false;
580 ForceEmitWaitcnt[LOAD_CNT] =
true;
581 ForceEmitWaitcnt[SAMPLE_CNT] =
true;
582 ForceEmitWaitcnt[BVH_CNT] =
true;
584 ForceEmitWaitcnt[LOAD_CNT] =
false;
585 ForceEmitWaitcnt[SAMPLE_CNT] =
false;
586 ForceEmitWaitcnt[BVH_CNT] =
false;
589 ForceEmitWaitcnt[VA_VDST] =
false;
590 ForceEmitWaitcnt[VM_VSRC] =
false;
596 WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
const {
599 case AMDGPU::GLOBAL_INV:
600 return GLOBAL_INV_ACCESS;
602 case AMDGPU::GLOBAL_WB:
603 case AMDGPU::GLOBAL_WBINV:
604 return VMEM_WRITE_ACCESS;
610 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
611 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
620 if (TII->mayAccessScratch(Inst))
621 return SCRATCH_WRITE_ACCESS;
622 return VMEM_WRITE_ACCESS;
626 return VmemReadMapping[getVmemType(Inst)];
629 std::optional<WaitEventType>
630 getExpertSchedulingEventType(
const MachineInstr &Inst)
const;
632 bool isVmemAccess(
const MachineInstr &
MI)
const;
633 bool generateWaitcntInstBefore(MachineInstr &
MI,
634 WaitcntBrackets &ScoreBrackets,
635 MachineInstr *OldWaitcntInstr,
636 PreheaderFlushFlags FlushFlags);
637 bool generateWaitcnt(AMDGPU::Waitcnt
Wait,
639 MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
640 MachineInstr *OldWaitcntInstr);
641 void updateEventWaitcntAfter(MachineInstr &Inst,
642 WaitcntBrackets *ScoreBrackets);
644 MachineBasicBlock *
Block)
const;
645 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
646 WaitcntBrackets &ScoreBrackets);
647 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
648 WaitcntBrackets &ScoreBrackets);
650 bool ExpertMode)
const;
661class WaitcntBrackets {
663 WaitcntBrackets(
const SIInsertWaitcnts *Context) : Context(Context) {
664 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
669 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
670 for (
auto &[
ID, Val] : VMem) {
674 for (
auto &[
ID, Val] : SGPRs) {
679 if (NumUnusedVmem || NumUnusedSGPRs) {
680 errs() <<
"WaitcntBracket had unused entries at destruction time: "
681 << NumUnusedVmem <<
" VMem and " << NumUnusedSGPRs
682 <<
" SGPR unused entries\n";
688 bool isSmemCounter(InstCounterType
T)
const {
689 return T == Context->SmemAccessCounter ||
T == X_CNT;
692 unsigned getSgprScoresIdx(InstCounterType
T)
const {
693 assert(isSmemCounter(
T) &&
"Invalid SMEM counter");
694 return T == X_CNT ? 1 : 0;
697 unsigned getScoreLB(InstCounterType
T)
const {
702 unsigned getScoreUB(InstCounterType
T)
const {
707 unsigned getScoreRange(InstCounterType
T)
const {
708 return getScoreUB(
T) - getScoreLB(
T);
711 unsigned getSGPRScore(MCRegUnit RU, InstCounterType
T)
const {
712 auto It = SGPRs.find(RU);
713 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(
T)] : 0;
716 unsigned getVMemScore(VMEMID TID, InstCounterType
T)
const {
717 auto It = VMem.find(TID);
718 return It != VMem.end() ? It->second.Scores[
T] : 0;
723 bool counterOutOfOrder(InstCounterType
T)
const;
724 void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
const {
727 void simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
728 AMDGPU::Waitcnt &UpdateWait)
const;
729 void simplifyWaitcnt(InstCounterType
T,
unsigned &
Count)
const;
730 void simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
731 AMDGPU::Waitcnt &UpdateWait)
const;
732 void simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
733 AMDGPU::Waitcnt &UpdateWait)
const;
735 void determineWaitForPhysReg(InstCounterType
T,
MCPhysReg Reg,
736 AMDGPU::Waitcnt &
Wait)
const;
737 void determineWaitForLDSDMA(InstCounterType
T, VMEMID TID,
738 AMDGPU::Waitcnt &
Wait)
const;
739 void tryClearSCCWriteEvent(MachineInstr *Inst);
741 void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
742 void applyWaitcnt(InstCounterType
T,
unsigned Count);
743 void updateByEvent(WaitEventType
E, MachineInstr &
MI);
745 unsigned hasPendingEvent()
const {
return PendingEvents; }
746 unsigned hasPendingEvent(WaitEventType
E)
const {
747 return PendingEvents & (1 <<
E);
749 unsigned hasPendingEvent(InstCounterType
T)
const {
750 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[
T];
751 assert((HasPending != 0) == (getScoreRange(
T) != 0));
755 bool hasMixedPendingEvents(InstCounterType
T)
const {
756 unsigned Events = hasPendingEvent(
T);
758 return Events & (Events - 1);
761 bool hasPendingFlat()
const {
762 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
763 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
764 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
765 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
768 void setPendingFlat() {
769 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
770 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
773 bool hasPendingGDS()
const {
774 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
777 unsigned getPendingGDSWait()
const {
778 return std::min(getScoreUB(DS_CNT) - LastGDS,
779 getWaitCountMax(Context->getLimits(), DS_CNT) - 1);
782 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
786 bool hasOtherPendingVmemTypes(
MCPhysReg Reg, VmemType V)
const {
787 for (MCRegUnit RU : regunits(
Reg)) {
788 auto It = VMem.find(toVMEMID(RU));
789 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
796 for (MCRegUnit RU : regunits(
Reg)) {
797 if (
auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
798 It->second.VMEMTypes = 0;
799 if (It->second.empty())
805 void setStateOnFunctionEntryOrReturn() {
806 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) +
807 getWaitCountMax(Context->getLimits(), STORE_CNT));
808 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
811 ArrayRef<const MachineInstr *> getLDSDMAStores()
const {
815 bool hasPointSampleAccel(
const MachineInstr &
MI)
const;
816 bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
819 void print(raw_ostream &)
const;
824 void purgeEmptyTrackingData();
834 void determineWaitForScore(InstCounterType
T,
unsigned Score,
835 AMDGPU::Waitcnt &
Wait)
const;
837 static bool mergeScore(
const MergeInfo &M,
unsigned &Score,
838 unsigned OtherScore);
841 assert(
Reg != AMDGPU::SCC &&
"Shouldn't be used on SCC");
842 if (!Context->TRI->isInAllocatableClass(
Reg))
844 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(
Reg);
845 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
846 if (
Size == 16 && Context->ST->hasD16Writes32BitVgpr())
847 Reg = Context->TRI->get32BitRegister(
Reg);
848 return Context->TRI->regunits(
Reg);
851 void setScoreLB(InstCounterType
T,
unsigned Val) {
856 void setScoreUB(InstCounterType
T,
unsigned Val) {
863 if (getScoreRange(EXP_CNT) > getWaitCountMax(Context->getLimits(), EXP_CNT))
865 ScoreUBs[
EXP_CNT] - getWaitCountMax(Context->getLimits(), EXP_CNT);
868 void setRegScore(
MCPhysReg Reg, InstCounterType
T,
unsigned Val) {
869 const SIRegisterInfo *
TRI = Context->TRI;
870 if (
Reg == AMDGPU::SCC) {
872 }
else if (
TRI->isVectorRegister(*Context->MRI,
Reg)) {
873 for (MCRegUnit RU : regunits(
Reg))
874 VMem[toVMEMID(RU)].Scores[
T] = Val;
875 }
else if (
TRI->isSGPRReg(*Context->MRI,
Reg)) {
876 auto STy = getSgprScoresIdx(
T);
877 for (MCRegUnit RU : regunits(
Reg))
878 SGPRs[RU].Scores[STy] = Val;
884 void setVMemScore(VMEMID TID, InstCounterType
T,
unsigned Val) {
885 VMem[TID].Scores[
T] = Val;
888 void setScoreByOperand(
const MachineOperand &
Op, InstCounterType CntTy,
891 const SIInsertWaitcnts *Context;
893 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
894 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
895 unsigned PendingEvents = 0;
897 unsigned LastFlat[NUM_INST_CNTS] = {0};
899 unsigned LastGDS = 0;
916 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
918 unsigned VMEMTypes = 0;
928 std::array<unsigned, 2> Scores = {0};
930 bool empty()
const {
return !Scores[0] && !Scores[1]; }
933 DenseMap<VMEMID, VMEMInfo> VMem;
934 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
937 unsigned SCCScore = 0;
939 const MachineInstr *PendingSCCWrite =
nullptr;
943 SmallVector<const MachineInstr *> LDSDMAStores;
949 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
951 bool runOnMachineFunction(MachineFunction &MF)
override;
953 StringRef getPassName()
const override {
954 return "SI insert wait instructions";
957 void getAnalysisUsage(AnalysisUsage &AU)
const override {
960 AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
970 InstCounterType CntTy,
unsigned Score) {
971 setRegScore(
Op.getReg().asMCReg(), CntTy, Score);
979bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
const {
984 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
994bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
996 if (!hasPointSampleAccel(
MI))
999 return hasOtherPendingVmemTypes(
Reg, VMEM_NOSAMPLER);
1002void WaitcntBrackets::updateByEvent(WaitEventType
E, MachineInstr &Inst) {
1003 InstCounterType
T = eventCounter(
Context->WaitEventMaskForInst,
E);
1006 unsigned UB = getScoreUB(
T);
1007 unsigned CurrScore = UB + 1;
1013 PendingEvents |= 1 <<
E;
1014 setScoreUB(
T, CurrScore);
1017 const MachineRegisterInfo *
MRI =
Context->MRI;
1026 if (
const auto *AddrOp =
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
1027 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
1030 if (
const auto *Data0 =
1031 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
1032 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
1033 if (
const auto *Data1 =
1034 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
1035 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
1037 Inst.
getOpcode() != AMDGPU::DS_APPEND &&
1038 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
1039 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
1040 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1041 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1042 setScoreByOperand(
Op, EXP_CNT, CurrScore);
1045 }
else if (
TII->isFLAT(Inst)) {
1047 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1048 EXP_CNT, CurrScore);
1050 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1051 EXP_CNT, CurrScore);
1053 }
else if (
TII->isMIMG(Inst)) {
1055 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
1057 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1058 EXP_CNT, CurrScore);
1060 }
else if (
TII->isMTBUF(Inst)) {
1062 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
1063 }
else if (
TII->isMUBUF(Inst)) {
1065 setScoreByOperand(Inst.
getOperand(0), EXP_CNT, CurrScore);
1067 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
1068 EXP_CNT, CurrScore);
1070 }
else if (
TII->isLDSDIR(Inst)) {
1072 setScoreByOperand(*
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1073 EXP_CNT, CurrScore);
1075 if (
TII->isEXP(Inst)) {
1080 for (MachineOperand &DefMO : Inst.
all_defs()) {
1081 if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
1082 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1086 for (
const MachineOperand &
Op : Inst.
all_uses()) {
1087 if (
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1088 setScoreByOperand(
Op, EXP_CNT, CurrScore);
1091 }
else if (
T == X_CNT) {
1092 WaitEventType OtherEvent =
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1093 if (PendingEvents & (1 << OtherEvent)) {
1098 setScoreLB(
T, getScoreUB(
T) - 1);
1099 PendingEvents &= ~(1 << OtherEvent);
1101 for (
const MachineOperand &
Op : Inst.
all_uses())
1102 setScoreByOperand(
Op,
T, CurrScore);
1103 }
else if (
T == VA_VDST ||
T == VM_VSRC) {
1106 for (
const MachineOperand &
Op : Inst.
operands()) {
1107 if (!
Op.isReg() || (
T == VA_VDST &&
Op.isUse()) ||
1108 (
T == VM_VSRC &&
Op.isDef()))
1111 setScoreByOperand(
Op,
T, CurrScore);
1123 for (
const MachineOperand &
Op : Inst.
defs()) {
1124 if (
T == LOAD_CNT ||
T == SAMPLE_CNT ||
T == BVH_CNT) {
1125 if (!
TRI->isVectorRegister(*
MRI,
Op.getReg()))
1127 if (updateVMCntOnly(Inst)) {
1132 VmemType
V = getVmemType(Inst);
1133 unsigned char TypesMask = 1 <<
V;
1136 if (hasPointSampleAccel(Inst))
1137 TypesMask |= 1 << VMEM_NOSAMPLER;
1138 for (MCRegUnit RU : regunits(
Op.getReg().asMCReg()))
1139 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1142 setScoreByOperand(
Op,
T, CurrScore);
1145 (
TII->isDS(Inst) ||
TII->mayWriteLDSThroughDMA(Inst))) {
1154 if (!MemOp->isStore() ||
1159 auto AAI = MemOp->getAAInfo();
1165 if (!AAI || !AAI.Scope)
1167 for (
unsigned I = 0,
E = LDSDMAStores.
size();
I !=
E && !Slot; ++
I) {
1168 for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
1169 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1184 setVMemScore(LDSDMA_BEGIN,
T, CurrScore);
1185 if (Slot && Slot < NUM_LDSDMA)
1186 setVMemScore(LDSDMA_BEGIN + Slot,
T, CurrScore);
1190 setRegScore(AMDGPU::SCC,
T, CurrScore);
1191 PendingSCCWrite = &Inst;
1196void WaitcntBrackets::print(raw_ostream &OS)
const {
1200 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
1201 unsigned SR = getScoreRange(
T);
1205 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"LOAD" :
"VM") <<
"_CNT("
1209 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"DS" :
"LGKM") <<
"_CNT("
1213 OS <<
" EXP_CNT(" << SR <<
"):";
1216 OS <<
" " << (
ST->hasExtendedWaitCounts() ?
"STORE" :
"VS") <<
"_CNT("
1220 OS <<
" SAMPLE_CNT(" << SR <<
"):";
1223 OS <<
" BVH_CNT(" << SR <<
"):";
1226 OS <<
" KM_CNT(" << SR <<
"):";
1229 OS <<
" X_CNT(" << SR <<
"):";
1232 OS <<
" VA_VDST(" << SR <<
"): ";
1235 OS <<
" VM_VSRC(" << SR <<
"): ";
1238 OS <<
" UNKNOWN(" << SR <<
"):";
1244 unsigned LB = getScoreLB(
T);
1247 sort(SortedVMEMIDs);
1249 for (
auto ID : SortedVMEMIDs) {
1250 unsigned RegScore = VMem.at(
ID).Scores[
T];
1253 unsigned RelScore = RegScore - LB - 1;
1254 if (
ID < REGUNITS_END) {
1255 OS <<
' ' << RelScore <<
":vRU" <<
ID;
1257 assert(
ID >= LDSDMA_BEGIN &&
ID < LDSDMA_END &&
1258 "Unhandled/unexpected ID value!");
1259 OS <<
' ' << RelScore <<
":LDSDMA" <<
ID;
1264 if (isSmemCounter(
T)) {
1266 sort(SortedSMEMIDs);
1267 for (
auto ID : SortedSMEMIDs) {
1268 unsigned RegScore = SGPRs.at(
ID).Scores[getSgprScoresIdx(
T)];
1271 unsigned RelScore = RegScore - LB - 1;
1272 OS <<
' ' << RelScore <<
":sRU" <<
static_cast<unsigned>(
ID);
1276 if (
T == KM_CNT && SCCScore > 0)
1277 OS <<
' ' << SCCScore <<
":scc";
1282 OS <<
"Pending Events: ";
1283 if (hasPendingEvent()) {
1285 for (
unsigned I = 0;
I != NUM_WAIT_EVENTS; ++
I) {
1286 if (hasPendingEvent((WaitEventType)
I)) {
1287 OS <<
LS << WaitEventTypeName[
I];
1300void WaitcntBrackets::simplifyWaitcnt(
const AMDGPU::Waitcnt &CheckWait,
1301 AMDGPU::Waitcnt &UpdateWait)
const {
1302 simplifyWaitcnt(LOAD_CNT, UpdateWait.
LoadCnt);
1303 simplifyWaitcnt(EXP_CNT, UpdateWait.
ExpCnt);
1304 simplifyWaitcnt(DS_CNT, UpdateWait.
DsCnt);
1305 simplifyWaitcnt(STORE_CNT, UpdateWait.
StoreCnt);
1306 simplifyWaitcnt(SAMPLE_CNT, UpdateWait.
SampleCnt);
1307 simplifyWaitcnt(BVH_CNT, UpdateWait.
BvhCnt);
1308 simplifyWaitcnt(KM_CNT, UpdateWait.
KmCnt);
1309 simplifyXcnt(CheckWait, UpdateWait);
1310 simplifyWaitcnt(VA_VDST, UpdateWait.
VaVdst);
1311 simplifyVmVsrc(CheckWait, UpdateWait);
1314void WaitcntBrackets::simplifyWaitcnt(InstCounterType
T,
1315 unsigned &
Count)
const {
1319 if (
Count >= getScoreRange(
T))
1323void WaitcntBrackets::simplifyVmVsrc(
const AMDGPU::Waitcnt &CheckWait,
1324 AMDGPU::Waitcnt &UpdateWait)
const {
1329 std::min({CheckWait.LoadCnt, CheckWait.StoreCnt, CheckWait.SampleCnt,
1330 CheckWait.BvhCnt, CheckWait.DsCnt}))
1332 simplifyWaitcnt(VM_VSRC, UpdateWait.
VmVsrc);
1335void WaitcntBrackets::purgeEmptyTrackingData() {
1346void WaitcntBrackets::determineWaitForScore(InstCounterType
T,
1347 unsigned ScoreToWait,
1348 AMDGPU::Waitcnt &
Wait)
const {
1349 const unsigned LB = getScoreLB(
T);
1350 const unsigned UB = getScoreUB(
T);
1353 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1354 if ((
T == LOAD_CNT ||
T == DS_CNT) && hasPendingFlat() &&
1355 !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
1359 addWait(
Wait,
T, 0);
1360 }
else if (counterOutOfOrder(
T)) {
1364 addWait(
Wait,
T, 0);
1368 unsigned NeededWait = std::min(
1369 UB - ScoreToWait, getWaitCountMax(
Context->getLimits(),
T) - 1);
1370 addWait(
Wait,
T, NeededWait);
1375void WaitcntBrackets::determineWaitForPhysReg(InstCounterType
T,
MCPhysReg Reg,
1376 AMDGPU::Waitcnt &
Wait)
const {
1377 if (
Reg == AMDGPU::SCC) {
1378 determineWaitForScore(
T, SCCScore,
Wait);
1381 for (MCRegUnit RU : regunits(
Reg))
1382 determineWaitForScore(
1383 T, IsVGPR ? getVMemScore(toVMEMID(RU),
T) : getSGPRScore(RU,
T),
1388void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType
T, VMEMID TID,
1389 AMDGPU::Waitcnt &
Wait)
const {
1390 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1391 determineWaitForScore(
T, getVMemScore(TID,
T),
Wait);
1394void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1397 if (PendingSCCWrite &&
1398 PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1400 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1402 if ((PendingEvents &
Context->WaitEventMaskForInst[KM_CNT]) ==
1403 SCC_WRITE_PendingEvent) {
1404 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1407 PendingEvents &= ~SCC_WRITE_PendingEvent;
1408 PendingSCCWrite =
nullptr;
1412void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
1413 applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1414 applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1415 applyWaitcnt(DS_CNT,
Wait.DsCnt);
1416 applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1417 applyWaitcnt(SAMPLE_CNT,
Wait.SampleCnt);
1418 applyWaitcnt(BVH_CNT,
Wait.BvhCnt);
1419 applyWaitcnt(KM_CNT,
Wait.KmCnt);
1420 applyWaitcnt(X_CNT,
Wait.XCnt);
1421 applyWaitcnt(VA_VDST,
Wait.VaVdst);
1422 applyWaitcnt(VM_VSRC,
Wait.VmVsrc);
1425void WaitcntBrackets::applyWaitcnt(InstCounterType
T,
unsigned Count) {
1426 const unsigned UB = getScoreUB(
T);
1430 if (counterOutOfOrder(
T))
1432 setScoreLB(
T, std::max(getScoreLB(
T), UB -
Count));
1435 PendingEvents &= ~Context->WaitEventMaskForInst[
T];
1438 if (
T == KM_CNT &&
Count == 0 && hasPendingEvent(SMEM_GROUP)) {
1439 if (!hasMixedPendingEvents(X_CNT))
1440 applyWaitcnt(X_CNT, 0);
1442 PendingEvents &= ~(1 << SMEM_GROUP);
1444 if (
T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
1445 !hasPendingEvent(STORE_CNT)) {
1446 if (!hasMixedPendingEvents(X_CNT))
1447 applyWaitcnt(X_CNT,
Count);
1448 else if (
Count == 0)
1449 PendingEvents &= ~(1 << VMEM_GROUP);
1453void WaitcntBrackets::simplifyXcnt(
const AMDGPU::Waitcnt &CheckWait,
1454 AMDGPU::Waitcnt &UpdateWait)
const {
1463 if (CheckWait.
KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1464 UpdateWait.
XCnt = ~0
u;
1468 if (CheckWait.
LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1469 !hasPendingEvent(STORE_CNT) && CheckWait.
XCnt >= CheckWait.
LoadCnt)
1470 UpdateWait.
XCnt = ~0
u;
1471 simplifyWaitcnt(X_CNT, UpdateWait.
XCnt);
1476bool WaitcntBrackets::counterOutOfOrder(InstCounterType
T)
const {
1478 if ((
T ==
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1479 (
T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1485 if (
T == LOAD_CNT) {
1486 unsigned Events = hasPendingEvent(
T);
1489 Events &= ~(1 << GLOBAL_INV_ACCESS);
1492 return Events & (Events - 1);
1495 return hasMixedPendingEvents(
T);
1505char SIInsertWaitcntsLegacy::
ID = 0;
1510 return new SIInsertWaitcntsLegacy();
1515 int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
OpName);
1520 if (NewEnc == MO.
getImm())
1531 case AMDGPU::S_WAIT_LOADCNT:
1533 case AMDGPU::S_WAIT_EXPCNT:
1535 case AMDGPU::S_WAIT_STORECNT:
1537 case AMDGPU::S_WAIT_SAMPLECNT:
1539 case AMDGPU::S_WAIT_BVHCNT:
1541 case AMDGPU::S_WAIT_DSCNT:
1543 case AMDGPU::S_WAIT_KMCNT:
1545 case AMDGPU::S_WAIT_XCNT:
1552bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
const {
1566bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1567 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1570 assert(isNormalMode(MaxCounter));
1573 MachineInstr *WaitcntInstr =
nullptr;
1574 MachineInstr *WaitcntVsCntInstr =
nullptr;
1577 dbgs() <<
"PreGFX12::applyPreexistingWaitcnt at: ";
1579 dbgs() <<
"end of block\n";
1587 if (
II.isMetaInstruction()) {
1593 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1597 if (Opcode == AMDGPU::S_WAITCNT) {
1598 unsigned IEnc =
II.getOperand(0).getImm();
1601 ScoreBrackets.simplifyWaitcnt(OldWait);
1605 if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1606 II.eraseFromParent();
1610 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1613 <<
"Before: " <<
Wait <<
'\n';);
1614 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN,
Wait);
1623 II.eraseFromParent();
1625 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1626 assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1629 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1631 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1632 Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
1634 if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
1635 II.eraseFromParent();
1638 WaitcntVsCntInstr = &
II;
1645 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1647 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1648 ScoreBrackets.applyWaitcnt(EXP_CNT,
Wait.ExpCnt);
1649 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1654 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1655 <<
"New Instr at block end: "
1656 << *WaitcntInstr <<
'\n'
1657 :
dbgs() <<
"applied pre-existing waitcnt\n"
1658 <<
"Old Instr: " << *It
1659 <<
"New Instr: " << *WaitcntInstr <<
'\n');
1662 if (WaitcntVsCntInstr) {
1664 AMDGPU::OpName::simm16,
Wait.StoreCnt);
1665 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1667 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1668 Wait.StoreCnt = ~0
u;
1671 ?
dbgs() <<
"applied pre-existing waitcnt\n"
1672 <<
"New Instr at block end: " << *WaitcntVsCntInstr
1674 :
dbgs() <<
"applied pre-existing waitcnt\n"
1675 <<
"Old Instr: " << *It
1676 <<
"New Instr: " << *WaitcntVsCntInstr <<
'\n');
1684bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1686 AMDGPU::Waitcnt
Wait, WaitcntBrackets *ScoreBrackets) {
1688 assert(isNormalMode(MaxCounter));
1696 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
1698 if (Outstanding > Target) {
1699 for (
unsigned i = Outstanding - 1; i >=
Target && i != ~0
u; --i) {
1704 EmitWaitcnt(Target);
1711 if (
Wait.hasWaitExceptStoreCnt()) {
1714 if (ExpandWaitcntProfiling && ScoreBrackets) {
1718 bool AnyOutOfOrder =
false;
1719 for (
auto CT : {LOAD_CNT, DS_CNT,
EXP_CNT}) {
1720 unsigned &WaitCnt = getCounterRef(
Wait, CT);
1721 if (WaitCnt != ~0u && ScoreBrackets->counterOutOfOrder(CT)) {
1722 AnyOutOfOrder =
true;
1727 if (AnyOutOfOrder) {
1734 for (
auto CT : {LOAD_CNT, DS_CNT,
EXP_CNT}) {
1735 unsigned &WaitCnt = getCounterRef(
Wait, CT);
1739 unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
1740 ScoreBrackets->getScoreLB(CT),
1741 getWaitCountMax(getLimits(), CT) - 1);
1742 EmitExpandedWaitcnt(Outstanding, WaitCnt, [&](
unsigned Count) {
1744 getCounterRef(W, CT) =
Count;
1753 [[maybe_unused]]
auto SWaitInst =
1758 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1759 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1763 if (
Wait.hasWaitStoreCnt()) {
1766 if (ExpandWaitcntProfiling && ScoreBrackets &&
Wait.StoreCnt != ~0u &&
1767 !ScoreBrackets->counterOutOfOrder(STORE_CNT)) {
1769 unsigned Outstanding =
1770 std::min(ScoreBrackets->getScoreUB(STORE_CNT) -
1771 ScoreBrackets->getScoreLB(STORE_CNT),
1772 getWaitCountMax(getLimits(), STORE_CNT) - 1);
1773 EmitExpandedWaitcnt(Outstanding,
Wait.StoreCnt, [&](
unsigned Count) {
1774 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1775 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1779 [[maybe_unused]]
auto SWaitInst =
1786 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
1787 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
1795WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1796 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt &&
ST->hasVscnt() ? 0 : ~0u);
1800WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
const {
1801 unsigned ExpertVal = IsExpertMode ? 0 : ~0
u;
1802 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1803 ~0u , ExpertVal, ExpertVal);
1810bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1811 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1814 assert(!isNormalMode(MaxCounter));
1817 MachineInstr *CombinedLoadDsCntInstr =
nullptr;
1818 MachineInstr *CombinedStoreDsCntInstr =
nullptr;
1819 MachineInstr *WaitcntDepctrInstr =
nullptr;
1820 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1823 dbgs() <<
"GFX12Plus::applyPreexistingWaitcnt at: ";
1825 dbgs() <<
"end of block\n";
1831 AMDGPU::Waitcnt RequiredWait;
1836 if (
II.isMetaInstruction()) {
1841 MachineInstr **UpdatableInstr;
1847 bool TrySimplify = Opcode !=
II.getOpcode() && !OptNone;
1851 if (Opcode == AMDGPU::S_WAITCNT)
1854 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1856 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1861 RequiredWait = RequiredWait.combined(OldWait);
1862 UpdatableInstr = &CombinedLoadDsCntInstr;
1863 }
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1865 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1870 RequiredWait = RequiredWait.combined(OldWait);
1871 UpdatableInstr = &CombinedStoreDsCntInstr;
1872 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1874 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1875 AMDGPU::Waitcnt OldWait;
1879 ScoreBrackets.simplifyWaitcnt(OldWait);
1881 UpdatableInstr = &WaitcntDepctrInstr;
1882 }
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1885 II.eraseFromParent();
1891 TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1893 addWait(
Wait, CT.value(), OldCnt);
1895 addWait(RequiredWait, CT.value(), OldCnt);
1896 UpdatableInstr = &WaitInstrs[CT.value()];
1900 if (!*UpdatableInstr) {
1901 *UpdatableInstr = &
II;
1902 }
else if (Opcode == AMDGPU::S_WAITCNT_DEPCTR) {
1909 unsigned Enc =
TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
1917 II.eraseFromParent();
1921 II.eraseFromParent();
1926 ScoreBrackets.simplifyWaitcnt(
Wait.combined(RequiredWait),
Wait);
1927 Wait =
Wait.combined(RequiredWait);
1929 if (CombinedLoadDsCntInstr) {
1942 if (
Wait.LoadCnt != ~0u &&
Wait.DsCnt != ~0u) {
1945 AMDGPU::OpName::simm16, NewEnc);
1946 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1947 ScoreBrackets.applyWaitcnt(LOAD_CNT,
Wait.LoadCnt);
1948 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1952 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1953 <<
"New Instr at block end: "
1954 << *CombinedLoadDsCntInstr <<
'\n'
1955 :
dbgs() <<
"applied pre-existing waitcnt\n"
1956 <<
"Old Instr: " << *It <<
"New Instr: "
1957 << *CombinedLoadDsCntInstr <<
'\n');
1964 if (CombinedStoreDsCntInstr) {
1966 if (
Wait.StoreCnt != ~0u &&
Wait.DsCnt != ~0u) {
1969 AMDGPU::OpName::simm16, NewEnc);
1970 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1971 ScoreBrackets.applyWaitcnt(STORE_CNT,
Wait.StoreCnt);
1972 ScoreBrackets.applyWaitcnt(DS_CNT,
Wait.DsCnt);
1973 Wait.StoreCnt = ~0
u;
1976 LLVM_DEBUG(It.isEnd() ?
dbgs() <<
"applied pre-existing waitcnt\n"
1977 <<
"New Instr at block end: "
1978 << *CombinedStoreDsCntInstr <<
'\n'
1979 :
dbgs() <<
"applied pre-existing waitcnt\n"
1980 <<
"Old Instr: " << *It <<
"New Instr: "
1981 << *CombinedStoreDsCntInstr <<
'\n');
1994 if (
Wait.DsCnt != ~0u) {
2003 if (
Wait.LoadCnt != ~0u) {
2004 WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
2005 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
2006 }
else if (
Wait.StoreCnt != ~0u) {
2007 WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
2008 WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
2011 for (MachineInstr **WI : WaitsToErase) {
2015 (*WI)->eraseFromParent();
2021 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2022 if (!WaitInstrs[CT])
2025 unsigned NewCnt = getWait(
Wait, CT);
2026 if (NewCnt != ~0u) {
2028 AMDGPU::OpName::simm16, NewCnt);
2029 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
2031 ScoreBrackets.applyWaitcnt(CT, NewCnt);
2032 setNoWait(
Wait, CT);
2035 ?
dbgs() <<
"applied pre-existing waitcnt\n"
2036 <<
"New Instr at block end: " << *WaitInstrs[CT]
2038 :
dbgs() <<
"applied pre-existing waitcnt\n"
2039 <<
"Old Instr: " << *It
2040 <<
"New Instr: " << *WaitInstrs[CT] <<
'\n');
2047 if (WaitcntDepctrInstr) {
2051 TII->getNamedOperand(*WaitcntDepctrInstr, AMDGPU::OpName::simm16)
2056 ScoreBrackets.applyWaitcnt(VA_VDST,
Wait.VaVdst);
2057 ScoreBrackets.applyWaitcnt(VM_VSRC,
Wait.VmVsrc);
2066 AMDGPU::OpName::simm16, Enc);
2068 <<
"New Instr at block end: "
2069 << *WaitcntDepctrInstr <<
'\n'
2070 :
dbgs() <<
"applyPreexistingWaitcnt\n"
2071 <<
"Old Instr: " << *It <<
"New Instr: "
2072 << *WaitcntDepctrInstr <<
'\n');
2083bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
2085 AMDGPU::Waitcnt
Wait, WaitcntBrackets *ScoreBrackets) {
2087 assert(!isNormalMode(MaxCounter));
2093 auto EmitExpandedWaitcnt = [&](
unsigned Outstanding,
unsigned Target,
2095 if (Outstanding > Target) {
2096 for (
unsigned i = Outstanding - 1; i >=
Target && i != ~0
u; --i) {
2101 EmitWaitcnt(Target);
2108 if (ExpandWaitcntProfiling && ScoreBrackets) {
2109 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2115 if (ScoreBrackets->counterOutOfOrder(CT)) {
2122 unsigned Outstanding = std::min(ScoreBrackets->getScoreUB(CT) -
2123 ScoreBrackets->getScoreLB(CT),
2124 getWaitCountMax(getLimits(), CT) - 1);
2125 EmitExpandedWaitcnt(Outstanding,
Count, [&](
unsigned Val) {
2135 if (
Wait.DsCnt != ~0u) {
2136 MachineInstr *SWaitInst =
nullptr;
2138 if (
Wait.LoadCnt != ~0u) {
2146 }
else if (
Wait.StoreCnt != ~0u) {
2153 Wait.StoreCnt = ~0
u;
2161 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2162 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2169 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2174 [[maybe_unused]]
auto SWaitInst =
2181 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2182 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2185 if (
Wait.hasWaitDepctr()) {
2190 [[maybe_unused]]
auto SWaitInst =
2196 if (It !=
Block.instr_end())
dbgs() <<
"Old Instr: " << *It;
2197 dbgs() <<
"New Instr: " << *SWaitInst <<
'\n');
2216bool SIInsertWaitcnts::generateWaitcntInstBefore(
2217 MachineInstr &
MI, WaitcntBrackets &ScoreBrackets,
2218 MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) {
2219 setForceEmitWaitcnt();
2223 AMDGPU::Waitcnt
Wait;
2224 const unsigned Opc =
MI.getOpcode();
2230 if (
Opc == AMDGPU::BUFFER_WBINVL1 ||
Opc == AMDGPU::BUFFER_WBINVL1_SC ||
2231 Opc == AMDGPU::BUFFER_WBINVL1_VOL ||
Opc == AMDGPU::BUFFER_GL0_INV ||
2232 Opc == AMDGPU::BUFFER_GL1_INV) {
2239 if (
Opc == AMDGPU::SI_RETURN_TO_EPILOG ||
Opc == AMDGPU::SI_RETURN ||
2240 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
2241 Opc == AMDGPU::S_SETPC_B64_return) {
2243 AMDGPU::Waitcnt AllZeroWait =
2244 WCG->getAllZeroWaitcnt(
false);
2249 if (
ST->hasExtendedWaitCounts() &&
2250 !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
2262 else if (
Opc == AMDGPU::S_ENDPGM ||
Opc == AMDGPU::S_ENDPGM_SAVED) {
2263 if (!WCG->isOptNone() &&
2264 (
MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
2265 (
ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
2266 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
2267 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
2271 else if ((
Opc == AMDGPU::S_SENDMSG ||
Opc == AMDGPU::S_SENDMSGHALT) &&
2272 ST->hasLegacyGeometry() &&
2283 if (
MI.modifiesRegister(AMDGPU::EXEC,
TRI)) {
2286 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
2287 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
2288 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
2289 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
2296 if (
TII->isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
2297 addWait(
Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
2304 Wait = AMDGPU::Waitcnt();
2306 const MachineOperand &CallAddrOp =
TII->getCalleeOperand(
MI);
2307 if (CallAddrOp.
isReg()) {
2308 ScoreBrackets.determineWaitForPhysReg(
2311 if (
const auto *RtnAddrOp =
2312 TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
2313 ScoreBrackets.determineWaitForPhysReg(
2314 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(),
Wait);
2317 }
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
2318 ScoreBrackets.tryClearSCCWriteEvent(&
MI);
2334 for (
const MachineMemOperand *Memop :
MI.memoperands()) {
2335 const Value *Ptr = Memop->getValue();
2336 if (Memop->isStore()) {
2337 if (
auto It = SLoadAddresses.
find(Ptr); It != SLoadAddresses.
end()) {
2338 addWait(
Wait, SmemAccessCounter, 0);
2340 SLoadAddresses.
erase(It);
2343 unsigned AS = Memop->getAddrSpace();
2347 if (
TII->mayWriteLDSThroughDMA(
MI))
2351 unsigned TID = LDSDMA_BEGIN;
2352 if (Ptr && Memop->getAAInfo()) {
2353 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2354 for (
unsigned I = 0,
E = LDSDMAStores.size();
I !=
E; ++
I) {
2355 if (
MI.mayAlias(AA, *LDSDMAStores[
I],
true)) {
2356 if ((
I + 1) >= NUM_LDSDMA) {
2359 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID,
Wait);
2363 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID +
I + 1,
Wait);
2367 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID,
Wait);
2369 if (Memop->isStore()) {
2370 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID,
Wait);
2375 for (
const MachineOperand &
Op :
MI.operands()) {
2380 if (
Op.isTied() &&
Op.isUse() &&
TII->doesNotReadTiedSource(
MI))
2385 const bool IsVGPR =
TRI->isVectorRegister(*
MRI,
Op.getReg());
2392 if (
Op.isImplicit() &&
MI.mayLoadOrStore())
2395 ScoreBrackets.determineWaitForPhysReg(VA_VDST,
Reg,
Wait);
2397 ScoreBrackets.determineWaitForPhysReg(VM_VSRC,
Reg,
Wait);
2404 if (
Op.isUse() || !updateVMCntOnly(
MI) ||
2405 ScoreBrackets.hasOtherPendingVmemTypes(
Reg, getVmemType(
MI)) ||
2406 ScoreBrackets.hasPointSamplePendingVmemTypes(
MI,
Reg) ||
2407 !
ST->hasVmemWriteVgprInOrder()) {
2408 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT,
Reg,
Wait);
2409 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT,
Reg,
Wait);
2410 ScoreBrackets.determineWaitForPhysReg(BVH_CNT,
Reg,
Wait);
2411 ScoreBrackets.clearVgprVmemTypes(
Reg);
2414 if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2415 ScoreBrackets.determineWaitForPhysReg(EXP_CNT,
Reg,
Wait);
2417 ScoreBrackets.determineWaitForPhysReg(DS_CNT,
Reg,
Wait);
2418 }
else if (
Op.getReg() == AMDGPU::SCC) {
2419 ScoreBrackets.determineWaitForPhysReg(KM_CNT,
Reg,
Wait);
2421 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter,
Reg,
Wait);
2424 if (
ST->hasWaitXCnt() &&
Op.isDef())
2425 ScoreBrackets.determineWaitForPhysReg(X_CNT,
Reg,
Wait);
2442 if (
Opc == AMDGPU::S_BARRIER && !
ST->hasAutoWaitcntBeforeBarrier() &&
2443 !
ST->supportsBackOffBarrier()) {
2444 Wait =
Wait.combined(WCG->getAllZeroWaitcnt(
true));
2451 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2456 ScoreBrackets.simplifyWaitcnt(
Wait);
2462 if (
TII->isVALU(
MI))
2468 if (
Wait.XCnt != ~0u && isVmemAccess(
MI)) {
2469 ScoreBrackets.applyWaitcnt(X_CNT,
Wait.XCnt);
2476 Wait = WCG->getAllZeroWaitcnt(
false);
2478 if (ForceEmitWaitcnt[LOAD_CNT])
2480 if (ForceEmitWaitcnt[EXP_CNT])
2482 if (ForceEmitWaitcnt[DS_CNT])
2484 if (ForceEmitWaitcnt[SAMPLE_CNT])
2486 if (ForceEmitWaitcnt[BVH_CNT])
2488 if (ForceEmitWaitcnt[KM_CNT])
2490 if (ForceEmitWaitcnt[X_CNT])
2494 if (ForceEmitWaitcnt[VA_VDST])
2496 if (ForceEmitWaitcnt[VM_VSRC])
2500 if (FlushFlags.FlushVmCnt) {
2501 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2503 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2505 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2509 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
2515 return generateWaitcnt(
Wait,
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
2519bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt
Wait,
2521 MachineBasicBlock &
Block,
2522 WaitcntBrackets &ScoreBrackets,
2523 MachineInstr *OldWaitcntInstr) {
2526 if (OldWaitcntInstr)
2530 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr,
Wait, It);
2532 AMDGPU::Waitcnt WaitForScore =
Wait;
2535 if (
Wait.ExpCnt != ~0u && It !=
Block.instr_end() &&
2537 MachineOperand *WaitExp =
2538 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2546 <<
"Update Instr: " << *It);
2549 if (WCG->createNewWaitcnt(
Block, It,
Wait, &ScoreBrackets))
2554 ScoreBrackets.applyWaitcnt(WaitForScore);
2559std::optional<WaitEventType>
2560SIInsertWaitcnts::getExpertSchedulingEventType(
const MachineInstr &Inst)
const {
2561 if (
TII->isVALU(Inst)) {
2566 if (
TII->isXDL(Inst))
2567 return VGPR_XDL_WRITE;
2569 if (
TII->isTRANS(Inst))
2570 return VGPR_TRANS_WRITE;
2573 return VGPR_DPMACC_WRITE;
2575 return VGPR_CSMACC_WRITE;
2582 if (
TII->isFLAT(Inst))
2583 return VGPR_FLAT_READ;
2585 if (
TII->isDS(Inst))
2586 return VGPR_LDS_READ;
2588 if (
TII->isVMEM(Inst) ||
TII->isVIMAGE(Inst) ||
TII->isVSAMPLE(Inst))
2589 return VGPR_VMEM_READ;
2596bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
const {
2597 return (
TII->isFLAT(
MI) &&
TII->mayAccessVMEMThroughFlat(
MI)) ||
2604 MachineBasicBlock *
Block)
const {
2605 auto BlockEnd =
Block->getParent()->end();
2606 auto BlockIter =
Block->getIterator();
2610 if (++BlockIter != BlockEnd) {
2611 It = BlockIter->instr_begin();
2618 if (!It->isMetaInstruction())
2626 return It->getOpcode() == AMDGPU::S_ENDPGM;
2630bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2631 MachineBasicBlock &
Block,
2632 WaitcntBrackets &ScoreBrackets) {
2633 AMDGPU::Waitcnt
Wait;
2634 bool NeedsEndPGMCheck =
false;
2642 NeedsEndPGMCheck =
true;
2645 ScoreBrackets.simplifyWaitcnt(
Wait);
2648 bool Result = generateWaitcnt(
Wait, SuccessorIt,
Block, ScoreBrackets,
2651 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
2659void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2660 WaitcntBrackets *ScoreBrackets) {
2668 bool IsVMEMAccess =
false;
2669 bool IsSMEMAccess =
false;
2672 if (
const auto ET = getExpertSchedulingEventType(Inst))
2673 ScoreBrackets->updateByEvent(*ET, Inst);
2676 if (
TII->isDS(Inst) &&
TII->usesLGKM_CNT(Inst)) {
2678 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2679 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2680 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2681 ScoreBrackets->setPendingGDS();
2683 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2685 }
else if (
TII->isFLAT(Inst)) {
2687 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2693 int FlatASCount = 0;
2695 if (
TII->mayAccessVMEMThroughFlat(Inst)) {
2697 IsVMEMAccess =
true;
2698 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2701 if (
TII->mayAccessLDSThroughFlat(Inst)) {
2703 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2712 ScoreBrackets->setPendingFlat();
2715 IsVMEMAccess =
true;
2716 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2718 if (
ST->vmemWriteNeedsExpWaitcnt() &&
2720 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2722 }
else if (
TII->isSMRD(Inst)) {
2723 IsSMEMAccess =
true;
2724 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2725 }
else if (Inst.
isCall()) {
2727 ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(
false));
2728 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2730 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2731 }
else if (
TII->isVINTERP(Inst)) {
2732 int64_t
Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2733 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2735 unsigned Imm =
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2737 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2739 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2741 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2743 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2746 case AMDGPU::S_SENDMSG:
2747 case AMDGPU::S_SENDMSG_RTN_B32:
2748 case AMDGPU::S_SENDMSG_RTN_B64:
2749 case AMDGPU::S_SENDMSGHALT:
2750 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2752 case AMDGPU::S_MEMTIME:
2753 case AMDGPU::S_MEMREALTIME:
2754 case AMDGPU::S_GET_BARRIER_STATE_M0:
2755 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2756 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2761 if (!
ST->hasWaitXCnt())
2765 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2768 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2771bool WaitcntBrackets::mergeScore(
const MergeInfo &M,
unsigned &Score,
2772 unsigned OtherScore) {
2773 unsigned MyShifted = Score <=
M.OldLB ? 0 : Score +
M.MyShift;
2774 unsigned OtherShifted =
2775 OtherScore <=
M.OtherLB ? 0 : OtherScore +
M.OtherShift;
2776 Score = std::max(MyShifted, OtherShifted);
2777 return OtherShifted > MyShifted;
2785bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
2786 bool StrictDom =
false;
2790 for (
auto K :
Other.VMem.keys())
2791 VMem.try_emplace(K);
2792 for (
auto K :
Other.SGPRs.keys())
2793 SGPRs.try_emplace(K);
2795 for (
auto T : inst_counter_types(
Context->MaxCounter)) {
2797 const unsigned *WaitEventMaskForInst =
Context->WaitEventMaskForInst;
2798 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
2799 const unsigned OtherEvents =
Other.PendingEvents & WaitEventMaskForInst[
T];
2800 if (OtherEvents & ~OldEvents)
2802 PendingEvents |= OtherEvents;
2805 const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
2806 const unsigned OtherPending =
Other.ScoreUBs[
T] -
Other.ScoreLBs[
T];
2807 const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
2808 if (NewUB < ScoreLBs[
T])
2812 M.OldLB = ScoreLBs[
T];
2813 M.OtherLB =
Other.ScoreLBs[
T];
2814 M.MyShift = NewUB - ScoreUBs[
T];
2815 M.OtherShift = NewUB -
Other.ScoreUBs[
T];
2817 ScoreUBs[
T] = NewUB;
2819 StrictDom |= mergeScore(M, LastFlat[
T],
Other.LastFlat[
T]);
2822 StrictDom |= mergeScore(M, LastGDS,
Other.LastGDS);
2825 StrictDom |= mergeScore(M, SCCScore,
Other.SCCScore);
2826 if (
Other.hasPendingEvent(SCC_WRITE)) {
2827 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2828 if (!OldEventsHasSCCWrite) {
2829 PendingSCCWrite =
Other.PendingSCCWrite;
2830 }
else if (PendingSCCWrite !=
Other.PendingSCCWrite) {
2831 PendingSCCWrite =
nullptr;
2836 for (
auto &[RegID,
Info] : VMem)
2837 StrictDom |= mergeScore(M,
Info.Scores[
T],
Other.getVMemScore(RegID,
T));
2839 if (isSmemCounter(
T)) {
2840 unsigned Idx = getSgprScoresIdx(
T);
2841 for (
auto &[RegID,
Info] : SGPRs) {
2842 auto It =
Other.SGPRs.find(RegID);
2843 unsigned OtherScore =
2844 (It !=
Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2845 StrictDom |= mergeScore(M,
Info.Scores[Idx], OtherScore);
2850 for (
auto &[TID,
Info] : VMem) {
2851 if (
auto It =
Other.VMem.find(TID); It !=
Other.VMem.end()) {
2852 unsigned char NewVmemTypes =
Info.VMEMTypes | It->second.VMEMTypes;
2853 StrictDom |= NewVmemTypes !=
Info.VMEMTypes;
2854 Info.VMEMTypes = NewVmemTypes;
2858 purgeEmptyTrackingData();
2864 return Opcode == AMDGPU::S_WAITCNT ||
2867 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2868 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2869 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2873void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &
MBB,
2875 bool ExpertMode)
const {
2879 .
addImm(ExpertMode ? 2 : 0)
2884bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2885 MachineBasicBlock &
Block,
2886 WaitcntBrackets &ScoreBrackets) {
2890 dbgs() <<
"*** Begin Block: ";
2892 ScoreBrackets.dump();
2898 bool VCCZCorrect =
true;
2899 if (
ST->hasReadVCCZBug()) {
2902 VCCZCorrect =
false;
2903 }
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
2906 VCCZCorrect =
false;
2910 MachineInstr *OldWaitcntInstr =
nullptr;
2915 MachineInstr &Inst = *Iter;
2924 (IsExpertMode && Inst.
getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
2925 if (!OldWaitcntInstr)
2926 OldWaitcntInstr = &Inst;
2931 PreheaderFlushFlags FlushFlags;
2932 if (
Block.getFirstTerminator() == Inst)
2933 FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
2936 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2938 OldWaitcntInstr =
nullptr;
2944 if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
2948 if (!
ST->partialVCCWritesUpdateVCCZ())
2949 VCCZCorrect =
false;
2958 if (
ST->hasReadVCCZBug() &&
2959 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2962 VCCZCorrect =
false;
2970 if (
TII->isSMRD(Inst)) {
2971 for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
2974 if (!Memop->isInvariant()) {
2975 const Value *Ptr = Memop->getValue();
2979 if (
ST->hasReadVCCZBug()) {
2981 VCCZCorrect =
false;
2985 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2987 Modified |= insertForcedWaitAfter(Inst,
Block, ScoreBrackets);
2991 ScoreBrackets.dump();
3001 TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
3013 AMDGPU::Waitcnt
Wait;
3014 if (
Block.getFirstTerminator() ==
Block.end()) {
3015 PreheaderFlushFlags FlushFlags = isPreheaderToFlush(
Block, ScoreBrackets);
3016 if (FlushFlags.FlushVmCnt) {
3017 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
3019 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
3021 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
3024 if (FlushFlags.FlushDsCnt && ScoreBrackets.hasPendingEvent(DS_CNT))
3033 dbgs() <<
"*** End Block: ";
3035 ScoreBrackets.dump();
3043SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &
MBB,
3044 const WaitcntBrackets &ScoreBrackets) {
3045 auto [Iterator, IsInserted] =
3048 return Iterator->second;
3052 return PreheaderFlushFlags();
3056 return PreheaderFlushFlags();
3059 Iterator->second = getPreheaderFlushFlags(Loop, ScoreBrackets);
3060 return Iterator->second;
3063 return PreheaderFlushFlags();
3066bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
const {
3068 return TII->mayAccessVMEMThroughFlat(
MI);
3072bool SIInsertWaitcnts::isDSRead(
const MachineInstr &
MI)
const {
3078bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(
const MachineInstr &
MI)
const {
3104SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *
ML,
3105 const WaitcntBrackets &Brackets) {
3106 PreheaderFlushFlags
Flags;
3107 bool HasVMemLoad =
false;
3108 bool HasVMemStore =
false;
3109 bool SeenDSStoreInLoop =
false;
3110 bool UsesVgprLoadedOutsideVMEM =
false;
3111 bool UsesVgprLoadedOutsideDS =
false;
3112 bool VMemInvalidated =
false;
3114 bool DSInvalidated = !
ST->hasExtendedWaitCounts();
3115 DenseSet<MCRegUnit> VgprUse;
3116 DenseSet<MCRegUnit> VgprDefVMEM;
3117 DenseSet<MCRegUnit> VgprDefDS;
3119 for (MachineBasicBlock *
MBB :
ML->blocks()) {
3120 bool SeenDSStoreInCurrMBB =
false;
3121 for (MachineInstr &
MI : *
MBB) {
3122 if (isVMEMOrFlatVMEM(
MI)) {
3123 HasVMemLoad |=
MI.mayLoad();
3124 HasVMemStore |=
MI.mayStore();
3126 if (mayStoreIncrementingDSCNT(
MI))
3127 SeenDSStoreInCurrMBB =
true;
3132 if (
MI.getOpcode() == AMDGPU::S_BARRIER)
3133 SeenDSStoreInCurrMBB =
false;
3134 for (
const MachineOperand &
Op :
MI.all_uses()) {
3135 if (
Op.isDebug() || !
TRI->isVectorRegister(*
MRI,
Op.getReg()))
3138 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3142 VMemInvalidated =
true;
3146 DSInvalidated =
true;
3149 if (VMemInvalidated && DSInvalidated)
3155 VMEMID
ID = toVMEMID(RU);
3156 bool HasPendingVMEM =
3157 Brackets.getVMemScore(
ID, LOAD_CNT) >
3158 Brackets.getScoreLB(LOAD_CNT) ||
3159 Brackets.getVMemScore(
ID, SAMPLE_CNT) >
3160 Brackets.getScoreLB(SAMPLE_CNT) ||
3161 Brackets.getVMemScore(
ID, BVH_CNT) > Brackets.getScoreLB(BVH_CNT);
3163 UsesVgprLoadedOutsideVMEM =
true;
3167 if (!HasPendingVMEM &&
3168 Brackets.getVMemScore(
ID, DS_CNT) > Brackets.getScoreLB(DS_CNT))
3169 UsesVgprLoadedOutsideDS =
true;
3174 if (isVMEMOrFlatVMEM(
MI) &&
MI.mayLoad()) {
3175 for (
const MachineOperand &
Op :
MI.all_defs()) {
3176 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3180 VMemInvalidated =
true;
3185 if (VMemInvalidated && DSInvalidated)
3197 for (
const MachineOperand &
Op :
MI.all_defs()) {
3198 for (MCRegUnit RU :
TRI->regunits(
Op.getReg().asMCReg())) {
3205 SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
3209 if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
3210 ((!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
3211 (HasVMemLoad &&
ST->hasVmemWriteVgprInOrder())))
3212 Flags.FlushVmCnt =
true;
3219 if (!DSInvalidated && !SeenDSStoreInLoop && UsesVgprLoadedOutsideDS)
3220 Flags.FlushDsCnt =
true;
3225bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
3226 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
3228 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
3230 if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
3231 AA = &AAR->getAAResults();
3233 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
3245 if (!SIInsertWaitcnts(MLI, PDT,
AA).
run(MF))
3250 .preserve<AAManager>();
3255 TII = ST->getInstrInfo();
3256 TRI = &
TII->getRegisterInfo();
3265 if (ST->hasExtendedWaitCounts()) {
3266 IsExpertMode = ST->hasExpertSchedulingMode() &&
3272 MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
3274 WaitcntGeneratorGFX12Plus(MF, MaxCounter, &Limits, IsExpertMode);
3275 WCG = &WCGGFX12Plus;
3277 MaxCounter = NUM_NORMAL_INST_CNTS;
3278 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, &Limits);
3282 for (
auto T : inst_counter_types())
3283 ForceEmitWaitcnt[
T] =
false;
3285 WaitEventMaskForInst = WCG->getWaitEventMask();
3287 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
3292 MachineBasicBlock &EntryBB = MF.
front();
3302 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3305 if (
ST->hasExtendedWaitCounts()) {
3308 for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
3309 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
3312 if (!
ST->hasImageInsts() &&
3313 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
3317 TII->get(instrsForExtendedCounterTypes[CT]))
3330 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
3331 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
3332 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
3339 for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
3342 std::unique_ptr<WaitcntBrackets> Brackets;
3347 for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
3349 MachineBasicBlock *
MBB = BII->first;
3350 BlockInfo &BI = BII->second;
3356 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
3358 *Brackets = *BI.Incoming;
3361 Brackets = std::make_unique<WaitcntBrackets>(
this);
3363 *Brackets = WaitcntBrackets(
this);
3366 Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
3369 if (Brackets->hasPendingEvent()) {
3370 BlockInfo *MoveBracketsToSucc =
nullptr;
3372 auto *SuccBII = BlockInfos.
find(Succ);
3373 BlockInfo &SuccBI = SuccBII->second;
3374 if (!SuccBI.Incoming) {
3375 SuccBI.Dirty =
true;
3376 if (SuccBII <= BII) {
3380 if (!MoveBracketsToSucc) {
3381 MoveBracketsToSucc = &SuccBI;
3383 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
3385 }
else if (SuccBI.Incoming->merge(*Brackets)) {
3386 SuccBI.Dirty =
true;
3387 if (SuccBII <= BII) {
3393 if (MoveBracketsToSucc)
3394 MoveBracketsToSucc->Incoming = std::move(Brackets);
3399 if (
ST->hasScalarStores()) {
3400 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
3401 bool HaveScalarStores =
false;
3403 for (MachineBasicBlock &
MBB : MF) {
3404 for (MachineInstr &
MI :
MBB) {
3405 if (!HaveScalarStores &&
TII->isScalarStore(
MI))
3406 HaveScalarStores =
true;
3408 if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
3409 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
3414 if (HaveScalarStores) {
3423 for (MachineBasicBlock *
MBB : EndPgmBlocks) {
3424 bool SeenDCacheWB =
false;
3428 if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
3429 SeenDCacheWB =
true;
3430 else if (
TII->isScalarStore(*
I))
3431 SeenDCacheWB =
false;
3434 if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
3435 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
3451 while (
I != EntryBB.
end() &&
I->isMetaInstruction())
3453 setSchedulingMode(EntryBB,
I,
true);
3455 for (MachineInstr *
MI : CallInsts) {
3456 MachineBasicBlock &
MBB = *
MI->getParent();
3457 setSchedulingMode(
MBB,
MI,
false);
3458 setSchedulingMode(
MBB, std::next(
MI->getIterator()),
true);
3461 for (MachineInstr *
MI : ReturnInsts)
3462 setSchedulingMode(*
MI->getParent(),
MI,
false);
3473 for (MachineInstr *
MI : ReleaseVGPRInsts) {
3475 TII->get(AMDGPU::S_ALLOC_VGPR))
3480 if (!ReleaseVGPRInsts.empty() &&
3481 (MF.getFrameInfo().hasCalls() ||
3482 ST->getOccupancyWithNumVGPRs(
3483 TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass),
3486 for (MachineInstr *
MI : ReleaseVGPRInsts) {
3487 if (
ST->requiresNopBeforeDeallocVGPRs()) {
3489 TII->get(AMDGPU::S_NOP))
3493 TII->get(AMDGPU::S_SENDMSG))
3501 ReturnInsts.
clear();
3502 ReleaseVGPRInsts.clear();
3503 PreheadersToFlush.
clear();
3504 SLoadAddresses.
clear();
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
#define AMDGPU_EVENT_NAME(Name)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ExpertSchedulingModeFlag("amdgpu-expert-scheduling-mode", cl::desc("Enable expert scheduling mode 2 for all functions (GFX12+ only)"), cl::init(false), cl::Hidden)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned decodeFieldVaVdst(unsigned Encoded)
int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
bool isDPMACCInstruction(unsigned Opc)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
static constexpr ValueType Default
static constexpr uint64_t encode(Fields... Values)
Represents the hardware counter limits for different wait count types.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
static constexpr bool is_iterable