26struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(
O) {}
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
31 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
34 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
44 cl::desc(
"Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
50 cl::desc(
"Insert a s_nop x before every instruction"));
60 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
69 EmittedInstrs.clear();
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
104 return Opcode == AMDGPU::S_RFE_B64;
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
121 if (
TII.isAlwaysGDS(
MI.getOpcode()))
124 switch (
MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
135 if (
TII.isDS(
MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (
MI.getOperand(GDS).getImm())
146 unsigned Opcode =
MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
170 AMDGPU::OpName::simm16);
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
190 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
194 if (!IsHazardRecognizerMode) {
195 if (checkWMMACoexecutionHazards(
MI) > 0)
199 if (ST.hasNoDataDepHazard())
211 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
214 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
219 checkMAIVALUHazards(
MI) > 0)
222 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
225 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
228 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
231 if (((ST.hasReadM0MovRelInterpHazard() &&
233 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
234 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
236 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
237 (ST.hasReadM0LdsDirectHazard() &&
238 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
239 checkReadM0Hazards(
MI) > 0)
246 checkMAILdStHazards(
MI) > 0)
249 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
257 while (Quantity > 0) {
258 unsigned Arg = std::min(Quantity, 8u);
266GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
267 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
268 assert(TSchedModel.getWriteProcResBegin(SC) !=
269 TSchedModel.getWriteProcResEnd(SC));
270 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
273void GCNHazardRecognizer::processBundle() {
277 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
278 CurrCycleInstr = &*
MI;
281 if (IsHazardRecognizerMode) {
282 fixHazards(CurrCycleInstr);
290 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
291 EmittedInstrs.push_front(
nullptr);
293 EmittedInstrs.push_front(CurrCycleInstr);
296 CurrCycleInstr =
nullptr;
300 assert(IsHazardRecognizerMode);
304 if (
MI->isInsideBundle())
314 IsHazardRecognizerMode =
true;
318 CurrCycleInstr =
nullptr;
329 return std::max(WaitStates, checkSMRDHazards(
MI));
331 if (ST.hasNSAtoVMEMBug())
332 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
334 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
336 if (ST.hasNoDataDepHazard())
340 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
343 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
346 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
349 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
352 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
356 checkMAIVALUHazards(
MI) > 0)
357 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
359 if (
MI->isInlineAsm())
360 return std::max(WaitStates, checkInlineAsmHazards(
MI));
363 return std::max(WaitStates, checkGetRegHazards(
MI));
366 return std::max(WaitStates, checkSetRegHazards(
MI));
369 return std::max(WaitStates, checkRFEHazards(
MI));
371 if ((ST.hasReadM0MovRelInterpHazard() &&
373 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
374 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
376 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
377 (ST.hasReadM0LdsDirectHazard() &&
378 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
379 return std::max(WaitStates, checkReadM0Hazards(
MI));
382 return std::max(WaitStates, checkMAIHazards(
MI));
385 return std::max(WaitStates, checkMAILdStHazards(
MI));
388 return std::max(WaitStates, checkPermlaneHazards(
MI));
394 EmittedInstrs.push_front(
nullptr);
400 if (!CurrCycleInstr) {
401 EmittedInstrs.push_front(
nullptr);
405 if (CurrCycleInstr->isBundle()) {
410 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
411 if (!NumWaitStates) {
412 CurrCycleInstr =
nullptr;
417 EmittedInstrs.push_front(CurrCycleInstr);
424 EmittedInstrs.push_front(
nullptr);
432 CurrCycleInstr =
nullptr;
436 assert(!IsHazardRecognizerMode &&
437 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
447template <
typename StateT>
457 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
462 static inline StateMapKey getEmptyKey() {
467 static inline StateMapKey getTombstoneKey() {
472 static unsigned getHashValue(
const StateMapKey &
Key) {
473 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
475 static unsigned getHashValue(
const StateT &State) {
476 return StateT::getHashValue(State);
478 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
479 const auto EKey = getEmptyKey();
480 const auto TKey = getTombstoneKey();
481 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
482 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
483 return StateMapKey::isEqual(
LHS,
RHS);
484 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
486 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
487 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
488 StateMapKey::isEqual(
RHS, getTombstoneKey()))
490 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
499 StateT State = InitialState;
502 unsigned WorkIdx = 0;
504 bool Expired =
false;
505 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
510 auto Result = IsHazard(State, *
I);
518 if (
I->isInlineAsm() ||
I->isMetaInstruction())
521 UpdateState(State, *
I);
525 unsigned StateIdx = States.
size();
526 StateMapKey
Key = {&States, StateIdx};
527 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
528 if (Insertion.second) {
531 StateIdx = Insertion.first->second;
534 Worklist.
insert(std::pair(Pred, StateIdx));
537 if (WorkIdx == Worklist.
size())
541 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
542 State = States[StateIdx];
543 I =
MBB->instr_rbegin();
560 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
568 if (
I->isInlineAsm())
571 WaitStates += GetNumWaitStates(*
I);
573 if (IsExpired(*
I, WaitStates))
574 return std::numeric_limits<int>::max();
577 int MinWaitStates = std::numeric_limits<int>::max();
579 if (!Visited.
insert(Pred).second)
583 IsExpired, Visited, GetNumWaitStates);
585 MinWaitStates = std::min(MinWaitStates, W);
588 return MinWaitStates;
599 std::next(
MI->getReverseIterator()), 0, IsExpired,
600 Visited, GetNumWaitStates);
603int GCNHazardRecognizer::getWaitStatesSince(
604 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates) {
605 if (IsHazardRecognizerMode) {
606 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
607 return WaitStates >= Limit;
609 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
614 for (MachineInstr *
MI : EmittedInstrs) {
619 if (
MI->isInlineAsm())
622 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
624 if (WaitStates >= Limit)
627 return std::numeric_limits<int>::max();
630int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
634int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
635 IsHazardFn IsHazardDef,
637 const SIRegisterInfo *TRI = ST.getRegisterInfo();
640 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
646int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
661 for (MCRegUnit Unit :
TRI.regunits(
Reg))
662 BV.
set(
static_cast<unsigned>(Unit));
686int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
689 if (!ST.isXNACKEnabled())
692 bool IsSMRD = TII.isSMRD(*MEM);
706 for (MachineInstr *
MI : EmittedInstrs) {
718 if (ClauseDefs.none())
731 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
734int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
735 int WaitStatesNeeded = 0;
737 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
740 if (!ST.hasSMRDReadVALUDefHazard())
741 return WaitStatesNeeded;
745 int SmrdSgprWaitStates = 4;
746 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
747 return TII.isVALU(
MI);
749 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
750 return TII.isSALU(
MI);
753 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
755 for (
const MachineOperand &Use :
SMRD->uses()) {
758 int WaitStatesNeededForUse =
759 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
771 int WaitStatesNeededForUse =
772 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
775 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
779 return WaitStatesNeeded;
782int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
783 if (!ST.hasVMEMReadSGPRVALUDefHazard())
786 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
790 const int VmemSgprWaitStates = 5;
791 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
792 return TII.isVALU(
MI);
794 for (
const MachineOperand &Use : VMEM->uses()) {
795 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
798 int WaitStatesNeededForUse =
799 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
801 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
803 return WaitStatesNeeded;
807 const SIRegisterInfo *TRI = ST.getRegisterInfo();
808 const SIInstrInfo *TII = ST.getInstrInfo();
811 int DppVgprWaitStates = 2;
812 int DppExecWaitStates = 5;
813 int WaitStatesNeeded = 0;
814 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
815 return TII->isVALU(
MI);
818 for (
const MachineOperand &Use :
DPP->uses()) {
819 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
821 int WaitStatesNeededForUse =
822 DppVgprWaitStates - getWaitStatesSinceDef(
824 [](
const MachineInstr &) { return true; },
826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
829 WaitStatesNeeded = std::max(
831 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
834 return WaitStatesNeeded;
837int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
838 const SIInstrInfo *TII = ST.getInstrInfo();
842 const int DivFMasWaitStates = 4;
843 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
844 return TII->isVALU(
MI);
846 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
849 return DivFMasWaitStates - WaitStatesNeeded;
852int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
853 const SIInstrInfo *TII = ST.getInstrInfo();
854 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
856 const int GetRegWaitStates = 2;
857 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
860 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
862 return GetRegWaitStates - WaitStatesNeeded;
865int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
866 const SIInstrInfo *TII = ST.getInstrInfo();
867 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
869 const int SetRegWaitStates = ST.getSetRegWaitStates();
870 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
873 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
874 return SetRegWaitStates - WaitStatesNeeded;
877int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
881 const SIInstrInfo *TII = ST.getInstrInfo();
882 unsigned Opcode =
MI.getOpcode();
883 const MCInstrDesc &
Desc =
MI.getDesc();
885 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
888 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
890 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
897 const MachineOperand *SOffset =
898 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
902 (!SOffset || !SOffset->
isReg()))
910 if (TII->isMIMG(
MI)) {
911 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
913 Desc.operands()[SRsrcIdx])) == 256);
917 if (TII->isFLAT(
MI)) {
930GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
934 const SIRegisterInfo *TRI = ST.getRegisterInfo();
936 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
937 int WaitStatesNeeded = 0;
939 if (!TRI->isVectorRegister(
MRI,
Def.getReg()))
940 return WaitStatesNeeded;
943 int DataIdx = createsVALUHazard(
MI);
944 return DataIdx >= 0 &&
945 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
948 int WaitStatesNeededForDef =
949 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
950 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
952 return WaitStatesNeeded;
968 unsigned Opcode =
MI.getOpcode();
978 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
980 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
986 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
988 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
992 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
994 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1000 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1021 for (
auto &Operand : VALU->operands()) {
1022 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1029int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
1030 int WaitStatesNeeded = 0;
1033 const int TransDefWaitstates = 1;
1035 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1038 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1039 const SIInstrInfo *TII = ST.getInstrInfo();
1040 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1042 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1043 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1050 int WaitStatesNeededForDef =
1051 TransDefWaitstates -
1052 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1053 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1056 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1057 const int Shift16DefWaitstates = 1;
1059 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1060 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1061 const MachineOperand *ForwardedDst =
1067 if (ProducerMI.isInlineAsm()) {
1069 for (
auto &Def : ProducerMI.all_defs()) {
1078 int WaitStatesNeededForDef =
1079 Shift16DefWaitstates -
1080 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1081 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1084 if (ST.hasVDecCoExecHazard()) {
1085 const int VALUWriteSGPRVALUReadWaitstates = 2;
1086 const int VALUWriteEXECRWLane = 4;
1087 const int VALUWriteVGPRReadlaneRead = 1;
1089 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1090 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1092 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1095 return MI.modifiesRegister(
UseReg, TRI);
1098 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1104 int WaitStatesNeededForDef =
1105 VALUWriteSGPRVALUReadWaitstates -
1106 getWaitStatesSince(IsVALUDefSGPRFn,
1107 VALUWriteSGPRVALUReadWaitstates);
1108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1112 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1114 int WaitStatesNeededForDef =
1115 VALUWriteSGPRVALUReadWaitstates -
1116 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1117 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1120 switch (
VALU->getOpcode()) {
1121 case AMDGPU::V_READLANE_B32:
1122 case AMDGPU::V_READFIRSTLANE_B32: {
1123 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1125 int WaitStatesNeededForDef =
1126 VALUWriteVGPRReadlaneRead -
1127 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1131 case AMDGPU::V_WRITELANE_B32: {
1133 int WaitStatesNeededForDef =
1134 VALUWriteEXECRWLane -
1135 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1146 if (!ST.has12DWordStoreHazard())
1147 return WaitStatesNeeded;
1149 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1151 for (
const MachineOperand &Def :
VALU->defs()) {
1152 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1155 return WaitStatesNeeded;
1158int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1167 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1168 !ST.hasCvtScaleForwardingHazard())
1171 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1172 int WaitStatesNeeded = 0;
1174 for (
const MachineOperand &
Op :
1176 if (
Op.isReg() &&
Op.isDef()) {
1177 if (!TRI.isVectorRegister(
MRI,
Op.getReg()))
1180 if (ST.has12DWordStoreHazard()) {
1182 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1187 if (ST.hasDstSelForwardingHazard()) {
1188 const int Shift16DefWaitstates = 1;
1190 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1194 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1195 IA->readsRegister(Dst->getReg(), &TRI);
1197 if (ProducerMI.isInlineAsm()) {
1199 for (
auto &Def : ProducerMI.all_defs()) {
1200 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1201 IA->readsRegister(
Def.getReg(), &TRI)) {
1210 int WaitStatesNeededForDef =
1211 Shift16DefWaitstates -
1212 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1213 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1216 return WaitStatesNeeded;
1219int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1220 const SIInstrInfo *TII = ST.getInstrInfo();
1221 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1222 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1224 const MachineOperand *LaneSelectOp =
1225 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1227 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(
MRI, LaneSelectOp->
getReg()))
1231 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1233 const int RWLaneWaitStates = 4;
1234 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1236 return RWLaneWaitStates - WaitStatesSince;
1239int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1240 if (!ST.hasRFEHazards())
1243 const SIInstrInfo *TII = ST.getInstrInfo();
1245 const int RFEWaitStates = 1;
1250 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1251 return RFEWaitStates - WaitStatesNeeded;
1255 const SIInstrInfo *TII = ST.getInstrInfo();
1256 const int ReadM0WaitStates = 1;
1257 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1258 return ReadM0WaitStates -
1259 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1264bool GCNHazardRecognizer::emitVNops(
MachineInstr *
MI,
int WaitStatesNeeded) {
1265 if (WaitStatesNeeded <= 0)
1268 const SIInstrInfo *TII = ST.getInstrInfo();
1269 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1271 TII->get(AMDGPU::V_NOP_e32));
1277 fixVMEMtoScalarWriteHazards(
MI);
1278 fixVcmpxPermlaneHazards(
MI);
1279 fixSMEMtoVectorWriteHazards(
MI);
1280 fixVcmpxExecWARHazard(
MI);
1281 fixLdsBranchVmemWARHazard(
MI);
1282 if (ST.hasLdsDirect()) {
1283 fixLdsDirectVALUHazard(
MI);
1284 fixLdsDirectVMEMHazard(
MI);
1286 fixVALUPartialForwardingHazard(
MI);
1287 fixVALUTransUseHazard(
MI);
1288 fixVALUTransCoexecutionHazards(
MI);
1290 emitVNops(
MI, checkWMMACoexecutionHazards(
MI));
1291 fixShift64HighRegBug(
MI);
1292 fixVALUMaskWriteHazard(
MI);
1293 fixRequiredExportPriority(
MI);
1294 if (ST.requiresWaitIdleBeforeGetReg())
1295 fixGetRegWaitIdle(
MI);
1296 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1297 fixDsAtomicAsyncBarrierArriveB64(
MI);
1298 if (ST.hasScratchBaseForwardingHazard())
1299 fixScratchBaseForwardingHazard(
MI);
1300 if (ST.setRegModeNeedsVNOPs())
1306 return (
TII.isVOPC(
MI) ||
1307 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1308 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1311bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1315 const SIInstrInfo *TII = ST.getInstrInfo();
1316 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1322 unsigned Opc =
MI.getOpcode();
1324 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1328 std::numeric_limits<int>::max())
1334 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1336 bool IsUndef = Src0->isUndef();
1338 TII->get(AMDGPU::V_MOV_B32_e32))
1345bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1346 if (!ST.hasVMEMtoScalarWriteHazard())
1348 assert(!ST.hasExtendedWaitCounts());
1353 if (
MI->getNumDefs() == 0)
1356 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1362 for (
const MachineOperand &Def :
MI->defs()) {
1363 const MachineOperand *
Op =
1364 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1374 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1375 !
MI.getOperand(0).getImm()) ||
1376 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1381 std::numeric_limits<int>::max())
1384 const SIInstrInfo *TII = ST.getInstrInfo();
1386 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1391bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1392 if (!ST.hasSMEMtoVectorWriteHazard())
1394 assert(!ST.hasExtendedWaitCounts());
1399 AMDGPU::OpName SDSTName;
1400 switch (
MI->getOpcode()) {
1401 case AMDGPU::V_READLANE_B32:
1402 case AMDGPU::V_READFIRSTLANE_B32:
1403 SDSTName = AMDGPU::OpName::vdst;
1406 SDSTName = AMDGPU::OpName::sdst;
1410 const SIInstrInfo *TII = ST.getInstrInfo();
1411 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1413 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1415 for (
const auto &MO :
MI->implicit_operands()) {
1416 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1427 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1432 if (TII->isSALU(
MI)) {
1433 switch (
MI.getOpcode()) {
1434 case AMDGPU::S_SETVSKIP:
1435 case AMDGPU::S_VERSION:
1436 case AMDGPU::S_WAITCNT_VSCNT:
1437 case AMDGPU::S_WAITCNT_VMCNT:
1438 case AMDGPU::S_WAITCNT_EXPCNT:
1441 case AMDGPU::S_WAITCNT_LGKMCNT:
1443 return (
MI.getOperand(1).getImm() == 0) &&
1444 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1445 case AMDGPU::S_WAITCNT: {
1446 const int64_t
Imm =
MI.getOperand(0).getImm();
1449 return (Decoded.
DsCnt == 0);
1453 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1454 "unexpected wait count instruction");
1456 if (TII->isSOPP(
MI))
1472 std::numeric_limits<int>::max())
1476 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1481bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1482 if (!ST.hasVcmpxExecWARHazard())
1484 assert(!ST.hasExtendedWaitCounts());
1489 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1490 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1496 return I.readsRegister(AMDGPU::EXEC, TRI);
1499 const SIInstrInfo *TII = ST.getInstrInfo();
1500 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1502 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1504 for (
auto MO :
MI.implicit_operands())
1505 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1508 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1515 std::numeric_limits<int>::max())
1519 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1526 if (!ST.hasLdsBranchVmemWARHazard())
1531 bool HasLds =
false;
1532 bool HasVmem =
false;
1533 for (
auto &
MBB : MF) {
1534 for (
auto &
MI :
MBB) {
1537 if (HasLds && HasVmem)
1545 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1546 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1547 !
I.getOperand(1).getImm();
1550bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1551 if (!RunLdsBranchVmemWARHazardFixup)
1554 assert(ST.hasLdsBranchVmemWARHazard());
1555 assert(!ST.hasExtendedWaitCounts());
1557 auto IsHazardInst = [](
const MachineInstr &
MI) {
1565 auto InstType = IsHazardInst(*
MI);
1569 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1573 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1577 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1578 auto InstType2 = IsHazardInst(
I);
1579 return InstType2 && InstType != InstType2;
1582 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1583 auto InstType2 = IsHazardInst(
I);
1584 if (InstType == InstType2)
1591 std::numeric_limits<int>::max();
1595 std::numeric_limits<int>::max())
1598 const SIInstrInfo *TII = ST.getInstrInfo();
1600 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1607bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1611 const int NoHazardWaitStates = 15;
1612 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1615 bool VisitedTrans =
false;
1616 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1621 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1623 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1624 if (WaitStates >= NoHazardWaitStates)
1630 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1634 DenseSet<const MachineBasicBlock *> Visited;
1636 std::next(
MI->getReverseIterator()), 0,
1644 MachineOperand *WaitVdstOp =
1645 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1646 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1651bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1655 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1658 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1661 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1663 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1666 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1668 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1669 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1672 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1676 std::numeric_limits<int>::max())
1679 if (LdsdirCanWait) {
1680 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1683 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1690bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1691 if (!ST.hasVALUPartialForwardingHazard())
1693 assert(!ST.hasExtendedWaitCounts());
1698 SmallSetVector<Register, 4> SrcVGPRs;
1700 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1701 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1706 if (SrcVGPRs.
size() <= 1)
1724 const int Intv1plus2MaxVALUs = 2;
1725 const int Intv3MaxVALUs = 4;
1726 const int IntvMaxVALUs = 6;
1727 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1730 SmallDenseMap<Register, int, 4> DefPos;
1731 int ExecPos = std::numeric_limits<int>::max();
1734 static unsigned getHashValue(
const StateType &State) {
1738 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1739 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1747 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1749 if (State.VALUs > NoHazardVALUWaitStates)
1755 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1763 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1764 State.DefPos[Src] = State.VALUs;
1769 if (State.ExecPos == std::numeric_limits<int>::max()) {
1770 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1771 State.ExecPos = State.VALUs;
1778 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1786 if (State.ExecPos == std::numeric_limits<int>::max())
1789 int PreExecPos = std::numeric_limits<int>::max();
1790 int PostExecPos = std::numeric_limits<int>::max();
1792 for (
auto Entry : State.DefPos) {
1793 int DefVALUs =
Entry.second;
1794 if (DefVALUs != std::numeric_limits<int>::max()) {
1795 if (DefVALUs >= State.ExecPos)
1796 PreExecPos = std::min(PreExecPos, DefVALUs);
1798 PostExecPos = std::min(PostExecPos, DefVALUs);
1803 if (PostExecPos == std::numeric_limits<int>::max())
1807 int Intv3VALUs = PostExecPos;
1808 if (Intv3VALUs > Intv3MaxVALUs)
1812 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1813 if (Intv2VALUs > Intv1plus2MaxVALUs)
1817 if (PreExecPos == std::numeric_limits<int>::max())
1821 int Intv1VALUs = PreExecPos - State.ExecPos;
1822 if (Intv1VALUs > Intv1plus2MaxVALUs)
1826 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1831 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1837 std::next(
MI->getReverseIterator())))
1841 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1847bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1848 if (!ST.hasVALUTransUseHazard())
1850 assert(!ST.hasExtendedWaitCounts());
1855 SmallSet<Register, 4> SrcVGPRs;
1857 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1858 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1872 const int IntvMaxVALUs = 5;
1873 const int IntvMaxTRANS = 1;
1879 static unsigned getHashValue(
const StateType &State) {
1882 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1883 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1890 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1892 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1898 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1905 if (
I.modifiesRegister(Src, &TRI)) {
1913 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1921 std::next(
MI->getReverseIterator())))
1927 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1933bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1938 const SIInstrInfo *TII = ST.getInstrInfo();
1939 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1941 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1946 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1947 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1948 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1952 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1953 if (!ValuDst || !ValuDst->isReg())
1957 Register ValuDef = ValuDst->getReg();
1958 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1959 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1970 const int HasVALU = std::numeric_limits<int>::max();
1971 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1974 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1982 const SIInstrInfo *TII = ST.getInstrInfo();
1983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1985 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
1992 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1994 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1997 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1999 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2000 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2009 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2010 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2024 std::numeric_limits<int>::max())
2027 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2039 unsigned Category) {
2041 "Handle me if the xdl wmma instruction latency changes");
2078int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI) {
2082 const SIInstrInfo *TII = ST.getInstrInfo();
2086 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2093 const int WMMAWaitStates[] = {5, 9, 3, 5};
2094 const int VALUWaitStates[] = {4, 8, 2, 4};
2095 unsigned Category = 0;
2097 auto IsWMMAHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2098 if (!TII->isXDLWMMA(
I))
2101 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2105 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2106 Register A1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2107 Register B1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2110 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2114 Register Idx1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2115 if (TRI->regsOverlap(D0, Idx1))
2122 auto IsVALUHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2123 if (!TII->isXDLWMMA(
I))
2126 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2131 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2132 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
2133 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2137 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
2138 if (!ValuDst || !ValuDst->isReg())
2143 if (TRI->regsOverlap(D0, D1))
2147 Register A0 = TII->getNamedOperand(
I, AMDGPU::OpName::src0)->getReg();
2148 Register B0 = TII->getNamedOperand(
I, AMDGPU::OpName::src1)->getReg();
2149 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2153 Register Idx0 = TII->getNamedOperand(
I, AMDGPU::OpName::src2)->getReg();
2154 if (TRI->regsOverlap(D1, Idx0))
2163 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2167 int WaitStatesNeeded = -1;
2168 if (TII->isXDLWMMA(*
MI)) {
2169 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2170 Limit = WMMAWaitStates[Category];
2176 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2179 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2180 Limit = VALUWaitStates[Category];
2186 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2190 return WaitStatesNeeded;
2193bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2194 if (!ST.hasShift64HighRegBug())
2196 assert(!ST.hasExtendedWaitCounts());
2198 switch (
MI->getOpcode()) {
2201 case AMDGPU::V_LSHLREV_B64_e64:
2202 case AMDGPU::V_LSHRREV_B64_e64:
2203 case AMDGPU::V_ASHRREV_I64_e64:
2207 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2212 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2214 if (!TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2217 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
2220 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2221 bool OverlappedSrc = Src1->
isReg() && TRI.regsOverlap(Src1->
getReg(), AmtReg);
2222 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
2223 bool Overlapped = OverlappedSrc || OverlappedDst;
2225 assert(!OverlappedDst || !OverlappedSrc ||
2226 Src1->
getReg() ==
MI->getOperand(0).getReg());
2227 assert(ST.needsAlignedVGPRs());
2228 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2231 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2232 : AMDGPU::VGPR_32RegClass) {
2233 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2239 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2244 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2247 MachineBasicBlock *
MBB =
MI->getParent();
2259 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2266 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2272 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2286 MI->getOperand(0).setReg(NewReg);
2287 if (OverlappedSrc) {
2297 int NSAtoVMEMWaitStates = 1;
2299 if (!ST.hasNSAtoVMEMBug())
2305 const SIInstrInfo *TII = ST.getInstrInfo();
2306 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2314 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2315 TII->getInstSizeInBytes(
I) >= 16;
2318 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2321int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2322 int FPAtomicToDenormModeWaitStates = 3;
2324 if (!ST.hasFPAtomicToDenormModeHazard())
2326 assert(!ST.hasExtendedWaitCounts());
2328 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2337 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2344 return FPAtomicToDenormModeWaitStates -
2351 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2359 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2363 int NeighborMFMALatency = 0;
2364 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2365 this](
const MachineInstr &
MI) {
2369 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2373 const int MaxMFMAPipelineWaitStates = 16;
2374 int WaitStatesSinceNeighborMFMA =
2375 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2377 int NeighborMFMAPaddingNeeded =
2379 WaitStatesSinceNeighborMFMA;
2381 return std::max(0, NeighborMFMAPaddingNeeded);
2385 int WaitStatesNeeded = 0;
2386 unsigned Opc =
MI->getOpcode();
2388 auto IsVALUFn = [](
const MachineInstr &
MI) {
2392 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2393 const int LegacyVALUWritesVGPRWaitStates = 2;
2394 const int VALUWritesExecWaitStates = 4;
2395 const int MaxWaitStates = 4;
2397 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2398 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2399 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2401 if (WaitStatesNeeded < MaxWaitStates) {
2402 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2403 const int MaxWaitStates = 2;
2405 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2408 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2409 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2410 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2412 if (WaitStatesNeeded == MaxWaitStates)
2418 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2419 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2422 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2425 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2426 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2427 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2428 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2429 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2430 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2431 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2432 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2433 const int MaxWaitStates = 18;
2435 unsigned HazardDefLatency = 0;
2437 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2438 this](
const MachineInstr &
MI) {
2445 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2446 return TRI.regsOverlap(DstReg,
Reg);
2449 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2451 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2452 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2453 int OpNo =
Op.getOperandNo();
2454 if (OpNo == SrcCIdx) {
2455 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2456 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2457 switch (HazardDefLatency) {
2458 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2460 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2462 case 16: [[fallthrough]];
2463 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2466 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2467 switch (HazardDefLatency) {
2468 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2470 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2472 case 16: [[fallthrough]];
2473 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2478 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2479 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2481 if (WaitStatesNeeded == MaxWaitStates)
2482 return WaitStatesNeeded;
2484 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2485 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2488 return TRI.regsOverlap(
Reg, DstReg);
2491 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2492 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2493 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2494 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2495 if (OpNo == SrcCIdx)
2496 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2497 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2498 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2500 WaitStatesNeededForUse = NeedWaitStates -
2501 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2502 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2504 if (WaitStatesNeeded == MaxWaitStates)
2505 return WaitStatesNeeded;
2508 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2509 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2510 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2511 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2512 const int MaxWaitStates = 13;
2513 Register DstReg =
MI->getOperand(0).getReg();
2514 unsigned HazardDefLatency = 0;
2516 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2517 this](
const MachineInstr &
MI) {
2520 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2522 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2523 return TRI.regsOverlap(
Reg, DstReg);
2526 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2528 switch (HazardDefLatency) {
2529 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2531 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2533 case 16: [[fallthrough]];
2534 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2538 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2539 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2543 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2545 return WaitStatesNeeded;
2556 return NumPasses + 1 + IsGFX950;
2567 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2585 return NumPasses + 2;
2595 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2599 int WaitStatesNeeded = 0;
2600 unsigned Opc =
MI->getOpcode();
2602 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2606 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2612 return WaitStatesNeeded;
2614 const int VALUWritesExecWaitStates = 4;
2615 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2616 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2617 VALUWritesExecWaitStates);
2618 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2620 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2623 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2624 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2625 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2626 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2627 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2628 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2629 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2630 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2631 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2632 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2633 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2634 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2635 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2636 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2637 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2638 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2639 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2640 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2641 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2642 const int MaxWaitStates = 19;
2648 const MachineInstr *MI1;
2650 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2651 this](
const MachineInstr &
MI) {
2655 FullReg = (DstReg ==
Reg);
2657 return TRI.regsOverlap(DstReg,
Reg);
2660 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2661 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2662 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2665 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2666 if (NumWaitStates == std::numeric_limits<int>::max())
2669 int OpNo =
Use.getOperandNo();
2671 int NeedWaitStates = 0;
2672 if (OpNo == SrcCIdx) {
2676 }
else if (FullReg) {
2677 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2678 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2679 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2680 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2681 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2682 else if (ST.hasGFX940Insts() &&
2683 TSchedModel.computeInstrLatency(MI1) == 2)
2684 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2687 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2688 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2689 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2690 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2691 if (!TII.isXDL(*
MI))
2694 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2695 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2697 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2698 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2699 if (!TII.isXDL(*
MI))
2700 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2703 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2704 if (ST.hasGFX940Insts()) {
2705 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2712 NumPasses, ST.hasGFX950Insts())
2714 NumPasses, ST.hasGFX950Insts()))
2720 switch (NumPasses) {
2724 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2725 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2730 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2731 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2736 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2737 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2746 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2747 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2748 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2749 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2752 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2753 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2755 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2756 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2757 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2760 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2762 if (ST.hasGFX940Insts()) {
2766 NumPasses, ST.hasGFX950Insts())
2772 switch (NumPasses) {
2774 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2779 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2783 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2787 if (WaitStatesNeeded >= NeedWaitStates)
2790 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2791 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2793 if (WaitStatesNeeded == MaxWaitStates)
2798 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2800 return WaitStatesNeeded;
2805 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2808 int WaitStatesNeeded = 0;
2810 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2811 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2814 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2815 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2820 const int AccVgprReadLdStWaitStates = 2;
2821 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2822 const int MaxWaitStates = 2;
2824 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2825 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2828 if (WaitStatesNeeded == MaxWaitStates)
2829 return WaitStatesNeeded;
2831 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2832 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2833 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2835 auto IsVALUFn = [](
const MachineInstr &
MI) {
2838 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2839 std::numeric_limits<int>::max();
2842 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2843 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2844 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2847 return WaitStatesNeeded;
2851 assert(!ST.hasVcmpxPermlaneHazard() &&
2852 "this is a different vcmpx+permlane hazard");
2853 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2854 const SIInstrInfo *TII = ST.getInstrInfo();
2856 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2860 auto IsVALUFn = [](
const MachineInstr &
MI) {
2864 const int VCmpXWritesExecWaitStates = 4;
2865 const int VALUWritesVDstWaitStates = 2;
2866 int WaitStatesNeeded = 0;
2868 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2869 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2873 int WaitStatesSinceDef =
2874 VALUWritesVDstWaitStates -
2875 getWaitStatesSinceDef(
Reg, IsVALUFn,
2876 VALUWritesVDstWaitStates);
2877 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2878 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2882 int VCmpXHazardWaits =
2883 VCmpXWritesExecWaitStates -
2884 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2886 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2887 return WaitStatesNeeded;
2895 return NumPasses + 2;
2905 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2915 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2923 return NumPasses + 2;
2927 if (!ST.hasGFX90AInsts())
2930 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
2938 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2940 int WaitStatesNeeded = 0;
2946 const MachineInstr *
MFMA =
nullptr;
2948 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
2950 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2956 const MachineInstr *
DOT =
nullptr;
2957 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
2959 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2965 bool DGEMMAfterVALUWrite =
false;
2966 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2969 DGEMMAfterVALUWrite =
true;
2973 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2979 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
2980 AMDGPU::OpName::src2);
2982 if (IsMemOrExport || IsVALU) {
2983 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2984 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2985 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2986 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2987 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2988 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2989 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2990 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2991 const int DotWriteSameDotReadSrcAB = 3;
2992 const int DotWriteDifferentVALURead = 3;
2993 const int DMFMABetweenVALUWriteVMEMRead = 2;
2994 const int MaxWaitStates = 19;
2996 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3002 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3005 int NeedWaitStates = 0;
3006 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3007 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3008 NeedWaitStates = DotWriteSameDotReadSrcAB;
3010 NeedWaitStates = DotWriteDifferentVALURead;
3013 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3014 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3021 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3022 DGEMMAfterVALUWrite =
false;
3023 if (TRI.isVectorRegister(
MRI,
Reg)) {
3024 int WaitStatesNeededForUse =
3025 DMFMABetweenVALUWriteVMEMRead -
3026 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3027 DMFMABetweenVALUWriteVMEMRead);
3029 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3034 WaitStatesSinceDef =
3035 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3039 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3040 int NumPasses = HazardDefLatency;
3041 int NeedWaitStates = MaxWaitStates;
3044 switch (HazardDefLatency) {
3046 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3047 : DMFMA4x4WriteVgprVALUReadWaitStates;
3053 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3054 : (ST.hasGFX950Insts()
3055 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3056 : DMFMA16x16WriteVgprVALUReadWaitStates);
3061 }
else if (ST.hasGFX940Insts()) {
3065 NumPasses, ST.hasGFX950Insts())
3069 switch (HazardDefLatency) {
3071 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3074 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3077 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3084 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3085 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3087 if (WaitStatesNeeded == MaxWaitStates)
3092 unsigned Opc =
MI->getOpcode();
3093 const int DMFMAToFMA64WaitStates = 2;
3094 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3095 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3096 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3097 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3098 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3099 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3100 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3103 if (!IsVALU && !IsMemOrExport)
3104 return WaitStatesNeeded;
3106 for (
const MachineOperand &Def :
MI->defs()) {
3107 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3108 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3109 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3110 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3111 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3112 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3113 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3114 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3115 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3116 const int DotWriteDifferentVALUWrite = 3;
3117 const int MaxWaitStates = 19;
3118 const int MaxWarWaitStates = 15;
3123 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3125 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3126 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3127 WaitStatesSinceDef);
3130 WaitStatesSinceDef =
3131 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3133 int NeedWaitStates = MaxWaitStates;
3134 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3137 switch (NumPasses) {
3139 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3143 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3148 }
else if (ST.hasGFX940Insts()) {
3152 NumPasses, ST.hasGFX950Insts())
3155 switch (NumPasses) {
3157 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3160 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3163 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3170 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3171 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3173 if (WaitStatesNeeded == MaxWaitStates)
3177 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3179 !
MI.readsRegister(
Reg, &TRI))
3182 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3185 const MachineOperand *SrcC =
3186 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3196 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3201 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3202 int NeedWaitStates = MaxWaitStates;
3203 switch (HazardDefLatency) {
3204 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3206 case 4:
assert(ST.hasGFX940Insts());
3207 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3209 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3211 case 16: [[fallthrough]];
3212 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3216 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3217 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3220 return WaitStatesNeeded;
3233 return MAI !=
nullptr;
3237 if (IsMFMAFn(*
MI)) {
3238 int W = getWaitStatesSince(IsMFMAFn, 16);
3240 return W < (int)TSchedModel.computeInstrLatency(MAI);
3254 while (
I->isBundledWithPred())
3260 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3264 const unsigned NewBytes = 4;
3266 "Unexpected instruction insertion in bundle");
3269 while (NextMI != End && NextMI->isBundledWithPred()) {
3270 for (
auto &Operand : NextMI->operands()) {
3271 if (Operand.isGlobal())
3272 Operand.setOffset(Operand.getOffset() + NewBytes);
3278bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3279 if (!ST.hasVALUMaskWriteHazard())
3281 assert(!ST.hasExtendedWaitCounts());
3288 if (!IsSALU && !IsVALU)
3300 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3301 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3306 case AMDGPU::EXEC_LO:
3307 case AMDGPU::EXEC_HI:
3309 case AMDGPU::SGPR_NULL:
3310 case AMDGPU::SGPR_NULL64:
3318 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3322 SmallSet<Register, 2> HazardSGPRs;
3324 static unsigned getHashValue(
const StateType &State) {
3327 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3328 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3332 SmallVector<const MachineInstr *> WaitInstrs;
3333 bool HasSGPRRead =
false;
3334 StateType InitialState;
3337 MachineOperand *HazardDef =
nullptr;
3338 for (MachineOperand &
Op :
MI->operands()) {
3341 if (
Op.isDef() && HazardDef)
3345 if (IgnoreableSGPR(
Reg))
3348 if (
Op.isImplicit())
3350 if (!TRI->isSGPRReg(
MRI,
Reg))
3368 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3369 InitialState.HazardSGPRs.insert(HazardReg);
3372 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3373 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3376 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3377 if (State.HazardSGPRs.empty())
3380 switch (
I.getOpcode()) {
3381 case AMDGPU::V_ADDC_U32_e32:
3382 case AMDGPU::V_ADDC_U32_dpp:
3383 case AMDGPU::V_CNDMASK_B16_t16_e32:
3384 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3385 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3386 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3387 case AMDGPU::V_CNDMASK_B32_e32:
3388 case AMDGPU::V_CNDMASK_B32_dpp:
3389 case AMDGPU::V_DIV_FMAS_F32_e64:
3390 case AMDGPU::V_DIV_FMAS_F64_e64:
3391 case AMDGPU::V_SUBB_U32_e32:
3392 case AMDGPU::V_SUBB_U32_dpp:
3393 case AMDGPU::V_SUBBREV_U32_e32:
3394 case AMDGPU::V_SUBBREV_U32_dpp: {
3398 case AMDGPU::V_ADDC_U32_e64:
3399 case AMDGPU::V_ADDC_U32_e64_dpp:
3400 case AMDGPU::V_CNDMASK_B16_t16_e64:
3401 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3402 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3403 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3404 case AMDGPU::V_CNDMASK_B32_e64:
3405 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3406 case AMDGPU::V_SUBB_U32_e64:
3407 case AMDGPU::V_SUBB_U32_e64_dpp:
3408 case AMDGPU::V_SUBBREV_U32_e64:
3409 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3411 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3413 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3425 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3426 switch (
I.getOpcode()) {
3427 case AMDGPU::S_WAITCNT_DEPCTR:
3429 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3430 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3435 for (
auto &
Op :
I.operands()) {
3440 if (IgnoreableSGPR(
Reg))
3443 if (
Op.isImplicit())
3445 if (!TRI->isSGPRReg(
MRI,
Reg))
3456 for (
Register SGPR : State.HazardSGPRs) {
3457 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3461 State.HazardSGPRs.erase(SGPR);
3470 std::next(
MI->getReverseIterator())))
3480 if (!WaitInstrs.
empty()) {
3484 SmallVector<MachineInstr *> ToErase;
3486 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3487 End = MI->getParent()->rend();
3488 Found < WaitInstrs.size() && It != End; ++It) {
3489 MachineInstr *WaitMI = &*It;
3491 if (std::as_const(WaitMI) != WaitInstrs[Found])
3494 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3495 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3496 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3497 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3498 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3499 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3500 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3501 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3502 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3503 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3504 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3505 ToErase.push_back(WaitMI);
3508 for (MachineInstr *WaitMI : ToErase)
3509 WaitMI->eraseFromParent();
3513 auto NextMI = std::next(
MI->getIterator());
3514 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3515 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3527 if (EntryMBB.
begin() != EntryMBB.
end()) {
3528 auto &EntryMI = *EntryMBB.
begin();
3529 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3530 EntryMI.getOperand(0).getImm() >= Priority)
3539bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3540 if (!ST.hasRequiredExportPriority())
3545 MachineBasicBlock *
MBB =
MI->getParent();
3558 const int MaxPriority = 3;
3559 const int NormalPriority = 2;
3560 const int PostExportPriority = 0;
3562 auto It =
MI->getIterator();
3563 switch (
MI->getOpcode()) {
3564 case AMDGPU::S_ENDPGM:
3565 case AMDGPU::S_ENDPGM_SAVED:
3566 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3567 case AMDGPU::SI_RETURN_TO_EPILOG:
3570 if (MF->getFrameInfo().hasCalls())
3573 case AMDGPU::S_SETPRIO: {
3575 auto &PrioOp =
MI->getOperand(0);
3576 int Prio = PrioOp.getImm();
3577 bool InWA = (Prio == PostExportPriority) &&
3578 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3579 if (InWA || Prio >= NormalPriority)
3581 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3585 if (!TII.isEXP(*
MI))
3596 auto NextMI = std::next(It);
3597 bool EndOfShader =
false;
3598 if (NextMI !=
MBB->
end()) {
3600 if (TII.isEXP(*NextMI))
3603 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3604 NextMI->getOperand(0).getImm() == PostExportPriority)
3606 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3613 .
addImm(PostExportPriority);
3617 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3618 .
addReg(AMDGPU::SGPR_NULL)
3638 const SIInstrInfo *TII = ST.getInstrInfo();
3650 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3655bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3656 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3659 const SIInstrInfo *TII = ST.getInstrInfo();
3661 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3663 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3664 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3670bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3673 if (!IsHazardRecognizerMode)
3676 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3677 const SIInstrInfo *TII = ST.getInstrInfo();
3679 const int FlatScrBaseWaitStates = 10;
3681 bool ReadsFlatScrLo =
3682 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3683 bool ReadsFlatScrHi =
3684 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3690 ReadsFlatScrLo =
true;
3693 ReadsFlatScrHi =
true;
3698 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3701 DenseSet<const MachineBasicBlock *> Visited;
3703 return MI.modifiesRegister(
Reg, TRI);
3708 auto IsSGPRDef = [TII, TRI, &
MRI](
const MachineInstr &
MI) ->
unsigned {
3709 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3711 for (
const MachineOperand &MO :
MI.all_defs()) {
3712 if (TRI->isSGPRReg(
MRI, MO.getReg()))
3718 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3719 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3720 unsigned Wait =
MI.getOperand(0).getImm();
3725 return SgprWrites >= FlatScrBaseWaitStates;
3728 return ::getWaitStatesSince(
3729 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3730 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3733 if ((!ReadsFlatScrLo ||
MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3734 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3735 (!ReadsFlatScrHi ||
MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3736 !IsRegDefHazard(AMDGPU::SGPR103)))
3740 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3751 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3752 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static const uint32_t IV[8]
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
unsigned PreEmitNoopsCommon(MachineInstr *)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...