28#define DEBUG_TYPE "gcn-hazard-recognizer"
31 "Number of WMMA hazard V_NOPs hoisted from loops");
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
37struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
42 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
45 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
55 cl::desc(
"Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
61 cl::desc(
"Insert a s_nop x before every instruction"));
65 cl::desc(
"Hoist WMMA hazard V_NOPs from loops to preheaders"));
76 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
85 EmittedInstrs.clear();
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
120 return Opcode == AMDGPU::S_RFE_B64;
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
137 if (
TII.isAlwaysGDS(
MI.getOpcode()))
140 switch (
MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
151 if (
TII.isDS(
MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (
MI.getOperand(GDS).getImm())
162 unsigned Opcode =
MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
186 AMDGPU::OpName::simm16);
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
206 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(
MI) > 0)
215 if (ST.hasNoDataDepHazard())
227 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
230 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
235 checkMAIVALUHazards(
MI) > 0)
238 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
241 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
244 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
247 if (((ST.hasReadM0MovRelInterpHazard() &&
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
252 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
255 checkReadM0Hazards(
MI) > 0)
262 checkMAILdStHazards(
MI) > 0)
265 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
282GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
289void GCNHazardRecognizer::processBundle() {
293 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
294 CurrCycleInstr = &*
MI;
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
306 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
307 EmittedInstrs.push_front(
nullptr);
309 EmittedInstrs.push_front(CurrCycleInstr);
312 CurrCycleInstr =
nullptr;
316 assert(IsHazardRecognizerMode);
320 if (
MI->isInsideBundle())
330 IsHazardRecognizerMode =
true;
334 CurrCycleInstr =
nullptr;
349 return std::max(WaitStates, checkSMRDHazards(
MI));
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
356 if (ST.hasNoDataDepHazard())
360 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
363 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
366 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
376 checkMAIVALUHazards(
MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
379 if (
MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(
MI));
383 return std::max(WaitStates, checkGetRegHazards(
MI));
386 return std::max(WaitStates, checkSetRegHazards(
MI));
389 return std::max(WaitStates, checkRFEHazards(
MI));
391 if ((ST.hasReadM0MovRelInterpHazard() &&
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
396 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(
MI));
402 return std::max(WaitStates, checkMAIHazards(
MI));
405 return std::max(WaitStates, checkMAILdStHazards(
MI));
408 return std::max(WaitStates, checkPermlaneHazards(
MI));
414 EmittedInstrs.push_front(
nullptr);
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(
nullptr);
425 if (CurrCycleInstr->isBundle()) {
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr =
nullptr;
437 EmittedInstrs.push_front(CurrCycleInstr);
444 EmittedInstrs.push_front(
nullptr);
452 CurrCycleInstr =
nullptr;
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
467template <
typename StateT>
477 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
482 static unsigned getHashValue(
const StateMapKey &
Key) {
483 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
485 static unsigned getHashValue(
const StateT &State) {
486 return StateT::getHashValue(State);
488 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
489 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
491 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
492 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
501 StateT State = InitialState;
504 unsigned WorkIdx = 0;
506 bool Expired =
false;
507 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
512 auto Result = IsHazard(State, *
I);
520 if (
I->isInlineAsm() ||
I->isMetaInstruction())
523 UpdateState(State, *
I);
527 unsigned StateIdx = States.
size();
528 StateMapKey
Key = {&States, StateIdx};
529 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
530 if (Insertion.second) {
533 StateIdx = Insertion.first->second;
536 Worklist.
insert(std::pair(Pred, StateIdx));
539 if (WorkIdx == Worklist.
size())
543 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
544 State = States[StateIdx];
545 I =
MBB->instr_rbegin();
562 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
570 if (
I->isInlineAsm())
573 WaitStates += GetNumWaitStates(*
I);
575 if (IsExpired(*
I, WaitStates))
576 return std::numeric_limits<int>::max();
579 int MinWaitStates = std::numeric_limits<int>::max();
581 if (!Visited.
insert(Pred).second)
585 IsExpired, Visited, GetNumWaitStates);
587 MinWaitStates = std::min(MinWaitStates, W);
590 return MinWaitStates;
601 std::next(
MI->getReverseIterator()), 0, IsExpired,
602 Visited, GetNumWaitStates);
605int GCNHazardRecognizer::getWaitStatesSince(
606 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates)
const {
607 if (IsHazardRecognizerMode) {
608 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
609 return WaitStates >= Limit;
611 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
616 for (MachineInstr *
MI : EmittedInstrs) {
621 if (
MI->isInlineAsm())
624 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
626 if (WaitStates >= Limit)
629 return std::numeric_limits<int>::max();
632int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
637int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
638 IsHazardFn IsHazardDef,
640 const SIRegisterInfo *TRI = ST.getRegisterInfo();
643 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
649int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
664 for (MCRegUnit Unit :
TRI.regunits(
Reg))
665 BV.
set(
static_cast<unsigned>(Unit));
677void GCNHazardRecognizer::addClauseInst(
const MachineInstr &
MI)
const {
689int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM)
const {
692 if (!ST.isXNACKEnabled())
695 bool IsSMRD = TII.isSMRD(*MEM);
709 for (MachineInstr *
MI : EmittedInstrs) {
721 if (ClauseDefs.none())
734 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
737int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD)
const {
738 int WaitStatesNeeded = 0;
740 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
743 if (!ST.hasSMRDReadVALUDefHazard())
744 return WaitStatesNeeded;
748 int SmrdSgprWaitStates = 4;
749 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
750 return TII.isVALU(
MI);
752 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
753 return TII.isSALU(
MI);
756 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
758 for (
const MachineOperand &Use :
SMRD->uses()) {
761 int WaitStatesNeededForUse =
762 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
764 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
774 int WaitStatesNeededForUse =
775 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
782 return WaitStatesNeeded;
785int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr *VMEM)
const {
786 if (!ST.hasVMEMReadSGPRVALUDefHazard())
789 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
793 const int VmemSgprWaitStates = 5;
794 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
795 return TII.isVALU(
MI);
797 for (
const MachineOperand &Use :
VMEM->uses()) {
798 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
801 int WaitStatesNeededForUse =
802 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
804 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
806 return WaitStatesNeeded;
810 const SIRegisterInfo *TRI = ST.getRegisterInfo();
811 const SIInstrInfo *TII = ST.getInstrInfo();
814 int DppVgprWaitStates = 2;
815 int DppExecWaitStates = 5;
816 int WaitStatesNeeded = 0;
817 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
818 return TII->isVALU(
MI);
821 for (
const MachineOperand &Use :
DPP->uses()) {
822 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
824 int WaitStatesNeededForUse =
825 DppVgprWaitStates - getWaitStatesSinceDef(
827 [](
const MachineInstr &) { return true; },
829 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
832 WaitStatesNeeded = std::max(
834 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
837 return WaitStatesNeeded;
840int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas)
const {
841 const SIInstrInfo *TII = ST.getInstrInfo();
845 const int DivFMasWaitStates = 4;
846 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
847 return TII->isVALU(
MI);
849 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
852 return DivFMasWaitStates - WaitStatesNeeded;
855int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr)
const {
856 const SIInstrInfo *TII = ST.getInstrInfo();
857 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
859 const int GetRegWaitStates = 2;
860 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
863 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
865 return GetRegWaitStates - WaitStatesNeeded;
868int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr)
const {
869 const SIInstrInfo *TII = ST.getInstrInfo();
870 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
872 const int SetRegWaitStates = ST.getSetRegWaitStates();
873 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
876 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
877 return SetRegWaitStates - WaitStatesNeeded;
880int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI)
const {
884 const SIInstrInfo *TII = ST.getInstrInfo();
885 unsigned Opcode =
MI.getOpcode();
886 const MCInstrDesc &
Desc =
MI.getDesc();
888 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
891 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
893 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
903 if (ST.hasGFX940Insts())
905 const MachineOperand *SOffset =
906 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
907 if (!SOffset || !SOffset->
isReg())
916 if (TII->isMIMG(
MI)) {
917 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
919 Desc.operands()[SRsrcIdx])) == 256);
923 if (TII->isFLAT(
MI)) {
935int GCNHazardRecognizer::checkVALUHazardsHelper(
947 const SIRegisterInfo *TRI = ST.getRegisterInfo();
948 const SIInstrInfo *TII = ST.getInstrInfo();
950 int WaitStatesNeeded = 0;
951 if (!TRI->isVectorRegister(MRI,
Def.getReg()))
952 return WaitStatesNeeded;
955 const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
960 auto WindowFor = [
this, TII](
const MachineInstr &
MI) ->
int {
961 if (!ST.hasGFX940Insts())
963 if (TII->isBUF(
MI)) {
964 const MachineOperand *SOffset =
965 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
966 if (SOffset && SOffset->
isReg())
976 auto Counter = [&](
const MachineInstr &
MI) {
977 int DataIdx = createsVALUHazard(
MI);
979 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg)) {
980 int Need = WindowFor(
MI) - Distance;
981 WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
985 if (!
MI.isInlineAsm())
989 getWaitStatesSince(Counter, MaxWaitStates);
991 return WaitStatesNeeded;
1007 unsigned Opcode =
MI.getOpcode();
1017 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
1019 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1025 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
1027 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1031 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
1033 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1039 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1060 for (
auto &Operand : VALU->operands()) {
1061 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1068int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU)
const {
1069 int WaitStatesNeeded = 0;
1072 const int TransDefWaitstates = 1;
1074 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1077 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1078 const SIInstrInfo *TII = ST.getInstrInfo();
1079 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1081 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1082 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1089 int WaitStatesNeededForDef =
1090 TransDefWaitstates -
1091 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1095 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1096 const int Shift16DefWaitstates = 1;
1098 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1099 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1100 const MachineOperand *ForwardedDst =
1106 if (ProducerMI.isInlineAsm()) {
1108 for (
auto &Def : ProducerMI.all_defs()) {
1117 int WaitStatesNeededForDef =
1118 Shift16DefWaitstates -
1119 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1123 if (ST.hasVDecCoExecHazard()) {
1124 const int VALUWriteSGPRVALUReadWaitstates = 2;
1125 const int VALUWriteEXECRWLane = 4;
1126 const int VALUWriteVGPRReadlaneRead = 1;
1128 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1129 const MachineRegisterInfo &MRI = MF.getRegInfo();
1131 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1134 return MI.modifiesRegister(
UseReg, TRI);
1137 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1142 if (TRI->isSGPRReg(MRI,
UseReg)) {
1143 int WaitStatesNeededForDef =
1144 VALUWriteSGPRVALUReadWaitstates -
1145 getWaitStatesSince(IsVALUDefSGPRFn,
1146 VALUWriteSGPRVALUReadWaitstates);
1147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1151 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1153 int WaitStatesNeededForDef =
1154 VALUWriteSGPRVALUReadWaitstates -
1155 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1159 switch (
VALU->getOpcode()) {
1160 case AMDGPU::V_READLANE_B32:
1161 case AMDGPU::V_READFIRSTLANE_B32: {
1162 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1164 int WaitStatesNeededForDef =
1165 VALUWriteVGPRReadlaneRead -
1166 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1167 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1170 case AMDGPU::V_WRITELANE_B32: {
1172 int WaitStatesNeededForDef =
1173 VALUWriteEXECRWLane -
1174 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1175 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1185 if (!ST.has12DWordStoreHazard())
1186 return WaitStatesNeeded;
1188 const MachineRegisterInfo &MRI = MF.getRegInfo();
1190 for (
const MachineOperand &Def :
VALU->defs()) {
1191 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1194 return WaitStatesNeeded;
1197int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA)
const {
1206 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1207 !ST.hasCvtScaleForwardingHazard())
1210 const MachineRegisterInfo &MRI = MF.getRegInfo();
1211 int WaitStatesNeeded = 0;
1213 for (
const MachineOperand &
Op :
1215 if (
Op.isReg() &&
Op.isDef()) {
1216 if (!TRI.isVectorRegister(MRI,
Op.getReg()))
1219 if (ST.has12DWordStoreHazard()) {
1221 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op, MRI));
1226 if (ST.hasDstSelForwardingHazard()) {
1227 const int Shift16DefWaitstates = 1;
1229 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1233 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1234 IA->readsRegister(Dst->getReg(), &TRI);
1236 if (ProducerMI.isInlineAsm()) {
1238 for (
auto &Def : ProducerMI.all_defs()) {
1239 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1240 IA->readsRegister(
Def.getReg(), &TRI)) {
1249 int WaitStatesNeededForDef =
1250 Shift16DefWaitstates -
1251 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1252 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1255 return WaitStatesNeeded;
1258int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane)
const {
1259 const SIInstrInfo *TII = ST.getInstrInfo();
1260 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1261 const MachineRegisterInfo &MRI = MF.getRegInfo();
1263 const MachineOperand *LaneSelectOp =
1264 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1266 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->
getReg()))
1270 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1272 const int RWLaneWaitStates = 4;
1273 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1275 return RWLaneWaitStates - WaitStatesSince;
1278int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE)
const {
1279 if (!ST.hasRFEHazards())
1282 const SIInstrInfo *TII = ST.getInstrInfo();
1284 const int RFEWaitStates = 1;
1289 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1290 return RFEWaitStates - WaitStatesNeeded;
1293int GCNHazardRecognizer::checkReadM0Hazards(
MachineInstr *
MI)
const {
1294 const SIInstrInfo *TII = ST.getInstrInfo();
1295 const int ReadM0WaitStates = 1;
1296 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1297 return ReadM0WaitStates -
1298 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1303 int WaitStatesNeeded,
bool IsHoisting) {
1305 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1306 BuildMI(
MBB, InsertPt,
DL, TII.get(AMDGPU::V_NOP_e32));
1310 fixVMEMtoScalarWriteHazards(
MI);
1311 fixVcmpxPermlaneHazards(
MI);
1312 fixSMEMtoVectorWriteHazards(
MI);
1313 fixVcmpxExecWARHazard(
MI);
1314 fixLdsBranchVmemWARHazard(
MI);
1315 if (ST.hasLdsDirect()) {
1316 fixLdsDirectVALUHazard(
MI);
1317 fixLdsDirectVMEMHazard(
MI);
1319 fixVALUPartialForwardingHazard(
MI);
1320 fixVALUTransUseHazard(
MI);
1321 fixVALUTransCoexecutionHazards(
MI);
1323 fixWMMACoexecutionHazards(
MI);
1324 fixShift64HighRegBug(
MI);
1325 fixVALUMaskWriteHazard(
MI);
1326 fixRequiredExportPriority(
MI);
1327 if (ST.requiresWaitIdleBeforeGetReg())
1328 fixGetRegWaitIdle(
MI);
1329 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1330 fixDsAtomicAsyncBarrierArriveB64(
MI);
1331 if (ST.hasScratchBaseForwardingHazard())
1332 fixScratchBaseForwardingHazard(
MI);
1333 if (ST.setRegModeNeedsVNOPs())
1339 return (
TII.isVOPC(
MI) ||
1340 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1341 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1344bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1348 const SIInstrInfo *TII = ST.getInstrInfo();
1349 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1355 unsigned Opc =
MI.getOpcode();
1357 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1361 std::numeric_limits<int>::max())
1367 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1369 bool IsUndef = Src0->isUndef();
1371 TII->get(AMDGPU::V_MOV_B32_e32))
1378bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1379 if (!ST.hasVMEMtoScalarWriteHazard())
1381 assert(!ST.hasExtendedWaitCounts());
1386 if (
MI->getNumDefs() == 0)
1389 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1395 for (
const MachineOperand &Def :
MI->defs()) {
1396 const MachineOperand *
Op =
1397 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1407 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1408 !
MI.getOperand(0).getImm()) ||
1409 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1414 std::numeric_limits<int>::max())
1417 const SIInstrInfo *TII = ST.getInstrInfo();
1419 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1424bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1425 if (!ST.hasSMEMtoVectorWriteHazard())
1427 assert(!ST.hasExtendedWaitCounts());
1432 AMDGPU::OpName SDSTName;
1433 switch (
MI->getOpcode()) {
1434 case AMDGPU::V_READLANE_B32:
1435 case AMDGPU::V_READFIRSTLANE_B32:
1436 SDSTName = AMDGPU::OpName::vdst;
1439 SDSTName = AMDGPU::OpName::sdst;
1443 const SIInstrInfo *TII = ST.getInstrInfo();
1444 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1446 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1448 for (
const auto &MO :
MI->implicit_operands()) {
1449 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1460 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1465 if (TII->isSALU(
MI)) {
1466 switch (
MI.getOpcode()) {
1467 case AMDGPU::S_SETVSKIP:
1468 case AMDGPU::S_VERSION:
1469 case AMDGPU::S_WAITCNT_VSCNT:
1470 case AMDGPU::S_WAITCNT_VMCNT:
1471 case AMDGPU::S_WAITCNT_EXPCNT:
1474 case AMDGPU::S_WAITCNT_LGKMCNT:
1476 return (
MI.getOperand(1).getImm() == 0) &&
1477 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1478 case AMDGPU::S_WAITCNT: {
1479 const int64_t
Imm =
MI.getOperand(0).getImm();
1486 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1487 "unexpected wait count instruction");
1489 if (TII->isSOPP(
MI))
1505 std::numeric_limits<int>::max())
1509 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1514bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1515 if (!ST.hasVcmpxExecWARHazard())
1517 assert(!ST.hasExtendedWaitCounts());
1522 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1523 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1529 return I.readsRegister(AMDGPU::EXEC, TRI);
1532 const SIInstrInfo *TII = ST.getInstrInfo();
1533 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1535 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1537 for (
auto MO :
MI.implicit_operands())
1538 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1541 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1548 std::numeric_limits<int>::max())
1552 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1559 if (!ST.hasLdsBranchVmemWARHazard())
1564 bool HasLds =
false;
1565 bool HasVmem =
false;
1566 for (
auto &
MBB : MF) {
1567 for (
auto &
MI :
MBB) {
1570 if (HasLds && HasVmem)
1578 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1579 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1580 !
I.getOperand(1).getImm();
1583bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1584 if (!RunLdsBranchVmemWARHazardFixup)
1587 assert(ST.hasLdsBranchVmemWARHazard());
1588 assert(!ST.hasExtendedWaitCounts());
1590 auto IsHazardInst = [](
const MachineInstr &
MI) {
1598 auto InstType = IsHazardInst(*
MI);
1602 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1606 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1610 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1611 auto InstType2 = IsHazardInst(
I);
1612 return InstType2 && InstType != InstType2;
1615 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1616 auto InstType2 = IsHazardInst(
I);
1617 if (InstType == InstType2)
1624 std::numeric_limits<int>::max();
1628 std::numeric_limits<int>::max())
1631 const SIInstrInfo *TII = ST.getInstrInfo();
1633 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1640bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1644 const int NoHazardWaitStates = 15;
1645 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1648 bool VisitedTrans =
false;
1649 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1654 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1656 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1657 if (WaitStates >= NoHazardWaitStates)
1663 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1667 DenseSet<const MachineBasicBlock *> Visited;
1669 std::next(
MI->getReverseIterator()), 0,
1677 MachineOperand *WaitVdstOp =
1678 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1679 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1684bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1688 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1691 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1694 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1696 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1699 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1701 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1702 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1705 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1709 std::numeric_limits<int>::max())
1712 if (LdsdirCanWait) {
1713 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1716 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1723bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1724 if (!ST.hasVALUPartialForwardingHazard())
1726 assert(!ST.hasExtendedWaitCounts());
1731 SmallSetVector<Register, 4> SrcVGPRs;
1733 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1734 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1739 if (SrcVGPRs.
size() <= 1)
1757 const int Intv1plus2MaxVALUs = 2;
1758 const int Intv3MaxVALUs = 4;
1759 const int IntvMaxVALUs = 6;
1760 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1763 SmallDenseMap<Register, int, 4> DefPos;
1764 int ExecPos = std::numeric_limits<int>::max();
1767 static unsigned getHashValue(
const StateType &State) {
1771 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1772 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1780 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1782 if (State.VALUs > NoHazardVALUWaitStates)
1788 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1796 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1797 State.DefPos[Src] = State.VALUs;
1802 if (State.ExecPos == std::numeric_limits<int>::max()) {
1803 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1804 State.ExecPos = State.VALUs;
1811 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1819 if (State.ExecPos == std::numeric_limits<int>::max())
1822 int PreExecPos = std::numeric_limits<int>::max();
1823 int PostExecPos = std::numeric_limits<int>::max();
1825 for (
auto Entry : State.DefPos) {
1826 int DefVALUs =
Entry.second;
1827 if (DefVALUs != std::numeric_limits<int>::max()) {
1828 if (DefVALUs >= State.ExecPos)
1829 PreExecPos = std::min(PreExecPos, DefVALUs);
1831 PostExecPos = std::min(PostExecPos, DefVALUs);
1836 if (PostExecPos == std::numeric_limits<int>::max())
1840 int Intv3VALUs = PostExecPos;
1841 if (Intv3VALUs > Intv3MaxVALUs)
1845 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1846 if (Intv2VALUs > Intv1plus2MaxVALUs)
1850 if (PreExecPos == std::numeric_limits<int>::max())
1854 int Intv1VALUs = PreExecPos - State.ExecPos;
1855 if (Intv1VALUs > Intv1plus2MaxVALUs)
1859 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1864 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1870 std::next(
MI->getReverseIterator())))
1874 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1880bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1881 if (!ST.hasVALUTransUseHazard())
1883 assert(!ST.hasExtendedWaitCounts());
1888 SmallSet<Register, 4> SrcVGPRs;
1890 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1891 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1905 const int IntvMaxVALUs = 5;
1906 const int IntvMaxTRANS = 1;
1912 static unsigned getHashValue(
const StateType &State) {
1915 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1916 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1923 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1925 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1931 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1938 if (
I.modifiesRegister(Src, &TRI)) {
1946 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1954 std::next(
MI->getReverseIterator())))
1960 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1966bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1967 if (!ST.hasGFX1250Insts() ||
1971 const SIInstrInfo *TII = ST.getInstrInfo();
1972 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1974 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1979 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1980 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1981 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1985 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1986 if (!ValuDst || !ValuDst->isReg())
1990 Register ValuDef = ValuDst->getReg();
1991 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1992 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
2003 const int HasVALU = std::numeric_limits<int>::max();
2004 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
2007 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2015 const SIInstrInfo *TII = ST.getInstrInfo();
2016 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2018 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
2025 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2027 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2030 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2032 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2033 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2042 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2043 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2057 std::numeric_limits<int>::max())
2060 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2114 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2115 unsigned Category = 0;
2117 unsigned Latency = SchedModel.computeInstrLatency(&
MI);
2120 Category = IsSWMMAC ? 2 : 0;
2123 Category = IsLowestRateWMMA ? 4 : (IsSWMMAC ? 3 : 1);
2126 assert(IsLowestRateWMMA &&
"latency 32 is not expected");
2136int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI)
const {
2137 if (!ST.hasGFX1250Insts())
2140 const SIInstrInfo *TII = ST.getInstrInfo();
2149 const int WMMAWaitStates[] = {5, 9, 3, 5, 9, 17};
2150 const int VALUWaitStates[] = {4, 8, 2, 4, 8, 16};
2151 unsigned Category = 0;
2153 auto IsWMMAHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2154 if (!TII->isXDLWMMA(
I))
2158 return hasWMMAToWMMARegOverlap(
I, *
MI);
2161 auto IsVALUHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2162 if (!TII->isXDLWMMA(
I))
2166 return hasWMMAToVALURegOverlap(
I, *
MI);
2169 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2173 int WaitStatesNeeded = -1;
2174 int ExistingVALUs = 0;
2175 bool IsLowestRateWMMA = ST.hasGFX125xLowestRateWMMA();
2182 if (TII->isXDLWMMA(*
MI)) {
2184 const int WMMAWaitsLimit = IsLowestRateWMMA ? 17 : 9;
2186 getWaitStatesSince(IsWMMAHazardFn, WMMAWaitsLimit, GetWaitStatesFn);
2187 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2190 const int VALUWaitsLimit = IsLowestRateWMMA ? 16 : 8;
2192 getWaitStatesSince(IsVALUHazardFn, VALUWaitsLimit, GetWaitStatesFn);
2193 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2196 return WaitStatesNeeded;
2199bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2201 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2202 Register A1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src0)->getReg();
2203 Register B1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src1)->getReg();
2206 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2210 Register Idx1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2211 if (TRI.regsOverlap(D0, Idx1))
2217bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2220 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2221 for (
const MachineOperand &ValuUse :
MI.explicit_uses()) {
2222 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2227 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2228 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2232 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2233 WMMARegs.push_back(Idx0);
2236 for (
const MachineOperand &ValuDef :
MI.defs()) {
2237 Register VDstReg = ValuDef.getReg();
2238 for (
Register WMMAReg : WMMARegs) {
2239 if (TRI.regsOverlap(VDstReg, WMMAReg))
2246bool GCNHazardRecognizer::isCoexecutionHazardFor(
const MachineInstr &
I,
2250 if (!TII.isXDLWMMA(
I))
2254 if (TII.isXDLWMMA(
MI))
2255 return hasWMMAToWMMARegOverlap(
I,
MI);
2257 return hasWMMAToVALURegOverlap(
I,
MI);
2263 bool IncludeSubloops) {
2266 for (MachineBasicBlock *
MBB :
L->getBlocks()) {
2267 if (!IncludeSubloops && MLI->getLoopFor(
MBB) != L)
2269 for (MachineInstr &
I : *
MBB) {
2272 if (isCoexecutionHazardFor(
I, *
MI))
2279bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(
MachineInstr *
MI,
2280 int WaitStatesNeeded) {
2284 MachineLoop *
L = MLI->getLoopFor(
MI->getParent());
2286 ++NumWMMAHoistingBailed;
2291 if (hasWMMAHazardInLoop(L,
MI)) {
2292 ++NumWMMAHoistingBailed;
2297 MachineLoop *TargetLoop =
L;
2299 if (hasWMMAHazardInLoop(Parent,
MI,
false))
2301 TargetLoop = Parent;
2307 ++NumWMMAHoistingBailed;
2311 LLVM_DEBUG(
dbgs() <<
"WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2317 NumWMMANopsHoisted += WaitStatesNeeded;
2321bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2322 int WaitStatesNeeded = checkWMMACoexecutionHazards(
MI);
2323 if (WaitStatesNeeded <= 0)
2329 emitVNops(*
MI->getParent(),
MI->getIterator(), WaitStatesNeeded);
2333bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2334 if (!ST.hasShift64HighRegBug())
2336 assert(!ST.hasExtendedWaitCounts());
2338 switch (
MI->getOpcode()) {
2341 case AMDGPU::V_LSHLREV_B64_e64:
2342 case AMDGPU::V_LSHRREV_B64_e64:
2343 case AMDGPU::V_ASHRREV_I64_e64:
2347 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2352 const MachineRegisterInfo &MRI = MF.getRegInfo();
2354 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2357 if (AmtReg != AMDGPU::VGPR255 && MRI.
isPhysRegUsed(AmtReg + 1))
2360 assert(ST.needsAlignedVGPRs());
2361 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2364 MachineBasicBlock *
MBB =
MI->getParent();
2365 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2376 Register DstReg =
MI->getOperand(0).getReg();
2378 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2386 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2388 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2389 : AMDGPU::VGPR_32RegClass) {
2390 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2396 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2401 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2414 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2421 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2427 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2441 MI->getOperand(0).setReg(NewReg);
2450int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI)
const {
2451 int NSAtoVMEMWaitStates = 1;
2453 if (!ST.hasNSAtoVMEMBug())
2459 const SIInstrInfo *TII = ST.getInstrInfo();
2460 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2468 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2469 TII->getInstSizeInBytes(
I) >= 16;
2472 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2475int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2477 int FPAtomicToDenormModeWaitStates = 3;
2479 if (!ST.hasFPAtomicToDenormModeHazard())
2481 assert(!ST.hasExtendedWaitCounts());
2483 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2492 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2499 return FPAtomicToDenormModeWaitStates -
2503int GCNHazardRecognizer::checkMAIHazards(
MachineInstr *
MI)
const {
2506 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2509int GCNHazardRecognizer::checkMFMAPadding(
MachineInstr *
MI)
const {
2514 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2518 int NeighborMFMALatency = 0;
2519 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2520 this](
const MachineInstr &
MI) {
2524 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2528 const int MaxMFMAPipelineWaitStates = 16;
2529 int WaitStatesSinceNeighborMFMA =
2530 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2532 int NeighborMFMAPaddingNeeded =
2534 WaitStatesSinceNeighborMFMA;
2536 return std::max(0, NeighborMFMAPaddingNeeded);
2539int GCNHazardRecognizer::checkMAIHazards908(
MachineInstr *
MI)
const {
2540 int WaitStatesNeeded = 0;
2541 unsigned Opc =
MI->getOpcode();
2543 auto IsVALUFn = [](
const MachineInstr &
MI) {
2547 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2548 const int LegacyVALUWritesVGPRWaitStates = 2;
2549 const int VALUWritesExecWaitStates = 4;
2550 const int MaxWaitStates = 4;
2552 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2553 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2554 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2556 if (WaitStatesNeeded < MaxWaitStates) {
2557 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2558 const int MaxWaitStates = 2;
2560 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2563 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2564 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2565 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2567 if (WaitStatesNeeded == MaxWaitStates)
2573 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2574 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2577 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2580 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2581 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2582 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2583 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2584 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2585 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2586 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2587 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2588 const int MaxWaitStates = 18;
2590 unsigned HazardDefLatency = 0;
2592 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2593 this](
const MachineInstr &
MI) {
2600 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2601 return TRI.regsOverlap(DstReg,
Reg);
2604 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2606 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2607 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2608 int OpNo =
Op.getOperandNo();
2609 if (OpNo == SrcCIdx) {
2610 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2611 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2612 switch (HazardDefLatency) {
2613 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2615 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2617 case 16: [[fallthrough]];
2618 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2621 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2622 switch (HazardDefLatency) {
2623 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2625 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2627 case 16: [[fallthrough]];
2628 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2633 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2634 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2636 if (WaitStatesNeeded == MaxWaitStates)
2637 return WaitStatesNeeded;
2639 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2640 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2643 return TRI.regsOverlap(
Reg, DstReg);
2646 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2647 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2648 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2649 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2650 if (OpNo == SrcCIdx)
2651 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2652 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2653 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2655 WaitStatesNeededForUse = NeedWaitStates -
2656 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2657 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2659 if (WaitStatesNeeded == MaxWaitStates)
2660 return WaitStatesNeeded;
2663 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2664 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2665 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2666 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2667 const int MaxWaitStates = 13;
2668 Register DstReg =
MI->getOperand(0).getReg();
2669 unsigned HazardDefLatency = 0;
2671 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2672 this](
const MachineInstr &
MI) {
2675 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2677 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2678 return TRI.regsOverlap(
Reg, DstReg);
2681 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2683 switch (HazardDefLatency) {
2684 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2686 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2688 case 16: [[fallthrough]];
2689 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2693 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2694 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2698 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2700 return WaitStatesNeeded;
2711 return NumPasses + 1 + IsGFX950;
2722 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2740 return NumPasses + 2;
2750 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2753int GCNHazardRecognizer::checkMAIHazards90A(
MachineInstr *
MI)
const {
2754 int WaitStatesNeeded = 0;
2755 unsigned Opc =
MI->getOpcode();
2757 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2761 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2767 return WaitStatesNeeded;
2769 const int VALUWritesExecWaitStates = 4;
2770 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2771 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2772 VALUWritesExecWaitStates);
2773 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2775 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2778 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2779 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2780 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2781 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2782 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2783 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2784 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2785 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2786 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2787 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2788 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2789 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2790 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2791 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2792 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2793 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2794 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2795 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2796 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2797 const int MaxWaitStates = 19;
2803 const MachineInstr *MI1;
2805 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2806 this](
const MachineInstr &
MI) {
2810 FullReg = (DstReg ==
Reg);
2812 return TRI.regsOverlap(DstReg,
Reg);
2815 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2816 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2817 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2820 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2821 if (NumWaitStates == std::numeric_limits<int>::max())
2824 int OpNo =
Use.getOperandNo();
2826 int NeedWaitStates = 0;
2827 if (OpNo == SrcCIdx) {
2831 }
else if (FullReg) {
2832 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2833 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2834 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2835 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2836 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2837 else if (ST.hasGFX940Insts() &&
2838 TSchedModel.computeInstrLatency(MI1) == 2)
2839 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2842 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2843 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2844 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2845 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2846 if (!TII.isXDL(*
MI))
2849 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2850 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2852 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2853 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2854 if (!TII.isXDL(*
MI))
2855 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2858 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2859 if (ST.hasGFX940Insts()) {
2860 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2867 NumPasses, ST.hasGFX950Insts())
2869 NumPasses, ST.hasGFX950Insts()))
2875 switch (NumPasses) {
2879 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2880 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2885 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2886 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2891 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2892 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2901 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2902 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2903 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2904 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2907 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2908 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2910 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2911 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2912 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2915 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2917 if (ST.hasGFX940Insts()) {
2921 NumPasses, ST.hasGFX950Insts())
2927 switch (NumPasses) {
2929 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2934 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2938 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2942 if (WaitStatesNeeded >= NeedWaitStates)
2945 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2946 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2948 if (WaitStatesNeeded == MaxWaitStates)
2953 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2955 return WaitStatesNeeded;
2958int GCNHazardRecognizer::checkMAILdStHazards(
MachineInstr *
MI)
const {
2960 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2963 int WaitStatesNeeded = 0;
2965 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2966 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2969 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2970 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2975 const int AccVgprReadLdStWaitStates = 2;
2976 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2977 const int MaxWaitStates = 2;
2979 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2980 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2981 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2983 if (WaitStatesNeeded == MaxWaitStates)
2984 return WaitStatesNeeded;
2986 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2987 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2988 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2990 auto IsVALUFn = [](
const MachineInstr &
MI) {
2993 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2994 std::numeric_limits<int>::max();
2997 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2998 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2999 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3002 return WaitStatesNeeded;
3005int GCNHazardRecognizer::checkPermlaneHazards(
MachineInstr *
MI)
const {
3006 assert(!ST.hasVcmpxPermlaneHazard() &&
3007 "this is a different vcmpx+permlane hazard");
3008 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3009 const SIInstrInfo *TII = ST.getInstrInfo();
3011 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
3015 auto IsVALUFn = [](
const MachineInstr &
MI) {
3019 const int VCmpXWritesExecWaitStates = 4;
3020 const int VALUWritesVDstWaitStates = 2;
3021 int WaitStatesNeeded = 0;
3023 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
3024 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
3028 int WaitStatesSinceDef =
3029 VALUWritesVDstWaitStates -
3030 getWaitStatesSinceDef(
Reg, IsVALUFn,
3031 VALUWritesVDstWaitStates);
3032 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3033 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3037 int VCmpXHazardWaits =
3038 VCmpXWritesExecWaitStates -
3039 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3041 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3042 return WaitStatesNeeded;
3050 return NumPasses + 2;
3060 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3070 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3078 return NumPasses + 2;
3081int GCNHazardRecognizer::checkMAIVALUHazards(
MachineInstr *
MI)
const {
3082 if (!ST.hasGFX90AInsts())
3085 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
3093 const MachineRegisterInfo &MRI = MF.getRegInfo();
3095 int WaitStatesNeeded = 0;
3101 const MachineInstr *
MFMA =
nullptr;
3103 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3105 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3111 const MachineInstr *
DOT =
nullptr;
3112 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
3114 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3120 bool DGEMMAfterVALUWrite =
false;
3121 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
3124 DGEMMAfterVALUWrite =
true;
3128 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
3134 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
3135 AMDGPU::OpName::src2);
3137 if (IsMemOrExport || IsVALU) {
3138 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3139 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3140 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3141 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3142 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3143 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3144 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3145 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3146 const int DotWriteSameDotReadSrcAB = 3;
3147 const int DotWriteDifferentVALURead = 3;
3148 const int DMFMABetweenVALUWriteVMEMRead = 2;
3149 const int MaxWaitStates = 19;
3151 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3157 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3160 int NeedWaitStates = 0;
3161 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3162 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3163 NeedWaitStates = DotWriteSameDotReadSrcAB;
3165 NeedWaitStates = DotWriteDifferentVALURead;
3168 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3169 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3176 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3177 DGEMMAfterVALUWrite =
false;
3178 if (TRI.isVectorRegister(MRI,
Reg)) {
3179 int WaitStatesNeededForUse =
3180 DMFMABetweenVALUWriteVMEMRead -
3181 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3182 DMFMABetweenVALUWriteVMEMRead);
3184 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3189 WaitStatesSinceDef =
3190 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3194 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3195 int NumPasses = HazardDefLatency;
3196 int NeedWaitStates = MaxWaitStates;
3199 switch (HazardDefLatency) {
3201 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3202 : DMFMA4x4WriteVgprVALUReadWaitStates;
3208 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3209 : (ST.hasGFX950Insts()
3210 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3211 : DMFMA16x16WriteVgprVALUReadWaitStates);
3216 }
else if (ST.hasGFX940Insts()) {
3220 NumPasses, ST.hasGFX950Insts())
3224 switch (HazardDefLatency) {
3226 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3229 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3232 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3239 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3240 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3242 if (WaitStatesNeeded == MaxWaitStates)
3247 unsigned Opc =
MI->getOpcode();
3248 const int DMFMAToFMA64WaitStates = 2;
3249 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3250 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3251 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3252 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3253 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3254 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3255 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3258 if (!IsVALU && !IsMemOrExport)
3259 return WaitStatesNeeded;
3261 for (
const MachineOperand &Def :
MI->defs()) {
3262 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3263 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3264 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3265 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3266 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3267 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3268 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3269 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3270 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3271 const int DotWriteDifferentVALUWrite = 3;
3272 const int MaxWaitStates = 19;
3273 const int MaxWarWaitStates = 15;
3278 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3280 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3281 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3282 WaitStatesSinceDef);
3285 WaitStatesSinceDef =
3286 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3288 int NeedWaitStates = MaxWaitStates;
3289 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3292 switch (NumPasses) {
3294 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3298 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3303 }
else if (ST.hasGFX940Insts()) {
3307 NumPasses, ST.hasGFX950Insts())
3310 switch (NumPasses) {
3312 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3315 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3318 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3325 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3326 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3328 if (WaitStatesNeeded == MaxWaitStates)
3332 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3334 !
MI.readsRegister(
Reg, &TRI))
3337 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3340 const MachineOperand *SrcC =
3341 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3351 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3356 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3357 int NeedWaitStates = MaxWaitStates;
3358 switch (HazardDefLatency) {
3359 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3361 case 4:
assert(ST.hasGFX940Insts());
3362 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3364 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3366 case 16: [[fallthrough]];
3367 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3371 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3372 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3375 return WaitStatesNeeded;
3388 return MAI !=
nullptr;
3392 if (IsMFMAFn(*
MI)) {
3393 int W = getWaitStatesSince(IsMFMAFn, 16);
3395 return W < (int)TSchedModel.computeInstrLatency(MAI);
3409 while (
I->isBundledWithPred())
3415 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3419 const unsigned NewBytes = 4;
3421 "Unexpected instruction insertion in bundle");
3424 while (NextMI != End && NextMI->isBundledWithPred()) {
3425 for (
auto &Operand : NextMI->operands()) {
3426 if (Operand.isGlobal())
3427 Operand.setOffset(Operand.getOffset() + NewBytes);
3433bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3434 if (!ST.hasVALUMaskWriteHazard())
3436 assert(!ST.hasExtendedWaitCounts());
3443 if (!IsSALU && !IsVALU)
3455 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3456 const MachineRegisterInfo &MRI = MF.getRegInfo();
3461 case AMDGPU::EXEC_LO:
3462 case AMDGPU::EXEC_HI:
3464 case AMDGPU::SGPR_NULL:
3465 case AMDGPU::SGPR_NULL64:
3473 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3477 SmallSet<Register, 2> HazardSGPRs;
3479 static unsigned getHashValue(
const StateType &State) {
3482 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3483 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3487 SmallVector<const MachineInstr *> WaitInstrs;
3488 bool HasSGPRRead =
false;
3489 StateType InitialState;
3492 MachineOperand *HazardDef =
nullptr;
3493 for (MachineOperand &
Op :
MI->operands()) {
3496 if (
Op.isDef() && HazardDef)
3500 if (IgnoreableSGPR(
Reg))
3503 if (
Op.isImplicit())
3505 if (!TRI->isSGPRReg(MRI,
Reg))
3523 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3524 InitialState.HazardSGPRs.insert(HazardReg);
3527 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3528 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3531 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3532 if (State.HazardSGPRs.empty())
3535 switch (
I.getOpcode()) {
3536 case AMDGPU::V_ADDC_U32_e32:
3537 case AMDGPU::V_ADDC_U32_dpp:
3538 case AMDGPU::V_CNDMASK_B16_t16_e32:
3539 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3540 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3541 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3542 case AMDGPU::V_CNDMASK_B32_e32:
3543 case AMDGPU::V_CNDMASK_B32_dpp:
3544 case AMDGPU::V_DIV_FMAS_F32_e64:
3545 case AMDGPU::V_DIV_FMAS_F64_e64:
3546 case AMDGPU::V_SUBB_U32_e32:
3547 case AMDGPU::V_SUBB_U32_dpp:
3548 case AMDGPU::V_SUBBREV_U32_e32:
3549 case AMDGPU::V_SUBBREV_U32_dpp: {
3553 case AMDGPU::V_ADDC_U32_e64:
3554 case AMDGPU::V_ADDC_U32_e64_dpp:
3555 case AMDGPU::V_CNDMASK_B16_t16_e64:
3556 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3557 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3558 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3559 case AMDGPU::V_CNDMASK_B32_e64:
3560 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3561 case AMDGPU::V_SUBB_U32_e64:
3562 case AMDGPU::V_SUBB_U32_e64_dpp:
3563 case AMDGPU::V_SUBBREV_U32_e64:
3564 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3566 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3568 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3580 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3581 switch (
I.getOpcode()) {
3582 case AMDGPU::S_WAITCNT_DEPCTR:
3584 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3585 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3590 for (
auto &
Op :
I.operands()) {
3595 if (IgnoreableSGPR(
Reg))
3598 if (
Op.isImplicit())
3600 if (!TRI->isSGPRReg(MRI,
Reg))
3611 for (
Register SGPR : State.HazardSGPRs) {
3612 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3616 State.HazardSGPRs.erase(SGPR);
3625 std::next(
MI->getReverseIterator())))
3635 if (!WaitInstrs.
empty()) {
3639 SmallVector<MachineInstr *> ToErase;
3641 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3642 End = MI->getParent()->rend();
3643 Found < WaitInstrs.size() && It != End; ++It) {
3644 MachineInstr *WaitMI = &*It;
3646 if (std::as_const(WaitMI) != WaitInstrs[Found])
3649 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3650 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3651 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3652 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3653 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3654 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3655 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3656 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3657 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3658 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3659 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3660 ToErase.push_back(WaitMI);
3663 for (MachineInstr *WaitMI : ToErase)
3664 WaitMI->eraseFromParent();
3668 auto NextMI = std::next(
MI->getIterator());
3669 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3670 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3682 if (EntryMBB.
begin() != EntryMBB.
end()) {
3683 auto &EntryMI = *EntryMBB.
begin();
3684 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3685 EntryMI.getOperand(0).getImm() >= Priority)
3694bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3695 if (!ST.hasRequiredExportPriority())
3700 MachineBasicBlock *
MBB =
MI->getParent();
3713 const int MaxPriority = 3;
3714 const int NormalPriority = 2;
3715 const int PostExportPriority = 0;
3717 auto It =
MI->getIterator();
3718 switch (
MI->getOpcode()) {
3719 case AMDGPU::S_ENDPGM:
3720 case AMDGPU::S_ENDPGM_SAVED:
3721 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3722 case AMDGPU::SI_RETURN_TO_EPILOG:
3725 if (MF->getFrameInfo().hasCalls())
3728 case AMDGPU::S_SETPRIO: {
3730 auto &PrioOp =
MI->getOperand(0);
3731 int Prio = PrioOp.getImm();
3732 bool InWA = (Prio == PostExportPriority) &&
3733 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3734 if (InWA || Prio >= NormalPriority)
3736 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3740 if (!TII.isEXP(*
MI))
3751 auto NextMI = std::next(It);
3752 bool EndOfShader =
false;
3753 if (NextMI !=
MBB->
end()) {
3755 if (TII.isEXP(*NextMI))
3758 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3759 NextMI->getOperand(0).getImm() == PostExportPriority)
3761 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3768 .
addImm(PostExportPriority);
3772 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3773 .
addReg(AMDGPU::SGPR_NULL)
3793 const SIInstrInfo *TII = ST.getInstrInfo();
3805 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3810bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3811 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3814 const SIInstrInfo *TII = ST.getInstrInfo();
3816 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3818 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3819 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3825bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3828 if (!IsHazardRecognizerMode)
3831 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3832 const SIInstrInfo *TII = ST.getInstrInfo();
3834 const int FlatScrBaseWaitStates = 10;
3836 bool ReadsFlatScrLo =
3837 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3838 bool ReadsFlatScrHi =
3839 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3845 ReadsFlatScrLo =
true;
3848 ReadsFlatScrHi =
true;
3853 const MachineRegisterInfo &MRI = MF.getRegInfo();
3856 DenseSet<const MachineBasicBlock *> Visited;
3858 return MI.modifiesRegister(
Reg, TRI);
3863 auto IsSGPRDef = [TII, TRI, &MRI](
const MachineInstr &
MI) ->
unsigned {
3864 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3866 for (
const MachineOperand &MO :
MI.all_defs()) {
3867 if (TRI->isSGPRReg(MRI, MO.getReg()))
3873 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3874 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3875 unsigned Wait =
MI.getOperand(0).getImm();
3880 return SgprWrites >= FlatScrBaseWaitStates;
3883 return ::getWaitStatesSince(
3884 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3885 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3889 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3891 !IsRegDefHazard(AMDGPU::SGPR103)))
3895 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3906 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3907 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel, const GCNSubtarget &ST)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const uint32_t IV[8]
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...