28#define DEBUG_TYPE "gcn-hazard-recognizer"
31 "Number of WMMA hazard V_NOPs hoisted from loops");
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
37struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
42 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
45 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
55 cl::desc(
"Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
61 cl::desc(
"Insert a s_nop x before every instruction"));
65 cl::desc(
"Hoist WMMA hazard V_NOPs from loops to preheaders"));
76 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
85 EmittedInstrs.clear();
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
120 return Opcode == AMDGPU::S_RFE_B64;
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
137 if (
TII.isAlwaysGDS(
MI.getOpcode()))
140 switch (
MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
151 if (
TII.isDS(
MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (
MI.getOperand(GDS).getImm())
162 unsigned Opcode =
MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
186 AMDGPU::OpName::simm16);
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
206 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(
MI) > 0)
215 if (ST.hasNoDataDepHazard())
227 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
230 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
235 checkMAIVALUHazards(
MI) > 0)
238 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
241 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
244 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
247 if (((ST.hasReadM0MovRelInterpHazard() &&
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
252 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
255 checkReadM0Hazards(
MI) > 0)
262 checkMAILdStHazards(
MI) > 0)
265 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
282GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
289void GCNHazardRecognizer::processBundle() {
293 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
294 CurrCycleInstr = &*
MI;
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
306 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
307 EmittedInstrs.push_front(
nullptr);
309 EmittedInstrs.push_front(CurrCycleInstr);
312 CurrCycleInstr =
nullptr;
316 assert(IsHazardRecognizerMode);
320 if (
MI->isInsideBundle())
330 IsHazardRecognizerMode =
true;
334 CurrCycleInstr =
nullptr;
349 return std::max(WaitStates, checkSMRDHazards(
MI));
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
356 if (ST.hasNoDataDepHazard())
360 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
363 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
366 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
376 checkMAIVALUHazards(
MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
379 if (
MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(
MI));
383 return std::max(WaitStates, checkGetRegHazards(
MI));
386 return std::max(WaitStates, checkSetRegHazards(
MI));
389 return std::max(WaitStates, checkRFEHazards(
MI));
391 if ((ST.hasReadM0MovRelInterpHazard() &&
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
396 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(
MI));
402 return std::max(WaitStates, checkMAIHazards(
MI));
405 return std::max(WaitStates, checkMAILdStHazards(
MI));
408 return std::max(WaitStates, checkPermlaneHazards(
MI));
414 EmittedInstrs.push_front(
nullptr);
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(
nullptr);
425 if (CurrCycleInstr->isBundle()) {
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr =
nullptr;
437 EmittedInstrs.push_front(CurrCycleInstr);
444 EmittedInstrs.push_front(
nullptr);
452 CurrCycleInstr =
nullptr;
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
467template <
typename StateT>
477 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
482 static unsigned getHashValue(
const StateMapKey &
Key) {
483 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
485 static unsigned getHashValue(
const StateT &State) {
486 return StateT::getHashValue(State);
488 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
489 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
491 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
492 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
501 StateT State = InitialState;
504 unsigned WorkIdx = 0;
506 bool Expired =
false;
507 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
512 auto Result = IsHazard(State, *
I);
520 if (
I->isInlineAsm() ||
I->isMetaInstruction())
523 UpdateState(State, *
I);
527 unsigned StateIdx = States.
size();
528 StateMapKey
Key = {&States, StateIdx};
529 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
530 if (Insertion.second) {
533 StateIdx = Insertion.first->second;
536 Worklist.
insert(std::pair(Pred, StateIdx));
539 if (WorkIdx == Worklist.
size())
543 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
544 State = States[StateIdx];
545 I =
MBB->instr_rbegin();
562 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
570 if (
I->isInlineAsm())
573 WaitStates += GetNumWaitStates(*
I);
575 if (IsExpired(*
I, WaitStates))
576 return std::numeric_limits<int>::max();
579 int MinWaitStates = std::numeric_limits<int>::max();
581 if (!Visited.
insert(Pred).second)
585 IsExpired, Visited, GetNumWaitStates);
587 MinWaitStates = std::min(MinWaitStates, W);
590 return MinWaitStates;
601 std::next(
MI->getReverseIterator()), 0, IsExpired,
602 Visited, GetNumWaitStates);
605int GCNHazardRecognizer::getWaitStatesSince(
606 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates)
const {
607 if (IsHazardRecognizerMode) {
608 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
609 return WaitStates >= Limit;
611 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
616 for (MachineInstr *
MI : EmittedInstrs) {
621 if (
MI->isInlineAsm())
624 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
626 if (WaitStates >= Limit)
629 return std::numeric_limits<int>::max();
632int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
637int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
638 IsHazardFn IsHazardDef,
640 const SIRegisterInfo *TRI = ST.getRegisterInfo();
643 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
649int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
664 for (MCRegUnit Unit :
TRI.regunits(
Reg))
665 BV.
set(
static_cast<unsigned>(Unit));
677void GCNHazardRecognizer::addClauseInst(
const MachineInstr &
MI)
const {
689int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM)
const {
692 if (!ST.isXNACKEnabled())
695 bool IsSMRD = TII.isSMRD(*MEM);
709 for (MachineInstr *
MI : EmittedInstrs) {
721 if (ClauseDefs.none())
734 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
737int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD)
const {
738 int WaitStatesNeeded = 0;
740 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
743 if (!ST.hasSMRDReadVALUDefHazard())
744 return WaitStatesNeeded;
748 int SmrdSgprWaitStates = 4;
749 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
750 return TII.isVALU(
MI);
752 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
753 return TII.isSALU(
MI);
756 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
758 for (
const MachineOperand &Use :
SMRD->uses()) {
761 int WaitStatesNeededForUse =
762 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
764 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
774 int WaitStatesNeededForUse =
775 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
778 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
782 return WaitStatesNeeded;
785int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr *VMEM)
const {
786 if (!ST.hasVMEMReadSGPRVALUDefHazard())
789 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
793 const int VmemSgprWaitStates = 5;
794 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
795 return TII.isVALU(
MI);
797 for (
const MachineOperand &Use :
VMEM->uses()) {
798 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
801 int WaitStatesNeededForUse =
802 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
804 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
806 return WaitStatesNeeded;
810 const SIRegisterInfo *TRI = ST.getRegisterInfo();
811 const SIInstrInfo *TII = ST.getInstrInfo();
814 int DppVgprWaitStates = 2;
815 int DppExecWaitStates = 5;
816 int WaitStatesNeeded = 0;
817 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
818 return TII->isVALU(
MI);
821 for (
const MachineOperand &Use :
DPP->uses()) {
822 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
824 int WaitStatesNeededForUse =
825 DppVgprWaitStates - getWaitStatesSinceDef(
827 [](
const MachineInstr &) { return true; },
829 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
832 WaitStatesNeeded = std::max(
834 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
837 return WaitStatesNeeded;
840int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas)
const {
841 const SIInstrInfo *TII = ST.getInstrInfo();
845 const int DivFMasWaitStates = 4;
846 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
847 return TII->isVALU(
MI);
849 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
852 return DivFMasWaitStates - WaitStatesNeeded;
855int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr)
const {
856 const SIInstrInfo *TII = ST.getInstrInfo();
857 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
859 const int GetRegWaitStates = 2;
860 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
863 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
865 return GetRegWaitStates - WaitStatesNeeded;
868int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr)
const {
869 const SIInstrInfo *TII = ST.getInstrInfo();
870 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
872 const int SetRegWaitStates = ST.getSetRegWaitStates();
873 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
876 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
877 return SetRegWaitStates - WaitStatesNeeded;
880int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI)
const {
884 const SIInstrInfo *TII = ST.getInstrInfo();
885 unsigned Opcode =
MI.getOpcode();
886 const MCInstrDesc &
Desc =
MI.getDesc();
888 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
891 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
893 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
903 if (ST.hasGFX940Insts())
905 const MachineOperand *SOffset =
906 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
907 if (!SOffset || !SOffset->
isReg())
916 if (TII->isMIMG(
MI)) {
917 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
919 Desc.operands()[SRsrcIdx])) == 256);
923 if (TII->isFLAT(
MI)) {
935int GCNHazardRecognizer::checkVALUHazardsHelper(
947 const SIRegisterInfo *TRI = ST.getRegisterInfo();
948 const SIInstrInfo *TII = ST.getInstrInfo();
950 int WaitStatesNeeded = 0;
951 if (!TRI->isVectorRegister(MRI,
Def.getReg()))
952 return WaitStatesNeeded;
955 const int MaxWaitStates = ST.hasGFX940Insts() ? 2 : 1;
960 auto WindowFor = [
this, TII](
const MachineInstr &
MI) ->
int {
961 if (!ST.hasGFX940Insts())
963 if (TII->isBUF(
MI)) {
964 const MachineOperand *SOffset =
965 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
966 if (SOffset && SOffset->
isReg())
976 auto Counter = [&](
const MachineInstr &
MI) {
977 int DataIdx = createsVALUHazard(
MI);
979 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg)) {
980 int Need = WindowFor(
MI) - Distance;
981 WaitStatesNeeded = std::max(WaitStatesNeeded, Need);
985 if (!
MI.isInlineAsm())
989 getWaitStatesSince(Counter, MaxWaitStates);
991 return WaitStatesNeeded;
1007 unsigned Opcode =
MI.getOpcode();
1017 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
1019 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1025 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
1027 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1031 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
1033 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1039 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1060 for (
auto &Operand : VALU->operands()) {
1061 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1068int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU)
const {
1069 int WaitStatesNeeded = 0;
1072 const int TransDefWaitstates = 1;
1074 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1077 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1078 const SIInstrInfo *TII = ST.getInstrInfo();
1079 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1081 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1082 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1089 int WaitStatesNeededForDef =
1090 TransDefWaitstates -
1091 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1095 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1096 const int Shift16DefWaitstates = 1;
1098 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1099 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1100 const MachineOperand *ForwardedDst =
1106 if (ProducerMI.isInlineAsm()) {
1108 for (
auto &Def : ProducerMI.all_defs()) {
1117 int WaitStatesNeededForDef =
1118 Shift16DefWaitstates -
1119 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1120 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1123 if (ST.hasVDecCoExecHazard()) {
1124 const int VALUWriteSGPRVALUReadWaitstates = 2;
1125 const int VALUWriteEXECRWLane = 4;
1126 const int VALUWriteVGPRReadlaneRead = 1;
1128 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1129 const MachineRegisterInfo &MRI = MF.getRegInfo();
1131 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1134 return MI.modifiesRegister(
UseReg, TRI);
1137 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1142 if (TRI->isSGPRReg(MRI,
UseReg)) {
1143 int WaitStatesNeededForDef =
1144 VALUWriteSGPRVALUReadWaitstates -
1145 getWaitStatesSince(IsVALUDefSGPRFn,
1146 VALUWriteSGPRVALUReadWaitstates);
1147 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1151 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1153 int WaitStatesNeededForDef =
1154 VALUWriteSGPRVALUReadWaitstates -
1155 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1159 switch (
VALU->getOpcode()) {
1160 case AMDGPU::V_READLANE_B32:
1161 case AMDGPU::V_READFIRSTLANE_B32: {
1162 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1164 int WaitStatesNeededForDef =
1165 VALUWriteVGPRReadlaneRead -
1166 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1167 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1170 case AMDGPU::V_WRITELANE_B32: {
1172 int WaitStatesNeededForDef =
1173 VALUWriteEXECRWLane -
1174 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1175 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1185 if (!ST.has12DWordStoreHazard())
1186 return WaitStatesNeeded;
1188 const MachineRegisterInfo &MRI = MF.getRegInfo();
1190 for (
const MachineOperand &Def :
VALU->defs()) {
1191 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1194 return WaitStatesNeeded;
1197int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA)
const {
1206 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1207 !ST.hasCvtScaleForwardingHazard())
1210 const MachineRegisterInfo &MRI = MF.getRegInfo();
1211 int WaitStatesNeeded = 0;
1213 for (
const MachineOperand &
Op :
1215 if (
Op.isReg() &&
Op.isDef()) {
1216 if (!TRI.isVectorRegister(MRI,
Op.getReg()))
1219 if (ST.has12DWordStoreHazard()) {
1221 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op, MRI));
1226 if (ST.hasDstSelForwardingHazard()) {
1227 const int Shift16DefWaitstates = 1;
1229 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1233 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1234 IA->readsRegister(Dst->getReg(), &TRI);
1236 if (ProducerMI.isInlineAsm()) {
1238 for (
auto &Def : ProducerMI.all_defs()) {
1239 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1240 IA->readsRegister(
Def.getReg(), &TRI)) {
1249 int WaitStatesNeededForDef =
1250 Shift16DefWaitstates -
1251 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1252 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1255 return WaitStatesNeeded;
1258int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane)
const {
1259 const SIInstrInfo *TII = ST.getInstrInfo();
1260 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1261 const MachineRegisterInfo &MRI = MF.getRegInfo();
1263 const MachineOperand *LaneSelectOp =
1264 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1266 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->
getReg()))
1270 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1272 const int RWLaneWaitStates = 4;
1273 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1275 return RWLaneWaitStates - WaitStatesSince;
1278int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE)
const {
1279 if (!ST.hasRFEHazards())
1282 const SIInstrInfo *TII = ST.getInstrInfo();
1284 const int RFEWaitStates = 1;
1289 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1290 return RFEWaitStates - WaitStatesNeeded;
1293int GCNHazardRecognizer::checkReadM0Hazards(
MachineInstr *
MI)
const {
1294 const SIInstrInfo *TII = ST.getInstrInfo();
1295 const int ReadM0WaitStates = 1;
1296 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1297 return ReadM0WaitStates -
1298 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1303 int WaitStatesNeeded,
bool IsHoisting) {
1305 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1306 BuildMI(
MBB, InsertPt,
DL, TII.get(AMDGPU::V_NOP_e32));
1310 fixVMEMtoScalarWriteHazards(
MI);
1311 fixVcmpxPermlaneHazards(
MI);
1312 fixSMEMtoVectorWriteHazards(
MI);
1313 fixVcmpxExecWARHazard(
MI);
1314 fixLdsBranchVmemWARHazard(
MI);
1315 if (ST.hasLdsDirect()) {
1316 fixLdsDirectVALUHazard(
MI);
1317 fixLdsDirectVMEMHazard(
MI);
1319 fixVALUPartialForwardingHazard(
MI);
1320 fixVALUTransUseHazard(
MI);
1321 fixVALUTransCoexecutionHazards(
MI);
1323 fixWMMACoexecutionHazards(
MI);
1324 fixShift64HighRegBug(
MI);
1325 fixVALUMaskWriteHazard(
MI);
1326 fixRequiredExportPriority(
MI);
1327 if (ST.requiresWaitIdleBeforeGetReg())
1328 fixGetRegWaitIdle(
MI);
1329 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1330 fixDsAtomicAsyncBarrierArriveB64(
MI);
1331 if (ST.hasScratchBaseForwardingHazard())
1332 fixScratchBaseForwardingHazard(
MI);
1333 if (ST.setRegModeNeedsVNOPs())
1339 return (
TII.isVOPC(
MI) ||
1340 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1341 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1344bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1348 const SIInstrInfo *TII = ST.getInstrInfo();
1349 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1355 unsigned Opc =
MI.getOpcode();
1357 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1361 std::numeric_limits<int>::max())
1367 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1369 bool IsUndef = Src0->isUndef();
1371 TII->get(AMDGPU::V_MOV_B32_e32))
1378bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1379 if (!ST.hasVMEMtoScalarWriteHazard())
1381 assert(!ST.hasExtendedWaitCounts());
1386 if (
MI->getNumDefs() == 0)
1389 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1395 for (
const MachineOperand &Def :
MI->defs()) {
1396 const MachineOperand *
Op =
1397 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1407 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1408 !
MI.getOperand(0).getImm()) ||
1409 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1414 std::numeric_limits<int>::max())
1417 const SIInstrInfo *TII = ST.getInstrInfo();
1419 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1424bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1425 if (!ST.hasSMEMtoVectorWriteHazard())
1427 assert(!ST.hasExtendedWaitCounts());
1432 AMDGPU::OpName SDSTName;
1433 switch (
MI->getOpcode()) {
1434 case AMDGPU::V_READLANE_B32:
1435 case AMDGPU::V_READFIRSTLANE_B32:
1436 SDSTName = AMDGPU::OpName::vdst;
1439 SDSTName = AMDGPU::OpName::sdst;
1443 const SIInstrInfo *TII = ST.getInstrInfo();
1444 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1446 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1448 for (
const auto &MO :
MI->implicit_operands()) {
1449 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1460 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1465 if (TII->isSALU(
MI)) {
1466 switch (
MI.getOpcode()) {
1467 case AMDGPU::S_SETVSKIP:
1468 case AMDGPU::S_VERSION:
1469 case AMDGPU::S_WAITCNT_VSCNT:
1470 case AMDGPU::S_WAITCNT_VMCNT:
1471 case AMDGPU::S_WAITCNT_EXPCNT:
1474 case AMDGPU::S_WAITCNT_LGKMCNT:
1476 return (
MI.getOperand(1).getImm() == 0) &&
1477 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1478 case AMDGPU::S_WAITCNT: {
1479 const int64_t
Imm =
MI.getOperand(0).getImm();
1486 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1487 "unexpected wait count instruction");
1489 if (TII->isSOPP(
MI))
1505 std::numeric_limits<int>::max())
1509 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1514bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1515 if (!ST.hasVcmpxExecWARHazard())
1517 assert(!ST.hasExtendedWaitCounts());
1522 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1523 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1529 return I.readsRegister(AMDGPU::EXEC, TRI);
1532 const SIInstrInfo *TII = ST.getInstrInfo();
1533 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1535 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1537 for (
auto MO :
MI.implicit_operands())
1538 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1541 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1548 std::numeric_limits<int>::max())
1552 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1559 if (!ST.hasLdsBranchVmemWARHazard())
1564 bool HasLds =
false;
1565 bool HasVmem =
false;
1566 for (
auto &
MBB : MF) {
1567 for (
auto &
MI :
MBB) {
1570 if (HasLds && HasVmem)
1578 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1579 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1580 !
I.getOperand(1).getImm();
1583bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1584 if (!RunLdsBranchVmemWARHazardFixup)
1587 assert(ST.hasLdsBranchVmemWARHazard());
1588 assert(!ST.hasExtendedWaitCounts());
1590 auto IsHazardInst = [](
const MachineInstr &
MI) {
1598 auto InstType = IsHazardInst(*
MI);
1602 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1606 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1610 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1611 auto InstType2 = IsHazardInst(
I);
1612 return InstType2 && InstType != InstType2;
1615 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1616 auto InstType2 = IsHazardInst(
I);
1617 if (InstType == InstType2)
1624 std::numeric_limits<int>::max();
1628 std::numeric_limits<int>::max())
1631 const SIInstrInfo *TII = ST.getInstrInfo();
1633 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1640bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1644 const int NoHazardWaitStates = 15;
1645 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1648 bool VisitedTrans =
false;
1649 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1654 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1656 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1657 if (WaitStates >= NoHazardWaitStates)
1663 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1667 DenseSet<const MachineBasicBlock *> Visited;
1669 std::next(
MI->getReverseIterator()), 0,
1677 MachineOperand *WaitVdstOp =
1678 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1679 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1684bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1688 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1691 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1694 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1696 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1699 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1701 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1702 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1705 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1709 std::numeric_limits<int>::max())
1712 if (LdsdirCanWait) {
1713 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1716 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1723bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1724 if (!ST.hasVALUPartialForwardingHazard())
1726 assert(!ST.hasExtendedWaitCounts());
1731 SmallSetVector<Register, 4> SrcVGPRs;
1733 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1734 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1739 if (SrcVGPRs.
size() <= 1)
1757 const int Intv1plus2MaxVALUs = 2;
1758 const int Intv3MaxVALUs = 4;
1759 const int IntvMaxVALUs = 6;
1760 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1763 SmallDenseMap<Register, int, 4> DefPos;
1764 int ExecPos = std::numeric_limits<int>::max();
1767 static unsigned getHashValue(
const StateType &State) {
1771 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1772 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1780 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1782 if (State.VALUs > NoHazardVALUWaitStates)
1788 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1796 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1797 State.DefPos[Src] = State.VALUs;
1802 if (State.ExecPos == std::numeric_limits<int>::max()) {
1803 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1804 State.ExecPos = State.VALUs;
1811 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1819 if (State.ExecPos == std::numeric_limits<int>::max())
1822 int PreExecPos = std::numeric_limits<int>::max();
1823 int PostExecPos = std::numeric_limits<int>::max();
1825 for (
auto Entry : State.DefPos) {
1826 int DefVALUs =
Entry.second;
1827 if (DefVALUs != std::numeric_limits<int>::max()) {
1828 if (DefVALUs >= State.ExecPos)
1829 PreExecPos = std::min(PreExecPos, DefVALUs);
1831 PostExecPos = std::min(PostExecPos, DefVALUs);
1836 if (PostExecPos == std::numeric_limits<int>::max())
1840 int Intv3VALUs = PostExecPos;
1841 if (Intv3VALUs > Intv3MaxVALUs)
1845 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1846 if (Intv2VALUs > Intv1plus2MaxVALUs)
1850 if (PreExecPos == std::numeric_limits<int>::max())
1854 int Intv1VALUs = PreExecPos - State.ExecPos;
1855 if (Intv1VALUs > Intv1plus2MaxVALUs)
1859 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1864 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1870 std::next(
MI->getReverseIterator())))
1874 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1880bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1881 if (!ST.hasVALUTransUseHazard())
1883 assert(!ST.hasExtendedWaitCounts());
1888 SmallSet<Register, 4> SrcVGPRs;
1890 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1891 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1905 const int IntvMaxVALUs = 5;
1906 const int IntvMaxTRANS = 1;
1912 static unsigned getHashValue(
const StateType &State) {
1915 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1916 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1923 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1925 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1931 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1938 if (
I.modifiesRegister(Src, &TRI)) {
1946 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1954 std::next(
MI->getReverseIterator())))
1960 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1966bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1967 if (!ST.hasGFX1250Insts() ||
1971 const SIInstrInfo *TII = ST.getInstrInfo();
1972 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1974 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1979 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1980 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1981 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1985 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1986 if (!ValuDst || !ValuDst->isReg())
1990 Register ValuDef = ValuDst->getReg();
1991 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1992 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
2003 const int HasVALU = std::numeric_limits<int>::max();
2004 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
2007 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2015 const SIInstrInfo *TII = ST.getInstrInfo();
2016 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2018 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
2025 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2027 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2030 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2032 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2033 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2042 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2043 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2057 std::numeric_limits<int>::max())
2060 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2099 unsigned Category = 0;
2101 unsigned Latency = SchedModel.computeInstrLatency(&
MI);
2104 Category = IsSWMMAC ? 2 : 0;
2107 Category = IsSWMMAC ? 3 : 1;
2116int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI)
const {
2117 if (!ST.hasGFX1250Insts())
2120 const SIInstrInfo *TII = ST.getInstrInfo();
2129 const int WMMAWaitStates[] = {5, 9, 3, 5};
2130 const int VALUWaitStates[] = {4, 8, 2, 4};
2131 unsigned Category = 0;
2133 auto IsWMMAHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2134 if (!TII->isXDLWMMA(
I))
2138 return hasWMMAToWMMARegOverlap(
I, *
MI);
2141 auto IsVALUHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2142 if (!TII->isXDLWMMA(
I))
2146 return hasWMMAToVALURegOverlap(
I, *
MI);
2149 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2153 int WaitStatesNeeded = -1;
2154 int ExistingVALUs = 0;
2161 if (TII->isXDLWMMA(*
MI)) {
2162 const int WMMAWaitsLimit = 9;
2164 getWaitStatesSince(IsWMMAHazardFn, WMMAWaitsLimit, GetWaitStatesFn);
2165 WaitStatesNeeded = WMMAWaitStates[Category] - ExistingVALUs;
2167 const int VALUWaitsLimit = 8;
2169 getWaitStatesSince(IsVALUHazardFn, VALUWaitsLimit, GetWaitStatesFn);
2170 WaitStatesNeeded = VALUWaitStates[Category] - ExistingVALUs;
2173 return WaitStatesNeeded;
2176bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2178 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2179 Register A1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src0)->getReg();
2180 Register B1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src1)->getReg();
2183 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2187 Register Idx1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2188 if (TRI.regsOverlap(D0, Idx1))
2194bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2197 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2198 for (
const MachineOperand &ValuUse :
MI.explicit_uses()) {
2199 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2204 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2205 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2209 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2210 WMMARegs.push_back(Idx0);
2213 for (
const MachineOperand &ValuDef :
MI.defs()) {
2214 Register VDstReg = ValuDef.getReg();
2215 for (
Register WMMAReg : WMMARegs) {
2216 if (TRI.regsOverlap(VDstReg, WMMAReg))
2223bool GCNHazardRecognizer::isCoexecutionHazardFor(
const MachineInstr &
I,
2227 if (!TII.isXDLWMMA(
I))
2231 if (TII.isXDLWMMA(
MI))
2232 return hasWMMAToWMMARegOverlap(
I,
MI);
2234 return hasWMMAToVALURegOverlap(
I,
MI);
2240 bool IncludeSubloops) {
2243 for (MachineBasicBlock *
MBB :
L->getBlocks()) {
2244 if (!IncludeSubloops && MLI->getLoopFor(
MBB) != L)
2246 for (MachineInstr &
I : *
MBB) {
2249 if (isCoexecutionHazardFor(
I, *
MI))
2256bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(
MachineInstr *
MI,
2257 int WaitStatesNeeded) {
2261 MachineLoop *
L = MLI->getLoopFor(
MI->getParent());
2263 ++NumWMMAHoistingBailed;
2268 if (hasWMMAHazardInLoop(L,
MI)) {
2269 ++NumWMMAHoistingBailed;
2274 MachineLoop *TargetLoop =
L;
2276 if (hasWMMAHazardInLoop(Parent,
MI,
false))
2278 TargetLoop = Parent;
2284 ++NumWMMAHoistingBailed;
2288 LLVM_DEBUG(
dbgs() <<
"WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2294 NumWMMANopsHoisted += WaitStatesNeeded;
2298bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2299 int WaitStatesNeeded = checkWMMACoexecutionHazards(
MI);
2300 if (WaitStatesNeeded <= 0)
2306 emitVNops(*
MI->getParent(),
MI->getIterator(), WaitStatesNeeded);
2310bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2311 if (!ST.hasShift64HighRegBug())
2313 assert(!ST.hasExtendedWaitCounts());
2315 switch (
MI->getOpcode()) {
2318 case AMDGPU::V_LSHLREV_B64_e64:
2319 case AMDGPU::V_LSHRREV_B64_e64:
2320 case AMDGPU::V_ASHRREV_I64_e64:
2324 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2329 const MachineRegisterInfo &MRI = MF.getRegInfo();
2331 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2334 if (AmtReg != AMDGPU::VGPR255 && MRI.
isPhysRegUsed(AmtReg + 1))
2337 assert(ST.needsAlignedVGPRs());
2338 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2341 MachineBasicBlock *
MBB =
MI->getParent();
2342 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2353 Register DstReg =
MI->getOperand(0).getReg();
2355 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2363 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2365 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2366 : AMDGPU::VGPR_32RegClass) {
2367 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2373 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2378 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2391 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2398 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2404 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2418 MI->getOperand(0).setReg(NewReg);
2427int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI)
const {
2428 int NSAtoVMEMWaitStates = 1;
2430 if (!ST.hasNSAtoVMEMBug())
2436 const SIInstrInfo *TII = ST.getInstrInfo();
2437 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2445 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2446 TII->getInstSizeInBytes(
I) >= 16;
2449 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2452int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2454 int FPAtomicToDenormModeWaitStates = 3;
2456 if (!ST.hasFPAtomicToDenormModeHazard())
2458 assert(!ST.hasExtendedWaitCounts());
2460 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2469 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2476 return FPAtomicToDenormModeWaitStates -
2480int GCNHazardRecognizer::checkMAIHazards(
MachineInstr *
MI)
const {
2483 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2486int GCNHazardRecognizer::checkMFMAPadding(
MachineInstr *
MI)
const {
2491 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2495 int NeighborMFMALatency = 0;
2496 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2497 this](
const MachineInstr &
MI) {
2501 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2505 const int MaxMFMAPipelineWaitStates = 16;
2506 int WaitStatesSinceNeighborMFMA =
2507 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2509 int NeighborMFMAPaddingNeeded =
2511 WaitStatesSinceNeighborMFMA;
2513 return std::max(0, NeighborMFMAPaddingNeeded);
2516int GCNHazardRecognizer::checkMAIHazards908(
MachineInstr *
MI)
const {
2517 int WaitStatesNeeded = 0;
2518 unsigned Opc =
MI->getOpcode();
2520 auto IsVALUFn = [](
const MachineInstr &
MI) {
2524 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2525 const int LegacyVALUWritesVGPRWaitStates = 2;
2526 const int VALUWritesExecWaitStates = 4;
2527 const int MaxWaitStates = 4;
2529 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2530 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2531 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2533 if (WaitStatesNeeded < MaxWaitStates) {
2534 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2535 const int MaxWaitStates = 2;
2537 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2540 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2541 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2542 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2544 if (WaitStatesNeeded == MaxWaitStates)
2550 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2551 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2554 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2557 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2558 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2559 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2560 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2561 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2562 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2563 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2564 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2565 const int MaxWaitStates = 18;
2567 unsigned HazardDefLatency = 0;
2569 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2570 this](
const MachineInstr &
MI) {
2577 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2578 return TRI.regsOverlap(DstReg,
Reg);
2581 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2583 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2584 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2585 int OpNo =
Op.getOperandNo();
2586 if (OpNo == SrcCIdx) {
2587 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2588 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2589 switch (HazardDefLatency) {
2590 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2592 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2594 case 16: [[fallthrough]];
2595 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2598 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2599 switch (HazardDefLatency) {
2600 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2602 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2604 case 16: [[fallthrough]];
2605 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2610 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2611 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2613 if (WaitStatesNeeded == MaxWaitStates)
2614 return WaitStatesNeeded;
2616 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2617 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2620 return TRI.regsOverlap(
Reg, DstReg);
2623 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2624 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2625 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2626 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2627 if (OpNo == SrcCIdx)
2628 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2629 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2630 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2632 WaitStatesNeededForUse = NeedWaitStates -
2633 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2634 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2636 if (WaitStatesNeeded == MaxWaitStates)
2637 return WaitStatesNeeded;
2640 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2641 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2642 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2643 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2644 const int MaxWaitStates = 13;
2645 Register DstReg =
MI->getOperand(0).getReg();
2646 unsigned HazardDefLatency = 0;
2648 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2649 this](
const MachineInstr &
MI) {
2652 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2654 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2655 return TRI.regsOverlap(
Reg, DstReg);
2658 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2660 switch (HazardDefLatency) {
2661 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2663 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2665 case 16: [[fallthrough]];
2666 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2670 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2671 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2675 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2677 return WaitStatesNeeded;
2688 return NumPasses + 1 + IsGFX950;
2699 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2717 return NumPasses + 2;
2727 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2730int GCNHazardRecognizer::checkMAIHazards90A(
MachineInstr *
MI)
const {
2731 int WaitStatesNeeded = 0;
2732 unsigned Opc =
MI->getOpcode();
2734 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2738 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2744 return WaitStatesNeeded;
2746 const int VALUWritesExecWaitStates = 4;
2747 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2748 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2749 VALUWritesExecWaitStates);
2750 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2752 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2755 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2756 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2757 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2758 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2759 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2760 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2761 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2762 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2763 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2764 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2765 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2766 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2767 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2768 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2769 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2770 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2771 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2772 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2773 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2774 const int MaxWaitStates = 19;
2780 const MachineInstr *MI1;
2782 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2783 this](
const MachineInstr &
MI) {
2787 FullReg = (DstReg ==
Reg);
2789 return TRI.regsOverlap(DstReg,
Reg);
2792 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2793 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2794 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2797 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2798 if (NumWaitStates == std::numeric_limits<int>::max())
2801 int OpNo =
Use.getOperandNo();
2803 int NeedWaitStates = 0;
2804 if (OpNo == SrcCIdx) {
2808 }
else if (FullReg) {
2809 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2810 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2811 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2812 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2813 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2814 else if (ST.hasGFX940Insts() &&
2815 TSchedModel.computeInstrLatency(MI1) == 2)
2816 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2819 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2820 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2821 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2822 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2823 if (!TII.isXDL(*
MI))
2826 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2827 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2829 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2830 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2831 if (!TII.isXDL(*
MI))
2832 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2835 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2836 if (ST.hasGFX940Insts()) {
2837 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2844 NumPasses, ST.hasGFX950Insts())
2846 NumPasses, ST.hasGFX950Insts()))
2852 switch (NumPasses) {
2856 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2857 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2862 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2863 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2868 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2869 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2878 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2879 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2880 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2881 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2884 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2885 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2887 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2888 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2889 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2892 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2894 if (ST.hasGFX940Insts()) {
2898 NumPasses, ST.hasGFX950Insts())
2904 switch (NumPasses) {
2906 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2911 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2915 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2919 if (WaitStatesNeeded >= NeedWaitStates)
2922 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2923 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2925 if (WaitStatesNeeded == MaxWaitStates)
2930 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2932 return WaitStatesNeeded;
2935int GCNHazardRecognizer::checkMAILdStHazards(
MachineInstr *
MI)
const {
2937 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2940 int WaitStatesNeeded = 0;
2942 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2943 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2946 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2947 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2952 const int AccVgprReadLdStWaitStates = 2;
2953 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2954 const int MaxWaitStates = 2;
2956 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2957 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2958 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2960 if (WaitStatesNeeded == MaxWaitStates)
2961 return WaitStatesNeeded;
2963 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2964 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2965 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2967 auto IsVALUFn = [](
const MachineInstr &
MI) {
2970 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2971 std::numeric_limits<int>::max();
2974 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2975 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2976 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2979 return WaitStatesNeeded;
2982int GCNHazardRecognizer::checkPermlaneHazards(
MachineInstr *
MI)
const {
2983 assert(!ST.hasVcmpxPermlaneHazard() &&
2984 "this is a different vcmpx+permlane hazard");
2985 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2986 const SIInstrInfo *TII = ST.getInstrInfo();
2988 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2992 auto IsVALUFn = [](
const MachineInstr &
MI) {
2996 const int VCmpXWritesExecWaitStates = 4;
2997 const int VALUWritesVDstWaitStates = 2;
2998 int WaitStatesNeeded = 0;
3000 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
3001 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
3005 int WaitStatesSinceDef =
3006 VALUWritesVDstWaitStates -
3007 getWaitStatesSinceDef(
Reg, IsVALUFn,
3008 VALUWritesVDstWaitStates);
3009 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
3010 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3014 int VCmpXHazardWaits =
3015 VCmpXWritesExecWaitStates -
3016 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3018 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3019 return WaitStatesNeeded;
3027 return NumPasses + 2;
3037 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3047 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3055 return NumPasses + 2;
3058int GCNHazardRecognizer::checkMAIVALUHazards(
MachineInstr *
MI)
const {
3059 if (!ST.hasGFX90AInsts())
3062 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
3070 const MachineRegisterInfo &MRI = MF.getRegInfo();
3072 int WaitStatesNeeded = 0;
3078 const MachineInstr *
MFMA =
nullptr;
3080 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3082 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3088 const MachineInstr *
DOT =
nullptr;
3089 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
3091 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3097 bool DGEMMAfterVALUWrite =
false;
3098 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
3101 DGEMMAfterVALUWrite =
true;
3105 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
3111 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
3112 AMDGPU::OpName::src2);
3114 if (IsMemOrExport || IsVALU) {
3115 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3116 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3117 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3118 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3119 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3120 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3121 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3122 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3123 const int DotWriteSameDotReadSrcAB = 3;
3124 const int DotWriteDifferentVALURead = 3;
3125 const int DMFMABetweenVALUWriteVMEMRead = 2;
3126 const int MaxWaitStates = 19;
3128 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3134 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3137 int NeedWaitStates = 0;
3138 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3139 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3140 NeedWaitStates = DotWriteSameDotReadSrcAB;
3142 NeedWaitStates = DotWriteDifferentVALURead;
3145 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3146 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3153 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3154 DGEMMAfterVALUWrite =
false;
3155 if (TRI.isVectorRegister(MRI,
Reg)) {
3156 int WaitStatesNeededForUse =
3157 DMFMABetweenVALUWriteVMEMRead -
3158 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3159 DMFMABetweenVALUWriteVMEMRead);
3161 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3166 WaitStatesSinceDef =
3167 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3171 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3172 int NumPasses = HazardDefLatency;
3173 int NeedWaitStates = MaxWaitStates;
3176 switch (HazardDefLatency) {
3178 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3179 : DMFMA4x4WriteVgprVALUReadWaitStates;
3185 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3186 : (ST.hasGFX950Insts()
3187 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3188 : DMFMA16x16WriteVgprVALUReadWaitStates);
3193 }
else if (ST.hasGFX940Insts()) {
3197 NumPasses, ST.hasGFX950Insts())
3201 switch (HazardDefLatency) {
3203 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3206 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3209 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3216 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3217 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3219 if (WaitStatesNeeded == MaxWaitStates)
3224 unsigned Opc =
MI->getOpcode();
3225 const int DMFMAToFMA64WaitStates = 2;
3226 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3227 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3228 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3229 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3230 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3231 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3232 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3235 if (!IsVALU && !IsMemOrExport)
3236 return WaitStatesNeeded;
3238 for (
const MachineOperand &Def :
MI->defs()) {
3239 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3240 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3241 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3242 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3243 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3244 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3245 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3246 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3247 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3248 const int DotWriteDifferentVALUWrite = 3;
3249 const int MaxWaitStates = 19;
3250 const int MaxWarWaitStates = 15;
3255 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3257 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3258 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3259 WaitStatesSinceDef);
3262 WaitStatesSinceDef =
3263 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3265 int NeedWaitStates = MaxWaitStates;
3266 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3269 switch (NumPasses) {
3271 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3275 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3280 }
else if (ST.hasGFX940Insts()) {
3284 NumPasses, ST.hasGFX950Insts())
3287 switch (NumPasses) {
3289 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3292 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3295 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3302 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3303 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3305 if (WaitStatesNeeded == MaxWaitStates)
3309 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3311 !
MI.readsRegister(
Reg, &TRI))
3314 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3317 const MachineOperand *SrcC =
3318 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3328 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3333 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3334 int NeedWaitStates = MaxWaitStates;
3335 switch (HazardDefLatency) {
3336 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3338 case 4:
assert(ST.hasGFX940Insts());
3339 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3341 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3343 case 16: [[fallthrough]];
3344 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3348 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3349 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3352 return WaitStatesNeeded;
3365 return MAI !=
nullptr;
3369 if (IsMFMAFn(*
MI)) {
3370 int W = getWaitStatesSince(IsMFMAFn, 16);
3372 return W < (int)TSchedModel.computeInstrLatency(MAI);
3386 while (
I->isBundledWithPred())
3392 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3396 const unsigned NewBytes = 4;
3398 "Unexpected instruction insertion in bundle");
3401 while (NextMI != End && NextMI->isBundledWithPred()) {
3402 for (
auto &Operand : NextMI->operands()) {
3403 if (Operand.isGlobal())
3404 Operand.setOffset(Operand.getOffset() + NewBytes);
3410bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3411 if (!ST.hasVALUMaskWriteHazard())
3413 assert(!ST.hasExtendedWaitCounts());
3420 if (!IsSALU && !IsVALU)
3432 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3433 const MachineRegisterInfo &MRI = MF.getRegInfo();
3438 case AMDGPU::EXEC_LO:
3439 case AMDGPU::EXEC_HI:
3441 case AMDGPU::SGPR_NULL:
3442 case AMDGPU::SGPR_NULL64:
3450 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3454 SmallSet<Register, 2> HazardSGPRs;
3456 static unsigned getHashValue(
const StateType &State) {
3459 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3460 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3464 SmallVector<const MachineInstr *> WaitInstrs;
3465 bool HasSGPRRead =
false;
3466 StateType InitialState;
3469 MachineOperand *HazardDef =
nullptr;
3470 for (MachineOperand &
Op :
MI->operands()) {
3473 if (
Op.isDef() && HazardDef)
3477 if (IgnoreableSGPR(
Reg))
3480 if (
Op.isImplicit())
3482 if (!TRI->isSGPRReg(MRI,
Reg))
3500 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3501 InitialState.HazardSGPRs.insert(HazardReg);
3504 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3505 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3508 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3509 if (State.HazardSGPRs.empty())
3512 switch (
I.getOpcode()) {
3513 case AMDGPU::V_ADDC_U32_e32:
3514 case AMDGPU::V_ADDC_U32_dpp:
3515 case AMDGPU::V_CNDMASK_B16_t16_e32:
3516 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3517 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3518 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3519 case AMDGPU::V_CNDMASK_B32_e32:
3520 case AMDGPU::V_CNDMASK_B32_dpp:
3521 case AMDGPU::V_DIV_FMAS_F32_e64:
3522 case AMDGPU::V_DIV_FMAS_F64_e64:
3523 case AMDGPU::V_SUBB_U32_e32:
3524 case AMDGPU::V_SUBB_U32_dpp:
3525 case AMDGPU::V_SUBBREV_U32_e32:
3526 case AMDGPU::V_SUBBREV_U32_dpp: {
3530 case AMDGPU::V_ADDC_U32_e64:
3531 case AMDGPU::V_ADDC_U32_e64_dpp:
3532 case AMDGPU::V_CNDMASK_B16_t16_e64:
3533 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3534 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3535 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3536 case AMDGPU::V_CNDMASK_B32_e64:
3537 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3538 case AMDGPU::V_SUBB_U32_e64:
3539 case AMDGPU::V_SUBB_U32_e64_dpp:
3540 case AMDGPU::V_SUBBREV_U32_e64:
3541 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3543 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3545 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3557 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3558 switch (
I.getOpcode()) {
3559 case AMDGPU::S_WAITCNT_DEPCTR:
3561 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3562 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3567 for (
auto &
Op :
I.operands()) {
3572 if (IgnoreableSGPR(
Reg))
3575 if (
Op.isImplicit())
3577 if (!TRI->isSGPRReg(MRI,
Reg))
3588 for (
Register SGPR : State.HazardSGPRs) {
3589 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3593 State.HazardSGPRs.erase(SGPR);
3602 std::next(
MI->getReverseIterator())))
3612 if (!WaitInstrs.
empty()) {
3616 SmallVector<MachineInstr *> ToErase;
3618 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3619 End = MI->getParent()->rend();
3620 Found < WaitInstrs.size() && It != End; ++It) {
3621 MachineInstr *WaitMI = &*It;
3623 if (std::as_const(WaitMI) != WaitInstrs[Found])
3626 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3627 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3628 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3629 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3630 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3631 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3632 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3633 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3634 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3635 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3636 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3637 ToErase.push_back(WaitMI);
3640 for (MachineInstr *WaitMI : ToErase)
3641 WaitMI->eraseFromParent();
3645 auto NextMI = std::next(
MI->getIterator());
3646 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3647 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3659 if (EntryMBB.
begin() != EntryMBB.
end()) {
3660 auto &EntryMI = *EntryMBB.
begin();
3661 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3662 EntryMI.getOperand(0).getImm() >= Priority)
3671bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3672 if (!ST.hasRequiredExportPriority())
3677 MachineBasicBlock *
MBB =
MI->getParent();
3690 const int MaxPriority = 3;
3691 const int NormalPriority = 2;
3692 const int PostExportPriority = 0;
3694 auto It =
MI->getIterator();
3695 switch (
MI->getOpcode()) {
3696 case AMDGPU::S_ENDPGM:
3697 case AMDGPU::S_ENDPGM_SAVED:
3698 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3699 case AMDGPU::SI_RETURN_TO_EPILOG:
3702 if (MF->getFrameInfo().hasCalls())
3705 case AMDGPU::S_SETPRIO: {
3707 auto &PrioOp =
MI->getOperand(0);
3708 int Prio = PrioOp.getImm();
3709 bool InWA = (Prio == PostExportPriority) &&
3710 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3711 if (InWA || Prio >= NormalPriority)
3713 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3717 if (!TII.isEXP(*
MI))
3728 auto NextMI = std::next(It);
3729 bool EndOfShader =
false;
3730 if (NextMI !=
MBB->
end()) {
3732 if (TII.isEXP(*NextMI))
3735 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3736 NextMI->getOperand(0).getImm() == PostExportPriority)
3738 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3745 .
addImm(PostExportPriority);
3749 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3750 .
addReg(AMDGPU::SGPR_NULL)
3770 const SIInstrInfo *TII = ST.getInstrInfo();
3782 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3787bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3788 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3791 const SIInstrInfo *TII = ST.getInstrInfo();
3793 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3795 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3796 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3802bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3805 if (!IsHazardRecognizerMode)
3808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3809 const SIInstrInfo *TII = ST.getInstrInfo();
3811 const int FlatScrBaseWaitStates = 10;
3813 bool ReadsFlatScrLo =
3814 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3815 bool ReadsFlatScrHi =
3816 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3822 ReadsFlatScrLo =
true;
3825 ReadsFlatScrHi =
true;
3830 const MachineRegisterInfo &MRI = MF.getRegInfo();
3833 DenseSet<const MachineBasicBlock *> Visited;
3835 return MI.modifiesRegister(
Reg, TRI);
3840 auto IsSGPRDef = [TII, TRI, &MRI](
const MachineInstr &
MI) ->
unsigned {
3841 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3843 for (
const MachineOperand &MO :
MI.all_defs()) {
3844 if (TRI->isSGPRReg(MRI, MO.getReg()))
3850 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3851 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3852 unsigned Wait =
MI.getOperand(0).getImm();
3857 return SgprWrites >= FlatScrBaseWaitStates;
3860 return ::getWaitStatesSince(
3861 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3862 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3866 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3868 !IsRegDefHazard(AMDGPU::SGPR103)))
3872 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3883 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3884 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static unsigned getWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, const TargetSchedModel &SchedModel)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(GsymDataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const uint32_t IV[8]
unsigned get(InstCounterType T) const
BitVector & set()
Set all bits in the bitvector.
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Provide an instruction scheduling machine model to CodeGen passes.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...