28#define DEBUG_TYPE "gcn-hazard-recognizer"
31 "Number of WMMA hazard V_NOPs hoisted from loops");
33 "Number of WMMA hazards where V_NOP hoisting was not possible");
37struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
40 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
42 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
45 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
55 cl::desc(
"Fill a percentage of the latency between "
56 "neighboring MFMA with s_nops."));
61 cl::desc(
"Insert a s_nop x before every instruction"));
65 cl::desc(
"Hoist WMMA hazard V_NOPs from loops to preheaders"));
76 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
77 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
78 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()), MLI(MLI),
79 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
80 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
85 EmittedInstrs.clear();
97 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
101 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
106 case AMDGPU::S_SETREG_B32:
107 case AMDGPU::S_SETREG_B32_mode:
108 case AMDGPU::S_SETREG_IMM32_B32:
109 case AMDGPU::S_SETREG_IMM32_B32_mode:
116 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
120 return Opcode == AMDGPU::S_RFE_B64;
125 case AMDGPU::S_MOVRELS_B32:
126 case AMDGPU::S_MOVRELS_B64:
127 case AMDGPU::S_MOVRELD_B32:
128 case AMDGPU::S_MOVRELD_B64:
137 if (
TII.isAlwaysGDS(
MI.getOpcode()))
140 switch (
MI.getOpcode()) {
141 case AMDGPU::S_SENDMSG:
142 case AMDGPU::S_SENDMSGHALT:
143 case AMDGPU::S_TTRACEDATA:
147 case AMDGPU::DS_PERMUTE_B32:
148 case AMDGPU::DS_BPERMUTE_B32:
151 if (
TII.isDS(
MI.getOpcode())) {
152 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
153 AMDGPU::OpName::gds);
154 if (
MI.getOperand(GDS).getImm())
162 unsigned Opcode =
MI.getOpcode();
163 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
164 Opcode == AMDGPU::V_PERMLANE64_B32 ||
165 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
169 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
171 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
173 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
175 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
176 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
186 AMDGPU::OpName::simm16);
203 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
206 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
210 if (!IsHazardRecognizerMode) {
211 if (checkWMMACoexecutionHazards(
MI) > 0)
215 if (ST.hasNoDataDepHazard())
227 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
230 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
235 checkMAIVALUHazards(
MI) > 0)
238 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
241 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
244 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
247 if (((ST.hasReadM0MovRelInterpHazard() &&
249 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
250 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
252 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
253 (ST.hasReadM0LdsDirectHazard() &&
254 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
255 checkReadM0Hazards(
MI) > 0)
262 checkMAILdStHazards(
MI) > 0)
265 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
273 while (Quantity > 0) {
274 unsigned Arg = std::min(Quantity, 8u);
282GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
283 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
284 assert(TSchedModel.getWriteProcResBegin(SC) !=
285 TSchedModel.getWriteProcResEnd(SC));
286 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
289void GCNHazardRecognizer::processBundle() {
293 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
294 CurrCycleInstr = &*
MI;
297 if (IsHazardRecognizerMode) {
298 fixHazards(CurrCycleInstr);
306 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
307 EmittedInstrs.push_front(
nullptr);
309 EmittedInstrs.push_front(CurrCycleInstr);
312 CurrCycleInstr =
nullptr;
316 assert(IsHazardRecognizerMode);
320 if (
MI->isInsideBundle())
330 IsHazardRecognizerMode =
true;
334 CurrCycleInstr =
nullptr;
349 return std::max(WaitStates, checkSMRDHazards(
MI));
351 if (ST.hasNSAtoVMEMBug())
352 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
354 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
356 if (ST.hasNoDataDepHazard())
360 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
363 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
366 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
369 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
372 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
376 checkMAIVALUHazards(
MI) > 0)
377 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
379 if (
MI->isInlineAsm())
380 return std::max(WaitStates, checkInlineAsmHazards(
MI));
383 return std::max(WaitStates, checkGetRegHazards(
MI));
386 return std::max(WaitStates, checkSetRegHazards(
MI));
389 return std::max(WaitStates, checkRFEHazards(
MI));
391 if ((ST.hasReadM0MovRelInterpHazard() &&
393 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
394 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
396 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
397 (ST.hasReadM0LdsDirectHazard() &&
398 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
399 return std::max(WaitStates, checkReadM0Hazards(
MI));
402 return std::max(WaitStates, checkMAIHazards(
MI));
405 return std::max(WaitStates, checkMAILdStHazards(
MI));
408 return std::max(WaitStates, checkPermlaneHazards(
MI));
414 EmittedInstrs.push_front(
nullptr);
420 if (!CurrCycleInstr) {
421 EmittedInstrs.push_front(
nullptr);
425 if (CurrCycleInstr->isBundle()) {
430 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
431 if (!NumWaitStates) {
432 CurrCycleInstr =
nullptr;
437 EmittedInstrs.push_front(CurrCycleInstr);
444 EmittedInstrs.push_front(
nullptr);
452 CurrCycleInstr =
nullptr;
456 assert(!IsHazardRecognizerMode &&
457 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
467template <
typename StateT>
477 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
482 static inline StateMapKey getEmptyKey() {
487 static inline StateMapKey getTombstoneKey() {
492 static unsigned getHashValue(
const StateMapKey &
Key) {
493 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
495 static unsigned getHashValue(
const StateT &State) {
496 return StateT::getHashValue(State);
498 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
499 const auto EKey = getEmptyKey();
500 const auto TKey = getTombstoneKey();
501 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
502 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
503 return StateMapKey::isEqual(
LHS,
RHS);
504 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
506 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
507 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
508 StateMapKey::isEqual(
RHS, getTombstoneKey()))
510 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
519 StateT State = InitialState;
522 unsigned WorkIdx = 0;
524 bool Expired =
false;
525 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
530 auto Result = IsHazard(State, *
I);
538 if (
I->isInlineAsm() ||
I->isMetaInstruction())
541 UpdateState(State, *
I);
545 unsigned StateIdx = States.
size();
546 StateMapKey
Key = {&States, StateIdx};
547 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
548 if (Insertion.second) {
551 StateIdx = Insertion.first->second;
554 Worklist.
insert(std::pair(Pred, StateIdx));
557 if (WorkIdx == Worklist.
size())
561 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
562 State = States[StateIdx];
563 I =
MBB->instr_rbegin();
580 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
588 if (
I->isInlineAsm())
591 WaitStates += GetNumWaitStates(*
I);
593 if (IsExpired(*
I, WaitStates))
594 return std::numeric_limits<int>::max();
597 int MinWaitStates = std::numeric_limits<int>::max();
599 if (!Visited.
insert(Pred).second)
603 IsExpired, Visited, GetNumWaitStates);
605 MinWaitStates = std::min(MinWaitStates, W);
608 return MinWaitStates;
619 std::next(
MI->getReverseIterator()), 0, IsExpired,
620 Visited, GetNumWaitStates);
623int GCNHazardRecognizer::getWaitStatesSince(
624 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates)
const {
625 if (IsHazardRecognizerMode) {
626 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
627 return WaitStates >= Limit;
629 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
634 for (MachineInstr *
MI : EmittedInstrs) {
639 if (
MI->isInlineAsm())
642 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
644 if (WaitStates >= Limit)
647 return std::numeric_limits<int>::max();
650int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
655int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
656 IsHazardFn IsHazardDef,
658 const SIRegisterInfo *TRI = ST.getRegisterInfo();
661 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
667int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
682 for (MCRegUnit Unit :
TRI.regunits(
Reg))
683 BV.
set(
static_cast<unsigned>(Unit));
695void GCNHazardRecognizer::addClauseInst(
const MachineInstr &
MI)
const {
707int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM)
const {
710 if (!ST.isXNACKEnabled())
713 bool IsSMRD = TII.isSMRD(*MEM);
727 for (MachineInstr *
MI : EmittedInstrs) {
739 if (ClauseDefs.none())
752 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
755int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD)
const {
756 int WaitStatesNeeded = 0;
758 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
761 if (!ST.hasSMRDReadVALUDefHazard())
762 return WaitStatesNeeded;
766 int SmrdSgprWaitStates = 4;
767 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
768 return TII.isVALU(
MI);
770 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
771 return TII.isSALU(
MI);
774 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
776 for (
const MachineOperand &Use :
SMRD->uses()) {
779 int WaitStatesNeededForUse =
780 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
782 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
792 int WaitStatesNeededForUse =
793 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
796 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
800 return WaitStatesNeeded;
803int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr *VMEM)
const {
804 if (!ST.hasVMEMReadSGPRVALUDefHazard())
807 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
811 const int VmemSgprWaitStates = 5;
812 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
813 return TII.isVALU(
MI);
815 for (
const MachineOperand &Use :
VMEM->uses()) {
816 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
819 int WaitStatesNeededForUse =
820 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
822 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
824 return WaitStatesNeeded;
828 const SIRegisterInfo *TRI = ST.getRegisterInfo();
829 const SIInstrInfo *TII = ST.getInstrInfo();
832 int DppVgprWaitStates = 2;
833 int DppExecWaitStates = 5;
834 int WaitStatesNeeded = 0;
835 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
836 return TII->isVALU(
MI);
839 for (
const MachineOperand &Use :
DPP->uses()) {
840 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
842 int WaitStatesNeededForUse =
843 DppVgprWaitStates - getWaitStatesSinceDef(
845 [](
const MachineInstr &) { return true; },
847 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
850 WaitStatesNeeded = std::max(
852 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
855 return WaitStatesNeeded;
858int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas)
const {
859 const SIInstrInfo *TII = ST.getInstrInfo();
863 const int DivFMasWaitStates = 4;
864 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
865 return TII->isVALU(
MI);
867 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
870 return DivFMasWaitStates - WaitStatesNeeded;
873int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr)
const {
874 const SIInstrInfo *TII = ST.getInstrInfo();
875 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
877 const int GetRegWaitStates = 2;
878 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
881 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
883 return GetRegWaitStates - WaitStatesNeeded;
886int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr)
const {
887 const SIInstrInfo *TII = ST.getInstrInfo();
888 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
890 const int SetRegWaitStates = ST.getSetRegWaitStates();
891 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
894 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
895 return SetRegWaitStates - WaitStatesNeeded;
898int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI)
const {
902 const SIInstrInfo *TII = ST.getInstrInfo();
903 unsigned Opcode =
MI.getOpcode();
904 const MCInstrDesc &
Desc =
MI.getDesc();
906 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
909 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
911 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
918 const MachineOperand *SOffset =
919 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
923 (!SOffset || !SOffset->
isReg()))
931 if (TII->isMIMG(
MI)) {
932 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
934 Desc.operands()[SRsrcIdx])) == 256);
938 if (TII->isFLAT(
MI)) {
950int GCNHazardRecognizer::checkVALUHazardsHelper(
954 const SIRegisterInfo *TRI = ST.getRegisterInfo();
956 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
957 int WaitStatesNeeded = 0;
959 if (!TRI->isVectorRegister(MRI,
Def.getReg()))
960 return WaitStatesNeeded;
963 int DataIdx = createsVALUHazard(
MI);
964 return DataIdx >= 0 &&
965 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
968 int WaitStatesNeededForDef =
969 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
970 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
972 return WaitStatesNeeded;
988 unsigned Opcode =
MI.getOpcode();
998 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
1000 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1006 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
1008 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1012 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
1014 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1020 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1041 for (
auto &Operand : VALU->operands()) {
1042 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1049int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU)
const {
1050 int WaitStatesNeeded = 0;
1053 const int TransDefWaitstates = 1;
1055 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1058 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1059 const SIInstrInfo *TII = ST.getInstrInfo();
1060 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1062 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1063 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1070 int WaitStatesNeededForDef =
1071 TransDefWaitstates -
1072 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1073 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1076 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1077 const int Shift16DefWaitstates = 1;
1079 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1080 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1081 const MachineOperand *ForwardedDst =
1087 if (ProducerMI.isInlineAsm()) {
1089 for (
auto &Def : ProducerMI.all_defs()) {
1098 int WaitStatesNeededForDef =
1099 Shift16DefWaitstates -
1100 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1101 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1104 if (ST.hasVDecCoExecHazard()) {
1105 const int VALUWriteSGPRVALUReadWaitstates = 2;
1106 const int VALUWriteEXECRWLane = 4;
1107 const int VALUWriteVGPRReadlaneRead = 1;
1109 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1110 const MachineRegisterInfo &MRI = MF.getRegInfo();
1112 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1115 return MI.modifiesRegister(
UseReg, TRI);
1118 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1123 if (TRI->isSGPRReg(MRI,
UseReg)) {
1124 int WaitStatesNeededForDef =
1125 VALUWriteSGPRVALUReadWaitstates -
1126 getWaitStatesSince(IsVALUDefSGPRFn,
1127 VALUWriteSGPRVALUReadWaitstates);
1128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1132 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1134 int WaitStatesNeededForDef =
1135 VALUWriteSGPRVALUReadWaitstates -
1136 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1137 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1140 switch (
VALU->getOpcode()) {
1141 case AMDGPU::V_READLANE_B32:
1142 case AMDGPU::V_READFIRSTLANE_B32: {
1143 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1145 int WaitStatesNeededForDef =
1146 VALUWriteVGPRReadlaneRead -
1147 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1148 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1151 case AMDGPU::V_WRITELANE_B32: {
1153 int WaitStatesNeededForDef =
1154 VALUWriteEXECRWLane -
1155 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1156 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1166 if (!ST.has12DWordStoreHazard())
1167 return WaitStatesNeeded;
1169 const MachineRegisterInfo &MRI = MF.getRegInfo();
1171 for (
const MachineOperand &Def :
VALU->defs()) {
1172 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1175 return WaitStatesNeeded;
1178int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA)
const {
1187 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1188 !ST.hasCvtScaleForwardingHazard())
1191 const MachineRegisterInfo &MRI = MF.getRegInfo();
1192 int WaitStatesNeeded = 0;
1194 for (
const MachineOperand &
Op :
1196 if (
Op.isReg() &&
Op.isDef()) {
1197 if (!TRI.isVectorRegister(MRI,
Op.getReg()))
1200 if (ST.has12DWordStoreHazard()) {
1202 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op, MRI));
1207 if (ST.hasDstSelForwardingHazard()) {
1208 const int Shift16DefWaitstates = 1;
1210 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1214 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1215 IA->readsRegister(Dst->getReg(), &TRI);
1217 if (ProducerMI.isInlineAsm()) {
1219 for (
auto &Def : ProducerMI.all_defs()) {
1220 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1221 IA->readsRegister(
Def.getReg(), &TRI)) {
1230 int WaitStatesNeededForDef =
1231 Shift16DefWaitstates -
1232 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1233 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1236 return WaitStatesNeeded;
1239int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane)
const {
1240 const SIInstrInfo *TII = ST.getInstrInfo();
1241 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1242 const MachineRegisterInfo &MRI = MF.getRegInfo();
1244 const MachineOperand *LaneSelectOp =
1245 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1247 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->
getReg()))
1251 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1253 const int RWLaneWaitStates = 4;
1254 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1256 return RWLaneWaitStates - WaitStatesSince;
1259int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE)
const {
1260 if (!ST.hasRFEHazards())
1263 const SIInstrInfo *TII = ST.getInstrInfo();
1265 const int RFEWaitStates = 1;
1270 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1271 return RFEWaitStates - WaitStatesNeeded;
1274int GCNHazardRecognizer::checkReadM0Hazards(
MachineInstr *
MI)
const {
1275 const SIInstrInfo *TII = ST.getInstrInfo();
1276 const int ReadM0WaitStates = 1;
1277 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1278 return ReadM0WaitStates -
1279 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1284 int WaitStatesNeeded,
bool IsHoisting) {
1286 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1287 BuildMI(
MBB, InsertPt,
DL, TII.get(AMDGPU::V_NOP_e32));
1291 fixVMEMtoScalarWriteHazards(
MI);
1292 fixVcmpxPermlaneHazards(
MI);
1293 fixSMEMtoVectorWriteHazards(
MI);
1294 fixVcmpxExecWARHazard(
MI);
1295 fixLdsBranchVmemWARHazard(
MI);
1296 if (ST.hasLdsDirect()) {
1297 fixLdsDirectVALUHazard(
MI);
1298 fixLdsDirectVMEMHazard(
MI);
1300 fixVALUPartialForwardingHazard(
MI);
1301 fixVALUTransUseHazard(
MI);
1302 fixVALUTransCoexecutionHazards(
MI);
1304 fixWMMACoexecutionHazards(
MI);
1305 fixShift64HighRegBug(
MI);
1306 fixVALUMaskWriteHazard(
MI);
1307 fixRequiredExportPriority(
MI);
1308 if (ST.requiresWaitIdleBeforeGetReg())
1309 fixGetRegWaitIdle(
MI);
1310 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1311 fixDsAtomicAsyncBarrierArriveB64(
MI);
1312 if (ST.hasScratchBaseForwardingHazard())
1313 fixScratchBaseForwardingHazard(
MI);
1314 if (ST.setRegModeNeedsVNOPs())
1320 return (
TII.isVOPC(
MI) ||
1321 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1322 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1325bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1329 const SIInstrInfo *TII = ST.getInstrInfo();
1330 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1336 unsigned Opc =
MI.getOpcode();
1338 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1342 std::numeric_limits<int>::max())
1348 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1350 bool IsUndef = Src0->isUndef();
1352 TII->get(AMDGPU::V_MOV_B32_e32))
1359bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1360 if (!ST.hasVMEMtoScalarWriteHazard())
1362 assert(!ST.hasExtendedWaitCounts());
1367 if (
MI->getNumDefs() == 0)
1370 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1376 for (
const MachineOperand &Def :
MI->defs()) {
1377 const MachineOperand *
Op =
1378 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1388 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1389 !
MI.getOperand(0).getImm()) ||
1390 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1395 std::numeric_limits<int>::max())
1398 const SIInstrInfo *TII = ST.getInstrInfo();
1400 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1405bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1406 if (!ST.hasSMEMtoVectorWriteHazard())
1408 assert(!ST.hasExtendedWaitCounts());
1413 AMDGPU::OpName SDSTName;
1414 switch (
MI->getOpcode()) {
1415 case AMDGPU::V_READLANE_B32:
1416 case AMDGPU::V_READFIRSTLANE_B32:
1417 SDSTName = AMDGPU::OpName::vdst;
1420 SDSTName = AMDGPU::OpName::sdst;
1424 const SIInstrInfo *TII = ST.getInstrInfo();
1425 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1427 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1429 for (
const auto &MO :
MI->implicit_operands()) {
1430 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1441 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1446 if (TII->isSALU(
MI)) {
1447 switch (
MI.getOpcode()) {
1448 case AMDGPU::S_SETVSKIP:
1449 case AMDGPU::S_VERSION:
1450 case AMDGPU::S_WAITCNT_VSCNT:
1451 case AMDGPU::S_WAITCNT_VMCNT:
1452 case AMDGPU::S_WAITCNT_EXPCNT:
1455 case AMDGPU::S_WAITCNT_LGKMCNT:
1457 return (
MI.getOperand(1).getImm() == 0) &&
1458 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1459 case AMDGPU::S_WAITCNT: {
1460 const int64_t
Imm =
MI.getOperand(0).getImm();
1467 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1468 "unexpected wait count instruction");
1470 if (TII->isSOPP(
MI))
1486 std::numeric_limits<int>::max())
1490 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1495bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1496 if (!ST.hasVcmpxExecWARHazard())
1498 assert(!ST.hasExtendedWaitCounts());
1503 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1504 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1510 return I.readsRegister(AMDGPU::EXEC, TRI);
1513 const SIInstrInfo *TII = ST.getInstrInfo();
1514 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1516 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1518 for (
auto MO :
MI.implicit_operands())
1519 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1522 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1529 std::numeric_limits<int>::max())
1533 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1540 if (!ST.hasLdsBranchVmemWARHazard())
1545 bool HasLds =
false;
1546 bool HasVmem =
false;
1547 for (
auto &
MBB : MF) {
1548 for (
auto &
MI :
MBB) {
1551 if (HasLds && HasVmem)
1559 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1560 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1561 !
I.getOperand(1).getImm();
1564bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1565 if (!RunLdsBranchVmemWARHazardFixup)
1568 assert(ST.hasLdsBranchVmemWARHazard());
1569 assert(!ST.hasExtendedWaitCounts());
1571 auto IsHazardInst = [](
const MachineInstr &
MI) {
1579 auto InstType = IsHazardInst(*
MI);
1583 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1587 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1591 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1592 auto InstType2 = IsHazardInst(
I);
1593 return InstType2 && InstType != InstType2;
1596 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1597 auto InstType2 = IsHazardInst(
I);
1598 if (InstType == InstType2)
1605 std::numeric_limits<int>::max();
1609 std::numeric_limits<int>::max())
1612 const SIInstrInfo *TII = ST.getInstrInfo();
1614 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1621bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1625 const int NoHazardWaitStates = 15;
1626 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1629 bool VisitedTrans =
false;
1630 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1635 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1637 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1638 if (WaitStates >= NoHazardWaitStates)
1644 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1648 DenseSet<const MachineBasicBlock *> Visited;
1650 std::next(
MI->getReverseIterator()), 0,
1658 MachineOperand *WaitVdstOp =
1659 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1660 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1665bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1669 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1672 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1675 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1677 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1680 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1682 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1683 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1686 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1690 std::numeric_limits<int>::max())
1693 if (LdsdirCanWait) {
1694 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1697 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1704bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1705 if (!ST.hasVALUPartialForwardingHazard())
1707 assert(!ST.hasExtendedWaitCounts());
1712 SmallSetVector<Register, 4> SrcVGPRs;
1714 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1715 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1720 if (SrcVGPRs.
size() <= 1)
1738 const int Intv1plus2MaxVALUs = 2;
1739 const int Intv3MaxVALUs = 4;
1740 const int IntvMaxVALUs = 6;
1741 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1744 SmallDenseMap<Register, int, 4> DefPos;
1745 int ExecPos = std::numeric_limits<int>::max();
1748 static unsigned getHashValue(
const StateType &State) {
1752 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1753 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1761 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1763 if (State.VALUs > NoHazardVALUWaitStates)
1769 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1777 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1778 State.DefPos[Src] = State.VALUs;
1783 if (State.ExecPos == std::numeric_limits<int>::max()) {
1784 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1785 State.ExecPos = State.VALUs;
1792 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1800 if (State.ExecPos == std::numeric_limits<int>::max())
1803 int PreExecPos = std::numeric_limits<int>::max();
1804 int PostExecPos = std::numeric_limits<int>::max();
1806 for (
auto Entry : State.DefPos) {
1807 int DefVALUs =
Entry.second;
1808 if (DefVALUs != std::numeric_limits<int>::max()) {
1809 if (DefVALUs >= State.ExecPos)
1810 PreExecPos = std::min(PreExecPos, DefVALUs);
1812 PostExecPos = std::min(PostExecPos, DefVALUs);
1817 if (PostExecPos == std::numeric_limits<int>::max())
1821 int Intv3VALUs = PostExecPos;
1822 if (Intv3VALUs > Intv3MaxVALUs)
1826 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1827 if (Intv2VALUs > Intv1plus2MaxVALUs)
1831 if (PreExecPos == std::numeric_limits<int>::max())
1835 int Intv1VALUs = PreExecPos - State.ExecPos;
1836 if (Intv1VALUs > Intv1plus2MaxVALUs)
1840 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1845 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1851 std::next(
MI->getReverseIterator())))
1855 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1861bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1862 if (!ST.hasVALUTransUseHazard())
1864 assert(!ST.hasExtendedWaitCounts());
1869 SmallSet<Register, 4> SrcVGPRs;
1871 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1872 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1886 const int IntvMaxVALUs = 5;
1887 const int IntvMaxTRANS = 1;
1893 static unsigned getHashValue(
const StateType &State) {
1896 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1897 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1904 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1906 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1912 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1919 if (
I.modifiesRegister(Src, &TRI)) {
1927 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1935 std::next(
MI->getReverseIterator())))
1941 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1947bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1948 if (!ST.hasGFX1250Insts() ||
1952 const SIInstrInfo *TII = ST.getInstrInfo();
1953 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1955 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1960 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1961 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1962 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1966 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1967 if (!ValuDst || !ValuDst->isReg())
1971 Register ValuDef = ValuDst->getReg();
1972 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1973 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1984 const int HasVALU = std::numeric_limits<int>::max();
1985 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1988 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1996 const SIInstrInfo *TII = ST.getInstrInfo();
1997 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1999 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
2006 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2008 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2011 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2013 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2014 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2023 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2024 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2038 std::numeric_limits<int>::max())
2041 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2053 unsigned Category) {
2055 "Handle me if the xdl wmma instruction latency changes");
2092int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI)
const {
2093 if (!ST.hasGFX1250Insts())
2096 const SIInstrInfo *TII = ST.getInstrInfo();
2105 const int WMMAWaitStates[] = {5, 9, 3, 5};
2106 const int VALUWaitStates[] = {4, 8, 2, 4};
2107 unsigned Category = 0;
2109 auto IsWMMAHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2110 if (!TII->isXDLWMMA(
I))
2113 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2117 return hasWMMAToWMMARegOverlap(
I, *
MI);
2120 auto IsVALUHazardFn = [
MI, TII, &Category,
this](
const MachineInstr &
I) {
2121 if (!TII->isXDLWMMA(
I))
2124 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2128 return hasWMMAToVALURegOverlap(
I, *
MI);
2133 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2137 int WaitStatesNeeded = -1;
2138 if (TII->isXDLWMMA(*
MI)) {
2139 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2140 Limit = WMMAWaitStates[Category];
2146 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2149 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2150 Limit = VALUWaitStates[Category];
2156 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2160 return WaitStatesNeeded;
2163bool GCNHazardRecognizer::hasWMMAToWMMARegOverlap(
2165 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2166 Register A1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src0)->getReg();
2167 Register B1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src1)->getReg();
2170 if (TRI.regsOverlap(D0, A1) || TRI.regsOverlap(D0, B1))
2174 Register Idx1 = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2175 if (TRI.regsOverlap(D0, Idx1))
2181bool GCNHazardRecognizer::hasWMMAToVALURegOverlap(
2184 Register D0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::vdst)->getReg();
2185 for (
const MachineOperand &ValuUse :
MI.explicit_uses()) {
2186 if (ValuUse.isReg() && TRI.regsOverlap(D0, ValuUse.getReg()))
2191 Register A0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src0)->getReg();
2192 Register B0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src1)->getReg();
2196 Register Idx0 = TII.getNamedOperand(WMMA, AMDGPU::OpName::src2)->getReg();
2197 WMMARegs.push_back(Idx0);
2200 for (
const MachineOperand &ValuDef :
MI.defs()) {
2201 Register VDstReg = ValuDef.getReg();
2202 for (
Register WMMAReg : WMMARegs) {
2203 if (TRI.regsOverlap(VDstReg, WMMAReg))
2210bool GCNHazardRecognizer::isCoexecutionHazardFor(
const MachineInstr &
I,
2214 if (!TII.isXDLWMMA(
I))
2218 if (TII.isXDLWMMA(
MI))
2219 return hasWMMAToWMMARegOverlap(
I,
MI);
2221 return hasWMMAToVALURegOverlap(
I,
MI);
2227 bool IncludeSubloops) {
2230 for (MachineBasicBlock *
MBB :
L->getBlocks()) {
2231 if (!IncludeSubloops && MLI->getLoopFor(
MBB) != L)
2233 for (MachineInstr &
I : *
MBB) {
2236 if (isCoexecutionHazardFor(
I, *
MI))
2243bool GCNHazardRecognizer::tryHoistWMMAVnopsFromLoop(
MachineInstr *
MI,
2244 int WaitStatesNeeded) {
2248 MachineLoop *
L = MLI->getLoopFor(
MI->getParent());
2250 ++NumWMMAHoistingBailed;
2255 if (hasWMMAHazardInLoop(L,
MI)) {
2256 ++NumWMMAHoistingBailed;
2261 MachineLoop *TargetLoop =
L;
2263 if (hasWMMAHazardInLoop(Parent,
MI,
false))
2265 TargetLoop = Parent;
2271 ++NumWMMAHoistingBailed;
2275 LLVM_DEBUG(
dbgs() <<
"WMMA V_NOP Hoisting: Moving " << WaitStatesNeeded
2281 NumWMMANopsHoisted += WaitStatesNeeded;
2285bool GCNHazardRecognizer::fixWMMACoexecutionHazards(
MachineInstr *
MI) {
2286 int WaitStatesNeeded = checkWMMACoexecutionHazards(
MI);
2287 if (WaitStatesNeeded <= 0)
2293 emitVNops(*
MI->getParent(),
MI->getIterator(), WaitStatesNeeded);
2297bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2298 if (!ST.hasShift64HighRegBug())
2300 assert(!ST.hasExtendedWaitCounts());
2302 switch (
MI->getOpcode()) {
2305 case AMDGPU::V_LSHLREV_B64_e64:
2306 case AMDGPU::V_LSHRREV_B64_e64:
2307 case AMDGPU::V_ASHRREV_I64_e64:
2311 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2316 const MachineRegisterInfo &MRI = MF.getRegInfo();
2318 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2321 if (AmtReg != AMDGPU::VGPR255 && MRI.
isPhysRegUsed(AmtReg + 1))
2324 assert(ST.needsAlignedVGPRs());
2325 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2328 MachineBasicBlock *
MBB =
MI->getParent();
2329 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2340 Register DstReg =
MI->getOperand(0).getReg();
2342 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2350 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2352 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2353 : AMDGPU::VGPR_32RegClass) {
2354 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2360 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2365 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2378 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2385 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2391 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2405 MI->getOperand(0).setReg(NewReg);
2414int GCNHazardRecognizer::checkNSAtoVMEMHazard(
MachineInstr *
MI)
const {
2415 int NSAtoVMEMWaitStates = 1;
2417 if (!ST.hasNSAtoVMEMBug())
2423 const SIInstrInfo *TII = ST.getInstrInfo();
2424 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2432 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2433 TII->getInstSizeInBytes(
I) >= 16;
2436 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2439int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
2441 int FPAtomicToDenormModeWaitStates = 3;
2443 if (!ST.hasFPAtomicToDenormModeHazard())
2445 assert(!ST.hasExtendedWaitCounts());
2447 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2456 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2463 return FPAtomicToDenormModeWaitStates -
2467int GCNHazardRecognizer::checkMAIHazards(
MachineInstr *
MI)
const {
2470 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2473int GCNHazardRecognizer::checkMFMAPadding(
MachineInstr *
MI)
const {
2478 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2482 int NeighborMFMALatency = 0;
2483 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2484 this](
const MachineInstr &
MI) {
2488 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2492 const int MaxMFMAPipelineWaitStates = 16;
2493 int WaitStatesSinceNeighborMFMA =
2494 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2496 int NeighborMFMAPaddingNeeded =
2498 WaitStatesSinceNeighborMFMA;
2500 return std::max(0, NeighborMFMAPaddingNeeded);
2503int GCNHazardRecognizer::checkMAIHazards908(
MachineInstr *
MI)
const {
2504 int WaitStatesNeeded = 0;
2505 unsigned Opc =
MI->getOpcode();
2507 auto IsVALUFn = [](
const MachineInstr &
MI) {
2511 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2512 const int LegacyVALUWritesVGPRWaitStates = 2;
2513 const int VALUWritesExecWaitStates = 4;
2514 const int MaxWaitStates = 4;
2516 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2517 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2518 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2520 if (WaitStatesNeeded < MaxWaitStates) {
2521 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2522 const int MaxWaitStates = 2;
2524 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2527 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2528 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2529 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2531 if (WaitStatesNeeded == MaxWaitStates)
2537 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2538 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2541 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2544 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2545 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2546 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2547 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2548 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2549 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2550 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2551 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2552 const int MaxWaitStates = 18;
2554 unsigned HazardDefLatency = 0;
2556 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2557 this](
const MachineInstr &
MI) {
2564 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2565 return TRI.regsOverlap(DstReg,
Reg);
2568 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2570 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2571 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2572 int OpNo =
Op.getOperandNo();
2573 if (OpNo == SrcCIdx) {
2574 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2575 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2576 switch (HazardDefLatency) {
2577 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2579 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2581 case 16: [[fallthrough]];
2582 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2585 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2586 switch (HazardDefLatency) {
2587 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2589 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2591 case 16: [[fallthrough]];
2592 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2597 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2598 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2600 if (WaitStatesNeeded == MaxWaitStates)
2601 return WaitStatesNeeded;
2603 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2604 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2607 return TRI.regsOverlap(
Reg, DstReg);
2610 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2611 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2612 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2613 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2614 if (OpNo == SrcCIdx)
2615 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2616 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2617 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2619 WaitStatesNeededForUse = NeedWaitStates -
2620 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2621 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2623 if (WaitStatesNeeded == MaxWaitStates)
2624 return WaitStatesNeeded;
2627 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2628 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2629 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2630 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2631 const int MaxWaitStates = 13;
2632 Register DstReg =
MI->getOperand(0).getReg();
2633 unsigned HazardDefLatency = 0;
2635 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2636 this](
const MachineInstr &
MI) {
2639 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2641 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2642 return TRI.regsOverlap(
Reg, DstReg);
2645 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2647 switch (HazardDefLatency) {
2648 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2650 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2652 case 16: [[fallthrough]];
2653 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2657 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2658 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2662 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2664 return WaitStatesNeeded;
2675 return NumPasses + 1 + IsGFX950;
2686 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2704 return NumPasses + 2;
2714 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2717int GCNHazardRecognizer::checkMAIHazards90A(
MachineInstr *
MI)
const {
2718 int WaitStatesNeeded = 0;
2719 unsigned Opc =
MI->getOpcode();
2721 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2725 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2731 return WaitStatesNeeded;
2733 const int VALUWritesExecWaitStates = 4;
2734 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2735 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2736 VALUWritesExecWaitStates);
2737 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2739 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2742 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2743 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2744 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2745 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2746 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2747 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2748 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2749 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2750 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2751 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2752 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2753 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2754 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2755 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2756 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2757 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2758 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2759 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2760 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2761 const int MaxWaitStates = 19;
2767 const MachineInstr *MI1;
2769 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2770 this](
const MachineInstr &
MI) {
2774 FullReg = (DstReg ==
Reg);
2776 return TRI.regsOverlap(DstReg,
Reg);
2779 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2780 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2781 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2784 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2785 if (NumWaitStates == std::numeric_limits<int>::max())
2788 int OpNo =
Use.getOperandNo();
2790 int NeedWaitStates = 0;
2791 if (OpNo == SrcCIdx) {
2795 }
else if (FullReg) {
2796 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2797 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2798 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2799 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2800 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2801 else if (ST.hasGFX940Insts() &&
2802 TSchedModel.computeInstrLatency(MI1) == 2)
2803 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2806 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2807 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2808 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2809 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2810 if (!TII.isXDL(*
MI))
2813 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2814 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2816 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2817 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2818 if (!TII.isXDL(*
MI))
2819 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2822 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2823 if (ST.hasGFX940Insts()) {
2824 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2831 NumPasses, ST.hasGFX950Insts())
2833 NumPasses, ST.hasGFX950Insts()))
2839 switch (NumPasses) {
2843 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2844 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2849 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2850 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2855 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2856 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2865 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2866 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2867 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2868 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2871 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2872 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2874 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2875 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2876 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2879 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2881 if (ST.hasGFX940Insts()) {
2885 NumPasses, ST.hasGFX950Insts())
2891 switch (NumPasses) {
2893 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2898 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2902 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2906 if (WaitStatesNeeded >= NeedWaitStates)
2909 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2910 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2912 if (WaitStatesNeeded == MaxWaitStates)
2917 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2919 return WaitStatesNeeded;
2922int GCNHazardRecognizer::checkMAILdStHazards(
MachineInstr *
MI)
const {
2924 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2927 int WaitStatesNeeded = 0;
2929 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2930 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2933 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2934 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2939 const int AccVgprReadLdStWaitStates = 2;
2940 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2941 const int MaxWaitStates = 2;
2943 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2944 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2945 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2947 if (WaitStatesNeeded == MaxWaitStates)
2948 return WaitStatesNeeded;
2950 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2951 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2952 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2954 auto IsVALUFn = [](
const MachineInstr &
MI) {
2957 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2958 std::numeric_limits<int>::max();
2961 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2962 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2963 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2966 return WaitStatesNeeded;
2969int GCNHazardRecognizer::checkPermlaneHazards(
MachineInstr *
MI)
const {
2970 assert(!ST.hasVcmpxPermlaneHazard() &&
2971 "this is a different vcmpx+permlane hazard");
2972 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2973 const SIInstrInfo *TII = ST.getInstrInfo();
2975 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2979 auto IsVALUFn = [](
const MachineInstr &
MI) {
2983 const int VCmpXWritesExecWaitStates = 4;
2984 const int VALUWritesVDstWaitStates = 2;
2985 int WaitStatesNeeded = 0;
2987 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2988 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2992 int WaitStatesSinceDef =
2993 VALUWritesVDstWaitStates -
2994 getWaitStatesSinceDef(
Reg, IsVALUFn,
2995 VALUWritesVDstWaitStates);
2996 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2997 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
3001 int VCmpXHazardWaits =
3002 VCmpXWritesExecWaitStates -
3003 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
3005 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
3006 return WaitStatesNeeded;
3014 return NumPasses + 2;
3024 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3034 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
3042 return NumPasses + 2;
3045int GCNHazardRecognizer::checkMAIVALUHazards(
MachineInstr *
MI)
const {
3046 if (!ST.hasGFX90AInsts())
3049 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
3057 const MachineRegisterInfo &MRI = MF.getRegInfo();
3059 int WaitStatesNeeded = 0;
3065 const MachineInstr *
MFMA =
nullptr;
3067 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3069 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3075 const MachineInstr *
DOT =
nullptr;
3076 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
3078 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
3084 bool DGEMMAfterVALUWrite =
false;
3085 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
3088 DGEMMAfterVALUWrite =
true;
3092 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
3098 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
3099 AMDGPU::OpName::src2);
3101 if (IsMemOrExport || IsVALU) {
3102 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
3103 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
3104 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3105 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3106 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3107 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3108 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3109 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3110 const int DotWriteSameDotReadSrcAB = 3;
3111 const int DotWriteDifferentVALURead = 3;
3112 const int DMFMABetweenVALUWriteVMEMRead = 2;
3113 const int MaxWaitStates = 19;
3115 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3121 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3124 int NeedWaitStates = 0;
3125 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3126 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3127 NeedWaitStates = DotWriteSameDotReadSrcAB;
3129 NeedWaitStates = DotWriteDifferentVALURead;
3132 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3133 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3140 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3141 DGEMMAfterVALUWrite =
false;
3142 if (TRI.isVectorRegister(MRI,
Reg)) {
3143 int WaitStatesNeededForUse =
3144 DMFMABetweenVALUWriteVMEMRead -
3145 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3146 DMFMABetweenVALUWriteVMEMRead);
3148 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3153 WaitStatesSinceDef =
3154 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3158 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3159 int NumPasses = HazardDefLatency;
3160 int NeedWaitStates = MaxWaitStates;
3163 switch (HazardDefLatency) {
3165 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3166 : DMFMA4x4WriteVgprVALUReadWaitStates;
3172 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3173 : (ST.hasGFX950Insts()
3174 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3175 : DMFMA16x16WriteVgprVALUReadWaitStates);
3180 }
else if (ST.hasGFX940Insts()) {
3184 NumPasses, ST.hasGFX950Insts())
3188 switch (HazardDefLatency) {
3190 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3193 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3196 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3203 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3204 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3206 if (WaitStatesNeeded == MaxWaitStates)
3211 unsigned Opc =
MI->getOpcode();
3212 const int DMFMAToFMA64WaitStates = 2;
3213 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3214 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3215 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3216 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3217 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3218 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3219 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3222 if (!IsVALU && !IsMemOrExport)
3223 return WaitStatesNeeded;
3225 for (
const MachineOperand &Def :
MI->defs()) {
3226 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3227 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3228 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3229 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3230 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3231 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3232 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3233 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3234 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3235 const int DotWriteDifferentVALUWrite = 3;
3236 const int MaxWaitStates = 19;
3237 const int MaxWarWaitStates = 15;
3242 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3244 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3245 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3246 WaitStatesSinceDef);
3249 WaitStatesSinceDef =
3250 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3252 int NeedWaitStates = MaxWaitStates;
3253 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3256 switch (NumPasses) {
3258 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3262 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3267 }
else if (ST.hasGFX940Insts()) {
3271 NumPasses, ST.hasGFX950Insts())
3274 switch (NumPasses) {
3276 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3279 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3282 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3289 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3290 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3292 if (WaitStatesNeeded == MaxWaitStates)
3296 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3298 !
MI.readsRegister(
Reg, &TRI))
3301 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3304 const MachineOperand *SrcC =
3305 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3315 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3320 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3321 int NeedWaitStates = MaxWaitStates;
3322 switch (HazardDefLatency) {
3323 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3325 case 4:
assert(ST.hasGFX940Insts());
3326 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3328 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3330 case 16: [[fallthrough]];
3331 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3335 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3336 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3339 return WaitStatesNeeded;
3352 return MAI !=
nullptr;
3356 if (IsMFMAFn(*
MI)) {
3357 int W = getWaitStatesSince(IsMFMAFn, 16);
3359 return W < (int)TSchedModel.computeInstrLatency(MAI);
3373 while (
I->isBundledWithPred())
3379 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3383 const unsigned NewBytes = 4;
3385 "Unexpected instruction insertion in bundle");
3388 while (NextMI != End && NextMI->isBundledWithPred()) {
3389 for (
auto &Operand : NextMI->operands()) {
3390 if (Operand.isGlobal())
3391 Operand.setOffset(Operand.getOffset() + NewBytes);
3397bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3398 if (!ST.hasVALUMaskWriteHazard())
3400 assert(!ST.hasExtendedWaitCounts());
3407 if (!IsSALU && !IsVALU)
3419 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3420 const MachineRegisterInfo &MRI = MF.getRegInfo();
3425 case AMDGPU::EXEC_LO:
3426 case AMDGPU::EXEC_HI:
3428 case AMDGPU::SGPR_NULL:
3429 case AMDGPU::SGPR_NULL64:
3437 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3441 SmallSet<Register, 2> HazardSGPRs;
3443 static unsigned getHashValue(
const StateType &State) {
3446 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3447 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3451 SmallVector<const MachineInstr *> WaitInstrs;
3452 bool HasSGPRRead =
false;
3453 StateType InitialState;
3456 MachineOperand *HazardDef =
nullptr;
3457 for (MachineOperand &
Op :
MI->operands()) {
3460 if (
Op.isDef() && HazardDef)
3464 if (IgnoreableSGPR(
Reg))
3467 if (
Op.isImplicit())
3469 if (!TRI->isSGPRReg(MRI,
Reg))
3487 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3488 InitialState.HazardSGPRs.insert(HazardReg);
3491 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3492 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3495 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3496 if (State.HazardSGPRs.empty())
3499 switch (
I.getOpcode()) {
3500 case AMDGPU::V_ADDC_U32_e32:
3501 case AMDGPU::V_ADDC_U32_dpp:
3502 case AMDGPU::V_CNDMASK_B16_t16_e32:
3503 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3504 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3505 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3506 case AMDGPU::V_CNDMASK_B32_e32:
3507 case AMDGPU::V_CNDMASK_B32_dpp:
3508 case AMDGPU::V_DIV_FMAS_F32_e64:
3509 case AMDGPU::V_DIV_FMAS_F64_e64:
3510 case AMDGPU::V_SUBB_U32_e32:
3511 case AMDGPU::V_SUBB_U32_dpp:
3512 case AMDGPU::V_SUBBREV_U32_e32:
3513 case AMDGPU::V_SUBBREV_U32_dpp: {
3517 case AMDGPU::V_ADDC_U32_e64:
3518 case AMDGPU::V_ADDC_U32_e64_dpp:
3519 case AMDGPU::V_CNDMASK_B16_t16_e64:
3520 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3521 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3522 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3523 case AMDGPU::V_CNDMASK_B32_e64:
3524 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3525 case AMDGPU::V_SUBB_U32_e64:
3526 case AMDGPU::V_SUBB_U32_e64_dpp:
3527 case AMDGPU::V_SUBBREV_U32_e64:
3528 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3530 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3532 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3544 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3545 switch (
I.getOpcode()) {
3546 case AMDGPU::S_WAITCNT_DEPCTR:
3548 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3549 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3554 for (
auto &
Op :
I.operands()) {
3559 if (IgnoreableSGPR(
Reg))
3562 if (
Op.isImplicit())
3564 if (!TRI->isSGPRReg(MRI,
Reg))
3575 for (
Register SGPR : State.HazardSGPRs) {
3576 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3580 State.HazardSGPRs.erase(SGPR);
3589 std::next(
MI->getReverseIterator())))
3599 if (!WaitInstrs.
empty()) {
3603 SmallVector<MachineInstr *> ToErase;
3605 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3606 End = MI->getParent()->rend();
3607 Found < WaitInstrs.size() && It != End; ++It) {
3608 MachineInstr *WaitMI = &*It;
3610 if (std::as_const(WaitMI) != WaitInstrs[Found])
3613 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3614 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3615 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3616 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3617 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3618 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3619 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3620 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3621 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3622 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3623 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3624 ToErase.push_back(WaitMI);
3627 for (MachineInstr *WaitMI : ToErase)
3628 WaitMI->eraseFromParent();
3632 auto NextMI = std::next(
MI->getIterator());
3633 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3634 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3646 if (EntryMBB.
begin() != EntryMBB.
end()) {
3647 auto &EntryMI = *EntryMBB.
begin();
3648 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3649 EntryMI.getOperand(0).getImm() >= Priority)
3658bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3659 if (!ST.hasRequiredExportPriority())
3664 MachineBasicBlock *
MBB =
MI->getParent();
3677 const int MaxPriority = 3;
3678 const int NormalPriority = 2;
3679 const int PostExportPriority = 0;
3681 auto It =
MI->getIterator();
3682 switch (
MI->getOpcode()) {
3683 case AMDGPU::S_ENDPGM:
3684 case AMDGPU::S_ENDPGM_SAVED:
3685 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3686 case AMDGPU::SI_RETURN_TO_EPILOG:
3689 if (MF->getFrameInfo().hasCalls())
3692 case AMDGPU::S_SETPRIO: {
3694 auto &PrioOp =
MI->getOperand(0);
3695 int Prio = PrioOp.getImm();
3696 bool InWA = (Prio == PostExportPriority) &&
3697 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3698 if (InWA || Prio >= NormalPriority)
3700 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3704 if (!TII.isEXP(*
MI))
3715 auto NextMI = std::next(It);
3716 bool EndOfShader =
false;
3717 if (NextMI !=
MBB->
end()) {
3719 if (TII.isEXP(*NextMI))
3722 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3723 NextMI->getOperand(0).getImm() == PostExportPriority)
3725 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3732 .
addImm(PostExportPriority);
3736 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3737 .
addReg(AMDGPU::SGPR_NULL)
3757 const SIInstrInfo *TII = ST.getInstrInfo();
3769 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3774bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3775 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3778 const SIInstrInfo *TII = ST.getInstrInfo();
3780 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3782 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3783 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3789bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3792 if (!IsHazardRecognizerMode)
3795 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3796 const SIInstrInfo *TII = ST.getInstrInfo();
3798 const int FlatScrBaseWaitStates = 10;
3800 bool ReadsFlatScrLo =
3801 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3802 bool ReadsFlatScrHi =
3803 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3809 ReadsFlatScrLo =
true;
3812 ReadsFlatScrHi =
true;
3817 const MachineRegisterInfo &MRI = MF.getRegInfo();
3820 DenseSet<const MachineBasicBlock *> Visited;
3822 return MI.modifiesRegister(
Reg, TRI);
3827 auto IsSGPRDef = [TII, TRI, &MRI](
const MachineInstr &
MI) ->
unsigned {
3828 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3830 for (
const MachineOperand &MO :
MI.all_defs()) {
3831 if (TRI->isSGPRReg(MRI, MO.getReg()))
3837 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3838 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3839 unsigned Wait =
MI.getOperand(0).getImm();
3844 return SgprWrites >= FlatScrBaseWaitStates;
3847 return ::getWaitStatesSince(
3848 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3849 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3853 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3855 !IsRegDefHazard(AMDGPU::SGPR103)))
3859 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3870 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3871 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static cl::opt< bool > EnableWMMAVnopHoisting("amdgpu-wmma-vnop-hoisting", cl::init(true), cl::Hidden, cl::desc("Hoist WMMA hazard V_NOPs from loops to preheaders"))
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const uint32_t IV[8]
unsigned get(InstCounterType T) const
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getHazardWaitStates(MachineInstr *MI) const
Returns the number of wait states until all hazards for MI are resolved.
unsigned PreEmitNoopsCommon(MachineInstr *) const
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
bool ShouldPreferAnother(SUnit *SU) const override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
GCNHazardRecognizer(const MachineFunction &MF, MachineLoopInfo *MLI=nullptr)
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
This namespace contains all of the command line option processing machinery.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
constexpr RegState getDeadRegState(bool B)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...