25struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
30 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
33 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
43 cl::desc(
"Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
54 IsHazardRecognizerMode(
false),
55 CurrCycleInstr(nullptr),
58 TII(*ST.getInstrInfo()),
59 TRI(
TII.getRegisterInfo()),
60 ClauseUses(
TRI.getNumRegUnits()),
61 ClauseDefs(
TRI.getNumRegUnits()) {
63 TSchedModel.
init(&ST);
68 EmittedInstrs.clear();
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
84 return Opcode == AMDGPU::S_GETREG_B32;
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
103 return Opcode == AMDGPU::S_RFE_B64;
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
123 unsigned Opcode =
MI.getOpcode();
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
131 if (!ST.hasGFX940Insts())
139 if (
TII.isAlwaysGDS(
MI.getOpcode()))
142 switch (
MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
153 if (
TII.isDS(
MI.getOpcode())) {
155 AMDGPU::OpName::gds);
156 if (
MI.getOperand(GDS).getImm())
164 unsigned Opcode =
MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANE64_B32 ||
167 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
179 AMDGPU::OpName::simm16);
199 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
208 && checkVMEMHazards(
MI) > 0)
217 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
220 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
228 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
231 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
234 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
239 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
240 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
244 MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
245 checkReadM0Hazards(
MI) > 0)
256 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
264 while (Quantity > 0) {
265 unsigned Arg = std::min(Quantity, 8u);
273GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
280void GCNHazardRecognizer::processBundle() {
284 for (;
MI != E &&
MI->isInsideBundle(); ++
MI) {
285 CurrCycleInstr = &*
MI;
288 if (IsHazardRecognizerMode) {
289 fixHazards(CurrCycleInstr);
297 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
298 EmittedInstrs.push_front(
nullptr);
300 EmittedInstrs.push_front(CurrCycleInstr);
303 CurrCycleInstr =
nullptr;
307 assert(IsHazardRecognizerMode);
311 if (
MI->isInsideBundle())
321 IsHazardRecognizerMode =
true;
325 CurrCycleInstr =
nullptr;
336 return std::max(WaitStates, checkSMRDHazards(
MI));
339 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
341 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
347 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
350 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
353 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
356 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
359 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
364 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
366 if (
MI->isInlineAsm())
367 return std::max(WaitStates, checkInlineAsmHazards(
MI));
370 return std::max(WaitStates, checkGetRegHazards(
MI));
373 return std::max(WaitStates, checkSetRegHazards(
MI));
376 return std::max(WaitStates, checkRFEHazards(
MI));
380 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
381 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
385 return std::max(WaitStates, checkReadM0Hazards(
MI));
388 return std::max(WaitStates, checkMAIHazards(
MI));
393 return std::max(WaitStates, checkMAILdStHazards(
MI));
399 EmittedInstrs.push_front(
nullptr);
405 if (!CurrCycleInstr) {
406 EmittedInstrs.push_front(
nullptr);
416 if (!NumWaitStates) {
417 CurrCycleInstr =
nullptr;
422 EmittedInstrs.push_front(CurrCycleInstr);
429 EmittedInstrs.push_front(
nullptr);
437 CurrCycleInstr =
nullptr;
441 llvm_unreachable(
"hazard recognizer does not support bottom-up scheduling.");
454template <
typename StateT>
467 switch (IsHazard(State, *
I)) {
477 if (
I->isInlineAsm() ||
I->isMetaInstruction())
480 UpdateState(State, *
I);
484 if (!Visited.
insert(Pred).second)
487 if (
hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
511 if (
I->isInlineAsm())
514 WaitStates += GetNumWaitStates(*
I);
516 if (IsExpired(*
I, WaitStates))
517 return std::numeric_limits<int>::max();
520 int MinWaitStates = std::numeric_limits<int>::max();
522 if (!Visited.
insert(Pred).second)
526 IsExpired, Visited, GetNumWaitStates);
528 MinWaitStates = std::min(MinWaitStates, W);
531 return MinWaitStates;
538 std::next(
MI->getReverseIterator()),
539 0, IsExpired, Visited);
542int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
543 if (IsHazardRecognizerMode) {
545 return WaitStates >= Limit;
547 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
556 if (
MI->isInlineAsm())
561 if (WaitStates >= Limit)
564 return std::numeric_limits<int>::max();
567int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
568 IsHazardFn IsHazardDef,
573 return IsHazardDef(
MI) &&
MI.modifiesRegister(Reg, TRI);
579int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
619int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
625 bool IsSMRD = TII.
isSMRD(*MEM);
651 if (ClauseDefs.
none())
664 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
667int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
668 int WaitStatesNeeded = 0;
670 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
674 return WaitStatesNeeded;
678 int SmrdSgprWaitStates = 4;
691 int WaitStatesNeededForUse =
692 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
694 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
704 int WaitStatesNeededForUse =
705 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
708 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
712 return WaitStatesNeeded;
715int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
719 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
723 const int VmemSgprWaitStates = 5;
731 int WaitStatesNeededForUse =
732 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
734 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
736 return WaitStatesNeeded;
744 int DppVgprWaitStates = 2;
745 int DppExecWaitStates = 5;
746 int WaitStatesNeeded = 0;
748 return TII->isVALU(
MI);
754 int WaitStatesNeededForUse =
755 DppVgprWaitStates - getWaitStatesSinceDef(
759 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
762 WaitStatesNeeded = std::max(
764 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
767 return WaitStatesNeeded;
770int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
775 const int DivFMasWaitStates = 4;
777 return TII->isVALU(
MI);
779 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
782 return DivFMasWaitStates - WaitStatesNeeded;
785int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
787 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
789 const int GetRegWaitStates = 2;
793 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
795 return GetRegWaitStates - WaitStatesNeeded;
798int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
800 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
806 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
807 return SetRegWaitStates - WaitStatesNeeded;
810int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
815 unsigned Opcode =
MI.getOpcode();
821 VDataRCID =
Desc.operands()[VDataIdx].RegClass;
831 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
835 (!SOffset || !SOffset->
isReg()))
843 if (
TII->isMIMG(
MI)) {
850 if (
TII->isFLAT(
MI)) {
860GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
867 int WaitStatesNeeded = 0;
869 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
870 return WaitStatesNeeded;
873 int DataIdx = createsVALUHazard(
MI);
874 return DataIdx >= 0 &&
875 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(), Reg);
877 int WaitStatesNeededForDef =
878 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
879 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
881 return WaitStatesNeeded;
884int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
885 int WaitStatesNeeded = 0;
888 const int TransDefWaitstates = 1;
898 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
905 int WaitStatesNeededForDef =
907 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
908 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
912 const int Shift16DefWaitstates = 1;
919 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
924 !(
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)
930 if (
auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
934 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
942 int WaitStatesNeededForDef =
943 Shift16DefWaitstates -
944 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
945 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
949 const int VALUWriteSGPRVALUReadWaitstates = 2;
950 const int VALUWriteEXECRWLane = 4;
951 const int VALUWriteVGPRReadlaneRead = 1;
959 return MI.modifiesRegister(
UseReg, TRI);
968 int WaitStatesNeededForDef =
969 VALUWriteSGPRVALUReadWaitstates -
970 getWaitStatesSince(IsVALUDefSGPRFn,
971 VALUWriteSGPRVALUReadWaitstates);
972 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
976 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
978 int WaitStatesNeededForDef =
979 VALUWriteSGPRVALUReadWaitstates -
980 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
981 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
984 switch (
VALU->getOpcode()) {
985 case AMDGPU::V_READLANE_B32:
986 case AMDGPU::V_READFIRSTLANE_B32: {
989 int WaitStatesNeededForDef =
990 VALUWriteVGPRReadlaneRead -
991 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
992 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
995 case AMDGPU::V_WRITELANE_B32: {
997 int WaitStatesNeededForDef =
998 VALUWriteEXECRWLane -
999 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1000 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1011 return WaitStatesNeeded;
1016 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1019 return WaitStatesNeeded;
1022int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1035 int WaitStatesNeeded = 0;
1039 if (
Op.isReg() &&
Op.isDef()) {
1041 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1045 return WaitStatesNeeded;
1048int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1054 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1062 const int RWLaneWaitStates = 4;
1063 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1065 return RWLaneWaitStates - WaitStatesSince;
1068int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1074 const int RFEWaitStates = 1;
1079 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1080 return RFEWaitStates - WaitStatesNeeded;
1085 const int ReadM0WaitStates = 1;
1087 return ReadM0WaitStates -
1088 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1092 fixVMEMtoScalarWriteHazards(
MI);
1093 fixVcmpxPermlaneHazards(
MI);
1094 fixSMEMtoVectorWriteHazards(
MI);
1095 fixVcmpxExecWARHazard(
MI);
1096 fixLdsBranchVmemWARHazard(
MI);
1098 fixLdsDirectVALUHazard(
MI);
1099 fixLdsDirectVMEMHazard(
MI);
1101 fixVALUPartialForwardingHazard(
MI);
1102 fixVALUTransUseHazard(
MI);
1104 fixShift64HighRegBug(
MI);
1105 fixVALUMaskWriteHazard(
MI);
1108bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1115 return (
TII->isVOPC(
MI) ||
1116 ((
TII->isVOP3(
MI) ||
TII->isSDWA(
MI)) &&
MI.isCompare())) &&
1117 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1121 unsigned Opc =
MI.getOpcode();
1123 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1127 std::numeric_limits<int>::max())
1133 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1135 bool IsUndef = Src0->isUndef();
1137 TII->get(AMDGPU::V_MOV_B32_e32))
1144bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1152 if (
MI->getNumDefs() == 0)
1164 I.findRegisterUseOperand(
Def.getReg(),
false,
TRI);
1174 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1175 !
MI.getOperand(0).getImm()) ||
1176 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1181 std::numeric_limits<int>::max())
1186 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1191bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1200 switch (
MI->getOpcode()) {
1201 case AMDGPU::V_READLANE_B32:
1202 case AMDGPU::V_READFIRSTLANE_B32:
1203 SDSTName = AMDGPU::OpName::vdst;
1206 SDSTName = AMDGPU::OpName::sdst;
1215 for (
const auto &MO :
MI->implicit_operands()) {
1216 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg()))) {
1232 if (
TII->isSALU(
MI)) {
1233 switch (
MI.getOpcode()) {
1234 case AMDGPU::S_SETVSKIP:
1235 case AMDGPU::S_VERSION:
1236 case AMDGPU::S_WAITCNT_VSCNT:
1237 case AMDGPU::S_WAITCNT_VMCNT:
1238 case AMDGPU::S_WAITCNT_EXPCNT:
1241 case AMDGPU::S_WAITCNT_LGKMCNT:
1243 return (
MI.getOperand(1).getImm() == 0) &&
1244 (
MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1245 case AMDGPU::S_WAITCNT: {
1246 const int64_t
Imm =
MI.getOperand(0).getImm();
1249 return (Decoded.
DsCnt == 0);
1253 if (
TII->isSOPP(
MI))
1269 std::numeric_limits<int>::max())
1273 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1278bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1287 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1293 return I.readsRegister(AMDGPU::EXEC, TRI);
1299 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1301 for (
auto MO :
MI.implicit_operands())
1302 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg())))
1305 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1312 std::numeric_limits<int>::max())
1316 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1323 if (!ST.hasLdsBranchVmemWARHazard())
1328 bool HasLds =
false;
1329 bool HasVmem =
false;
1330 for (
auto &
MBB : MF) {
1331 for (
auto &
MI :
MBB) {
1335 if (HasLds && HasVmem)
1343 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1344 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1345 !
I.getOperand(1).getImm();
1348bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1349 if (!RunLdsBranchVmemWARHazardFixup)
1363 auto InstType = IsHazardInst(*
MI);
1376 auto InstType2 = IsHazardInst(
I);
1377 return InstType2 && InstType != InstType2;
1381 auto InstType2 = IsHazardInst(
I);
1382 if (InstType == InstType2)
1389 std::numeric_limits<int>::max();
1393 std::numeric_limits<int>::max())
1398 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1405bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1409 const int NoHazardWaitStates = 15;
1413 bool VisitedTrans =
false;
1419 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1422 if (WaitStates >= NoHazardWaitStates)
1433 auto Count = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
1434 std::next(
MI->getReverseIterator()), 0,
1443 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1444 WaitVdstOp->
setImm(std::min(Count, NoHazardWaitStates));
1449bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1460 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1467 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1468 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1471 !
TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1475 std::numeric_limits<int>::max())
1478 if (LdsdirCanWait) {
1479 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1482 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1489bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1505 if (SrcVGPRs.
size() <= 1)
1523 const int Intv1plus2MaxVALUs = 2;
1524 const int Intv3MaxVALUs = 4;
1525 const int IntvMaxVALUs = 6;
1526 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1530 int ExecPos = std::numeric_limits<int>::max();
1539 if (State.VALUs > NoHazardVALUWaitStates)
1545 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1550 bool Changed =
false;
1553 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1554 State.DefPos[Src] = State.VALUs;
1559 if (State.ExecPos == std::numeric_limits<int>::max()) {
1560 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1561 State.ExecPos = State.VALUs;
1568 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1576 if (State.ExecPos == std::numeric_limits<int>::max())
1579 int PreExecPos = std::numeric_limits<int>::max();
1580 int PostExecPos = std::numeric_limits<int>::max();
1582 for (
auto Entry : State.DefPos) {
1583 int DefVALUs = Entry.second;
1584 if (DefVALUs != std::numeric_limits<int>::max()) {
1585 if (DefVALUs >= State.ExecPos)
1586 PreExecPos = std::min(PreExecPos, DefVALUs);
1588 PostExecPos = std::min(PostExecPos, DefVALUs);
1593 if (PostExecPos == std::numeric_limits<int>::max())
1597 int Intv3VALUs = PostExecPos;
1598 if (Intv3VALUs > Intv3MaxVALUs)
1602 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1603 if (Intv2VALUs > Intv1plus2MaxVALUs)
1607 if (PreExecPos == std::numeric_limits<int>::max())
1611 int Intv1VALUs = PreExecPos - State.ExecPos;
1612 if (Intv1VALUs > Intv1plus2MaxVALUs)
1616 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1621 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1627 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1628 std::next(
MI->getReverseIterator()), Visited))
1632 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1638bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1663 const int IntvMaxVALUs = 5;
1664 const int IntvMaxTRANS = 1;
1676 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1682 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1683 I.getOperand(0).getImm() == 0x0fff))
1689 if (
I.modifiesRegister(Src, &TRI)) {
1697 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1705 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1706 std::next(
MI->getReverseIterator()), Visited))
1712 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1732 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1734 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1737 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1739 if (
TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1740 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1749 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1750 if (
TRI->regsOverlap(PrevDstReg, CurIndex))
1764 std::numeric_limits<int>::max())
1767 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1772bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
1777 switch (
MI->getOpcode()) {
1780 case AMDGPU::V_LSHLREV_B64_e64:
1781 case AMDGPU::V_LSHRREV_B64_e64:
1782 case AMDGPU::V_ASHRREV_I64_e64:
1793 if (!
TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1796 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
1800 bool OverlappedSrc = Src1->
isReg() &&
TRI.regsOverlap(Src1->
getReg(), AmtReg);
1801 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
1802 bool Overlapped = OverlappedSrc || OverlappedDst;
1804 assert(!OverlappedDst || !OverlappedSrc ||
1805 Src1->
getReg() ==
MI->getOperand(0).getReg());
1807 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1810 for (
MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1811 : AMDGPU::VGPR_32RegClass) {
1812 if (!
MI->modifiesRegister(Reg, &TRI) && !
MI->readsRegister(Reg, &TRI)) {
1823 NewAmtLo =
TRI.getSubReg(NewReg, AMDGPU::sub0);
1865 MI->getOperand(0).setReg(NewReg);
1866 if (OverlappedSrc) {
1876 int NSAtoVMEMWaitStates = 1;
1885 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
1893 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1894 TII->getInstSizeInBytes(
I) >= 16;
1897 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
1900int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
1901 int FPAtomicToDenormModeWaitStates = 3;
1907 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1920 switch (
MI.getOpcode()) {
1921 case AMDGPU::S_WAITCNT:
1922 case AMDGPU::S_WAITCNT_VSCNT:
1923 case AMDGPU::S_WAITCNT_VMCNT:
1924 case AMDGPU::S_WAITCNT_EXPCNT:
1925 case AMDGPU::S_WAITCNT_LGKMCNT:
1926 case AMDGPU::S_WAIT_IDLE:
1935 return FPAtomicToDenormModeWaitStates -
1954 int NeighborMFMALatency = 0;
1955 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1960 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
1964 const int MaxMFMAPipelineWaitStates = 16;
1965 int WaitStatesSinceNeighborMFMA =
1966 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1968 int NeighborMFMAPaddingNeeded =
1970 WaitStatesSinceNeighborMFMA;
1972 return std::max(0, NeighborMFMAPaddingNeeded);
1976 int WaitStatesNeeded = 0;
1977 unsigned Opc =
MI->getOpcode();
1983 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
1984 const int LegacyVALUWritesVGPRWaitStates = 2;
1985 const int VALUWritesExecWaitStates = 4;
1986 const int MaxWaitStates = 4;
1988 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1989 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1990 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1992 if (WaitStatesNeeded < MaxWaitStates) {
1994 const int MaxWaitStates = 2;
1999 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2000 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2001 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2003 if (WaitStatesNeeded == MaxWaitStates)
2013 if (
Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2016 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2017 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2018 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2019 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2020 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2021 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2022 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2023 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2024 const int MaxWaitStates = 18;
2026 unsigned HazardDefLatency = 0;
2028 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2036 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2037 return TRI.regsOverlap(DstReg, Reg);
2040 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2042 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2044 int OpNo =
Op.getOperandNo();
2045 if (OpNo == SrcCIdx) {
2046 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2047 }
else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2048 switch (HazardDefLatency) {
2049 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2051 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2053 case 16: [[fallthrough]];
2054 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2057 }
else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2058 switch (HazardDefLatency) {
2059 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2061 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2063 case 16: [[fallthrough]];
2064 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2069 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2070 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2072 if (WaitStatesNeeded == MaxWaitStates)
2073 return WaitStatesNeeded;
2076 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2079 return TRI.regsOverlap(Reg, DstReg);
2082 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2083 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2084 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2085 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2086 if (OpNo == SrcCIdx)
2087 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2088 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2089 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2091 WaitStatesNeededForUse = NeedWaitStates -
2092 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2093 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2095 if (WaitStatesNeeded == MaxWaitStates)
2096 return WaitStatesNeeded;
2099 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2100 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2101 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2102 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2103 const int MaxWaitStates = 13;
2104 Register DstReg =
MI->getOperand(0).getReg();
2105 unsigned HazardDefLatency = 0;
2107 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2113 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2114 return TRI.regsOverlap(Reg, DstReg);
2117 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2119 switch (HazardDefLatency) {
2120 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2122 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2124 case 16: [[fallthrough]];
2125 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2129 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2130 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2134 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2136 return WaitStatesNeeded;
2145 return NumPasses + 1;
2163 return NumPasses + 2;
2171 return NumPasses + 3;
2175 int WaitStatesNeeded = 0;
2176 unsigned Opc =
MI->getOpcode();
2188 return WaitStatesNeeded;
2190 const int VALUWritesExecWaitStates = 4;
2191 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2192 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2193 VALUWritesExecWaitStates);
2194 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2200 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2201 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2202 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2203 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2204 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2205 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2206 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2207 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2208 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2209 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2210 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2211 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2212 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2213 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2214 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2215 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2216 const int MaxWaitStates = 19;
2224 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2229 FullReg = (DstReg ==
Reg);
2231 return TRI.regsOverlap(DstReg, Reg);
2234 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2235 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2236 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2239 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2240 if (NumWaitStates == std::numeric_limits<int>::max())
2245 int NeedWaitStates = 0;
2246 if (OpNo == SrcCIdx) {
2249 }
else if (FullReg) {
2250 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2251 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2252 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2253 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2254 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2256 TSchedModel.computeInstrLatency(MI1) == 2)
2257 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2260 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2261 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2262 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2265 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2267 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2268 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2270 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2273 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2287 switch (NumPasses) {
2290 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2291 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2296 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2297 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2302 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2303 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2312 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2313 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2314 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2316 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2318 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2319 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2320 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2323 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2335 switch (NumPasses) {
2337 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2342 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2344 case 16: [[fallthrough]];
2346 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2350 if (WaitStatesNeeded >= NeedWaitStates)
2353 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2354 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2356 if (WaitStatesNeeded == MaxWaitStates)
2360 return WaitStatesNeeded;
2368 int WaitStatesNeeded = 0;
2371 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2380 const int AccVgprReadLdStWaitStates = 2;
2381 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2382 const int MaxWaitStates = 2;
2384 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2385 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2386 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2388 if (WaitStatesNeeded == MaxWaitStates)
2389 return WaitStatesNeeded;
2392 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2393 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2398 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <
2399 std::numeric_limits<int>::max();
2402 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2403 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2404 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2407 return WaitStatesNeeded;
2415 return NumPasses + 2;
2423 return NumPasses + 3;
2431 return NumPasses + 3;
2439 return NumPasses + 2;
2456 int WaitStatesNeeded = 0;
2468 !
TRI.regsOverlap(
MI.getOperand(0).getReg(), Reg))
2477 !
TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2483 bool DGEMMAfterVALUWrite =
false;
2484 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2487 DGEMMAfterVALUWrite =
true;
2491 if (!
TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2498 AMDGPU::OpName::src2);
2500 if (IsMemOrExport || IsVALU) {
2501 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2502 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2503 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2504 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2505 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2506 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2507 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2508 const int DotWriteSameDotReadSrcAB = 3;
2509 const int DotWriteDifferentVALURead = 3;
2510 const int DMFMABetweenVALUWriteVMEMRead = 2;
2511 const int MaxWaitStates = 19;
2519 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2522 int NeedWaitStates = 0;
2523 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2524 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
2525 NeedWaitStates = DotWriteSameDotReadSrcAB;
2527 NeedWaitStates = DotWriteDifferentVALURead;
2530 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2531 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2539 DGEMMAfterVALUWrite =
false;
2540 if (
TRI.isVectorRegister(
MRI, Reg)) {
2541 int WaitStatesNeededForUse =
2542 DMFMABetweenVALUWriteVMEMRead -
2543 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2544 DMFMABetweenVALUWriteVMEMRead);
2546 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2551 WaitStatesSinceDef =
2552 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2556 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2557 int NumPasses = HazardDefLatency;
2558 int NeedWaitStates = MaxWaitStates;
2560 if (
isDGEMM(MFMA->getOpcode())) {
2561 switch (HazardDefLatency) {
2563 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2564 : DMFMA4x4WriteVgprVALUReadWaitStates;
2568 NeedWaitStates = IsMemOrExport
2569 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2570 : DMFMA16x16WriteVgprVALUReadWaitStates;
2582 switch (HazardDefLatency) {
2584 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2587 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2590 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2597 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2598 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2600 if (WaitStatesNeeded == MaxWaitStates)
2605 unsigned Opc =
MI->getOpcode();
2606 const int DMFMAToFMA64WaitStates = 2;
2607 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2608 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2609 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2610 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2611 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2612 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2613 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2616 if (!IsVALU && !IsMemOrExport)
2617 return WaitStatesNeeded;
2620 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2621 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2622 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2623 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2624 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2625 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2626 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2627 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2628 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2629 const int DotWriteDifferentVALUWrite = 3;
2630 const int MaxWaitStates = 19;
2631 const int MaxWarWaitStates = 15;
2636 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2638 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
2639 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2640 WaitStatesSinceDef);
2643 WaitStatesSinceDef =
2644 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2646 int NeedWaitStates = MaxWaitStates;
2647 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2649 if (
isDGEMM(MFMA->getOpcode())) {
2650 switch (NumPasses) {
2652 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2656 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2667 switch (NumPasses) {
2669 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2672 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2675 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2682 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2683 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2685 if (WaitStatesNeeded == MaxWaitStates)
2691 !
MI.readsRegister(Reg, &TRI))
2698 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
2708 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2713 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2714 int NeedWaitStates = MaxWaitStates;
2715 switch (HazardDefLatency) {
2716 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2719 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2721 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2723 case 16: [[fallthrough]];
2724 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2728 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2729 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2732 return WaitStatesNeeded;
2745 return MAI !=
nullptr;
2749 if (IsMFMAFn(*
MI)) {
2750 int W = getWaitStatesSince(IsMFMAFn, 16);
2752 return W < (int)TSchedModel.computeInstrLatency(MAI);
2758bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
2775 if (!SDSTOp || !SDSTOp->
isReg())
2779 if (HazardReg == AMDGPU::EXEC ||
2780 HazardReg == AMDGPU::EXEC_LO ||
2781 HazardReg == AMDGPU::EXEC_HI ||
2782 HazardReg == AMDGPU::M0)
2786 switch (
I.getOpcode()) {
2787 case AMDGPU::V_ADDC_U32_e32:
2788 case AMDGPU::V_ADDC_U32_dpp:
2789 case AMDGPU::V_CNDMASK_B16_e32:
2790 case AMDGPU::V_CNDMASK_B16_dpp:
2791 case AMDGPU::V_CNDMASK_B32_e32:
2792 case AMDGPU::V_CNDMASK_B32_dpp:
2793 case AMDGPU::V_DIV_FMAS_F32_e64:
2794 case AMDGPU::V_DIV_FMAS_F64_e64:
2795 case AMDGPU::V_SUBB_U32_e32:
2796 case AMDGPU::V_SUBB_U32_dpp:
2797 case AMDGPU::V_SUBBREV_U32_e32:
2798 case AMDGPU::V_SUBBREV_U32_dpp:
2800 return HazardReg == AMDGPU::VCC ||
2801 HazardReg == AMDGPU::VCC_LO ||
2802 HazardReg == AMDGPU::VCC_HI;
2803 case AMDGPU::V_ADDC_U32_e64:
2804 case AMDGPU::V_ADDC_U32_e64_dpp:
2805 case AMDGPU::V_CNDMASK_B16_e64:
2806 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2807 case AMDGPU::V_CNDMASK_B32_e64:
2808 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2809 case AMDGPU::V_SUBB_U32_e64:
2810 case AMDGPU::V_SUBB_U32_e64_dpp:
2811 case AMDGPU::V_SUBBREV_U32_e64:
2812 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2816 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
2826 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2835 for (
int OpNo = 0,
End =
I.getNumOperands(); OpNo <
End; ++OpNo) {
2843 if (OpReg == AMDGPU::EXEC ||
2844 OpReg == AMDGPU::EXEC_LO ||
2845 OpReg == AMDGPU::EXEC_HI)
2848 if (
Op.isImplicit()) {
2849 if (OpReg == AMDGPU::VCC ||
2850 OpReg == AMDGPU::VCC_LO ||
2851 OpReg == AMDGPU::VCC_HI)
2855 if (
TRI.isSGPRReg(
MRI, OpReg))
2860 if (!
TII.isInlineConstant(
Op, OpInfo))
2869 std::numeric_limits<int>::max())
2872 auto NextMI = std::next(
MI->getIterator());
2875 BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
2876 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2880 if (
MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2882 while (NextMI !=
MI->getParent()->end() &&
2883 NextMI->isBundledWithPred()) {
2884 for (
auto &Operand : NextMI->operands()) {
2885 if (Operand.isGlobal())
2886 Operand.setOffset(Operand.getOffset() + 4);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
bool none() const
none - Returns true if none of the bits are set.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
This holds information about one operand of a machine instruction, indicating the register class for ...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< pred_iterator > predecessors()
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isMAI(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
StringRef - Represent a constant reference to a string, i.e.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...