27struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
32 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
35 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
45 cl::desc(
"Fill a percentage of the latency between "
46 "neighboring MFMA with s_nops."));
50 cl::desc(
"Maximum function size for exhausive hazard search"));
60 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
62 TRI(
TII.getRegisterInfo()), TSchedModel(
TII.getSchedModel()),
63 UseVALUReadHazardExhaustiveSearch(
false),
64 ClauseUses(
TRI.getNumRegUnits()), ClauseDefs(
TRI.getNumRegUnits()) {
70 EmittedInstrs.clear();
82 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
86 return Opcode == AMDGPU::S_GETREG_B32;
91 case AMDGPU::S_SETREG_B32:
92 case AMDGPU::S_SETREG_B32_mode:
93 case AMDGPU::S_SETREG_IMM32_B32:
94 case AMDGPU::S_SETREG_IMM32_B32_mode:
101 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
105 return Opcode == AMDGPU::S_RFE_B64;
110 case AMDGPU::S_MOVRELS_B32:
111 case AMDGPU::S_MOVRELS_B64:
112 case AMDGPU::S_MOVRELD_B32:
113 case AMDGPU::S_MOVRELD_B64:
125 unsigned Opcode =
MI.getOpcode();
129 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
130 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
133 if (!ST.hasGFX940Insts())
141 if (
TII.isAlwaysGDS(
MI.getOpcode()))
144 switch (
MI.getOpcode()) {
145 case AMDGPU::S_SENDMSG:
146 case AMDGPU::S_SENDMSGHALT:
147 case AMDGPU::S_TTRACEDATA:
151 case AMDGPU::DS_PERMUTE_B32:
152 case AMDGPU::DS_BPERMUTE_B32:
155 if (
TII.isDS(
MI.getOpcode())) {
157 AMDGPU::OpName::gds);
158 if (
MI.getOperand(GDS).getImm())
166 unsigned Opcode =
MI.getOpcode();
167 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
168 Opcode == AMDGPU::V_PERMLANE64_B32 ||
169 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
171 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
172 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
173 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
174 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
175 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
185 AMDGPU::OpName::simm16);
205 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
214 && checkVMEMHazards(
MI) > 0)
223 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
226 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
234 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
237 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
240 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
245 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
246 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
250 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
251 checkReadM0Hazards(
MI) > 0)
262 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
270 while (Quantity > 0) {
271 unsigned Arg = std::min(Quantity, 8u);
279GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
286void GCNHazardRecognizer::processBundle() {
290 for (;
MI != E &&
MI->isInsideBundle(); ++
MI) {
291 CurrCycleInstr = &*
MI;
294 if (IsHazardRecognizerMode) {
295 fixHazards(CurrCycleInstr);
303 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
304 EmittedInstrs.push_front(
nullptr);
306 EmittedInstrs.push_front(CurrCycleInstr);
309 CurrCycleInstr =
nullptr;
313 assert(IsHazardRecognizerMode);
317 if (
MI->isInsideBundle())
327 IsHazardRecognizerMode =
true;
331 CurrCycleInstr =
nullptr;
342 return std::max(WaitStates, checkSMRDHazards(
MI));
345 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
347 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
353 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
356 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
359 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
362 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
365 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
370 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
372 if (
MI->isInlineAsm())
373 return std::max(WaitStates, checkInlineAsmHazards(
MI));
376 return std::max(WaitStates, checkGetRegHazards(
MI));
379 return std::max(WaitStates, checkSetRegHazards(
MI));
382 return std::max(WaitStates, checkRFEHazards(
MI));
386 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
387 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
391 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
392 return std::max(WaitStates, checkReadM0Hazards(
MI));
395 return std::max(WaitStates, checkMAIHazards(
MI));
400 return std::max(WaitStates, checkMAILdStHazards(
MI));
403 return std::max(WaitStates, checkPermlaneHazards(
MI));
409 EmittedInstrs.push_front(
nullptr);
415 if (!CurrCycleInstr) {
416 EmittedInstrs.push_front(
nullptr);
426 if (!NumWaitStates) {
427 CurrCycleInstr =
nullptr;
432 EmittedInstrs.push_front(CurrCycleInstr);
439 EmittedInstrs.push_front(
nullptr);
447 CurrCycleInstr =
nullptr;
451 llvm_unreachable(
"hazard recognizer does not support bottom-up scheduling.");
464template <
typename StateT>
477 switch (IsHazard(State, *
I)) {
487 if (
I->isInlineAsm() ||
I->isMetaInstruction())
490 UpdateState(State, *
I);
494 if (!Visited.
insert(Pred).second)
497 if (
hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
521 if (
I->isInlineAsm())
524 WaitStates += GetNumWaitStates(*
I);
526 if (IsExpired(*
I, WaitStates))
527 return std::numeric_limits<int>::max();
530 int MinWaitStates = std::numeric_limits<int>::max();
532 if (!Visited.
insert(Pred).second)
536 IsExpired, Visited, GetNumWaitStates);
538 MinWaitStates = std::min(MinWaitStates, W);
541 return MinWaitStates;
548 std::next(
MI->getReverseIterator()),
549 0, IsExpired, Visited);
552int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
553 if (IsHazardRecognizerMode) {
555 return WaitStates >= Limit;
557 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
566 if (
MI->isInlineAsm())
571 if (WaitStates >= Limit)
574 return std::numeric_limits<int>::max();
577int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
578 IsHazardFn IsHazardDef,
583 return IsHazardDef(
MI) &&
MI.modifiesRegister(Reg, TRI);
589int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
629int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
635 bool IsSMRD = TII.
isSMRD(*MEM);
661 if (ClauseDefs.
none())
674 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
677int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
678 int WaitStatesNeeded = 0;
680 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
684 return WaitStatesNeeded;
688 int SmrdSgprWaitStates = 4;
701 int WaitStatesNeededForUse =
702 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
704 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
714 int WaitStatesNeededForUse =
715 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
718 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
722 return WaitStatesNeeded;
725int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
729 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
733 const int VmemSgprWaitStates = 5;
741 int WaitStatesNeededForUse =
742 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
744 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
746 return WaitStatesNeeded;
754 int DppVgprWaitStates = 2;
755 int DppExecWaitStates = 5;
756 int WaitStatesNeeded = 0;
758 return TII->isVALU(
MI);
764 int WaitStatesNeededForUse =
765 DppVgprWaitStates - getWaitStatesSinceDef(
769 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
772 WaitStatesNeeded = std::max(
774 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
777 return WaitStatesNeeded;
780int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
785 const int DivFMasWaitStates = 4;
787 return TII->isVALU(
MI);
789 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
792 return DivFMasWaitStates - WaitStatesNeeded;
795int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
797 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
799 const int GetRegWaitStates = 2;
803 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
805 return GetRegWaitStates - WaitStatesNeeded;
808int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
810 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
816 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
817 return SetRegWaitStates - WaitStatesNeeded;
820int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
825 unsigned Opcode =
MI.getOpcode();
831 VDataRCID =
Desc.operands()[VDataIdx].RegClass;
841 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
845 (!SOffset || !SOffset->
isReg()))
853 if (
TII->isMIMG(
MI)) {
860 if (
TII->isFLAT(
MI)) {
870GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
877 int WaitStatesNeeded = 0;
879 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
880 return WaitStatesNeeded;
883 int DataIdx = createsVALUHazard(
MI);
884 return DataIdx >= 0 &&
885 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(), Reg);
888 int WaitStatesNeededForDef =
889 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
890 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
892 return WaitStatesNeeded;
908 unsigned Opcode =
MI.getOpcode();
918 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
920 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
926 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
928 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
932 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
934 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
940 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
961 for (
auto &Operand : VALU->operands()) {
962 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
969int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
970 int WaitStatesNeeded = 0;
973 const int TransDefWaitstates = 1;
983 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
990 int WaitStatesNeededForDef =
992 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
997 const int Shift16DefWaitstates = 1;
1007 if (ProducerMI.isInlineAsm()) {
1009 for (
auto &Def : ProducerMI.all_defs()) {
1018 int WaitStatesNeededForDef =
1019 Shift16DefWaitstates -
1020 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1021 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1025 const int VALUWriteSGPRVALUReadWaitstates = 2;
1026 const int VALUWriteEXECRWLane = 4;
1027 const int VALUWriteVGPRReadlaneRead = 1;
1035 return MI.modifiesRegister(
UseReg, TRI);
1044 int WaitStatesNeededForDef =
1045 VALUWriteSGPRVALUReadWaitstates -
1046 getWaitStatesSince(IsVALUDefSGPRFn,
1047 VALUWriteSGPRVALUReadWaitstates);
1048 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1052 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1054 int WaitStatesNeededForDef =
1055 VALUWriteSGPRVALUReadWaitstates -
1056 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1057 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1060 switch (
VALU->getOpcode()) {
1061 case AMDGPU::V_READLANE_B32:
1062 case AMDGPU::V_READFIRSTLANE_B32: {
1065 int WaitStatesNeededForDef =
1066 VALUWriteVGPRReadlaneRead -
1067 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1068 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1071 case AMDGPU::V_WRITELANE_B32: {
1073 int WaitStatesNeededForDef =
1074 VALUWriteEXECRWLane -
1075 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1076 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1087 return WaitStatesNeeded;
1092 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1095 return WaitStatesNeeded;
1098int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1112 int WaitStatesNeeded = 0;
1116 if (
Op.isReg() &&
Op.isDef()) {
1117 if (!
TRI.isVectorRegister(
MRI,
Op.getReg()))
1122 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1128 const int Shift16DefWaitstates = 1;
1130 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1134 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1135 IA->readsRegister(Dst->getReg(), &TRI);
1137 if (ProducerMI.isInlineAsm()) {
1139 for (
auto &Def : ProducerMI.all_defs()) {
1140 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1141 IA->readsRegister(
Def.getReg(), &TRI)) {
1150 int WaitStatesNeededForDef =
1151 Shift16DefWaitstates -
1152 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1153 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1156 return WaitStatesNeeded;
1159int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1165 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1173 const int RWLaneWaitStates = 4;
1174 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1176 return RWLaneWaitStates - WaitStatesSince;
1179int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1185 const int RFEWaitStates = 1;
1190 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1191 return RFEWaitStates - WaitStatesNeeded;
1196 const int ReadM0WaitStates = 1;
1198 return ReadM0WaitStates -
1199 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1203 fixVMEMtoScalarWriteHazards(
MI);
1204 fixVcmpxPermlaneHazards(
MI);
1205 fixSMEMtoVectorWriteHazards(
MI);
1206 fixVcmpxExecWARHazard(
MI);
1207 fixLdsBranchVmemWARHazard(
MI);
1209 fixLdsDirectVALUHazard(
MI);
1210 fixLdsDirectVMEMHazard(
MI);
1212 fixVALUPartialForwardingHazard(
MI);
1213 fixVALUTransUseHazard(
MI);
1215 fixShift64HighRegBug(
MI);
1216 fixVALUMaskWriteHazard(
MI);
1217 fixVALUReadSGPRHazard(
MI);
1218 fixRequiredExportPriority(
MI);
1223 return (
TII.isVOPC(
MI) ||
1224 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1225 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1228bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1239 unsigned Opc =
MI.getOpcode();
1241 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1245 std::numeric_limits<int>::max())
1251 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1253 bool IsUndef = Src0->isUndef();
1255 TII->get(AMDGPU::V_MOV_B32_e32))
1262bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1270 if (
MI->getNumDefs() == 0)
1282 I.findRegisterUseOperand(
Def.getReg(),
TRI,
false);
1292 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1293 !
MI.getOperand(0).getImm()) ||
1294 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1299 std::numeric_limits<int>::max())
1304 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1309bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1318 switch (
MI->getOpcode()) {
1319 case AMDGPU::V_READLANE_B32:
1320 case AMDGPU::V_READFIRSTLANE_B32:
1321 SDSTName = AMDGPU::OpName::vdst;
1324 SDSTName = AMDGPU::OpName::sdst;
1333 for (
const auto &MO :
MI->implicit_operands()) {
1334 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg()))) {
1350 if (
TII->isSALU(
MI)) {
1351 switch (
MI.getOpcode()) {
1352 case AMDGPU::S_SETVSKIP:
1353 case AMDGPU::S_VERSION:
1354 case AMDGPU::S_WAITCNT_VSCNT:
1355 case AMDGPU::S_WAITCNT_VMCNT:
1356 case AMDGPU::S_WAITCNT_EXPCNT:
1359 case AMDGPU::S_WAITCNT_LGKMCNT:
1361 return (
MI.getOperand(1).getImm() == 0) &&
1362 (
MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1363 case AMDGPU::S_WAITCNT: {
1364 const int64_t
Imm =
MI.getOperand(0).getImm();
1367 return (Decoded.
DsCnt == 0);
1371 if (
TII->isSOPP(
MI))
1387 std::numeric_limits<int>::max())
1391 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1396bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1405 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1411 return I.readsRegister(AMDGPU::EXEC, TRI);
1417 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1419 for (
auto MO :
MI.implicit_operands())
1420 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg())))
1423 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1430 std::numeric_limits<int>::max())
1434 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1441 if (!ST.hasLdsBranchVmemWARHazard())
1446 bool HasLds =
false;
1447 bool HasVmem =
false;
1448 for (
auto &
MBB : MF) {
1449 for (
auto &
MI :
MBB) {
1453 if (HasLds && HasVmem)
1461 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1462 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1463 !
I.getOperand(1).getImm();
1466bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1467 if (!RunLdsBranchVmemWARHazardFixup)
1481 auto InstType = IsHazardInst(*
MI);
1494 auto InstType2 = IsHazardInst(
I);
1495 return InstType2 && InstType != InstType2;
1499 auto InstType2 = IsHazardInst(
I);
1500 if (InstType == InstType2)
1507 std::numeric_limits<int>::max();
1511 std::numeric_limits<int>::max())
1516 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1523bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1527 const int NoHazardWaitStates = 15;
1531 bool VisitedTrans =
false;
1537 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1540 if (WaitStates >= NoHazardWaitStates)
1551 auto Count = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
1552 std::next(
MI->getReverseIterator()), 0,
1561 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1562 WaitVdstOp->
setImm(std::min(Count, NoHazardWaitStates));
1567bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1578 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1585 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1586 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1589 !
TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1593 std::numeric_limits<int>::max())
1596 if (LdsdirCanWait) {
1597 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1600 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1607bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1623 if (SrcVGPRs.
size() <= 1)
1641 const int Intv1plus2MaxVALUs = 2;
1642 const int Intv3MaxVALUs = 4;
1643 const int IntvMaxVALUs = 6;
1644 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1648 int ExecPos = std::numeric_limits<int>::max();
1657 if (State.VALUs > NoHazardVALUWaitStates)
1658 return HazardExpired;
1663 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1665 return HazardExpired;
1668 bool Changed =
false;
1671 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1672 State.DefPos[Src] = State.VALUs;
1677 if (State.ExecPos == std::numeric_limits<int>::max()) {
1678 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1679 State.ExecPos = State.VALUs;
1686 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1687 return HazardExpired;
1691 return NoHazardFound;
1694 if (State.ExecPos == std::numeric_limits<int>::max())
1695 return NoHazardFound;
1697 int PreExecPos = std::numeric_limits<int>::max();
1698 int PostExecPos = std::numeric_limits<int>::max();
1700 for (
auto Entry : State.DefPos) {
1701 int DefVALUs =
Entry.second;
1702 if (DefVALUs != std::numeric_limits<int>::max()) {
1703 if (DefVALUs >= State.ExecPos)
1704 PreExecPos = std::min(PreExecPos, DefVALUs);
1706 PostExecPos = std::min(PostExecPos, DefVALUs);
1711 if (PostExecPos == std::numeric_limits<int>::max())
1712 return NoHazardFound;
1715 int Intv3VALUs = PostExecPos;
1716 if (Intv3VALUs > Intv3MaxVALUs)
1717 return HazardExpired;
1720 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1721 if (Intv2VALUs > Intv1plus2MaxVALUs)
1722 return HazardExpired;
1725 if (PreExecPos == std::numeric_limits<int>::max())
1726 return NoHazardFound;
1729 int Intv1VALUs = PreExecPos - State.ExecPos;
1730 if (Intv1VALUs > Intv1plus2MaxVALUs)
1731 return HazardExpired;
1734 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1735 return HazardExpired;
1739 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1745 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1746 std::next(
MI->getReverseIterator()), Visited))
1750 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1756bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1781 const int IntvMaxVALUs = 5;
1782 const int IntvMaxTRANS = 1;
1794 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1795 return HazardExpired;
1800 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1801 I.getOperand(0).getImm() == 0x0fff))
1802 return HazardExpired;
1807 if (
I.modifiesRegister(Src, &TRI)) {
1813 return NoHazardFound;
1815 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1823 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1824 std::next(
MI->getReverseIterator()), Visited))
1830 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1850 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1852 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1855 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1857 if (
TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1858 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1867 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1868 if (
TRI->regsOverlap(PrevDstReg, CurIndex))
1882 std::numeric_limits<int>::max())
1885 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1890bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
1895 switch (
MI->getOpcode()) {
1898 case AMDGPU::V_LSHLREV_B64_e64:
1899 case AMDGPU::V_LSHRREV_B64_e64:
1900 case AMDGPU::V_ASHRREV_I64_e64:
1911 if (!
TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1914 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
1918 bool OverlappedSrc = Src1->
isReg() &&
TRI.regsOverlap(Src1->
getReg(), AmtReg);
1919 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
1920 bool Overlapped = OverlappedSrc || OverlappedDst;
1922 assert(!OverlappedDst || !OverlappedSrc ||
1923 Src1->
getReg() ==
MI->getOperand(0).getReg());
1925 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1928 for (
MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1929 : AMDGPU::VGPR_32RegClass) {
1930 if (!
MI->modifiesRegister(Reg, &TRI) && !
MI->readsRegister(Reg, &TRI)) {
1941 NewAmtLo =
TRI.getSubReg(NewReg, AMDGPU::sub0);
1983 MI->getOperand(0).setReg(NewReg);
1984 if (OverlappedSrc) {
1994 int NSAtoVMEMWaitStates = 1;
2003 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2011 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2012 TII->getInstSizeInBytes(
I) >= 16;
2015 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2018int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2019 int FPAtomicToDenormModeWaitStates = 3;
2025 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2038 switch (
MI.getOpcode()) {
2039 case AMDGPU::S_WAITCNT:
2040 case AMDGPU::S_WAITCNT_VSCNT:
2041 case AMDGPU::S_WAITCNT_VMCNT:
2042 case AMDGPU::S_WAITCNT_EXPCNT:
2043 case AMDGPU::S_WAITCNT_LGKMCNT:
2044 case AMDGPU::S_WAIT_IDLE:
2053 return FPAtomicToDenormModeWaitStates -
2072 int NeighborMFMALatency = 0;
2073 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2078 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2082 const int MaxMFMAPipelineWaitStates = 16;
2083 int WaitStatesSinceNeighborMFMA =
2084 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2086 int NeighborMFMAPaddingNeeded =
2088 WaitStatesSinceNeighborMFMA;
2090 return std::max(0, NeighborMFMAPaddingNeeded);
2094 int WaitStatesNeeded = 0;
2095 unsigned Opc =
MI->getOpcode();
2101 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2102 const int LegacyVALUWritesVGPRWaitStates = 2;
2103 const int VALUWritesExecWaitStates = 4;
2104 const int MaxWaitStates = 4;
2106 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2107 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2110 if (WaitStatesNeeded < MaxWaitStates) {
2112 const int MaxWaitStates = 2;
2117 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2118 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2119 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2121 if (WaitStatesNeeded == MaxWaitStates)
2131 if (
Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2134 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2135 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2136 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2137 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2138 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2139 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2140 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2141 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2142 const int MaxWaitStates = 18;
2144 unsigned HazardDefLatency = 0;
2146 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2154 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2155 return TRI.regsOverlap(DstReg, Reg);
2158 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2160 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2162 int OpNo =
Op.getOperandNo();
2163 if (OpNo == SrcCIdx) {
2164 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2165 }
else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2166 switch (HazardDefLatency) {
2167 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2169 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2171 case 16: [[fallthrough]];
2172 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2175 }
else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2176 switch (HazardDefLatency) {
2177 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2179 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2181 case 16: [[fallthrough]];
2182 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2187 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2188 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2190 if (WaitStatesNeeded == MaxWaitStates)
2191 return WaitStatesNeeded;
2194 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2197 return TRI.regsOverlap(Reg, DstReg);
2200 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2201 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2202 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2203 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2204 if (OpNo == SrcCIdx)
2205 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2206 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2207 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2209 WaitStatesNeededForUse = NeedWaitStates -
2210 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2211 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2213 if (WaitStatesNeeded == MaxWaitStates)
2214 return WaitStatesNeeded;
2217 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2218 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2219 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2220 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2221 const int MaxWaitStates = 13;
2222 Register DstReg =
MI->getOperand(0).getReg();
2223 unsigned HazardDefLatency = 0;
2225 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2231 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2232 return TRI.regsOverlap(Reg, DstReg);
2235 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2237 switch (HazardDefLatency) {
2238 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2240 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2242 case 16: [[fallthrough]];
2243 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2247 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2248 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2252 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2254 return WaitStatesNeeded;
2265 return NumPasses + 1 + IsGFX950;
2276 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2294 return NumPasses + 2;
2302 return NumPasses + 3;
2306 int WaitStatesNeeded = 0;
2307 unsigned Opc =
MI->getOpcode();
2319 return WaitStatesNeeded;
2321 const int VALUWritesExecWaitStates = 4;
2322 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2323 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2324 VALUWritesExecWaitStates);
2325 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2331 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2332 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2333 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2334 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2335 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2336 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2337 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2338 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2339 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2340 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2341 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2342 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2343 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2344 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2345 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2346 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2347 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2348 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2349 const int MaxWaitStates = 19;
2357 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2362 FullReg = (DstReg ==
Reg);
2364 return TRI.regsOverlap(DstReg, Reg);
2367 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2368 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2369 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2372 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2373 if (NumWaitStates == std::numeric_limits<int>::max())
2378 int NeedWaitStates = 0;
2379 if (OpNo == SrcCIdx) {
2382 }
else if (FullReg) {
2383 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2384 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2385 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2386 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2387 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2389 TSchedModel.computeInstrLatency(MI1) == 2)
2390 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2393 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2394 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2395 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2396 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2400 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2401 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2403 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2404 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2406 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2409 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2420 NumPasses, ST.hasGFX950Insts()))
2426 switch (NumPasses) {
2429 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2430 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2435 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2436 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2441 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2442 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2451 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2452 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2453 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2454 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2457 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2458 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2460 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2461 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2462 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2465 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2477 switch (NumPasses) {
2479 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2484 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2488 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2492 if (WaitStatesNeeded >= NeedWaitStates)
2495 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2496 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2498 if (WaitStatesNeeded == MaxWaitStates)
2503 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2505 return WaitStatesNeeded;
2513 int WaitStatesNeeded = 0;
2516 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2525 const int AccVgprReadLdStWaitStates = 2;
2526 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2527 const int MaxWaitStates = 2;
2529 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2530 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2531 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2533 if (WaitStatesNeeded == MaxWaitStates)
2534 return WaitStatesNeeded;
2537 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2538 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2543 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <
2544 std::numeric_limits<int>::max();
2547 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2548 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2549 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2552 return WaitStatesNeeded;
2557 "this is a different vcmpx+permlane hazard");
2569 const int VCmpXWritesExecWaitStates = 4;
2570 const int VALUWritesVDstWaitStates = 2;
2571 int WaitStatesNeeded = 0;
2578 int WaitStatesSinceDef =
2579 VALUWritesVDstWaitStates -
2580 getWaitStatesSinceDef(Reg, IsVALUFn,
2581 VALUWritesVDstWaitStates);
2582 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2583 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2587 int VCmpXHazardWaits =
2588 VCmpXWritesExecWaitStates -
2589 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2591 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2592 return WaitStatesNeeded;
2600 return NumPasses + 2;
2608 return NumPasses + 3;
2616 return NumPasses + 3;
2624 return NumPasses + 2;
2641 int WaitStatesNeeded = 0;
2653 !
TRI.regsOverlap(
MI.getOperand(0).getReg(), Reg))
2662 !
TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2668 bool DGEMMAfterVALUWrite =
false;
2669 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2672 DGEMMAfterVALUWrite =
true;
2676 if (!
TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2683 AMDGPU::OpName::src2);
2685 if (IsMemOrExport || IsVALU) {
2686 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2687 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2688 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2689 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2690 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2691 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2692 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2693 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
2694 const int DotWriteSameDotReadSrcAB = 3;
2695 const int DotWriteDifferentVALURead = 3;
2696 const int DMFMABetweenVALUWriteVMEMRead = 2;
2697 const int MaxWaitStates = 19;
2705 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2708 int NeedWaitStates = 0;
2709 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2710 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
2711 NeedWaitStates = DotWriteSameDotReadSrcAB;
2713 NeedWaitStates = DotWriteDifferentVALURead;
2716 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2717 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2725 DGEMMAfterVALUWrite =
false;
2726 if (
TRI.isVectorRegister(
MRI, Reg)) {
2727 int WaitStatesNeededForUse =
2728 DMFMABetweenVALUWriteVMEMRead -
2729 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2730 DMFMABetweenVALUWriteVMEMRead);
2732 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2737 WaitStatesSinceDef =
2738 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2742 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2743 int NumPasses = HazardDefLatency;
2744 int NeedWaitStates = MaxWaitStates;
2746 if (
isDGEMM(MFMA->getOpcode())) {
2747 switch (HazardDefLatency) {
2749 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2750 : DMFMA4x4WriteVgprVALUReadWaitStates;
2756 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2758 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
2759 : DMFMA16x16WriteVgprVALUReadWaitStates);
2771 switch (HazardDefLatency) {
2773 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2776 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2779 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2786 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2787 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2789 if (WaitStatesNeeded == MaxWaitStates)
2794 unsigned Opc =
MI->getOpcode();
2795 const int DMFMAToFMA64WaitStates = 2;
2796 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2797 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2798 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2799 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2800 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2801 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2802 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2805 if (!IsVALU && !IsMemOrExport)
2806 return WaitStatesNeeded;
2809 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2810 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2811 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2812 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2813 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2814 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2815 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2816 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2817 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2818 const int DotWriteDifferentVALUWrite = 3;
2819 const int MaxWaitStates = 19;
2820 const int MaxWarWaitStates = 15;
2825 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2827 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
2828 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2829 WaitStatesSinceDef);
2832 WaitStatesSinceDef =
2833 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2835 int NeedWaitStates = MaxWaitStates;
2836 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2838 if (
isDGEMM(MFMA->getOpcode())) {
2839 switch (NumPasses) {
2841 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2845 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2856 switch (NumPasses) {
2858 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2861 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2864 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2871 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2872 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2874 if (WaitStatesNeeded == MaxWaitStates)
2880 !
MI.readsRegister(Reg, &TRI))
2887 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
2897 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2902 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2903 int NeedWaitStates = MaxWaitStates;
2904 switch (HazardDefLatency) {
2905 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2908 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2910 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2912 case 16: [[fallthrough]];
2913 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2917 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2918 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2921 return WaitStatesNeeded;
2934 return MAI !=
nullptr;
2938 if (IsMFMAFn(*
MI)) {
2939 int W = getWaitStatesSince(IsMFMAFn, 16);
2941 return W < (int)TSchedModel.computeInstrLatency(MAI);
2955 while (
I->isBundledWithPred())
2961 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
2965 const unsigned NewBytes = 4;
2967 "Unexpected instruction insertion in bundle");
2970 while (NextMI !=
End && NextMI->isBundledWithPred()) {
2971 for (
auto &Operand : NextMI->operands()) {
2972 if (Operand.isGlobal())
2973 Operand.setOffset(Operand.getOffset() + NewBytes);
2979bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
2996 if (!SDSTOp || !SDSTOp->
isReg())
3000 if (HazardReg == AMDGPU::EXEC ||
3001 HazardReg == AMDGPU::EXEC_LO ||
3002 HazardReg == AMDGPU::EXEC_HI ||
3003 HazardReg == AMDGPU::M0)
3007 switch (
I.getOpcode()) {
3008 case AMDGPU::V_ADDC_U32_e32:
3009 case AMDGPU::V_ADDC_U32_dpp:
3010 case AMDGPU::V_CNDMASK_B16_e32:
3011 case AMDGPU::V_CNDMASK_B16_dpp:
3012 case AMDGPU::V_CNDMASK_B32_e32:
3013 case AMDGPU::V_CNDMASK_B32_dpp:
3014 case AMDGPU::V_DIV_FMAS_F32_e64:
3015 case AMDGPU::V_DIV_FMAS_F64_e64:
3016 case AMDGPU::V_SUBB_U32_e32:
3017 case AMDGPU::V_SUBB_U32_dpp:
3018 case AMDGPU::V_SUBBREV_U32_e32:
3019 case AMDGPU::V_SUBBREV_U32_dpp:
3021 return HazardReg == AMDGPU::VCC ||
3022 HazardReg == AMDGPU::VCC_LO ||
3023 HazardReg == AMDGPU::VCC_HI;
3024 case AMDGPU::V_ADDC_U32_e64:
3025 case AMDGPU::V_ADDC_U32_e64_dpp:
3026 case AMDGPU::V_CNDMASK_B16_e64:
3027 case AMDGPU::V_CNDMASK_B16_e64_dpp:
3028 case AMDGPU::V_CNDMASK_B32_e64:
3029 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3030 case AMDGPU::V_SUBB_U32_e64:
3031 case AMDGPU::V_SUBB_U32_e64_dpp:
3032 case AMDGPU::V_SUBBREV_U32_e64:
3033 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3037 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
3047 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3056 for (
int OpNo = 0,
End =
I.getNumOperands(); OpNo <
End; ++OpNo) {
3064 if (OpReg == AMDGPU::EXEC ||
3065 OpReg == AMDGPU::EXEC_LO ||
3066 OpReg == AMDGPU::EXEC_HI)
3069 if (
Op.isImplicit()) {
3070 if (OpReg == AMDGPU::VCC ||
3071 OpReg == AMDGPU::VCC_LO ||
3072 OpReg == AMDGPU::VCC_HI)
3076 if (
TRI.isSGPRReg(
MRI, OpReg))
3081 if (!
TII.isInlineConstant(
Op, OpInfo))
3090 std::numeric_limits<int>::max())
3093 auto NextMI = std::next(
MI->getIterator());
3096 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3097 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3113 case AMDGPU::EXEC_LO:
3114 case AMDGPU::EXEC_HI:
3115 case AMDGPU::SGPR_NULL:
3116 case AMDGPU::SGPR_NULL64:
3121 unsigned RegN =
TRI.getEncodingValue(Reg);
3124 return (RegN >> 1) & 0x3f;
3128void GCNHazardRecognizer::computeVALUHazardSGPRs(
MachineFunction *MMF) {
3132 if (!VALUReadHazardSGPRs.
empty())
3141 UseVALUReadHazardExhaustiveSearch =
3146 bool UseVALUUseCache =
3148 VALUReadHazardSGPRs.
resize(64, !UseVALUUseCache);
3149 if (!UseVALUUseCache)
3157 BitVector SALUWriteSGPRs(64), ReadSGPRs(64);
3166 if (!IsVALU && !IsSALU)
3175 if (
Op.isImplicit() && !(Reg == AMDGPU::VCC_LO ||
3176 Reg == AMDGPU::VCC_HI || Reg == AMDGPU::VCC))
3178 if (!
TRI.isSGPRReg(
MRI, Reg))
3183 if (IsVALU &&
Op.isUse()) {
3185 if (InCycle || (ReadSGPRs[*RegN] && SALUWriteSGPRs[*RegN]))
3186 VALUReadHazardSGPRs.
set(*RegN);
3187 ReadSGPRs.
set(*RegN);
3188 }
else if (IsSALU) {
3190 SALUWriteSGPRs.
set(*RegN);
3192 ReadSGPRs.
set(*RegN);
3199bool GCNHazardRecognizer::fixVALUReadSGPRHazard(
MachineInstr *
MI) {
3215 if (!(MIIsSALU || MIIsVALU))
3225 TII.getNamedOperand(*
MI, AMDGPU::OpName::sdst);
3226 if (!SDSTOp || !SDSTOp->
isReg())
3230 if (HazardReg == AMDGPU::EXEC || HazardReg == AMDGPU::EXEC_LO ||
3231 HazardReg == AMDGPU::EXEC_HI || HazardReg == AMDGPU::M0)
3235 auto NextMI = std::next(
MI->getIterator());
3236 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3237 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3248 computeVALUHazardSGPRs(
MI->getMF());
3251 if (VALUReadHazardSGPRs.
none())
3256 const bool IsSetPC = (
MI->isCall() ||
MI->isReturn()) &&
3257 !(
MI->getOpcode() == AMDGPU::S_ENDPGM ||
3258 MI->getOpcode() == AMDGPU::S_ENDPGM_SAVED);
3270 if (MIIsSALU &&
Op.isImplicit())
3273 if (!
TRI.isSGPRReg(
MRI, OpReg))
3280 if (!VALUReadHazardSGPRs[*RegN])
3287 if (SGPRsUsed.
empty())
3297 if (IsSetPC &&
I.getNumDefs() > 0)
3301 return I.modifiesRegister(Reg, &TRI);
3307 if (Count >= SALUExpiryCount)
3310 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
3316 auto WaitStatesFn = [
this, &SGPRsUsed](
const MachineInstr &
I) {
3322 [
this, &
I](
Register Reg) {
return I.readsRegister(Reg, &TRI); }))
3329 int WaitStates = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
3330 std::next(
MI->getReverseIterator()), 0,
3333 if (WaitStates >= SALUExpiryCount)
3337 if (UseVALUReadHazardExhaustiveSearch) {
3341 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
3344 return Register(AMDGPU::SGPR0_SGPR1 + *RegN);
3346 auto SearchHazardFn = [
this, hazardPair,
3352 return I.readsRegister(hazardPair(Reg), &TRI);
3355 auto SearchExpiredFn = [&](
const MachineInstr &
I,
int Count) {
3358 if (::getWaitStatesSince(SearchHazardFn,
MI, SearchExpiredFn) ==
3359 std::numeric_limits<int>::max())
3364 auto NewMI =
BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
3365 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3377 if (EntryMBB.
begin() != EntryMBB.
end()) {
3378 auto &EntryMI = *EntryMBB.
begin();
3379 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3380 EntryMI.getOperand(0).getImm() >= Priority)
3389bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3408 const int MaxPriority = 3;
3409 const int NormalPriority = 2;
3410 const int PostExportPriority = 0;
3412 auto It =
MI->getIterator();
3413 switch (
MI->getOpcode()) {
3414 case AMDGPU::S_ENDPGM:
3415 case AMDGPU::S_ENDPGM_SAVED:
3416 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3417 case AMDGPU::SI_RETURN_TO_EPILOG:
3423 case AMDGPU::S_SETPRIO: {
3425 auto &PrioOp =
MI->getOperand(0);
3426 int Prio = PrioOp.getImm();
3427 bool InWA = (Prio == PostExportPriority) &&
3428 (It !=
MBB->
begin() &&
TII.isEXP(*std::prev(It)));
3429 if (InWA || Prio >= NormalPriority)
3431 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3435 if (!
TII.isEXP(*
MI))
3442 bool Changed =
false;
3446 auto NextMI = std::next(It);
3447 bool EndOfShader =
false;
3448 if (NextMI !=
MBB->
end()) {
3450 if (
TII.isEXP(*NextMI))
3453 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3454 NextMI->getOperand(0).getImm() == PostExportPriority)
3456 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3463 .
addImm(PostExportPriority);
3468 .
addReg(AMDGPU::SGPR_NULL)
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static std::optional< unsigned > sgprPairNumber(Register Reg, const SIRegisterInfo &TRI)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > MaxExhaustiveHazardSearch("amdgpu-max-exhaustive-hazard-search", cl::init(128), cl::Hidden, cl::desc("Maximum function size for exhausive hazard search"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
bool none() const
none - Returns true if none of the bits are set.
bool empty() const
empty - Tests whether there are no bits in this bitvector.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
bool hasCvtScaleForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool hasGFX950Insts() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasVALUReadSGPRHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
void compute(FunctionT &F)
Compute the cycle info for a function.
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
This holds information about one operand of a machine instruction, indicating the register class for ...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
unsigned getInstructionCount() const
Return the number of MachineInstrs in this MachineFunction.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isMAI(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSOPP(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
StringRef - Represent a constant reference to a string, i.e.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< po_iterator< T > > post_order(const T &G)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
auto reverse(ContainerTy &&C)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...