26struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
31 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
34 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
44 cl::desc(
"Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
55 IsHazardRecognizerMode(
false),
56 CurrCycleInstr(nullptr),
59 TII(*ST.getInstrInfo()),
60 TRI(
TII.getRegisterInfo()),
61 ClauseUses(
TRI.getNumRegUnits()),
62 ClauseDefs(
TRI.getNumRegUnits()) {
64 TSchedModel.
init(&ST);
69 EmittedInstrs.clear();
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
85 return Opcode == AMDGPU::S_GETREG_B32;
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
104 return Opcode == AMDGPU::S_RFE_B64;
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
124 unsigned Opcode =
MI.getOpcode();
128 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
129 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
132 if (!ST.hasGFX940Insts())
140 if (
TII.isAlwaysGDS(
MI.getOpcode()))
143 switch (
MI.getOpcode()) {
144 case AMDGPU::S_SENDMSG:
145 case AMDGPU::S_SENDMSGHALT:
146 case AMDGPU::S_TTRACEDATA:
150 case AMDGPU::DS_PERMUTE_B32:
151 case AMDGPU::DS_BPERMUTE_B32:
154 if (
TII.isDS(
MI.getOpcode())) {
156 AMDGPU::OpName::gds);
157 if (
MI.getOperand(GDS).getImm())
165 unsigned Opcode =
MI.getOpcode();
166 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
167 Opcode == AMDGPU::V_PERMLANE64_B32 ||
168 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
169 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
170 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
180 AMDGPU::OpName::simm16);
200 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
209 && checkVMEMHazards(
MI) > 0)
218 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
221 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
229 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
232 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
235 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
240 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
241 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
245 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
246 checkReadM0Hazards(
MI) > 0)
257 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
265 while (Quantity > 0) {
266 unsigned Arg = std::min(Quantity, 8u);
274GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
281void GCNHazardRecognizer::processBundle() {
285 for (;
MI != E &&
MI->isInsideBundle(); ++
MI) {
286 CurrCycleInstr = &*
MI;
289 if (IsHazardRecognizerMode) {
290 fixHazards(CurrCycleInstr);
298 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
299 EmittedInstrs.push_front(
nullptr);
301 EmittedInstrs.push_front(CurrCycleInstr);
304 CurrCycleInstr =
nullptr;
308 assert(IsHazardRecognizerMode);
312 if (
MI->isInsideBundle())
322 IsHazardRecognizerMode =
true;
326 CurrCycleInstr =
nullptr;
337 return std::max(WaitStates, checkSMRDHazards(
MI));
340 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
342 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
348 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
351 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
354 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
357 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
360 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
365 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
367 if (
MI->isInlineAsm())
368 return std::max(WaitStates, checkInlineAsmHazards(
MI));
371 return std::max(WaitStates, checkGetRegHazards(
MI));
374 return std::max(WaitStates, checkSetRegHazards(
MI));
377 return std::max(WaitStates, checkRFEHazards(
MI));
381 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
382 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
386 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
387 return std::max(WaitStates, checkReadM0Hazards(
MI));
390 return std::max(WaitStates, checkMAIHazards(
MI));
395 return std::max(WaitStates, checkMAILdStHazards(
MI));
401 EmittedInstrs.push_front(
nullptr);
407 if (!CurrCycleInstr) {
408 EmittedInstrs.push_front(
nullptr);
418 if (!NumWaitStates) {
419 CurrCycleInstr =
nullptr;
424 EmittedInstrs.push_front(CurrCycleInstr);
431 EmittedInstrs.push_front(
nullptr);
439 CurrCycleInstr =
nullptr;
443 llvm_unreachable(
"hazard recognizer does not support bottom-up scheduling.");
456template <
typename StateT>
469 switch (IsHazard(State, *
I)) {
479 if (
I->isInlineAsm() ||
I->isMetaInstruction())
482 UpdateState(State, *
I);
486 if (!Visited.
insert(Pred).second)
489 if (
hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
513 if (
I->isInlineAsm())
516 WaitStates += GetNumWaitStates(*
I);
518 if (IsExpired(*
I, WaitStates))
519 return std::numeric_limits<int>::max();
522 int MinWaitStates = std::numeric_limits<int>::max();
524 if (!Visited.
insert(Pred).second)
528 IsExpired, Visited, GetNumWaitStates);
530 MinWaitStates = std::min(MinWaitStates, W);
533 return MinWaitStates;
540 std::next(
MI->getReverseIterator()),
541 0, IsExpired, Visited);
544int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
545 if (IsHazardRecognizerMode) {
547 return WaitStates >= Limit;
549 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn);
558 if (
MI->isInlineAsm())
563 if (WaitStates >= Limit)
566 return std::numeric_limits<int>::max();
569int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
570 IsHazardFn IsHazardDef,
575 return IsHazardDef(
MI) &&
MI.modifiesRegister(Reg, TRI);
581int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
621int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
627 bool IsSMRD = TII.
isSMRD(*MEM);
653 if (ClauseDefs.
none())
666 return ClauseDefs.
anyCommon(ClauseUses) ? 1 : 0;
669int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
670 int WaitStatesNeeded = 0;
672 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
676 return WaitStatesNeeded;
680 int SmrdSgprWaitStates = 4;
693 int WaitStatesNeededForUse =
694 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
696 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
706 int WaitStatesNeededForUse =
707 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
710 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
714 return WaitStatesNeeded;
717int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
721 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
725 const int VmemSgprWaitStates = 5;
733 int WaitStatesNeededForUse =
734 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
736 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
738 return WaitStatesNeeded;
746 int DppVgprWaitStates = 2;
747 int DppExecWaitStates = 5;
748 int WaitStatesNeeded = 0;
750 return TII->isVALU(
MI);
756 int WaitStatesNeededForUse =
757 DppVgprWaitStates - getWaitStatesSinceDef(
761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
764 WaitStatesNeeded = std::max(
766 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
769 return WaitStatesNeeded;
772int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
777 const int DivFMasWaitStates = 4;
779 return TII->isVALU(
MI);
781 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
784 return DivFMasWaitStates - WaitStatesNeeded;
787int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
789 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
791 const int GetRegWaitStates = 2;
795 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
797 return GetRegWaitStates - WaitStatesNeeded;
800int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
802 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
808 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
809 return SetRegWaitStates - WaitStatesNeeded;
812int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
817 unsigned Opcode =
MI.getOpcode();
823 VDataRCID =
Desc.operands()[VDataIdx].RegClass;
833 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
837 (!SOffset || !SOffset->
isReg()))
845 if (
TII->isMIMG(
MI)) {
852 if (
TII->isFLAT(
MI)) {
862GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
869 int WaitStatesNeeded = 0;
871 if (!
TRI->isVectorRegister(
MRI,
Def.getReg()))
872 return WaitStatesNeeded;
875 int DataIdx = createsVALUHazard(
MI);
876 return DataIdx >= 0 &&
877 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(), Reg);
879 int WaitStatesNeededForDef =
880 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
881 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
883 return WaitStatesNeeded;
886int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
887 int WaitStatesNeeded = 0;
890 const int TransDefWaitstates = 1;
900 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
907 int WaitStatesNeededForDef =
909 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
910 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
914 const int Shift16DefWaitstates = 1;
921 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
926 !(
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)
932 if (
auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
936 if (
Use.isReg() &&
TRI->regsOverlap(Def,
Use.getReg()))
944 int WaitStatesNeededForDef =
945 Shift16DefWaitstates -
946 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
947 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
951 const int VALUWriteSGPRVALUReadWaitstates = 2;
952 const int VALUWriteEXECRWLane = 4;
953 const int VALUWriteVGPRReadlaneRead = 1;
961 return MI.modifiesRegister(
UseReg, TRI);
970 int WaitStatesNeededForDef =
971 VALUWriteSGPRVALUReadWaitstates -
972 getWaitStatesSince(IsVALUDefSGPRFn,
973 VALUWriteSGPRVALUReadWaitstates);
974 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
978 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
980 int WaitStatesNeededForDef =
981 VALUWriteSGPRVALUReadWaitstates -
982 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
983 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
986 switch (
VALU->getOpcode()) {
987 case AMDGPU::V_READLANE_B32:
988 case AMDGPU::V_READFIRSTLANE_B32: {
991 int WaitStatesNeededForDef =
992 VALUWriteVGPRReadlaneRead -
993 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
994 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
997 case AMDGPU::V_WRITELANE_B32: {
999 int WaitStatesNeededForDef =
1000 VALUWriteEXECRWLane -
1001 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1002 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1013 return WaitStatesNeeded;
1018 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1021 return WaitStatesNeeded;
1024int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1037 int WaitStatesNeeded = 0;
1041 if (
Op.isReg() &&
Op.isDef()) {
1043 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1047 return WaitStatesNeeded;
1050int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1056 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1064 const int RWLaneWaitStates = 4;
1065 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1067 return RWLaneWaitStates - WaitStatesSince;
1070int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1076 const int RFEWaitStates = 1;
1081 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1082 return RFEWaitStates - WaitStatesNeeded;
1087 const int ReadM0WaitStates = 1;
1089 return ReadM0WaitStates -
1090 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1094 fixVMEMtoScalarWriteHazards(
MI);
1095 fixVcmpxPermlaneHazards(
MI);
1096 fixSMEMtoVectorWriteHazards(
MI);
1097 fixVcmpxExecWARHazard(
MI);
1098 fixLdsBranchVmemWARHazard(
MI);
1100 fixLdsDirectVALUHazard(
MI);
1101 fixLdsDirectVMEMHazard(
MI);
1103 fixVALUPartialForwardingHazard(
MI);
1104 fixVALUTransUseHazard(
MI);
1106 fixShift64HighRegBug(
MI);
1107 fixVALUMaskWriteHazard(
MI);
1108 fixRequiredExportPriority(
MI);
1111bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1118 return (
TII->isVOPC(
MI) ||
1119 ((
TII->isVOP3(
MI) ||
TII->isSDWA(
MI)) &&
MI.isCompare())) &&
1120 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1124 unsigned Opc =
MI.getOpcode();
1126 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1130 std::numeric_limits<int>::max())
1136 auto *Src0 =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1138 bool IsUndef = Src0->isUndef();
1140 TII->get(AMDGPU::V_MOV_B32_e32))
1147bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1155 if (
MI->getNumDefs() == 0)
1167 I.findRegisterUseOperand(
Def.getReg(),
TRI,
false);
1177 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1178 !
MI.getOperand(0).getImm()) ||
1179 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1184 std::numeric_limits<int>::max())
1189 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1194bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1203 switch (
MI->getOpcode()) {
1204 case AMDGPU::V_READLANE_B32:
1205 case AMDGPU::V_READFIRSTLANE_B32:
1206 SDSTName = AMDGPU::OpName::vdst;
1209 SDSTName = AMDGPU::OpName::sdst;
1218 for (
const auto &MO :
MI->implicit_operands()) {
1219 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg()))) {
1235 if (
TII->isSALU(
MI)) {
1236 switch (
MI.getOpcode()) {
1237 case AMDGPU::S_SETVSKIP:
1238 case AMDGPU::S_VERSION:
1239 case AMDGPU::S_WAITCNT_VSCNT:
1240 case AMDGPU::S_WAITCNT_VMCNT:
1241 case AMDGPU::S_WAITCNT_EXPCNT:
1244 case AMDGPU::S_WAITCNT_LGKMCNT:
1246 return (
MI.getOperand(1).getImm() == 0) &&
1247 (
MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1248 case AMDGPU::S_WAITCNT: {
1249 const int64_t
Imm =
MI.getOperand(0).getImm();
1252 return (Decoded.
DsCnt == 0);
1256 if (
TII->isSOPP(
MI))
1272 std::numeric_limits<int>::max())
1276 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1281bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1290 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1296 return I.readsRegister(AMDGPU::EXEC, TRI);
1302 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1304 for (
auto MO :
MI.implicit_operands())
1305 if (MO.isDef() &&
TRI->isSGPRClass(
TRI->getPhysRegBaseClass(MO.getReg())))
1308 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1315 std::numeric_limits<int>::max())
1319 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1326 if (!ST.hasLdsBranchVmemWARHazard())
1331 bool HasLds =
false;
1332 bool HasVmem =
false;
1333 for (
auto &
MBB : MF) {
1334 for (
auto &
MI :
MBB) {
1338 if (HasLds && HasVmem)
1346 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1347 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1348 !
I.getOperand(1).getImm();
1351bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1352 if (!RunLdsBranchVmemWARHazardFixup)
1366 auto InstType = IsHazardInst(*
MI);
1379 auto InstType2 = IsHazardInst(
I);
1380 return InstType2 && InstType != InstType2;
1384 auto InstType2 = IsHazardInst(
I);
1385 if (InstType == InstType2)
1392 std::numeric_limits<int>::max();
1396 std::numeric_limits<int>::max())
1401 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1408bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1412 const int NoHazardWaitStates = 15;
1416 bool VisitedTrans =
false;
1422 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1425 if (WaitStates >= NoHazardWaitStates)
1436 auto Count = ::getWaitStatesSince(
IsHazardFn,
MI->getParent(),
1437 std::next(
MI->getReverseIterator()), 0,
1446 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1447 WaitVdstOp->
setImm(std::min(Count, NoHazardWaitStates));
1452bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1463 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1470 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1471 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1474 !
TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1478 std::numeric_limits<int>::max())
1481 if (LdsdirCanWait) {
1482 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1485 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1492bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1508 if (SrcVGPRs.
size() <= 1)
1526 const int Intv1plus2MaxVALUs = 2;
1527 const int Intv3MaxVALUs = 4;
1528 const int IntvMaxVALUs = 6;
1529 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1533 int ExecPos = std::numeric_limits<int>::max();
1542 if (State.VALUs > NoHazardVALUWaitStates)
1543 return HazardExpired;
1548 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1550 return HazardExpired;
1553 bool Changed =
false;
1556 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1557 State.DefPos[Src] = State.VALUs;
1562 if (State.ExecPos == std::numeric_limits<int>::max()) {
1563 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1564 State.ExecPos = State.VALUs;
1571 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1572 return HazardExpired;
1576 return NoHazardFound;
1579 if (State.ExecPos == std::numeric_limits<int>::max())
1580 return NoHazardFound;
1582 int PreExecPos = std::numeric_limits<int>::max();
1583 int PostExecPos = std::numeric_limits<int>::max();
1585 for (
auto Entry : State.DefPos) {
1586 int DefVALUs =
Entry.second;
1587 if (DefVALUs != std::numeric_limits<int>::max()) {
1588 if (DefVALUs >= State.ExecPos)
1589 PreExecPos = std::min(PreExecPos, DefVALUs);
1591 PostExecPos = std::min(PostExecPos, DefVALUs);
1596 if (PostExecPos == std::numeric_limits<int>::max())
1597 return NoHazardFound;
1600 int Intv3VALUs = PostExecPos;
1601 if (Intv3VALUs > Intv3MaxVALUs)
1602 return HazardExpired;
1605 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1606 if (Intv2VALUs > Intv1plus2MaxVALUs)
1607 return HazardExpired;
1610 if (PreExecPos == std::numeric_limits<int>::max())
1611 return NoHazardFound;
1614 int Intv1VALUs = PreExecPos - State.ExecPos;
1615 if (Intv1VALUs > Intv1plus2MaxVALUs)
1616 return HazardExpired;
1619 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1620 return HazardExpired;
1624 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1630 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1631 std::next(
MI->getReverseIterator()), Visited))
1635 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1641bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1666 const int IntvMaxVALUs = 5;
1667 const int IntvMaxTRANS = 1;
1679 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1680 return HazardExpired;
1685 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1686 I.getOperand(0).getImm() == 0x0fff))
1687 return HazardExpired;
1692 if (
I.modifiesRegister(Src, &TRI)) {
1698 return NoHazardFound;
1700 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1708 if (!hasHazard<StateType>(State,
IsHazardFn, UpdateStateFn,
MI->getParent(),
1709 std::next(
MI->getReverseIterator()), Visited))
1715 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1735 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1737 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1740 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1742 if (
TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1743 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1752 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
1753 if (
TRI->regsOverlap(PrevDstReg, CurIndex))
1767 std::numeric_limits<int>::max())
1770 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(),
TII->get(AMDGPU::V_NOP_e32));
1775bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
1780 switch (
MI->getOpcode()) {
1783 case AMDGPU::V_LSHLREV_B64_e64:
1784 case AMDGPU::V_LSHRREV_B64_e64:
1785 case AMDGPU::V_ASHRREV_I64_e64:
1796 if (!
TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1799 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
1803 bool OverlappedSrc = Src1->
isReg() &&
TRI.regsOverlap(Src1->
getReg(), AmtReg);
1804 bool OverlappedDst =
MI->modifiesRegister(AmtReg, &TRI);
1805 bool Overlapped = OverlappedSrc || OverlappedDst;
1807 assert(!OverlappedDst || !OverlappedSrc ||
1808 Src1->
getReg() ==
MI->getOperand(0).getReg());
1810 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1813 for (
MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1814 : AMDGPU::VGPR_32RegClass) {
1815 if (!
MI->modifiesRegister(Reg, &TRI) && !
MI->readsRegister(Reg, &TRI)) {
1826 NewAmtLo =
TRI.getSubReg(NewReg, AMDGPU::sub0);
1868 MI->getOperand(0).setReg(NewReg);
1869 if (OverlappedSrc) {
1879 int NSAtoVMEMWaitStates = 1;
1888 const auto *
Offset =
TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
1896 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1897 TII->getInstSizeInBytes(
I) >= 16;
1900 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
1903int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
1904 int FPAtomicToDenormModeWaitStates = 3;
1910 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1923 switch (
MI.getOpcode()) {
1924 case AMDGPU::S_WAITCNT:
1925 case AMDGPU::S_WAITCNT_VSCNT:
1926 case AMDGPU::S_WAITCNT_VMCNT:
1927 case AMDGPU::S_WAITCNT_EXPCNT:
1928 case AMDGPU::S_WAITCNT_LGKMCNT:
1929 case AMDGPU::S_WAIT_IDLE:
1938 return FPAtomicToDenormModeWaitStates -
1957 int NeighborMFMALatency = 0;
1958 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1963 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
1967 const int MaxMFMAPipelineWaitStates = 16;
1968 int WaitStatesSinceNeighborMFMA =
1969 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1971 int NeighborMFMAPaddingNeeded =
1973 WaitStatesSinceNeighborMFMA;
1975 return std::max(0, NeighborMFMAPaddingNeeded);
1979 int WaitStatesNeeded = 0;
1980 unsigned Opc =
MI->getOpcode();
1986 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
1987 const int LegacyVALUWritesVGPRWaitStates = 2;
1988 const int VALUWritesExecWaitStates = 4;
1989 const int MaxWaitStates = 4;
1991 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1992 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1993 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1995 if (WaitStatesNeeded < MaxWaitStates) {
1997 const int MaxWaitStates = 2;
2002 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2003 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2004 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2006 if (WaitStatesNeeded == MaxWaitStates)
2016 if (
Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2019 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2020 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2021 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2022 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2023 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2024 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2025 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2026 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2027 const int MaxWaitStates = 18;
2029 unsigned HazardDefLatency = 0;
2031 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2039 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2040 return TRI.regsOverlap(DstReg, Reg);
2043 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2045 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2047 int OpNo =
Op.getOperandNo();
2048 if (OpNo == SrcCIdx) {
2049 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2050 }
else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2051 switch (HazardDefLatency) {
2052 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2054 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2056 case 16: [[fallthrough]];
2057 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2060 }
else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2061 switch (HazardDefLatency) {
2062 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2064 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2066 case 16: [[fallthrough]];
2067 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2072 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2073 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2075 if (WaitStatesNeeded == MaxWaitStates)
2076 return WaitStatesNeeded;
2079 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2082 return TRI.regsOverlap(Reg, DstReg);
2085 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2086 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2087 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2088 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2089 if (OpNo == SrcCIdx)
2090 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2091 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2092 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2094 WaitStatesNeededForUse = NeedWaitStates -
2095 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2096 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2098 if (WaitStatesNeeded == MaxWaitStates)
2099 return WaitStatesNeeded;
2102 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2103 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2104 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2105 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2106 const int MaxWaitStates = 13;
2107 Register DstReg =
MI->getOperand(0).getReg();
2108 unsigned HazardDefLatency = 0;
2110 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2116 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2117 return TRI.regsOverlap(Reg, DstReg);
2120 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2122 switch (HazardDefLatency) {
2123 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2125 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2127 case 16: [[fallthrough]];
2128 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2132 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2133 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2137 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2139 return WaitStatesNeeded;
2148 return NumPasses + 1;
2166 return NumPasses + 2;
2174 return NumPasses + 3;
2178 int WaitStatesNeeded = 0;
2179 unsigned Opc =
MI->getOpcode();
2191 return WaitStatesNeeded;
2193 const int VALUWritesExecWaitStates = 4;
2194 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2195 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2196 VALUWritesExecWaitStates);
2197 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2203 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2204 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2205 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2206 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2207 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2208 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2209 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2210 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2211 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2212 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2213 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2214 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2215 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2216 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2217 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2218 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2219 const int MaxWaitStates = 19;
2227 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2232 FullReg = (DstReg ==
Reg);
2234 return TRI.regsOverlap(DstReg, Reg);
2237 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2238 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2239 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2242 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2243 if (NumWaitStates == std::numeric_limits<int>::max())
2248 int NeedWaitStates = 0;
2249 if (OpNo == SrcCIdx) {
2252 }
else if (FullReg) {
2253 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2254 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2255 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2256 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2257 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2259 TSchedModel.computeInstrLatency(MI1) == 2)
2260 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2263 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2264 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2265 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2266 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2268 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2270 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2271 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2273 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2276 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2290 switch (NumPasses) {
2293 isDGEMM(Opc) ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2294 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2299 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2300 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2305 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2306 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2315 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2316 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2317 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2318 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2319 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2321 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2322 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2323 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2326 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2338 switch (NumPasses) {
2340 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2345 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2349 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2353 if (WaitStatesNeeded >= NeedWaitStates)
2356 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2357 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2359 if (WaitStatesNeeded == MaxWaitStates)
2364 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2366 return WaitStatesNeeded;
2374 int WaitStatesNeeded = 0;
2377 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2386 const int AccVgprReadLdStWaitStates = 2;
2387 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2388 const int MaxWaitStates = 2;
2390 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2391 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2392 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2394 if (WaitStatesNeeded == MaxWaitStates)
2395 return WaitStatesNeeded;
2398 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2399 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2404 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <
2405 std::numeric_limits<int>::max();
2408 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2409 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2410 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2413 return WaitStatesNeeded;
2421 return NumPasses + 2;
2429 return NumPasses + 3;
2437 return NumPasses + 3;
2445 return NumPasses + 2;
2462 int WaitStatesNeeded = 0;
2474 !
TRI.regsOverlap(
MI.getOperand(0).getReg(), Reg))
2483 !
TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2489 bool DGEMMAfterVALUWrite =
false;
2490 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2493 DGEMMAfterVALUWrite =
true;
2497 if (!
TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2504 AMDGPU::OpName::src2);
2506 if (IsMemOrExport || IsVALU) {
2507 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2508 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2509 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2510 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2511 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2512 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2513 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2514 const int DotWriteSameDotReadSrcAB = 3;
2515 const int DotWriteDifferentVALURead = 3;
2516 const int DMFMABetweenVALUWriteVMEMRead = 2;
2517 const int MaxWaitStates = 19;
2525 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2528 int NeedWaitStates = 0;
2529 if (
DOT->getOpcode() ==
MI->getOpcode()) {
2530 if (&
Use - &
MI->getOperand(0) != SrcCIdx)
2531 NeedWaitStates = DotWriteSameDotReadSrcAB;
2533 NeedWaitStates = DotWriteDifferentVALURead;
2536 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2537 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2545 DGEMMAfterVALUWrite =
false;
2546 if (
TRI.isVectorRegister(
MRI, Reg)) {
2547 int WaitStatesNeededForUse =
2548 DMFMABetweenVALUWriteVMEMRead -
2549 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2550 DMFMABetweenVALUWriteVMEMRead);
2552 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2557 WaitStatesSinceDef =
2558 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2562 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2563 int NumPasses = HazardDefLatency;
2564 int NeedWaitStates = MaxWaitStates;
2566 if (
isDGEMM(MFMA->getOpcode())) {
2567 switch (HazardDefLatency) {
2569 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2570 : DMFMA4x4WriteVgprVALUReadWaitStates;
2574 NeedWaitStates = IsMemOrExport
2575 ? DMFMA16x16WriteVgprMemExpReadWaitStates
2576 : DMFMA16x16WriteVgprVALUReadWaitStates;
2588 switch (HazardDefLatency) {
2590 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2593 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2596 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2603 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2604 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2606 if (WaitStatesNeeded == MaxWaitStates)
2611 unsigned Opc =
MI->getOpcode();
2612 const int DMFMAToFMA64WaitStates = 2;
2613 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2614 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2615 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2616 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2617 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2618 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2619 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2622 if (!IsVALU && !IsMemOrExport)
2623 return WaitStatesNeeded;
2626 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2627 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2628 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2629 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2630 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2631 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2632 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2633 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2634 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2635 const int DotWriteDifferentVALUWrite = 3;
2636 const int MaxWaitStates = 19;
2637 const int MaxWarWaitStates = 15;
2642 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2644 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
2645 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2646 WaitStatesSinceDef);
2649 WaitStatesSinceDef =
2650 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2652 int NeedWaitStates = MaxWaitStates;
2653 int NumPasses = TSchedModel.computeInstrLatency(MFMA);
2655 if (
isDGEMM(MFMA->getOpcode())) {
2656 switch (NumPasses) {
2658 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
2662 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
2673 switch (NumPasses) {
2675 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
2678 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
2681 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
2688 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2689 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2691 if (WaitStatesNeeded == MaxWaitStates)
2697 !
MI.readsRegister(Reg, &TRI))
2704 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
2714 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2719 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2720 int NeedWaitStates = MaxWaitStates;
2721 switch (HazardDefLatency) {
2722 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2725 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2727 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2729 case 16: [[fallthrough]];
2730 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2734 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2735 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2738 return WaitStatesNeeded;
2751 return MAI !=
nullptr;
2755 if (IsMFMAFn(*
MI)) {
2756 int W = getWaitStatesSince(IsMFMAFn, 16);
2758 return W < (int)TSchedModel.computeInstrLatency(MAI);
2764bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
2781 if (!SDSTOp || !SDSTOp->
isReg())
2785 if (HazardReg == AMDGPU::EXEC ||
2786 HazardReg == AMDGPU::EXEC_LO ||
2787 HazardReg == AMDGPU::EXEC_HI ||
2788 HazardReg == AMDGPU::M0)
2792 switch (
I.getOpcode()) {
2793 case AMDGPU::V_ADDC_U32_e32:
2794 case AMDGPU::V_ADDC_U32_dpp:
2795 case AMDGPU::V_CNDMASK_B16_e32:
2796 case AMDGPU::V_CNDMASK_B16_dpp:
2797 case AMDGPU::V_CNDMASK_B32_e32:
2798 case AMDGPU::V_CNDMASK_B32_dpp:
2799 case AMDGPU::V_DIV_FMAS_F32_e64:
2800 case AMDGPU::V_DIV_FMAS_F64_e64:
2801 case AMDGPU::V_SUBB_U32_e32:
2802 case AMDGPU::V_SUBB_U32_dpp:
2803 case AMDGPU::V_SUBBREV_U32_e32:
2804 case AMDGPU::V_SUBBREV_U32_dpp:
2806 return HazardReg == AMDGPU::VCC ||
2807 HazardReg == AMDGPU::VCC_LO ||
2808 HazardReg == AMDGPU::VCC_HI;
2809 case AMDGPU::V_ADDC_U32_e64:
2810 case AMDGPU::V_ADDC_U32_e64_dpp:
2811 case AMDGPU::V_CNDMASK_B16_e64:
2812 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2813 case AMDGPU::V_CNDMASK_B32_e64:
2814 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2815 case AMDGPU::V_SUBB_U32_e64:
2816 case AMDGPU::V_SUBB_U32_e64_dpp:
2817 case AMDGPU::V_SUBBREV_U32_e64:
2818 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2822 return TRI.regsOverlap(SSRCOp->
getReg(), HazardReg);
2832 if (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2841 for (
int OpNo = 0,
End =
I.getNumOperands(); OpNo <
End; ++OpNo) {
2849 if (OpReg == AMDGPU::EXEC ||
2850 OpReg == AMDGPU::EXEC_LO ||
2851 OpReg == AMDGPU::EXEC_HI)
2854 if (
Op.isImplicit()) {
2855 if (OpReg == AMDGPU::VCC ||
2856 OpReg == AMDGPU::VCC_LO ||
2857 OpReg == AMDGPU::VCC_HI)
2861 if (
TRI.isSGPRReg(
MRI, OpReg))
2866 if (!
TII.isInlineConstant(
Op, OpInfo))
2875 std::numeric_limits<int>::max())
2878 auto NextMI = std::next(
MI->getIterator());
2881 BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
2882 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2886 if (
MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2888 while (NextMI !=
MI->getParent()->end() &&
2889 NextMI->isBundledWithPred()) {
2890 for (
auto &Operand : NextMI->operands()) {
2891 if (Operand.isGlobal())
2892 Operand.setOffset(Operand.getOffset() + 4);
2904 if (EntryMBB.
begin() != EntryMBB.
end()) {
2905 auto &EntryMI = *EntryMBB.
begin();
2906 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
2907 EntryMI.getOperand(0).getImm() >= Priority)
2916bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
2935 const int MaxPriority = 3;
2936 const int NormalPriority = 2;
2937 const int PostExportPriority = 0;
2939 auto It =
MI->getIterator();
2940 switch (
MI->getOpcode()) {
2941 case AMDGPU::S_ENDPGM:
2942 case AMDGPU::S_ENDPGM_SAVED:
2943 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
2944 case AMDGPU::SI_RETURN_TO_EPILOG:
2950 case AMDGPU::S_SETPRIO: {
2952 auto &PrioOp =
MI->getOperand(0);
2953 int Prio = PrioOp.getImm();
2954 bool InWA = (Prio == PostExportPriority) &&
2955 (It !=
MBB->
begin() &&
TII.isEXP(*std::prev(It)));
2956 if (InWA || Prio >= NormalPriority)
2958 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
2962 if (!
TII.isEXP(*
MI))
2969 bool Changed =
false;
2973 auto NextMI = std::next(It);
2974 bool EndOfShader =
false;
2975 if (NextMI !=
MBB->
end()) {
2977 if (
TII.isEXP(*NextMI))
2980 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
2981 NextMI->getOperand(0).getImm() == PostExportPriority)
2983 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
2990 .
addImm(PostExportPriority);
2995 .
addReg(AMDGPU::SGPR_NULL)
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI)
static bool isLdsDma(const MachineInstr &MI)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isSSetReg(unsigned Opcode)
static bool hasHazard(StateT State, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, DenseSet< const MachineBasicBlock * > &Visited)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static bool isDGEMM(unsigned Opcode)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static const uint32_t IV[8]
bool anyCommon(const BitVector &RHS) const
Test if any common bits are set.
bool none() const
none - Returns true if none of the bits are set.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
unsigned PreEmitNoopsCommon(MachineInstr *)
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
bool hasShift64HighRegBug() const
bool hasFPAtomicToDenormModeHazard() const
bool hasLdsBranchVmemWARHazard() const
bool hasGFX90AInsts() const
bool hasDstSelForwardingHazard() const
const SIInstrInfo * getInstrInfo() const override
bool hasVALUMaskWriteHazard() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasVcmpxExecWARHazard() const
bool hasReadM0MovRelInterpHazard() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasRequiredExportPriority() const
bool hasLdsWaitVMSRC() const
bool hasExtendedWaitCounts() const
bool hasVcmpxPermlaneHazard() const
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
bool hasNoDataDepHazard() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
bool hasTransForwardingHazard() const
bool hasGFX940Insts() const
bool hasReadM0LdsDmaHazard() const
bool hasSMEMtoVectorWriteHazard() const
bool hasVMEMtoScalarWriteHazard() const
bool hasNSAtoVMEMBug() const
bool hasVDecCoExecHazard() const
bool hasReadM0SendMsgHazard() const
bool hasReadM0LdsDirectHazard() const
bool isXNACKEnabled() const
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasVALUTransUseHazard() const
bool hasLdsDirect() const
Describe properties that are true of each instruction in the target description file.
ArrayRef< MCOperandInfo > operands() const
This holds information about one operand of a machine instruction, indicating the register class for ...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
reverse_instr_iterator instr_rend()
Instructions::iterator instr_iterator
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
bool hasCalls() const
Return true if the current function has any function calls.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Wrapper class representing virtual and physical registers.
static bool isMAI(const MachineInstr &MI)
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isVINTRP(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
bool isBufferSMRD(const MachineInstr &MI) const
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
StringRef - Represent a constant reference to a string, i.e.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
A Use represents the edge between a Value definition and its users.
unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM Value Representation.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
IsaVersion getIsaVersion(StringRef GPU)
bool getMAIIsGFX940XDL(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool getMAIIsDGEMM(unsigned Opc)
Returns true if MAI operation is a double precision GEMM.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
Description of the encoding of one expression Op.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...