26struct MFMAPaddingRatioParser :
public cl::parser<unsigned> {
27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(
O) {}
29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg,
unsigned &
Value) {
31 return O.error(
"'" + Arg +
"' value invalid for uint argument!");
34 return O.error(
"'" + Arg +
"' value must be in the range [0, 100]!");
44 cl::desc(
"Fill a percentage of the latency between "
45 "neighboring MFMA with s_nops."));
50 cl::desc(
"Insert a s_nop x before every instruction"));
60 : IsHazardRecognizerMode(
false), CurrCycleInstr(nullptr), MF(MF),
61 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()),
62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),
63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {
64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
69 EmittedInstrs.clear();
81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;
90 case AMDGPU::S_SETREG_B32:
91 case AMDGPU::S_SETREG_B32_mode:
92 case AMDGPU::S_SETREG_IMM32_B32:
93 case AMDGPU::S_SETREG_IMM32_B32_mode:
100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
104 return Opcode == AMDGPU::S_RFE_B64;
109 case AMDGPU::S_MOVRELS_B32:
110 case AMDGPU::S_MOVRELS_B64:
111 case AMDGPU::S_MOVRELD_B32:
112 case AMDGPU::S_MOVRELD_B64:
121 if (
TII.isAlwaysGDS(
MI.getOpcode()))
124 switch (
MI.getOpcode()) {
125 case AMDGPU::S_SENDMSG:
126 case AMDGPU::S_SENDMSGHALT:
127 case AMDGPU::S_TTRACEDATA:
131 case AMDGPU::DS_PERMUTE_B32:
132 case AMDGPU::DS_BPERMUTE_B32:
135 if (
TII.isDS(
MI.getOpcode())) {
136 int GDS = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
137 AMDGPU::OpName::gds);
138 if (
MI.getOperand(GDS).getImm())
146 unsigned Opcode =
MI.getOpcode();
147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
148 Opcode == AMDGPU::V_PERMLANE64_B32 ||
149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||
152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
170 AMDGPU::OpName::simm16);
187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(
MI) > 0)
190 if (checkFPAtomicToDenormModeHazard(
MI) > 0)
194 if (!IsHazardRecognizerMode) {
195 if (checkWMMACoexecutionHazards(
MI) > 0)
199 if (ST.hasNoDataDepHazard())
211 if (
isDivFMas(
MI->getOpcode()) && checkDivFMasHazards(
MI) > 0)
214 if (
isRWLane(
MI->getOpcode()) && checkRWLaneHazards(
MI) > 0)
219 checkMAIVALUHazards(
MI) > 0)
222 if (
isSGetReg(
MI->getOpcode()) && checkGetRegHazards(
MI) > 0)
225 if (
isSSetReg(
MI->getOpcode()) && checkSetRegHazards(
MI) > 0)
228 if (
isRFE(
MI->getOpcode()) && checkRFEHazards(
MI) > 0)
231 if (((ST.hasReadM0MovRelInterpHazard() &&
233 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
234 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
236 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
237 (ST.hasReadM0LdsDirectHazard() &&
238 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr))) &&
239 checkReadM0Hazards(
MI) > 0)
246 checkMAILdStHazards(
MI) > 0)
249 if (
MI->isInlineAsm() && checkInlineAsmHazards(
MI) > 0)
257 while (Quantity > 0) {
258 unsigned Arg = std::min(Quantity, 8u);
266GCNHazardRecognizer::getMFMAPipelineWaitStates(
const MachineInstr &
MI)
const {
267 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&
MI);
268 assert(TSchedModel.getWriteProcResBegin(SC) !=
269 TSchedModel.getWriteProcResEnd(SC));
270 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
273void GCNHazardRecognizer::processBundle() {
277 for (;
MI !=
E &&
MI->isInsideBundle(); ++
MI) {
278 CurrCycleInstr = &*
MI;
281 if (IsHazardRecognizerMode) {
282 fixHazards(CurrCycleInstr);
290 for (
unsigned i = 0, e = std::min(WaitStates,
MaxLookAhead - 1); i <
e; ++i)
291 EmittedInstrs.push_front(
nullptr);
293 EmittedInstrs.push_front(CurrCycleInstr);
296 CurrCycleInstr =
nullptr;
300 assert(IsHazardRecognizerMode);
304 if (
MI->isInsideBundle())
314 IsHazardRecognizerMode =
true;
318 CurrCycleInstr =
nullptr;
329 return std::max(WaitStates, checkSMRDHazards(
MI));
331 if (ST.hasNSAtoVMEMBug())
332 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(
MI));
334 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(
MI));
336 if (ST.hasNoDataDepHazard())
340 WaitStates = std::max(WaitStates, checkVMEMHazards(
MI));
343 WaitStates = std::max(WaitStates, checkVALUHazards(
MI));
346 WaitStates = std::max(WaitStates, checkDPPHazards(
MI));
349 WaitStates = std::max(WaitStates, checkDivFMasHazards(
MI));
352 WaitStates = std::max(WaitStates, checkRWLaneHazards(
MI));
356 checkMAIVALUHazards(
MI) > 0)
357 WaitStates = std::max(WaitStates, checkMAIVALUHazards(
MI));
359 if (
MI->isInlineAsm())
360 return std::max(WaitStates, checkInlineAsmHazards(
MI));
363 return std::max(WaitStates, checkGetRegHazards(
MI));
366 return std::max(WaitStates, checkSetRegHazards(
MI));
369 return std::max(WaitStates, checkRFEHazards(
MI));
371 if ((ST.hasReadM0MovRelInterpHazard() &&
373 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
374 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
376 (ST.hasReadM0LdsDmaHazard() &&
isLdsDma(*
MI)) ||
377 (ST.hasReadM0LdsDirectHazard() &&
378 MI->readsRegister(AMDGPU::LDS_DIRECT,
nullptr)))
379 return std::max(WaitStates, checkReadM0Hazards(
MI));
382 return std::max(WaitStates, checkMAIHazards(
MI));
385 return std::max(WaitStates, checkMAILdStHazards(
MI));
388 return std::max(WaitStates, checkPermlaneHazards(
MI));
394 EmittedInstrs.push_front(
nullptr);
400 if (!CurrCycleInstr) {
401 EmittedInstrs.push_front(
nullptr);
405 if (CurrCycleInstr->isBundle()) {
410 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
411 if (!NumWaitStates) {
412 CurrCycleInstr =
nullptr;
417 EmittedInstrs.push_front(CurrCycleInstr);
424 EmittedInstrs.push_front(
nullptr);
432 CurrCycleInstr =
nullptr;
436 assert(!IsHazardRecognizerMode &&
437 "Bottom-up scheduling shouldn't run in hazard recognizer mode");
447template <
typename StateT>
457 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
462 static inline StateMapKey getEmptyKey() {
467 static inline StateMapKey getTombstoneKey() {
472 static unsigned getHashValue(
const StateMapKey &
Key) {
473 return StateT::getHashValue((*
Key.States)[
Key.Idx]);
475 static unsigned getHashValue(
const StateT &State) {
476 return StateT::getHashValue(State);
478 static bool isEqual(
const StateMapKey &
LHS,
const StateMapKey &
RHS) {
479 const auto EKey = getEmptyKey();
480 const auto TKey = getTombstoneKey();
481 if (StateMapKey::isEqual(
LHS, EKey) || StateMapKey::isEqual(
RHS, EKey) ||
482 StateMapKey::isEqual(
LHS, TKey) || StateMapKey::isEqual(
RHS, TKey))
483 return StateMapKey::isEqual(
LHS,
RHS);
484 return StateT::isEqual((*
LHS.States)[
LHS.Idx], (*
RHS.States)[
RHS.Idx]);
486 static bool isEqual(
const StateT &
LHS,
const StateMapKey &
RHS) {
487 if (StateMapKey::isEqual(
RHS, getEmptyKey()) ||
488 StateMapKey::isEqual(
RHS, getTombstoneKey()))
490 return StateT::isEqual(
LHS, (*
RHS.States)[
RHS.Idx]);
499 StateT State = InitialState;
502 unsigned WorkIdx = 0;
504 bool Expired =
false;
505 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
510 auto Result = IsHazard(State, *
I);
518 if (
I->isInlineAsm() ||
I->isMetaInstruction())
521 UpdateState(State, *
I);
525 unsigned StateIdx = States.
size();
526 StateMapKey
Key = {&States, StateIdx};
527 auto Insertion = StateMap.
insert_as(std::pair(
Key, StateIdx), State);
528 if (Insertion.second) {
531 StateIdx = Insertion.first->second;
534 Worklist.
insert(std::pair(Pred, StateIdx));
537 if (WorkIdx == Worklist.
size())
541 std::tie(
MBB, StateIdx) = Worklist[WorkIdx++];
542 State = States[StateIdx];
543 I =
MBB->instr_rbegin();
560 for (
auto E =
MBB->instr_rend();
I !=
E; ++
I) {
568 if (
I->isInlineAsm())
571 WaitStates += GetNumWaitStates(*
I);
573 if (IsExpired(*
I, WaitStates))
574 return std::numeric_limits<int>::max();
577 int MinWaitStates = std::numeric_limits<int>::max();
579 if (!Visited.
insert(Pred).second)
583 IsExpired, Visited, GetNumWaitStates);
585 MinWaitStates = std::min(MinWaitStates, W);
588 return MinWaitStates;
599 std::next(
MI->getReverseIterator()), 0, IsExpired,
600 Visited, GetNumWaitStates);
603int GCNHazardRecognizer::getWaitStatesSince(
604 IsHazardFn IsHazard,
int Limit, GetNumWaitStatesFn GetNumWaitStates) {
605 if (IsHazardRecognizerMode) {
606 auto IsExpiredFn = [Limit](
const MachineInstr &,
int WaitStates) {
607 return WaitStates >= Limit;
609 return ::getWaitStatesSince(IsHazard, CurrCycleInstr,
IsExpiredFn,
614 for (MachineInstr *
MI : EmittedInstrs) {
619 if (
MI->isInlineAsm())
622 WaitStates +=
MI ? GetNumWaitStates(*
MI) : 1;
624 if (WaitStates >= Limit)
627 return std::numeric_limits<int>::max();
630int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard,
int Limit) {
634int GCNHazardRecognizer::getWaitStatesSinceDef(
unsigned Reg,
635 IsHazardFn IsHazardDef,
637 const SIRegisterInfo *TRI = ST.getRegisterInfo();
640 return IsHazardDef(
MI) &&
MI.modifiesRegister(
Reg, TRI);
646int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
661 for (MCRegUnit Unit :
TRI.regunits(
Reg))
662 BV.
set(
static_cast<unsigned>(Unit));
686int GCNHazardRecognizer::checkSoftClauseHazards(
MachineInstr *MEM) {
689 if (!ST.isXNACKEnabled())
692 bool IsSMRD = TII.isSMRD(*MEM);
706 for (MachineInstr *
MI : EmittedInstrs) {
718 if (ClauseDefs.none())
731 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
734int GCNHazardRecognizer::checkSMRDHazards(
MachineInstr *SMRD) {
735 int WaitStatesNeeded = 0;
737 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
740 if (!ST.hasSMRDReadVALUDefHazard())
741 return WaitStatesNeeded;
745 int SmrdSgprWaitStates = 4;
746 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
747 return TII.isVALU(
MI);
749 auto IsBufferHazardDefFn = [
this](
const MachineInstr &
MI) {
750 return TII.isSALU(
MI);
753 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
755 for (
const MachineOperand &Use :
SMRD->uses()) {
758 int WaitStatesNeededForUse =
759 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
771 int WaitStatesNeededForUse =
772 SmrdSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(),
775 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
779 return WaitStatesNeeded;
782int GCNHazardRecognizer::checkVMEMHazards(
MachineInstr* VMEM) {
783 if (!ST.hasVMEMReadSGPRVALUDefHazard())
786 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
790 const int VmemSgprWaitStates = 5;
791 auto IsHazardDefFn = [
this](
const MachineInstr &
MI) {
792 return TII.isVALU(
MI);
794 for (
const MachineOperand &Use : VMEM->uses()) {
795 if (!
Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(),
Use.getReg()))
798 int WaitStatesNeededForUse =
799 VmemSgprWaitStates - getWaitStatesSinceDef(
Use.getReg(), IsHazardDefFn,
801 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
803 return WaitStatesNeeded;
807 const SIRegisterInfo *TRI = ST.getRegisterInfo();
808 const SIInstrInfo *TII = ST.getInstrInfo();
811 int DppVgprWaitStates = 2;
812 int DppExecWaitStates = 5;
813 int WaitStatesNeeded = 0;
814 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
815 return TII->isVALU(
MI);
818 for (
const MachineOperand &Use :
DPP->uses()) {
819 if (!
Use.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Use.getReg()))
821 int WaitStatesNeededForUse =
822 DppVgprWaitStates - getWaitStatesSinceDef(
824 [](
const MachineInstr &) { return true; },
826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
829 WaitStatesNeeded = std::max(
831 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
834 return WaitStatesNeeded;
837int GCNHazardRecognizer::checkDivFMasHazards(
MachineInstr *DivFMas) {
838 const SIInstrInfo *TII = ST.getInstrInfo();
842 const int DivFMasWaitStates = 4;
843 auto IsHazardDefFn = [TII](
const MachineInstr &
MI) {
844 return TII->isVALU(
MI);
846 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
849 return DivFMasWaitStates - WaitStatesNeeded;
852int GCNHazardRecognizer::checkGetRegHazards(
MachineInstr *GetRegInstr) {
853 const SIInstrInfo *TII = ST.getInstrInfo();
854 unsigned GetRegHWReg =
getHWReg(TII, *GetRegInstr);
856 const int GetRegWaitStates = 2;
857 auto IsHazardFn = [TII, GetRegHWReg](
const MachineInstr &
MI) {
860 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, GetRegWaitStates);
862 return GetRegWaitStates - WaitStatesNeeded;
865int GCNHazardRecognizer::checkSetRegHazards(
MachineInstr *SetRegInstr) {
866 const SIInstrInfo *TII = ST.getInstrInfo();
867 unsigned HWReg =
getHWReg(TII, *SetRegInstr);
869 const int SetRegWaitStates = ST.getSetRegWaitStates();
870 auto IsHazardFn = [TII, HWReg](
const MachineInstr &
MI) {
873 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, SetRegWaitStates);
874 return SetRegWaitStates - WaitStatesNeeded;
877int GCNHazardRecognizer::createsVALUHazard(
const MachineInstr &
MI) {
881 const SIInstrInfo *TII = ST.getInstrInfo();
882 unsigned Opcode =
MI.getOpcode();
883 const MCInstrDesc &
Desc =
MI.getDesc();
885 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
888 VDataRCID = TII->getOpRegClassID(
Desc.operands()[VDataIdx]);
890 if (TII->isMUBUF(
MI) || TII->isMTBUF(
MI)) {
897 const MachineOperand *SOffset =
898 TII->getNamedOperand(
MI, AMDGPU::OpName::soffset);
902 (!SOffset || !SOffset->
isReg()))
910 if (TII->isMIMG(
MI)) {
911 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
913 Desc.operands()[SRsrcIdx])) == 256);
917 if (TII->isFLAT(
MI)) {
930GCNHazardRecognizer::checkVALUHazardsHelper(
const MachineOperand &Def,
934 const SIRegisterInfo *TRI = ST.getRegisterInfo();
936 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
937 int WaitStatesNeeded = 0;
939 if (!TRI->isVectorRegister(
MRI,
Def.getReg()))
940 return WaitStatesNeeded;
943 int DataIdx = createsVALUHazard(
MI);
944 return DataIdx >= 0 &&
945 TRI->regsOverlap(
MI.getOperand(DataIdx).getReg(),
Reg);
948 int WaitStatesNeededForDef =
949 VALUWaitStates - getWaitStatesSince(
IsHazardFn, VALUWaitStates);
950 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
952 return WaitStatesNeeded;
968 unsigned Opcode =
MI.getOpcode();
978 if (
auto *DstSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel))
980 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
986 if (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src0_modifiers) &
988 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
992 (
TII->getNamedImmOperand(
MI, AMDGPU::OpName::src2_modifiers) &
994 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1000 return TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1021 for (
auto &Operand : VALU->operands()) {
1022 if (Operand.isReg() &&
TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
1029int GCNHazardRecognizer::checkVALUHazards(
MachineInstr *VALU) {
1030 int WaitStatesNeeded = 0;
1033 const int TransDefWaitstates = 1;
1035 auto IsTransDefFn = [
this,
VALU](
const MachineInstr &
MI) {
1038 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1039 const SIInstrInfo *TII = ST.getInstrInfo();
1040 Register Def = TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
1042 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1043 if (
Use.isReg() && TRI->regsOverlap(Def,
Use.getReg()))
1050 int WaitStatesNeededForDef =
1051 TransDefWaitstates -
1052 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
1053 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1056 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
1057 const int Shift16DefWaitstates = 1;
1059 auto IsShift16BitDefFn = [
this,
VALU](
const MachineInstr &ProducerMI) {
1060 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1061 const MachineOperand *ForwardedDst =
1067 if (ProducerMI.isInlineAsm()) {
1069 for (
auto &Def : ProducerMI.all_defs()) {
1078 int WaitStatesNeededForDef =
1079 Shift16DefWaitstates -
1080 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1081 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1084 if (ST.hasVDecCoExecHazard()) {
1085 const int VALUWriteSGPRVALUReadWaitstates = 2;
1086 const int VALUWriteEXECRWLane = 4;
1087 const int VALUWriteVGPRReadlaneRead = 1;
1089 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1090 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1092 auto IsVALUDefSGPRFn = [&
UseReg, TRI](
const MachineInstr &
MI) {
1095 return MI.modifiesRegister(
UseReg, TRI);
1098 for (
const MachineOperand &Use :
VALU->explicit_uses()) {
1104 int WaitStatesNeededForDef =
1105 VALUWriteSGPRVALUReadWaitstates -
1106 getWaitStatesSince(IsVALUDefSGPRFn,
1107 VALUWriteSGPRVALUReadWaitstates);
1108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1112 if (
VALU->readsRegister(AMDGPU::VCC, TRI)) {
1114 int WaitStatesNeededForDef =
1115 VALUWriteSGPRVALUReadWaitstates -
1116 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
1117 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1120 switch (
VALU->getOpcode()) {
1121 case AMDGPU::V_READLANE_B32:
1122 case AMDGPU::V_READFIRSTLANE_B32: {
1123 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
1125 int WaitStatesNeededForDef =
1126 VALUWriteVGPRReadlaneRead -
1127 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
1128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1131 case AMDGPU::V_WRITELANE_B32: {
1133 int WaitStatesNeededForDef =
1134 VALUWriteEXECRWLane -
1135 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1146 if (!ST.has12DWordStoreHazard())
1147 return WaitStatesNeeded;
1149 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1151 for (
const MachineOperand &Def :
VALU->defs()) {
1152 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def,
MRI));
1155 return WaitStatesNeeded;
1158int GCNHazardRecognizer::checkInlineAsmHazards(
MachineInstr *IA) {
1167 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1168 !ST.hasCvtScaleForwardingHazard())
1171 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1172 int WaitStatesNeeded = 0;
1174 for (
const MachineOperand &
Op :
1176 if (
Op.isReg() &&
Op.isDef()) {
1177 if (!TRI.isVectorRegister(
MRI,
Op.getReg()))
1180 if (ST.has12DWordStoreHazard()) {
1182 std::max(WaitStatesNeeded, checkVALUHazardsHelper(
Op,
MRI));
1187 if (ST.hasDstSelForwardingHazard()) {
1188 const int Shift16DefWaitstates = 1;
1190 auto IsShift16BitDefFn = [
this, &
IA](
const MachineInstr &ProducerMI) {
1194 return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1195 IA->readsRegister(Dst->getReg(), &TRI);
1197 if (ProducerMI.isInlineAsm()) {
1199 for (
auto &Def : ProducerMI.all_defs()) {
1200 if (
IA->modifiesRegister(
Def.getReg(), &TRI) ||
1201 IA->readsRegister(
Def.getReg(), &TRI)) {
1210 int WaitStatesNeededForDef =
1211 Shift16DefWaitstates -
1212 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1213 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1216 return WaitStatesNeeded;
1219int GCNHazardRecognizer::checkRWLaneHazards(
MachineInstr *RWLane) {
1220 const SIInstrInfo *TII = ST.getInstrInfo();
1221 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1222 const MachineRegisterInfo &
MRI = MF.getRegInfo();
1224 const MachineOperand *LaneSelectOp =
1225 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1227 if (!LaneSelectOp->
isReg() || !TRI->isSGPRReg(
MRI, LaneSelectOp->
getReg()))
1231 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isVALU(
MI); };
1233 const int RWLaneWaitStates = 4;
1234 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg,
IsHazardFn,
1236 return RWLaneWaitStates - WaitStatesSince;
1239int GCNHazardRecognizer::checkRFEHazards(
MachineInstr *RFE) {
1240 if (!ST.hasRFEHazards())
1243 const SIInstrInfo *TII = ST.getInstrInfo();
1245 const int RFEWaitStates = 1;
1250 int WaitStatesNeeded = getWaitStatesSinceSetReg(
IsHazardFn, RFEWaitStates);
1251 return RFEWaitStates - WaitStatesNeeded;
1255 const SIInstrInfo *TII = ST.getInstrInfo();
1256 const int ReadM0WaitStates = 1;
1257 auto IsHazardFn = [TII](
const MachineInstr &
MI) {
return TII->isSALU(
MI); };
1258 return ReadM0WaitStates -
1259 getWaitStatesSinceDef(AMDGPU::M0,
IsHazardFn, ReadM0WaitStates);
1264bool GCNHazardRecognizer::emitVNops(
MachineInstr *
MI,
int WaitStatesNeeded) {
1265 if (WaitStatesNeeded <= 0)
1268 const SIInstrInfo *TII = ST.getInstrInfo();
1269 for (
int I = 0;
I < WaitStatesNeeded; ++
I)
1271 TII->get(AMDGPU::V_NOP_e32));
1277 fixVMEMtoScalarWriteHazards(
MI);
1278 fixVcmpxPermlaneHazards(
MI);
1279 fixSMEMtoVectorWriteHazards(
MI);
1280 fixVcmpxExecWARHazard(
MI);
1281 fixLdsBranchVmemWARHazard(
MI);
1282 if (ST.hasLdsDirect()) {
1283 fixLdsDirectVALUHazard(
MI);
1284 fixLdsDirectVMEMHazard(
MI);
1286 fixVALUPartialForwardingHazard(
MI);
1287 fixVALUTransUseHazard(
MI);
1288 fixVALUTransCoexecutionHazards(
MI);
1290 emitVNops(
MI, checkWMMACoexecutionHazards(
MI));
1291 fixShift64HighRegBug(
MI);
1292 fixVALUMaskWriteHazard(
MI);
1293 fixRequiredExportPriority(
MI);
1294 if (ST.requiresWaitIdleBeforeGetReg())
1295 fixGetRegWaitIdle(
MI);
1296 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())
1297 fixDsAtomicAsyncBarrierArriveB64(
MI);
1298 if (ST.hasScratchBaseForwardingHazard())
1299 fixScratchBaseForwardingHazard(
MI);
1300 if (ST.setRegModeNeedsVNOPs())
1306 return (
TII.isVOPC(
MI) ||
1307 (
MI.isCompare() && (
TII.isVOP3(
MI) ||
TII.isSDWA(
MI)))) &&
1308 MI.modifiesRegister(AMDGPU::EXEC, &
TRI);
1311bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(
MachineInstr *
MI) {
1315 const SIInstrInfo *TII = ST.getInstrInfo();
1316 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1322 unsigned Opc =
MI.getOpcode();
1324 Opc != AMDGPU::V_NOP_e64 &&
Opc != AMDGPU::V_NOP_sdwa;
1328 std::numeric_limits<int>::max())
1334 auto *Src0 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0);
1336 bool IsUndef = Src0->isUndef();
1338 TII->get(AMDGPU::V_MOV_B32_e32))
1345bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(
MachineInstr *
MI) {
1346 if (!ST.hasVMEMtoScalarWriteHazard())
1348 assert(!ST.hasExtendedWaitCounts());
1353 if (
MI->getNumDefs() == 0)
1356 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1362 for (
const MachineOperand &Def :
MI->defs()) {
1363 const MachineOperand *
Op =
1364 I.findRegisterUseOperand(
Def.getReg(), TRI,
false);
1374 (
MI.getOpcode() == AMDGPU::S_WAITCNT &&
1375 !
MI.getOperand(0).getImm()) ||
1376 (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1381 std::numeric_limits<int>::max())
1384 const SIInstrInfo *TII = ST.getInstrInfo();
1386 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1391bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(
MachineInstr *
MI) {
1392 if (!ST.hasSMEMtoVectorWriteHazard())
1394 assert(!ST.hasExtendedWaitCounts());
1399 AMDGPU::OpName SDSTName;
1400 switch (
MI->getOpcode()) {
1401 case AMDGPU::V_READLANE_B32:
1402 case AMDGPU::V_READFIRSTLANE_B32:
1403 SDSTName = AMDGPU::OpName::vdst;
1406 SDSTName = AMDGPU::OpName::sdst;
1410 const SIInstrInfo *TII = ST.getInstrInfo();
1411 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1413 const MachineOperand *SDST = TII->getNamedOperand(*
MI, SDSTName);
1415 for (
const auto &MO :
MI->implicit_operands()) {
1416 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1427 auto IsHazardFn = [SDSTReg, TRI](
const MachineInstr &
I) {
1432 if (TII->isSALU(
MI)) {
1433 switch (
MI.getOpcode()) {
1434 case AMDGPU::S_SETVSKIP:
1435 case AMDGPU::S_VERSION:
1436 case AMDGPU::S_WAITCNT_VSCNT:
1437 case AMDGPU::S_WAITCNT_VMCNT:
1438 case AMDGPU::S_WAITCNT_EXPCNT:
1441 case AMDGPU::S_WAITCNT_LGKMCNT:
1443 return (
MI.getOperand(1).getImm() == 0) &&
1444 (
MI.getOperand(0).
getReg() == AMDGPU::SGPR_NULL);
1445 case AMDGPU::S_WAITCNT: {
1446 const int64_t
Imm =
MI.getOperand(0).getImm();
1449 return (Decoded.
DsCnt == 0);
1453 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&
1454 "unexpected wait count instruction");
1456 if (TII->isSOPP(
MI))
1472 std::numeric_limits<int>::max())
1476 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1481bool GCNHazardRecognizer::fixVcmpxExecWARHazard(
MachineInstr *
MI) {
1482 if (!ST.hasVcmpxExecWARHazard())
1484 assert(!ST.hasExtendedWaitCounts());
1489 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1490 if (!
MI->modifiesRegister(AMDGPU::EXEC, TRI))
1496 return I.readsRegister(AMDGPU::EXEC, TRI);
1499 const SIInstrInfo *TII = ST.getInstrInfo();
1500 auto IsExpiredFn = [TII, TRI](
const MachineInstr &
MI, int) {
1502 if (TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))
1504 for (
auto MO :
MI.implicit_operands())
1505 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1508 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1515 std::numeric_limits<int>::max())
1519 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1526 if (!ST.hasLdsBranchVmemWARHazard())
1531 bool HasLds =
false;
1532 bool HasVmem =
false;
1533 for (
auto &
MBB : MF) {
1534 for (
auto &
MI :
MBB) {
1537 if (HasLds && HasVmem)
1545 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1546 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1547 !
I.getOperand(1).getImm();
1550bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(
MachineInstr *
MI) {
1551 if (!RunLdsBranchVmemWARHazardFixup)
1554 assert(ST.hasLdsBranchVmemWARHazard());
1555 assert(!ST.hasExtendedWaitCounts());
1557 auto IsHazardInst = [](
const MachineInstr &
MI) {
1565 auto InstType = IsHazardInst(*
MI);
1569 auto IsExpiredFn = [&IsHazardInst](
const MachineInstr &
I, int) {
1573 auto IsHazardFn = [InstType, &IsHazardInst](
const MachineInstr &
I) {
1577 auto IsHazardFn = [InstType, IsHazardInst](
const MachineInstr &
I) {
1578 auto InstType2 = IsHazardInst(
I);
1579 return InstType2 && InstType != InstType2;
1582 auto IsExpiredFn = [InstType, &IsHazardInst](
const MachineInstr &
I, int) {
1583 auto InstType2 = IsHazardInst(
I);
1584 if (InstType == InstType2)
1591 std::numeric_limits<int>::max();
1595 std::numeric_limits<int>::max())
1598 const SIInstrInfo *TII = ST.getInstrInfo();
1600 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1607bool GCNHazardRecognizer::fixLdsDirectVALUHazard(
MachineInstr *
MI) {
1611 const int NoHazardWaitStates = 15;
1612 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1615 bool VisitedTrans =
false;
1616 auto IsHazardFn = [
this, VDSTReg, &VisitedTrans](
const MachineInstr &
I) {
1621 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1623 auto IsExpiredFn = [&](
const MachineInstr &
I,
int WaitStates) {
1624 if (WaitStates >= NoHazardWaitStates)
1630 auto GetWaitStatesFn = [](
const MachineInstr &
MI) {
1634 DenseSet<const MachineBasicBlock *> Visited;
1636 std::next(
MI->getReverseIterator()), 0,
1644 MachineOperand *WaitVdstOp =
1645 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvdst);
1646 WaitVdstOp->
setImm(std::min(
Count, NoHazardWaitStates));
1651bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(
MachineInstr *
MI) {
1655 const MachineOperand *VDST = TII.getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1658 auto IsHazardFn = [
this, VDSTReg](
const MachineInstr &
I) {
1661 return I.readsRegister(VDSTReg, &TRI) ||
I.modifiesRegister(VDSTReg, &TRI);
1663 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
1666 auto IsExpiredFn = [
this, LdsdirCanWait](
const MachineInstr &
I, int) {
1668 (
I.getOpcode() == AMDGPU::S_WAITCNT && !
I.getOperand(0).getImm()) ||
1669 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1672 !TII.getNamedOperand(
I, AMDGPU::OpName::waitvsrc)->getImm());
1676 std::numeric_limits<int>::max())
1679 if (LdsdirCanWait) {
1680 TII.getNamedOperand(*
MI, AMDGPU::OpName::waitvsrc)->setImm(0);
1683 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1690bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(
MachineInstr *
MI) {
1691 if (!ST.hasVALUPartialForwardingHazard())
1693 assert(!ST.hasExtendedWaitCounts());
1698 SmallSetVector<Register, 4> SrcVGPRs;
1700 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1701 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1706 if (SrcVGPRs.
size() <= 1)
1724 const int Intv1plus2MaxVALUs = 2;
1725 const int Intv3MaxVALUs = 4;
1726 const int IntvMaxVALUs = 6;
1727 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1730 SmallDenseMap<Register, int, 4> DefPos;
1731 int ExecPos = std::numeric_limits<int>::max();
1734 static unsigned getHashValue(
const StateType &State) {
1738 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1739 return LHS.DefPos ==
RHS.DefPos &&
LHS.ExecPos ==
RHS.ExecPos &&
1747 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1749 if (State.VALUs > NoHazardVALUWaitStates)
1755 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1763 if (!State.DefPos.count(Src) &&
I.modifiesRegister(Src, &TRI)) {
1764 State.DefPos[Src] = State.VALUs;
1769 if (State.ExecPos == std::numeric_limits<int>::max()) {
1770 if (!State.DefPos.empty() &&
I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1771 State.ExecPos = State.VALUs;
1778 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1786 if (State.ExecPos == std::numeric_limits<int>::max())
1789 int PreExecPos = std::numeric_limits<int>::max();
1790 int PostExecPos = std::numeric_limits<int>::max();
1792 for (
auto Entry : State.DefPos) {
1793 int DefVALUs =
Entry.second;
1794 if (DefVALUs != std::numeric_limits<int>::max()) {
1795 if (DefVALUs >= State.ExecPos)
1796 PreExecPos = std::min(PreExecPos, DefVALUs);
1798 PostExecPos = std::min(PostExecPos, DefVALUs);
1803 if (PostExecPos == std::numeric_limits<int>::max())
1807 int Intv3VALUs = PostExecPos;
1808 if (Intv3VALUs > Intv3MaxVALUs)
1812 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1813 if (Intv2VALUs > Intv1plus2MaxVALUs)
1817 if (PreExecPos == std::numeric_limits<int>::max())
1821 int Intv1VALUs = PreExecPos - State.ExecPos;
1822 if (Intv1VALUs > Intv1plus2MaxVALUs)
1826 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1831 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1837 std::next(
MI->getReverseIterator())))
1841 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1847bool GCNHazardRecognizer::fixVALUTransUseHazard(
MachineInstr *
MI) {
1848 if (!ST.hasVALUTransUseHazard())
1850 assert(!ST.hasExtendedWaitCounts());
1855 SmallSet<Register, 4> SrcVGPRs;
1857 for (
const MachineOperand &Use :
MI->explicit_uses()) {
1858 if (
Use.isReg() && TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
1872 const int IntvMaxVALUs = 5;
1873 const int IntvMaxTRANS = 1;
1879 static unsigned getHashValue(
const StateType &State) {
1882 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
1883 return LHS.VALUs ==
RHS.VALUs &&
LHS.TRANS ==
RHS.TRANS;
1890 auto IsHazardFn = [&,
this](StateType &State,
const MachineInstr &
I) {
1892 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1898 (
I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1905 if (
I.modifiesRegister(Src, &TRI)) {
1913 auto UpdateStateFn = [](StateType &State,
const MachineInstr &
MI) {
1921 std::next(
MI->getReverseIterator())))
1927 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1933bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(
MachineInstr *
MI) {
1938 const SIInstrInfo *TII = ST.getInstrInfo();
1939 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1941 auto IsTransHazardFn = [
MI, TII, TRI](
const MachineInstr &
I) {
1946 Register TransDef = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1947 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
1948 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))
1952 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
1953 if (!ValuDst || !ValuDst->isReg())
1957 Register ValuDef = ValuDst->getReg();
1958 for (
const MachineOperand &TransUse :
I.explicit_uses()) {
1959 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))
1970 const int HasVALU = std::numeric_limits<int>::max();
1971 if (::getWaitStatesSince(IsTransHazardFn,
MI,
IsExpiredFn) == HasVALU)
1974 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1982 const SIInstrInfo *TII = ST.getInstrInfo();
1983 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1985 auto IsHazardFn = [
MI, TII, TRI,
this](
const MachineInstr &
I) {
1992 TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
1994 TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
1997 TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
1999 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
2000 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
2009 TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2010 if (TRI->regsOverlap(PrevDstReg, CurIndex))
2024 std::numeric_limits<int>::max())
2027 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
2039 unsigned Category) {
2041 "Handle me if the xdl wmma instruction latency changes");
2078int GCNHazardRecognizer::checkWMMACoexecutionHazards(
MachineInstr *
MI) {
2082 const SIInstrInfo *TII = ST.getInstrInfo();
2086 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2093 const int WMMAWaitStates[] = {5, 9, 3, 5};
2094 const int VALUWaitStates[] = {4, 8, 2, 4};
2095 unsigned Category = 0;
2097 auto IsWMMAHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2098 if (!TII->isXDLWMMA(
I))
2101 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2105 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2106 Register A1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src0)->getReg();
2107 Register B1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
2110 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))
2114 Register Idx1 = TII->getNamedOperand(*
MI, AMDGPU::OpName::src2)->getReg();
2115 if (TRI->regsOverlap(D0, Idx1))
2122 auto IsVALUHazardFn = [
MI, TII, TRI, &Category,
this](
const MachineInstr &
I) {
2123 if (!TII->isXDLWMMA(
I))
2126 unsigned Latency = TSchedModel.computeInstrLatency(&
I);
2131 Register D0 = TII->getNamedOperand(
I, AMDGPU::OpName::vdst)->getReg();
2132 for (
const MachineOperand &ValuUse :
MI->explicit_uses()) {
2133 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))
2137 auto *ValuDst = TII->getNamedOperand(*
MI, AMDGPU::OpName::vdst);
2138 if (!ValuDst || !ValuDst->isReg())
2143 if (TRI->regsOverlap(D0, D1))
2147 Register A0 = TII->getNamedOperand(
I, AMDGPU::OpName::src0)->getReg();
2148 Register B0 = TII->getNamedOperand(
I, AMDGPU::OpName::src1)->getReg();
2149 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))
2153 Register Idx0 = TII->getNamedOperand(
I, AMDGPU::OpName::src2)->getReg();
2154 if (TRI->regsOverlap(D1, Idx0))
2163 auto GetWaitStatesFn = [](
const MachineInstr &
I) {
2167 int WaitStatesNeeded = -1;
2168 if (TII->isXDLWMMA(*
MI)) {
2169 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2170 Limit = WMMAWaitStates[Category];
2176 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);
2179 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {
2180 Limit = VALUWaitStates[Category];
2186 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);
2190 return WaitStatesNeeded;
2193bool GCNHazardRecognizer::fixShift64HighRegBug(
MachineInstr *
MI) {
2194 if (!ST.hasShift64HighRegBug())
2196 assert(!ST.hasExtendedWaitCounts());
2198 switch (
MI->getOpcode()) {
2201 case AMDGPU::V_LSHLREV_B64_e64:
2202 case AMDGPU::V_LSHRREV_B64_e64:
2203 case AMDGPU::V_ASHRREV_I64_e64:
2207 MachineOperand *Amt = TII.getNamedOperand(*
MI, AMDGPU::OpName::src0);
2212 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2214 if (!TRI.isVGPR(
MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
2217 if (AmtReg != AMDGPU::VGPR255 &&
MRI.isPhysRegUsed(AmtReg + 1))
2220 assert(ST.needsAlignedVGPRs());
2221 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
2224 MachineBasicBlock *
MBB =
MI->getParent();
2225 MachineOperand *Src1 = TII.getNamedOperand(*
MI, AMDGPU::OpName::src1);
2236 Register DstReg =
MI->getOperand(0).getReg();
2238 Register DstLo = TRI.getSubReg(DstReg, AMDGPU::sub0);
2246 bool Overlapped =
MI->modifiesRegister(AmtReg, &TRI);
2248 for (MCRegister
Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
2249 : AMDGPU::VGPR_32RegClass) {
2250 if (!
MI->modifiesRegister(
Reg, &TRI) && !
MI->readsRegister(
Reg, &TRI)) {
2256 Register NewAmt = Overlapped ? (
Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
2261 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
2274 runOnInstruction(
BuildMI(*
MBB,
MI,
DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
2281 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2287 BuildMI(*
MBB, std::next(
MI->getIterator()),
DL, TII.get(AMDGPU::V_SWAP_B32),
2301 MI->getOperand(0).setReg(NewReg);
2311 int NSAtoVMEMWaitStates = 1;
2313 if (!ST.hasNSAtoVMEMBug())
2319 const SIInstrInfo *TII = ST.getInstrInfo();
2320 const auto *
Offset = TII->getNamedOperand(*
MI, AMDGPU::OpName::offset);
2328 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
2329 TII->getInstSizeInBytes(
I) >= 16;
2332 return NSAtoVMEMWaitStates - getWaitStatesSince(
IsHazardFn, 1);
2335int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(
MachineInstr *
MI) {
2336 int FPAtomicToDenormModeWaitStates = 3;
2338 if (!ST.hasFPAtomicToDenormModeHazard())
2340 assert(!ST.hasExtendedWaitCounts());
2342 if (
MI->getOpcode() != AMDGPU::S_DENORM_MODE)
2351 auto IsExpiredFn = [](
const MachineInstr &
MI,
int WaitStates) {
2358 return FPAtomicToDenormModeWaitStates -
2365 return ST.hasGFX90AInsts() ? checkMAIHazards90A(
MI) : checkMAIHazards908(
MI);
2373 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2377 int NeighborMFMALatency = 0;
2378 auto IsNeighboringMFMA = [&NeighborMFMALatency,
2379 this](
const MachineInstr &
MI) {
2383 NeighborMFMALatency = this->getMFMAPipelineWaitStates(
MI);
2387 const int MaxMFMAPipelineWaitStates = 16;
2388 int WaitStatesSinceNeighborMFMA =
2389 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
2391 int NeighborMFMAPaddingNeeded =
2393 WaitStatesSinceNeighborMFMA;
2395 return std::max(0, NeighborMFMAPaddingNeeded);
2399 int WaitStatesNeeded = 0;
2400 unsigned Opc =
MI->getOpcode();
2402 auto IsVALUFn = [](
const MachineInstr &
MI) {
2406 if (
Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {
2407 const int LegacyVALUWritesVGPRWaitStates = 2;
2408 const int VALUWritesExecWaitStates = 4;
2409 const int MaxWaitStates = 4;
2411 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2412 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
2413 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2415 if (WaitStatesNeeded < MaxWaitStates) {
2416 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2417 const int MaxWaitStates = 2;
2419 if (!
Use.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Use.getReg()))
2422 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
2423 getWaitStatesSinceDef(
Use.getReg(), IsVALUFn, MaxWaitStates);
2424 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2426 if (WaitStatesNeeded == MaxWaitStates)
2432 for (
const MachineOperand &
Op :
MI->explicit_operands()) {
2433 if (!
Op.isReg() || !TRI.isAGPR(MF.getRegInfo(),
Op.getReg()))
2436 if (
Op.isDef() &&
Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2439 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2440 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2441 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2442 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2443 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2444 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2445 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2446 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2447 const int MaxWaitStates = 18;
2449 unsigned HazardDefLatency = 0;
2451 auto IsOverlappedMFMAFn = [
Reg, &HazardDefLatency,
2452 this](
const MachineInstr &
MI) {
2459 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2460 return TRI.regsOverlap(DstReg,
Reg);
2463 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn,
2465 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2466 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2467 int OpNo =
Op.getOperandNo();
2468 if (OpNo == SrcCIdx) {
2469 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2470 }
else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2471 switch (HazardDefLatency) {
2472 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2474 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2476 case 16: [[fallthrough]];
2477 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2480 }
else if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2481 switch (HazardDefLatency) {
2482 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2484 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2486 case 16: [[fallthrough]];
2487 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2492 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2493 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2495 if (WaitStatesNeeded == MaxWaitStates)
2496 return WaitStatesNeeded;
2498 auto IsAccVgprWriteFn = [
Reg,
this](
const MachineInstr &
MI) {
2499 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2502 return TRI.regsOverlap(
Reg, DstReg);
2505 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2506 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2507 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2508 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2509 if (OpNo == SrcCIdx)
2510 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2511 else if (
Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2512 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2514 WaitStatesNeededForUse = NeedWaitStates -
2515 getWaitStatesSinceDef(
Reg, IsAccVgprWriteFn, MaxWaitStates);
2516 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2518 if (WaitStatesNeeded == MaxWaitStates)
2519 return WaitStatesNeeded;
2522 if (
Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2523 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2524 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2525 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2526 const int MaxWaitStates = 13;
2527 Register DstReg =
MI->getOperand(0).getReg();
2528 unsigned HazardDefLatency = 0;
2530 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2531 this](
const MachineInstr &
MI) {
2534 Register Reg = TII.getNamedOperand(
MI, AMDGPU::OpName::src2)->getReg();
2536 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&
MI));
2537 return TRI.regsOverlap(
Reg, DstReg);
2540 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2542 switch (HazardDefLatency) {
2543 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2545 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2547 case 16: [[fallthrough]];
2548 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2552 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2553 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2557 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2559 return WaitStatesNeeded;
2570 return NumPasses + 1 + IsGFX950;
2581 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);
2599 return NumPasses + 2;
2609 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2613 int WaitStatesNeeded = 0;
2614 unsigned Opc =
MI->getOpcode();
2616 auto IsLegacyVALUFn = [](
const MachineInstr &
MI) {
2620 auto IsLegacyVALUNotDotFn = [](
const MachineInstr &
MI) {
2626 return WaitStatesNeeded;
2628 const int VALUWritesExecWaitStates = 4;
2629 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2630 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2631 VALUWritesExecWaitStates);
2632 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2634 int SrcCIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::src2);
2637 for (
const MachineOperand &Use :
MI->explicit_uses()) {
2638 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2639 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2640 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2641 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2642 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2643 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2644 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2645 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2646 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;
2647 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2648 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2649 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2650 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2651 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2652 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2653 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;
2654 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2655 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2656 const int MaxWaitStates = 19;
2662 const MachineInstr *MI1;
2664 auto IsOverlappedMFMAFn = [
Reg, &FullReg, &MI1,
2665 this](
const MachineInstr &
MI) {
2669 FullReg = (DstReg ==
Reg);
2671 return TRI.regsOverlap(DstReg,
Reg);
2674 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2675 getWaitStatesSinceDef(
Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2676 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2679 getWaitStatesSinceDef(
Reg, IsOverlappedMFMAFn, MaxWaitStates);
2680 if (NumWaitStates == std::numeric_limits<int>::max())
2683 int OpNo =
Use.getOperandNo();
2685 int NeedWaitStates = 0;
2686 if (OpNo == SrcCIdx) {
2690 }
else if (FullReg) {
2691 if ((
Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2692 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2693 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2694 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2695 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2696 else if (ST.hasGFX940Insts() &&
2697 TSchedModel.computeInstrLatency(MI1) == 2)
2698 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2701 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2702 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2703 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2704 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2705 if (!TII.isXDL(*
MI))
2708 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates
2709 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2711 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2712 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2713 if (!TII.isXDL(*
MI))
2714 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2717 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2718 if (ST.hasGFX940Insts()) {
2719 if (TII.isXDL(*
MI) && !TII.isXDL(*MI1))
2726 NumPasses, ST.hasGFX950Insts())
2728 NumPasses, ST.hasGFX950Insts()))
2734 switch (NumPasses) {
2738 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2739 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2744 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2745 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2750 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2751 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2760 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2761 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2762 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2763 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2766 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates
2767 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2769 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2770 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2771 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2774 int NumPasses = TSchedModel.computeInstrLatency(MI1);
2776 if (ST.hasGFX940Insts()) {
2780 NumPasses, ST.hasGFX950Insts())
2786 switch (NumPasses) {
2788 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2793 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2797 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2801 if (WaitStatesNeeded >= NeedWaitStates)
2804 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2805 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2807 if (WaitStatesNeeded == MaxWaitStates)
2812 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(
MI));
2814 return WaitStatesNeeded;
2819 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2822 int WaitStatesNeeded = 0;
2824 auto IsAccVgprReadFn = [](
const MachineInstr &
MI) {
2825 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2828 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2829 if (!
Op.isReg() || !TRI.isVGPR(MF.getRegInfo(),
Op.getReg()))
2834 const int AccVgprReadLdStWaitStates = 2;
2835 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2836 const int MaxWaitStates = 2;
2838 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2839 getWaitStatesSinceDef(
Reg, IsAccVgprReadFn, MaxWaitStates);
2840 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2842 if (WaitStatesNeeded == MaxWaitStates)
2843 return WaitStatesNeeded;
2845 auto IsVALUAccVgprRdWrCheckFn = [
Reg,
this](
const MachineInstr &
MI) {
2846 if (
MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2847 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2849 auto IsVALUFn = [](
const MachineInstr &
MI) {
2852 return getWaitStatesSinceDef(
Reg, IsVALUFn, 2 ) <
2853 std::numeric_limits<int>::max();
2856 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2857 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2858 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2861 return WaitStatesNeeded;
2865 assert(!ST.hasVcmpxPermlaneHazard() &&
2866 "this is a different vcmpx+permlane hazard");
2867 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2868 const SIInstrInfo *TII = ST.getInstrInfo();
2870 auto IsVCmpXWritesExecFn = [TII, TRI](
const MachineInstr &
MI) {
2874 auto IsVALUFn = [](
const MachineInstr &
MI) {
2878 const int VCmpXWritesExecWaitStates = 4;
2879 const int VALUWritesVDstWaitStates = 2;
2880 int WaitStatesNeeded = 0;
2882 for (
const MachineOperand &
Op :
MI->explicit_uses()) {
2883 if (!
Op.isReg() || !TRI->isVGPR(MF.getRegInfo(),
Op.getReg()))
2887 int WaitStatesSinceDef =
2888 VALUWritesVDstWaitStates -
2889 getWaitStatesSinceDef(
Reg, IsVALUFn,
2890 VALUWritesVDstWaitStates);
2891 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2892 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2896 int VCmpXHazardWaits =
2897 VCmpXWritesExecWaitStates -
2898 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2900 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2901 return WaitStatesNeeded;
2909 return NumPasses + 2;
2919 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2929 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
2937 return NumPasses + 2;
2941 if (!ST.hasGFX90AInsts())
2944 auto IsDGEMMFn = [](
const MachineInstr &
MI) ->
bool {
2952 const MachineRegisterInfo &
MRI = MF.getRegInfo();
2954 int WaitStatesNeeded = 0;
2960 const MachineInstr *
MFMA =
nullptr;
2962 auto IsMFMAWriteFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
2964 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2970 const MachineInstr *
DOT =
nullptr;
2971 auto IsDotWriteFn = [&
Reg, &
DOT,
this](
const MachineInstr &
MI) {
2973 !TRI.regsOverlap(
MI.getOperand(0).getReg(),
Reg))
2979 bool DGEMMAfterVALUWrite =
false;
2980 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite,
this](
const MachineInstr &
MI) {
2983 DGEMMAfterVALUWrite =
true;
2987 if (!TII.isVALU(
MI) || !DGEMMAfterVALUWrite)
2993 int SrcCIdx = AMDGPU::getNamedOperandIdx(
MI->getOpcode(),
2994 AMDGPU::OpName::src2);
2996 if (IsMemOrExport || IsVALU) {
2997 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2998 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2999 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
3000 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
3001 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
3002 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
3003 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
3004 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;
3005 const int DotWriteSameDotReadSrcAB = 3;
3006 const int DotWriteDifferentVALURead = 3;
3007 const int DMFMABetweenVALUWriteVMEMRead = 2;
3008 const int MaxWaitStates = 19;
3010 for (
const MachineOperand &Use :
MI->explicit_uses()) {
3016 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3019 int NeedWaitStates = 0;
3020 if (
DOT->getOpcode() ==
MI->getOpcode()) {
3021 if (&Use - &
MI->getOperand(0) != SrcCIdx)
3022 NeedWaitStates = DotWriteSameDotReadSrcAB;
3024 NeedWaitStates = DotWriteDifferentVALURead;
3027 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3028 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3035 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
3036 DGEMMAfterVALUWrite =
false;
3037 if (TRI.isVectorRegister(
MRI,
Reg)) {
3038 int WaitStatesNeededForUse =
3039 DMFMABetweenVALUWriteVMEMRead -
3040 getWaitStatesSinceDef(
Reg, IsDGEMMHazard,
3041 DMFMABetweenVALUWriteVMEMRead);
3043 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3048 WaitStatesSinceDef =
3049 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3053 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3054 int NumPasses = HazardDefLatency;
3055 int NeedWaitStates = MaxWaitStates;
3058 switch (HazardDefLatency) {
3060 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
3061 : DMFMA4x4WriteVgprVALUReadWaitStates;
3067 ? DMFMA16x16WriteVgprMemExpReadWaitStates
3068 : (ST.hasGFX950Insts()
3069 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates
3070 : DMFMA16x16WriteVgprVALUReadWaitStates);
3075 }
else if (ST.hasGFX940Insts()) {
3079 NumPasses, ST.hasGFX950Insts())
3083 switch (HazardDefLatency) {
3085 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
3088 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
3091 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
3098 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3099 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3101 if (WaitStatesNeeded == MaxWaitStates)
3106 unsigned Opc =
MI->getOpcode();
3107 const int DMFMAToFMA64WaitStates = 2;
3108 if ((
Opc == AMDGPU::V_FMA_F64_e64 ||
3109 Opc == AMDGPU::V_FMAC_F64_e32 ||
Opc == AMDGPU::V_FMAC_F64_e64 ||
3110 Opc == AMDGPU::V_FMAC_F64_dpp) &&
3111 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
3112 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
3113 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
3114 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3117 if (!IsVALU && !IsMemOrExport)
3118 return WaitStatesNeeded;
3120 for (
const MachineOperand &Def :
MI->defs()) {
3121 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
3122 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
3123 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
3124 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
3125 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
3126 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
3127 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
3128 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
3129 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
3130 const int DotWriteDifferentVALUWrite = 3;
3131 const int MaxWaitStates = 19;
3132 const int MaxWarWaitStates = 15;
3137 int WaitStatesSinceDef = getWaitStatesSinceDef(
Reg, IsDotWriteFn,
3139 if (DOT &&
DOT->getOpcode() !=
MI->getOpcode())
3140 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
3141 WaitStatesSinceDef);
3144 WaitStatesSinceDef =
3145 getWaitStatesSinceDef(
Reg, IsMFMAWriteFn, MaxWaitStates);
3147 int NeedWaitStates = MaxWaitStates;
3148 int NumPasses = TSchedModel.computeInstrLatency(
MFMA);
3151 switch (NumPasses) {
3153 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
3157 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;
3162 }
else if (ST.hasGFX940Insts()) {
3166 NumPasses, ST.hasGFX950Insts())
3169 switch (NumPasses) {
3171 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
3174 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
3177 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;
3184 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
3185 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3187 if (WaitStatesNeeded == MaxWaitStates)
3191 auto IsSMFMAReadAsCFn = [&
Reg, &
MFMA,
this](
const MachineInstr &
MI) {
3193 !
MI.readsRegister(
Reg, &TRI))
3196 if (ST.hasGFX940Insts() && !TII.isXDL(
MI))
3199 const MachineOperand *SrcC =
3200 TII.getNamedOperand(
MI, AMDGPU::OpName::src2);
3210 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
3215 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(
MFMA);
3216 int NeedWaitStates = MaxWaitStates;
3217 switch (HazardDefLatency) {
3218 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
3220 case 4:
assert(ST.hasGFX940Insts());
3221 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
3223 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
3225 case 16: [[fallthrough]];
3226 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
3230 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
3231 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
3234 return WaitStatesNeeded;
3247 return MAI !=
nullptr;
3251 if (IsMFMAFn(*
MI)) {
3252 int W = getWaitStatesSince(IsMFMAFn, 16);
3254 return W < (int)TSchedModel.computeInstrLatency(MAI);
3268 while (
I->isBundledWithPred())
3274 if (
I->getOpcode() != AMDGPU::S_GETPC_B64)
3278 const unsigned NewBytes = 4;
3280 "Unexpected instruction insertion in bundle");
3283 while (NextMI != End && NextMI->isBundledWithPred()) {
3284 for (
auto &Operand : NextMI->operands()) {
3285 if (Operand.isGlobal())
3286 Operand.setOffset(Operand.getOffset() + NewBytes);
3292bool GCNHazardRecognizer::fixVALUMaskWriteHazard(
MachineInstr *
MI) {
3293 if (!ST.hasVALUMaskWriteHazard())
3295 assert(!ST.hasExtendedWaitCounts());
3302 if (!IsSALU && !IsVALU)
3314 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3315 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3320 case AMDGPU::EXEC_LO:
3321 case AMDGPU::EXEC_HI:
3323 case AMDGPU::SGPR_NULL:
3324 case AMDGPU::SGPR_NULL64:
3332 return Reg == AMDGPU::VCC ||
Reg == AMDGPU::VCC_LO ||
Reg == AMDGPU::VCC_HI;
3336 SmallSet<Register, 2> HazardSGPRs;
3338 static unsigned getHashValue(
const StateType &State) {
3341 static bool isEqual(
const StateType &
LHS,
const StateType &
RHS) {
3342 return LHS.HazardSGPRs ==
RHS.HazardSGPRs;
3346 SmallVector<const MachineInstr *> WaitInstrs;
3347 bool HasSGPRRead =
false;
3348 StateType InitialState;
3351 MachineOperand *HazardDef =
nullptr;
3352 for (MachineOperand &
Op :
MI->operands()) {
3355 if (
Op.isDef() && HazardDef)
3359 if (IgnoreableSGPR(
Reg))
3362 if (
Op.isImplicit())
3364 if (!TRI->isSGPRReg(
MRI,
Reg))
3382 if (AMDGPU::SReg_32RegClass.
contains(HazardReg)) {
3383 InitialState.HazardSGPRs.insert(HazardReg);
3386 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));
3387 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));
3390 auto IsHazardFn = [&](StateType &State,
const MachineInstr &
I) {
3391 if (State.HazardSGPRs.empty())
3394 switch (
I.getOpcode()) {
3395 case AMDGPU::V_ADDC_U32_e32:
3396 case AMDGPU::V_ADDC_U32_dpp:
3397 case AMDGPU::V_CNDMASK_B16_t16_e32:
3398 case AMDGPU::V_CNDMASK_B16_fake16_e32:
3399 case AMDGPU::V_CNDMASK_B16_t16_dpp:
3400 case AMDGPU::V_CNDMASK_B16_fake16_dpp:
3401 case AMDGPU::V_CNDMASK_B32_e32:
3402 case AMDGPU::V_CNDMASK_B32_dpp:
3403 case AMDGPU::V_DIV_FMAS_F32_e64:
3404 case AMDGPU::V_DIV_FMAS_F64_e64:
3405 case AMDGPU::V_SUBB_U32_e32:
3406 case AMDGPU::V_SUBB_U32_dpp:
3407 case AMDGPU::V_SUBBREV_U32_e32:
3408 case AMDGPU::V_SUBBREV_U32_dpp: {
3412 case AMDGPU::V_ADDC_U32_e64:
3413 case AMDGPU::V_ADDC_U32_e64_dpp:
3414 case AMDGPU::V_CNDMASK_B16_t16_e64:
3415 case AMDGPU::V_CNDMASK_B16_fake16_e64:
3416 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:
3417 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
3418 case AMDGPU::V_CNDMASK_B32_e64:
3419 case AMDGPU::V_CNDMASK_B32_e64_dpp:
3420 case AMDGPU::V_SUBB_U32_e64:
3421 case AMDGPU::V_SUBB_U32_e64_dpp:
3422 case AMDGPU::V_SUBBREV_U32_e64:
3423 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
3425 const MachineOperand *SSRCOp = TII.getNamedOperand(
I, AMDGPU::OpName::src2);
3427 bool Result = TRI->regsOverlap(SSRCOp->
getReg(), HazardReg);
3439 auto UpdateStateFn = [&](StateType &State,
const MachineInstr &
I) {
3440 switch (
I.getOpcode()) {
3441 case AMDGPU::S_WAITCNT_DEPCTR:
3443 if (!HasSGPRRead &&
I.getParent() ==
MI->getParent() && !
I.isBundled() &&
3444 (
I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)
3449 for (
auto &
Op :
I.operands()) {
3454 if (IgnoreableSGPR(
Reg))
3457 if (
Op.isImplicit())
3459 if (!TRI->isSGPRReg(
MRI,
Reg))
3470 for (
Register SGPR : State.HazardSGPRs) {
3471 if (
Reg == SGPR || TRI->regsOverlap(
Reg, SGPR))
3475 State.HazardSGPRs.erase(SGPR);
3484 std::next(
MI->getReverseIterator())))
3494 if (!WaitInstrs.
empty()) {
3498 SmallVector<MachineInstr *> ToErase;
3500 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),
3501 End = MI->getParent()->rend();
3502 Found < WaitInstrs.size() && It != End; ++It) {
3503 MachineInstr *WaitMI = &*It;
3505 if (std::as_const(WaitMI) != WaitInstrs[Found])
3508 unsigned WaitMask = WaitMI->getOperand(0).getImm();
3509 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);
3510 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(
3511 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),
3512 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));
3513 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(
3514 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),
3515 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));
3516 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(
3517 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),
3518 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));
3519 ToErase.push_back(WaitMI);
3522 for (MachineInstr *WaitMI : ToErase)
3523 WaitMI->eraseFromParent();
3527 auto NextMI = std::next(
MI->getIterator());
3528 auto NewMI =
BuildMI(*
MI->getParent(), NextMI,
MI->getDebugLoc(),
3529 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
3541 if (EntryMBB.
begin() != EntryMBB.
end()) {
3542 auto &EntryMI = *EntryMBB.
begin();
3543 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&
3544 EntryMI.getOperand(0).getImm() >= Priority)
3553bool GCNHazardRecognizer::fixRequiredExportPriority(
MachineInstr *
MI) {
3554 if (!ST.hasRequiredExportPriority())
3559 MachineBasicBlock *
MBB =
MI->getParent();
3572 const int MaxPriority = 3;
3573 const int NormalPriority = 2;
3574 const int PostExportPriority = 0;
3576 auto It =
MI->getIterator();
3577 switch (
MI->getOpcode()) {
3578 case AMDGPU::S_ENDPGM:
3579 case AMDGPU::S_ENDPGM_SAVED:
3580 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:
3581 case AMDGPU::SI_RETURN_TO_EPILOG:
3584 if (MF->getFrameInfo().hasCalls())
3587 case AMDGPU::S_SETPRIO: {
3589 auto &PrioOp =
MI->getOperand(0);
3590 int Prio = PrioOp.getImm();
3591 bool InWA = (Prio == PostExportPriority) &&
3592 (It !=
MBB->
begin() && TII.isEXP(*std::prev(It)));
3593 if (InWA || Prio >= NormalPriority)
3595 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));
3599 if (!TII.isEXP(*
MI))
3610 auto NextMI = std::next(It);
3611 bool EndOfShader =
false;
3612 if (NextMI !=
MBB->
end()) {
3614 if (TII.isEXP(*NextMI))
3617 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&
3618 NextMI->getOperand(0).getImm() == PostExportPriority)
3620 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;
3627 .
addImm(PostExportPriority);
3631 BuildMI(*
MBB, NextMI,
DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))
3632 .
addReg(AMDGPU::SGPR_NULL)
3652 const SIInstrInfo *TII = ST.getInstrInfo();
3664 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3669bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(
MachineInstr *
MI) {
3670 if (
MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)
3673 const SIInstrInfo *TII = ST.getInstrInfo();
3675 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3677 BuildMI(*
MI->getParent(), std::next(
MI->getIterator()),
MI->getDebugLoc(),
3678 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3684bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(
MachineInstr *
MI) {
3687 if (!IsHazardRecognizerMode)
3690 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3691 const SIInstrInfo *TII = ST.getInstrInfo();
3693 const int FlatScrBaseWaitStates = 10;
3695 bool ReadsFlatScrLo =
3696 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);
3697 bool ReadsFlatScrHi =
3698 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);
3704 ReadsFlatScrLo =
true;
3707 ReadsFlatScrHi =
true;
3712 const MachineRegisterInfo &
MRI = MF.getRegInfo();
3715 DenseSet<const MachineBasicBlock *> Visited;
3717 return MI.modifiesRegister(
Reg, TRI);
3722 auto IsSGPRDef = [TII, TRI, &
MRI](
const MachineInstr &
MI) ->
unsigned {
3723 if (!TII->isSALU(
MI) && !TII->isVALU(
MI))
3725 for (
const MachineOperand &MO :
MI.all_defs()) {
3726 if (TRI->isSGPRReg(
MRI, MO.getReg()))
3732 auto IsExpiredFn = [=](
const MachineInstr &
MI,
int SgprWrites) {
3733 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
3734 unsigned Wait =
MI.getOperand(0).getImm();
3739 return SgprWrites >= FlatScrBaseWaitStates;
3742 return ::getWaitStatesSince(
3743 IsHazardFn,
MI->getParent(), std::next(
MI->getReverseIterator()),
3744 0,
IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;
3747 if ((!ReadsFlatScrLo ||
MRI.isConstantPhysReg(AMDGPU::SGPR102) ||
3748 !IsRegDefHazard(AMDGPU::SGPR102)) &&
3749 (!ReadsFlatScrHi ||
MRI.isConstantPhysReg(AMDGPU::SGPR103) ||
3750 !IsRegDefHazard(AMDGPU::SGPR103)))
3754 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
3765 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
3766 BuildMI(*
MI->getParent(),
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))
static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)
Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .
static bool isSGetReg(unsigned Opcode)
static bool breaksSMEMSoftClause(MachineInstr *MI)
static bool isLdsDma(const MachineInstr &MI)
static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)
static bool isRFE(unsigned Opcode)
static bool isRWLane(unsigned Opcode)
static bool isSMovRel(unsigned Opcode)
static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)
Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...
static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)
static void updateGetPCBundle(MachineInstr *NewMI)
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)
static bool isStoreCountWaitZero(const MachineInstr &I)
static bool breaksVMEMSoftClause(MachineInstr *MI)
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)
static bool isSSetReg(unsigned Opcode)
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)
static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)
static bool isDivFMas(unsigned Opcode)
static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)
static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)
static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)
static bool isCoexecutableVALUInst(const MachineInstr &MI)
static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)
static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)
static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)
static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)
static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))
static bool isPermlane(const MachineInstr &MI)
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)
static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static const uint32_t IV[8]
std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)
Alternate version of insert() which allows a different, and possibly less expensive,...
Implements a dense probed hash-table based set.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void EmitNoop() override
EmitNoop - This callback is invoked when a noop was added to the instruction stream.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
unsigned PreEmitNoops(MachineInstr *) override
This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
function_ref< bool(const MachineInstr &)> IsHazardFn
void AdvanceCycle() override
AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...
function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn
unsigned PreEmitNoopsCommon(MachineInstr *)
function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn
bool ShouldPreferAnother(SUnit *SU) override
ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.
HazardType getHazardType(SUnit *SU, int Stalls) override
getHazardType - Return the hazard type of emitting this node.
GCNHazardRecognizer(const MachineFunction &MF)
void RecedeCycle() override
RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...
Wrapper class representing physical registers. Should be passed by value.
Instructions::const_reverse_iterator const_reverse_instr_iterator
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isBundled() const
Return true if this instruction part of a bundle.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setIsKill(bool Val=true)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static bool isDS(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isSMRD(const MachineInstr &MI)
static bool isMTBUF(const MachineInstr &MI)
static bool isDGEMM(unsigned Opcode)
static bool isEXP(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isSDWA(const MachineInstr &MI)
static bool isDOT(const MachineInstr &MI)
static bool isSWMMAC(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isWaitcnt(unsigned Opcode)
static bool isDPP(const MachineInstr &MI)
static bool isMFMA(const MachineInstr &MI)
static bool isMAI(const MCInstrDesc &Desc)
static bool isFPAtomic(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isWMMA(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
unsigned getOccupancy() const
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
unsigned getMaxLookAhead() const
unsigned MaxLookAhead
MaxLookAhead - Indicate the number of cycles in the scoreboard state.
virtual void EmitNoops(unsigned Quantity)
EmitNoops - This callback is invoked when noops were added to the instruction stream.
size_type size() const
Determine the number of elements in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A SetVector that performs no allocations if smaller than a certain size.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)
unsigned decodeFieldSaSdst(unsigned Encoded)
unsigned decodeFieldVaSdst(unsigned Encoded)
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
unsigned decodeFieldVaVdst(unsigned Encoded)
unsigned decodeFieldVmVsrc(unsigned Encoded)
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
FPType getFPDstSelType(unsigned Opc)
bool isGFX12Plus(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
bool isGFX1250(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
NodeAddr< DefNode * > Def
NodeAddr< UseNode * > Use
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
DWARFExpression::Operation Op
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
An information struct used to provide DenseMap with the various necessary components for a given valu...