89#define DEBUG_TYPE "si-wqm"
98 StateStrict = StateStrictWWM | StateStrictWQM,
105 explicit PrintState(
int State) : State(State) {}
111 static const std::pair<char, const char *> Mapping[] = {
112 std::pair(StateWQM,
"WQM"), std::pair(StateStrictWWM,
"StrictWWM"),
113 std::pair(StateStrictWQM,
"StrictWQM"), std::pair(StateExact,
"Exact")};
114 char State = PS.State;
115 for (
auto M : Mapping) {
116 if (State & M.first) {
133 char MarkedStates = 0;
140 char InitialState = 0;
141 bool NeedsLowering =
false;
153class SIWholeQuadMode {
190 std::vector<WorkItem> &Worklist);
193 std::vector<WorkItem> &Worklist);
195 std::vector<WorkItem> &Worklist);
197 std::vector<WorkItem> &Worklist);
198 char scanInstructions(
MachineFunction &MF, std::vector<WorkItem> &Worklist,
200 void propagateInstruction(
MachineInstr &
MI, std::vector<WorkItem> &Worklist);
215 Register SaveOrig,
char StrictStateNeeded);
218 char NonStrictState,
char CurrentStrictState);
227 bool lowerLiveMaskQueries();
228 bool lowerCopyInstrs();
229 bool lowerKillInstrs(
bool IsWQM);
243 StringRef getPassName()
const override {
return "SI Whole Quad Mode"; }
260char SIWholeQuadModeLegacy::ID = 0;
273 return new SIWholeQuadModeLegacy;
278 for (
const auto &BII : Blocks) {
281 <<
" InNeeds = " << PrintState(BII.second.InNeeds)
282 <<
", Needs = " << PrintState(BII.second.Needs)
283 <<
", OutNeeds = " << PrintState(BII.second.OutNeeds) <<
"\n\n";
286 auto III = Instructions.find(&
MI);
287 if (III != Instructions.end()) {
288 dbgs() <<
" " <<
MI <<
" Needs = " << PrintState(III->second.Needs)
289 <<
", OutNeeds = " << PrintState(III->second.OutNeeds) <<
'\n';
296void SIWholeQuadMode::markInstruction(MachineInstr &
MI,
char Flag,
297 std::vector<WorkItem> &Worklist) {
298 InstrInfo &
II = Instructions[&
MI];
300 assert(!(Flag & StateExact) && Flag != 0);
309 Flag &= ~II.Disabled;
313 if ((
II.Needs & Flag) == Flag)
318 Worklist.emplace_back(&
MI);
322void SIWholeQuadMode::markDefs(
const MachineInstr &
UseMI,
LiveRange &LR,
323 VirtRegOrUnit VRegOrUnit,
unsigned SubReg,
324 char Flag, std::vector<WorkItem> &Worklist) {
334 const LaneBitmask UseLanes =
335 SubReg ?
TRI->getSubRegIndexLaneMask(SubReg)
346 LaneBitmask DefinedLanes;
348 PhiEntry(
const VNInfo *Phi,
unsigned PredIdx, LaneBitmask DefinedLanes)
349 :
Phi(
Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
351 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
353 SmallSet<VisitKey, 4> Visited;
354 LaneBitmask DefinedLanes;
355 unsigned NextPredIdx = 0;
357 const VNInfo *NextValue =
nullptr;
358 const VisitKey
Key(
Value, DefinedLanes);
365 if (
Value->isPHIDef()) {
368 assert(
MBB &&
"Phi-def has no defining MBB");
371 unsigned Idx = NextPredIdx;
374 for (; PI != PE && !NextValue; ++PI, ++Idx) {
376 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
386 assert(
MI &&
"Def has no defining instruction");
391 for (
const MachineOperand &
Op :
MI->all_defs()) {
396 LaneBitmask OpLanes =
398 :
TRI->getSubRegIndexLaneMask(
Op.getSubReg());
399 LaneBitmask Overlap = (UseLanes & OpLanes);
402 HasDef |= Overlap.
any();
405 DefinedLanes |= OpLanes;
409 if ((DefinedLanes & UseLanes) != UseLanes) {
412 if (
const VNInfo *VN = LRQ.
valueIn()) {
413 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
420 markInstruction(*
MI, Flag, Worklist);
423 markInstruction(*
MI, Flag, Worklist);
427 if (!NextValue && !PhiStack.
empty()) {
430 NextValue =
Entry.Phi;
431 NextPredIdx =
Entry.PredIdx;
432 DefinedLanes =
Entry.DefinedLanes;
440void SIWholeQuadMode::markOperand(
const MachineInstr &
MI,
441 const MachineOperand &
Op,
char Flag,
442 std::vector<WorkItem> &Worklist) {
449 case AMDGPU::EXEC_LO:
459 markDefs(
MI, LR, VirtRegOrUnit(
Reg),
Op.getSubReg(), Flag, Worklist);
468 markDefs(
MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
475void SIWholeQuadMode::markInstructionUses(
const MachineInstr &
MI,
char Flag,
476 std::vector<WorkItem> &Worklist) {
477 LLVM_DEBUG(
dbgs() <<
"markInstructionUses " << PrintState(Flag) <<
": "
480 for (
const MachineOperand &Use :
MI.all_uses())
481 markOperand(
MI, Use, Flag, Worklist);
486char SIWholeQuadMode::scanInstructions(
487 MachineFunction &MF, std::vector<WorkItem> &Worklist,
489 char GlobalFlags = 0;
491 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
492 bool HasImplicitDerivatives =
499 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
500 for (MachineBasicBlock *
MBB : RPOT) {
501 BlockInfo &BBI = Blocks[
MBB];
503 for (MachineInstr &
MI : *
MBB) {
504 InstrInfo &III = Instructions[&
MI];
505 unsigned Opcode =
MI.getOpcode();
508 if (
TII->isWQM(Opcode)) {
513 if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
517 markInstructionUses(
MI, StateWQM, Worklist);
518 GlobalFlags |= StateWQM;
520 }
else if (Opcode == AMDGPU::WQM) {
524 LowerToCopyInstrs.insert(&
MI);
525 }
else if (Opcode == AMDGPU::SOFT_WQM) {
526 LowerToCopyInstrs.insert(&
MI);
528 }
else if (Opcode == AMDGPU::STRICT_WWM) {
532 markInstructionUses(
MI, StateStrictWWM, Worklist);
533 GlobalFlags |= StateStrictWWM;
535 }
else if (Opcode == AMDGPU::STRICT_WQM ||
536 TII->isDualSourceBlendEXP(
MI)) {
540 markInstructionUses(
MI, StateStrictWQM, Worklist);
541 GlobalFlags |= StateStrictWQM;
543 if (Opcode == AMDGPU::STRICT_WQM) {
549 BBI.Needs |= StateExact;
550 if (!(BBI.InNeeds & StateExact)) {
551 BBI.InNeeds |= StateExact;
552 Worklist.emplace_back(
MBB);
554 GlobalFlags |= StateExact;
555 III.Disabled = StateWQM | StateStrict;
557 }
else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
558 Opcode == AMDGPU::DS_PARAM_LOAD ||
559 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
560 Opcode == AMDGPU::DS_DIRECT_LOAD) {
563 III.Needs |= StateStrictWQM;
564 GlobalFlags |= StateStrictWQM;
565 }
else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
567 III.Disabled = StateStrict;
568 MachineOperand &Inactive =
MI.getOperand(4);
569 if (Inactive.
isReg()) {
570 if (Inactive.
isUndef() &&
MI.getOperand(3).getImm() == 0)
571 LowerToCopyInstrs.insert(&
MI);
573 markOperand(
MI, Inactive, StateStrictWWM, Worklist);
576 BBI.NeedsLowering =
true;
577 }
else if (
TII->isDisableWQM(
MI)) {
578 BBI.Needs |= StateExact;
579 if (!(BBI.InNeeds & StateExact)) {
580 BBI.InNeeds |= StateExact;
581 Worklist.emplace_back(
MBB);
583 GlobalFlags |= StateExact;
584 III.Disabled = StateWQM | StateStrict;
585 }
else if (Opcode == AMDGPU::SI_PS_LIVE ||
586 Opcode == AMDGPU::SI_LIVE_MASK) {
588 }
else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
589 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
590 Opcode == AMDGPU::SI_DEMOTE_I1) {
592 BBI.NeedsLowering =
true;
593 }
else if (Opcode == AMDGPU::SI_INIT_EXEC ||
594 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
595 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
597 }
else if (WQMOutputs) {
602 for (
const MachineOperand &MO :
MI.defs()) {
605 TRI->hasVectorRegisters(
TRI->getPhysRegBaseClass(
Reg))) {
612 if (
TII->hasUnwantedEffectsWhenEXECEmpty(
MI)) {
613 for (
auto &
Op :
MI.uses()) {
616 if (!
TRI->isVectorRegister(*MRI,
Op.getReg()))
625 markInstruction(
MI, Flags, Worklist);
626 GlobalFlags |=
Flags;
635 if (GlobalFlags & StateWQM) {
636 for (MachineInstr *
MI : SetInactiveInstrs)
637 markInstruction(*
MI, StateWQM, Worklist);
638 for (MachineInstr *
MI : SoftWQMInstrs)
639 markInstruction(*
MI, StateWQM, Worklist);
645void SIWholeQuadMode::propagateInstruction(MachineInstr &
MI,
646 std::vector<WorkItem>& Worklist) {
647 MachineBasicBlock *
MBB =
MI.getParent();
648 InstrInfo
II = Instructions[&
MI];
649 BlockInfo &BI = Blocks[
MBB];
653 if ((
II.OutNeeds & StateWQM) && !(
II.Disabled & StateWQM) &&
654 (
MI.isTerminator() || (
TII->usesVM_CNT(
MI) &&
MI.mayStore()))) {
655 Instructions[&
MI].Needs = StateWQM;
660 if (
II.Needs & StateWQM) {
661 BI.Needs |= StateWQM;
662 if (!(BI.InNeeds & StateWQM)) {
663 BI.InNeeds |= StateWQM;
664 Worklist.emplace_back(
MBB);
669 if (MachineInstr *PrevMI =
MI.getPrevNode()) {
670 char InNeeds = (
II.Needs & ~StateStrict) |
II.OutNeeds;
671 if (!PrevMI->isPHI()) {
672 InstrInfo &PrevII = Instructions[PrevMI];
673 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
674 PrevII.OutNeeds |= InNeeds;
675 Worklist.emplace_back(PrevMI);
684 markInstructionUses(
MI,
II.Needs, Worklist);
688 if (
II.Needs & StateStrictWWM)
689 BI.Needs |= StateStrictWWM;
690 if (
II.Needs & StateStrictWQM)
691 BI.Needs |= StateStrictWQM;
694void SIWholeQuadMode::propagateBlock(MachineBasicBlock &
MBB,
695 std::vector<WorkItem>& Worklist) {
696 BlockInfo BI = Blocks[&
MBB];
701 InstrInfo &LastII = Instructions[LastMI];
702 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
703 LastII.OutNeeds |= BI.OutNeeds;
704 Worklist.emplace_back(LastMI);
710 BlockInfo &PredBI = Blocks[Pred];
711 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
714 PredBI.OutNeeds |= BI.InNeeds;
715 PredBI.InNeeds |= BI.InNeeds;
716 Worklist.emplace_back(Pred);
721 BlockInfo &SuccBI = Blocks[Succ];
722 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
725 SuccBI.InNeeds |= BI.OutNeeds;
726 Worklist.emplace_back(Succ);
730char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
731 std::vector<WorkItem> Worklist;
733 char GlobalFlags = scanInstructions(MF, Worklist, ExeczSideEffectInstrs);
735 while (!Worklist.empty()) {
736 WorkItem WI = Worklist.back();
740 propagateInstruction(*WI.MI, Worklist);
742 propagateBlock(*WI.MBB, Worklist);
744 if (Worklist.empty()) {
750 for (
auto *
MI : ExeczSideEffectInstrs) {
751 InstrInfo
II = Instructions[
MI];
752 if (
II.OutNeeds & StateWQM)
753 markInstructionUses(*
MI, StateWQM, Worklist);
757 ExeczSideEffectInstrs.clear();
765SIWholeQuadMode::saveSCC(MachineBasicBlock &
MBB,
772 MachineInstr *Restore =
783void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
784 MachineBasicBlock *BB = TermMI->
getParent();
788 MachineBasicBlock *SplitBB =
789 BB->
splitAt(*TermMI,
true, LIS);
793 unsigned NewOpcode = 0;
795 case AMDGPU::S_AND_B32:
796 NewOpcode = AMDGPU::S_AND_B32_term;
798 case AMDGPU::S_AND_B64:
799 NewOpcode = AMDGPU::S_AND_B64_term;
801 case AMDGPU::S_MOV_B32:
802 NewOpcode = AMDGPU::S_MOV_B32_term;
804 case AMDGPU::S_MOV_B64:
805 NewOpcode = AMDGPU::S_MOV_B64_term;
807 case AMDGPU::S_ANDN2_B32:
808 NewOpcode = AMDGPU::S_ANDN2_B32_term;
810 case AMDGPU::S_ANDN2_B64:
811 NewOpcode = AMDGPU::S_ANDN2_B64_term;
825 for (MachineBasicBlock *Succ : SplitBB->
successors()) {
826 DTUpdates.
push_back({DomTreeT::Insert, SplitBB, Succ});
827 DTUpdates.
push_back({DomTreeT::Delete, BB, Succ});
829 DTUpdates.
push_back({DomTreeT::Insert, BB, SplitBB});
837MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &
MI) {
852 switch (
MI.getOperand(2).getImm()) {
854 Opcode = AMDGPU::V_CMP_LG_F32_e64;
857 Opcode = AMDGPU::V_CMP_GE_F32_e64;
860 Opcode = AMDGPU::V_CMP_GT_F32_e64;
863 Opcode = AMDGPU::V_CMP_LE_F32_e64;
866 Opcode = AMDGPU::V_CMP_LT_F32_e64;
869 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
872 Opcode = AMDGPU::V_CMP_O_F32_e64;
875 Opcode = AMDGPU::V_CMP_U_F32_e64;
879 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
883 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
887 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
891 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
895 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
899 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
905 MachineBasicBlock &
MBB = *
MI.getParent();
908 MachineInstr *VcmpMI;
909 const MachineOperand &Op0 =
MI.getOperand(0);
910 const MachineOperand &Op1 =
MI.getOperand(1);
926 MachineInstr *MaskUpdateMI =
933 MachineInstr *EarlyTermMI =
936 MachineInstr *ExecMaskMI =
954MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &
MI,
bool IsWQM) {
957 MachineBasicBlock &
MBB = *
MI.getParent();
960 MachineInstr *MaskUpdateMI =
nullptr;
962 const bool IsDemote = IsWQM && (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
963 const MachineOperand &
Op =
MI.getOperand(0);
964 int64_t KillVal =
MI.getOperand(1).getImm();
965 MachineInstr *ComputeKilledMaskMI =
nullptr;
971 if (
Op.getImm() == KillVal) {
978 bool IsLastTerminator = std::next(
MI.getIterator()) ==
MBB.
end();
979 if (!IsLastTerminator) {
1011 MachineInstr *EarlyTermMI =
1016 MachineInstr *NewTerm;
1017 MachineInstr *WQMMaskMI =
nullptr;
1032 }
else if (!IsWQM) {
1050 if (ComputeKilledMaskMI)
1073void SIWholeQuadMode::lowerBlock(MachineBasicBlock &
MBB, BlockInfo &BI) {
1074 if (!BI.NeedsLowering)
1079 SmallVector<MachineInstr *, 4> SplitPoints;
1081 char State = BI.InitialState;
1085 auto MIState = StateTransition.find(&
MI);
1086 if (MIState != StateTransition.end())
1087 State = MIState->second;
1089 MachineInstr *SplitPoint =
nullptr;
1090 switch (
MI.getOpcode()) {
1091 case AMDGPU::SI_DEMOTE_I1:
1092 case AMDGPU::SI_KILL_I1_TERMINATOR:
1093 SplitPoint = lowerKillI1(
MI, State == StateWQM);
1095 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1096 SplitPoint = lowerKillF32(
MI);
1098 case AMDGPU::ENTER_STRICT_WWM:
1099 ActiveLanesReg =
MI.getOperand(0).getReg();
1101 case AMDGPU::EXIT_STRICT_WWM:
1104 case AMDGPU::V_SET_INACTIVE_B32:
1105 if (ActiveLanesReg) {
1106 LiveInterval &LI = LIS->
getInterval(
MI.getOperand(5).getReg());
1108 MI.getOperand(5).setReg(ActiveLanesReg);
1111 assert(State == StateExact || State == StateWQM);
1122 for (MachineInstr *
MI : SplitPoints)
1142 SlotIndex FirstIdx = FirstNonDbg != MBBE
1147 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1148 const LiveRange::Segment *S;
1157 if (
Next < FirstIdx)
1162 assert(EndMI &&
"Segment does not end on valid instruction");
1186 bool IsExecDef =
false;
1187 for (
const MachineOperand &MO :
MBBI->all_defs()) {
1189 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1203void SIWholeQuadMode::toExact(MachineBasicBlock &
MBB,
1208 bool IsTerminator = Before ==
MBB.
end();
1209 if (!IsTerminator) {
1211 if (FirstTerm !=
MBB.
end()) {
1214 IsTerminator = BeforeIdx > FirstTermIdx;
1234 StateTransition[
MI] = StateExact;
1237void SIWholeQuadMode::toWQM(MachineBasicBlock &
MBB,
1252 StateTransition[
MI] = StateWQM;
1255void SIWholeQuadMode::toStrictMode(MachineBasicBlock &
MBB,
1257 Register SaveOrig,
char StrictStateNeeded) {
1260 assert(StrictStateNeeded == StateStrictWWM ||
1261 StrictStateNeeded == StateStrictWQM);
1265 if (StrictStateNeeded == StateStrictWWM) {
1273 StateTransition[
MI] = StrictStateNeeded;
1276void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &
MBB,
1278 Register SavedOrig,
char NonStrictState,
1279 char CurrentStrictState) {
1283 assert(CurrentStrictState == StateStrictWWM ||
1284 CurrentStrictState == StateStrictWQM);
1288 if (CurrentStrictState == StateStrictWWM) {
1298 StateTransition[
MI] = NonStrictState;
1301void SIWholeQuadMode::processBlock(MachineBasicBlock &
MBB, BlockInfo &BI,
1305 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1306 BI.InitialState = StateWQM;
1315 bool WQMFromExec = IsEntry;
1316 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1317 char NonStrictState = 0;
1318 const TargetRegisterClass *BoolRC =
TRI->getBoolRC();
1323 if (
II != IE &&
II->getOpcode() == AMDGPU::COPY &&
1324 II->getOperand(1).getReg() == LMC.
ExecReg)
1339 BI.InitialState = State;
1341 for (
unsigned Idx = 0;; ++Idx) {
1343 char Needs = StateExact | StateWQM;
1349 if (FirstStrict == IE)
1353 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1359 MachineInstr &
MI = *
II;
1361 if (
MI.isTerminator() ||
TII->mayReadEXEC(*MRI,
MI)) {
1362 auto III = Instructions.find(&
MI);
1363 if (III != Instructions.end()) {
1364 if (III->second.Needs & StateStrictWWM)
1365 Needs = StateStrictWWM;
1366 else if (III->second.Needs & StateStrictWQM)
1367 Needs = StateStrictWQM;
1368 else if (III->second.Needs & StateWQM)
1371 Needs &= ~III->second.Disabled;
1372 OutNeeds = III->second.OutNeeds;
1377 Needs = StateExact | StateWQM | StateStrict;
1381 if (
MI.isBranch() && OutNeeds == StateExact)
1387 if (BI.OutNeeds & StateWQM)
1389 else if (BI.OutNeeds == StateExact)
1392 Needs = StateWQM | StateExact;
1396 if (!(Needs & State)) {
1398 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1399 State == StateStrictWQM || Needs == StateStrictWQM) {
1401 First = FirstStrict;
1408 bool SaveSCC =
false;
1411 case StateStrictWWM:
1412 case StateStrictWQM:
1416 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1420 SaveSCC = !(Needs & StateWQM);
1426 char StartState = State & StateStrict ? NonStrictState : State;
1428 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1429 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1430 !(Needs & StateExact);
1431 bool PreferLast = Needs == StateWQM;
1436 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1438 if (
TII->hasUnwantedEffectsWhenEXECEmpty(*
I)) {
1439 PreferLast = WQMToExact;
1445 prepareInsertion(
MBB,
First,
II, PreferLast, SaveSCC);
1447 if (State & StateStrict) {
1448 assert(State == StateStrictWWM || State == StateStrictWQM);
1449 assert(SavedNonStrictReg);
1450 fromStrictMode(
MBB, Before, SavedNonStrictReg, NonStrictState, State);
1453 SavedNonStrictReg = 0;
1454 State = NonStrictState;
1457 if (Needs & StateStrict) {
1458 NonStrictState = State;
1459 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1460 assert(!SavedNonStrictReg);
1463 toStrictMode(
MBB, Before, SavedNonStrictReg, Needs);
1467 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1472 toExact(
MBB, Before, SavedWQMReg);
1474 }
else if (ExactToWQM) {
1475 assert(WQMFromExec == (SavedWQMReg == 0));
1477 toWQM(
MBB, Before, SavedWQMReg);
1493 if (Needs != (StateExact | StateWQM | StateStrict)) {
1494 if (Needs != (StateExact | StateWQM))
1505 assert(!SavedNonStrictReg);
1508bool SIWholeQuadMode::lowerLiveMaskQueries() {
1509 for (MachineInstr *
MI : LiveMaskQueries) {
1513 MachineInstr *
Copy =
1518 MI->eraseFromParent();
1520 return !LiveMaskQueries.empty();
1523bool SIWholeQuadMode::lowerCopyInstrs() {
1524 for (MachineInstr *
MI : LowerToMovInstrs) {
1525 assert(
MI->getNumExplicitOperands() == 2);
1529 const TargetRegisterClass *regClass =
1530 TRI->getRegClassForOperandReg(*MRI,
MI->getOperand(0));
1531 if (
TRI->isVGPRClass(regClass)) {
1532 const unsigned MovOp =
TII->getMovOpcode(regClass);
1533 MI->setDesc(
TII->get(MovOp));
1537 assert(
any_of(
MI->implicit_operands(), [](
const MachineOperand &MO) {
1538 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1544 if (
MI->getOperand(0).isEarlyClobber()) {
1546 MI->getOperand(0).setIsEarlyClobber(
false);
1549 int Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1550 while (Index >= 0) {
1551 MI->removeOperand(Index);
1552 Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1554 MI->setDesc(
TII->get(AMDGPU::COPY));
1558 for (MachineInstr *
MI : LowerToCopyInstrs) {
1561 if (
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1562 assert(
MI->getNumExplicitOperands() == 6);
1564 LiveInterval *RecomputeLI =
nullptr;
1565 if (
MI->getOperand(4).isReg())
1566 RecomputeLI = &LIS->
getInterval(
MI->getOperand(4).getReg());
1568 MI->removeOperand(5);
1569 MI->removeOperand(4);
1570 MI->removeOperand(3);
1571 MI->removeOperand(1);
1576 assert(
MI->getNumExplicitOperands() == 2);
1579 unsigned CopyOp =
MI->getOperand(1).isReg()
1580 ? (unsigned)AMDGPU::COPY
1581 :
TII->getMovOpcode(
TRI->getRegClassForOperandReg(
1582 *MRI,
MI->getOperand(0)));
1583 MI->setDesc(
TII->get(CopyOp));
1586 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1589bool SIWholeQuadMode::lowerKillInstrs(
bool IsWQM) {
1590 for (MachineInstr *
MI : KillInstrs) {
1591 MachineInstr *SplitPoint =
nullptr;
1592 switch (
MI->getOpcode()) {
1593 case AMDGPU::SI_DEMOTE_I1:
1594 case AMDGPU::SI_KILL_I1_TERMINATOR:
1595 SplitPoint = lowerKillI1(*
MI, IsWQM);
1597 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1598 SplitPoint = lowerKillF32(*
MI);
1604 return !KillInstrs.empty();
1607void SIWholeQuadMode::lowerInitExec(MachineInstr &
MI) {
1608 MachineBasicBlock *
MBB =
MI.getParent();
1610 if (
MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1612 "init whole wave not in entry block");
1625 MI.eraseFromParent();
1634 if (
MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1638 .
addImm(
MI.getOperand(0).getImm());
1643 MI.eraseFromParent();
1654 Register InputReg =
MI.getOperand(0).getReg();
1655 MachineInstr *FirstMI = &*
MBB->
begin();
1657 MachineInstr *DefInstr = MRI->
getVRegDef(InputReg);
1660 if (DefInstr != FirstMI) {
1679 auto BfeMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_BFE_U32), CountReg)
1681 .
addImm((
MI.getOperand(1).getImm() & Mask) | 0x70000);
1685 auto CmpMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_CMP_EQ_U32))
1686 .
addReg(CountReg, RegState::Kill)
1692 MI.eraseFromParent();
1697 MI.eraseFromParent();
1712SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry,
bool &
Changed) {
1715 for (MachineInstr *
MI : InitExecInstrs) {
1719 if (
MI->getParent() == &Entry)
1720 InsertPt = std::next(
MI->getIterator());
1729bool SIWholeQuadMode::run(MachineFunction &MF) {
1731 <<
" ------------- \n");
1734 Instructions.clear();
1736 LiveMaskQueries.clear();
1737 LowerToCopyInstrs.clear();
1738 LowerToMovInstrs.clear();
1740 InitExecInstrs.clear();
1741 SetInactiveInstrs.
clear();
1742 StateTransition.clear();
1753 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1754 const bool HasWaveModes = GlobalFlags & ~StateExact;
1755 const bool HasKills = !KillInstrs.empty();
1756 const bool UsesWQM = GlobalFlags & StateWQM;
1757 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1768 for (MachineInstr *
MI : SetInactiveInstrs) {
1769 if (LowerToCopyInstrs.contains(
MI))
1771 auto &
Info = Instructions[
MI];
1772 if (
Info.MarkedStates & StateStrict) {
1773 Info.Needs |= StateStrictWWM;
1774 Info.Disabled &= ~StateStrictWWM;
1775 Blocks[
MI->getParent()].Needs |= StateStrictWWM;
1778 LowerToCopyInstrs.insert(
MI);
1784 Changed |= lowerLiveMaskQueries();
1787 if (!HasWaveModes) {
1789 Changed |= lowerKillInstrs(
false);
1790 }
else if (GlobalFlags == StateWQM) {
1796 lowerKillInstrs(
true);
1800 if (GlobalFlags & StateWQM)
1801 Blocks[&
Entry].InNeeds |= StateWQM;
1803 for (
auto &BII : Blocks)
1804 processBlock(*BII.first, BII.second, BII.first == &Entry);
1806 for (
auto &BII : Blocks)
1807 lowerBlock(*BII.first, BII.second);
1812 if (LiveMaskReg != LMC.
ExecReg)
1821 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1827bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1828 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1829 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1830 MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() :
nullptr;
1832 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1833 MachinePostDominatorTree *PDT =
1834 PDTWrapper ? &PDTWrapper->getPostDomTree() :
nullptr;
1835 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1836 return Impl.run(MF);
1849 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT, MachineLoopInfo *MLI)
SI Optimize VGPR LiveRange
unsigned getWavefrontSize() const
const unsigned AndSaveExecTermOpc
const unsigned AndTermOpc
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned OrSaveExecOpc
const unsigned AndSaveExecOpc
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
FunctionPass class - This class is used to implement most global optimizations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
succ_iterator succ_begin()
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
pred_iterator pred_begin()
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
instr_iterator instr_end()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Wrapper class representing virtual and physical registers.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Wrapper class representing a virtual register or register unit.
constexpr bool isVirtualReg() const
constexpr Register asVirtualReg() const
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int32_t getVOPe32(uint32_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
DominatorTreeBase< T, false > DomTreeBase
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionPass * createSIWholeQuadModeLegacyPass()
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
@ Disabled
Don't do any conversion of .debug_str_offsets tables.
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
WorkItem(const BasicBlock *BB, int St)
static constexpr LaneBitmask getAll()
constexpr bool any() const
static constexpr LaneBitmask getNone()