89#define DEBUG_TYPE "si-wqm"
98 StateStrict = StateStrictWWM | StateStrictWQM,
105 explicit PrintState(
int State) : State(State) {}
111 static const std::pair<char, const char *> Mapping[] = {
112 std::pair(StateWQM,
"WQM"), std::pair(StateStrictWWM,
"StrictWWM"),
113 std::pair(StateStrictWQM,
"StrictWQM"), std::pair(StateExact,
"Exact")};
114 char State = PS.State;
115 for (
auto M : Mapping) {
116 if (State & M.first) {
133 char MarkedStates = 0;
140 char InitialState = 0;
141 bool NeedsLowering =
false;
153class SIWholeQuadMode {
190 std::vector<WorkItem> &Worklist);
193 std::vector<WorkItem> &Worklist);
195 std::vector<WorkItem> &Worklist);
197 std::vector<WorkItem> &Worklist);
198 char scanInstructions(
MachineFunction &MF, std::vector<WorkItem> &Worklist);
199 void propagateInstruction(
MachineInstr &
MI, std::vector<WorkItem> &Worklist);
214 Register SaveOrig,
char StrictStateNeeded);
217 char NonStrictState,
char CurrentStrictState);
226 bool lowerLiveMaskQueries();
227 bool lowerCopyInstrs();
228 bool lowerKillInstrs(
bool IsWQM);
242 StringRef getPassName()
const override {
return "SI Whole Quad Mode"; }
259char SIWholeQuadModeLegacy::ID = 0;
272 return new SIWholeQuadModeLegacy;
277 for (
const auto &BII : Blocks) {
280 <<
" InNeeds = " << PrintState(BII.second.InNeeds)
281 <<
", Needs = " << PrintState(BII.second.Needs)
282 <<
", OutNeeds = " << PrintState(BII.second.OutNeeds) <<
"\n\n";
285 auto III = Instructions.find(&
MI);
286 if (III != Instructions.end()) {
287 dbgs() <<
" " <<
MI <<
" Needs = " << PrintState(III->second.Needs)
288 <<
", OutNeeds = " << PrintState(III->second.OutNeeds) <<
'\n';
295void SIWholeQuadMode::markInstruction(MachineInstr &
MI,
char Flag,
296 std::vector<WorkItem> &Worklist) {
297 InstrInfo &
II = Instructions[&
MI];
299 assert(!(Flag & StateExact) && Flag != 0);
308 Flag &= ~II.Disabled;
312 if ((
II.Needs & Flag) == Flag)
317 Worklist.emplace_back(&
MI);
321void SIWholeQuadMode::markDefs(
const MachineInstr &
UseMI,
LiveRange &LR,
322 VirtRegOrUnit VRegOrUnit,
unsigned SubReg,
323 char Flag, std::vector<WorkItem> &Worklist) {
333 const LaneBitmask UseLanes =
345 LaneBitmask DefinedLanes;
347 PhiEntry(
const VNInfo *Phi,
unsigned PredIdx, LaneBitmask DefinedLanes)
348 :
Phi(
Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
350 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
352 SmallSet<VisitKey, 4> Visited;
353 LaneBitmask DefinedLanes;
354 unsigned NextPredIdx = 0;
356 const VNInfo *NextValue =
nullptr;
357 const VisitKey
Key(
Value, DefinedLanes);
364 if (
Value->isPHIDef()) {
367 assert(
MBB &&
"Phi-def has no defining MBB");
370 unsigned Idx = NextPredIdx;
373 for (; PI != PE && !NextValue; ++PI, ++Idx) {
375 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
385 assert(
MI &&
"Def has no defining instruction");
390 for (
const MachineOperand &
Op :
MI->all_defs()) {
395 LaneBitmask OpLanes =
397 :
TRI->getSubRegIndexLaneMask(
Op.getSubReg());
398 LaneBitmask Overlap = (UseLanes & OpLanes);
401 HasDef |= Overlap.
any();
404 DefinedLanes |= OpLanes;
408 if ((DefinedLanes & UseLanes) != UseLanes) {
411 if (
const VNInfo *VN = LRQ.
valueIn()) {
412 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
419 markInstruction(*
MI, Flag, Worklist);
422 markInstruction(*
MI, Flag, Worklist);
426 if (!NextValue && !PhiStack.
empty()) {
429 NextValue =
Entry.Phi;
430 NextPredIdx =
Entry.PredIdx;
431 DefinedLanes =
Entry.DefinedLanes;
439void SIWholeQuadMode::markOperand(
const MachineInstr &
MI,
440 const MachineOperand &
Op,
char Flag,
441 std::vector<WorkItem> &Worklist) {
448 case AMDGPU::EXEC_LO:
458 markDefs(
MI, LR, VirtRegOrUnit(
Reg),
Op.getSubReg(), Flag, Worklist);
467 markDefs(
MI, LR, VirtRegOrUnit(Unit), AMDGPU::NoSubRegister, Flag,
474void SIWholeQuadMode::markInstructionUses(
const MachineInstr &
MI,
char Flag,
475 std::vector<WorkItem> &Worklist) {
476 LLVM_DEBUG(
dbgs() <<
"markInstructionUses " << PrintState(Flag) <<
": "
479 for (
const MachineOperand &Use :
MI.all_uses())
480 markOperand(
MI, Use, Flag, Worklist);
485char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
486 std::vector<WorkItem> &Worklist) {
487 char GlobalFlags = 0;
489 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
490 bool HasImplicitDerivatives =
497 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
498 for (MachineBasicBlock *
MBB : RPOT) {
499 BlockInfo &BBI = Blocks[
MBB];
501 for (MachineInstr &
MI : *
MBB) {
502 InstrInfo &III = Instructions[&
MI];
503 unsigned Opcode =
MI.getOpcode();
506 if (
TII->isWQM(Opcode)) {
515 markInstructionUses(
MI, StateWQM, Worklist);
516 GlobalFlags |= StateWQM;
518 }
else if (Opcode == AMDGPU::WQM) {
522 LowerToCopyInstrs.insert(&
MI);
523 }
else if (Opcode == AMDGPU::SOFT_WQM) {
524 LowerToCopyInstrs.insert(&
MI);
526 }
else if (Opcode == AMDGPU::STRICT_WWM) {
530 markInstructionUses(
MI, StateStrictWWM, Worklist);
531 GlobalFlags |= StateStrictWWM;
533 }
else if (Opcode == AMDGPU::STRICT_WQM ||
534 TII->isDualSourceBlendEXP(
MI)) {
538 markInstructionUses(
MI, StateStrictWQM, Worklist);
539 GlobalFlags |= StateStrictWQM;
541 if (Opcode == AMDGPU::STRICT_WQM) {
547 BBI.Needs |= StateExact;
548 if (!(BBI.InNeeds & StateExact)) {
549 BBI.InNeeds |= StateExact;
550 Worklist.emplace_back(
MBB);
552 GlobalFlags |= StateExact;
553 III.Disabled = StateWQM | StateStrict;
555 }
else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
556 Opcode == AMDGPU::DS_PARAM_LOAD ||
557 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
558 Opcode == AMDGPU::DS_DIRECT_LOAD) {
561 III.Needs |= StateStrictWQM;
562 GlobalFlags |= StateStrictWQM;
563 }
else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
565 III.Disabled = StateStrict;
566 MachineOperand &Inactive =
MI.getOperand(4);
567 if (Inactive.
isReg()) {
568 if (Inactive.
isUndef() &&
MI.getOperand(3).getImm() == 0)
569 LowerToCopyInstrs.insert(&
MI);
571 markOperand(
MI, Inactive, StateStrictWWM, Worklist);
574 BBI.NeedsLowering =
true;
575 }
else if (
TII->isDisableWQM(
MI)) {
576 BBI.Needs |= StateExact;
577 if (!(BBI.InNeeds & StateExact)) {
578 BBI.InNeeds |= StateExact;
579 Worklist.emplace_back(
MBB);
581 GlobalFlags |= StateExact;
582 III.Disabled = StateWQM | StateStrict;
583 }
else if (Opcode == AMDGPU::SI_PS_LIVE ||
584 Opcode == AMDGPU::SI_LIVE_MASK) {
586 }
else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
587 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
588 Opcode == AMDGPU::SI_DEMOTE_I1) {
590 BBI.NeedsLowering =
true;
591 }
else if (Opcode == AMDGPU::SI_INIT_EXEC ||
592 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
593 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
595 }
else if (WQMOutputs) {
600 for (
const MachineOperand &MO :
MI.defs()) {
603 TRI->hasVectorRegisters(
TRI->getPhysRegBaseClass(
Reg))) {
611 markInstruction(
MI, Flags, Worklist);
612 GlobalFlags |=
Flags;
621 if (GlobalFlags & StateWQM) {
622 for (MachineInstr *
MI : SetInactiveInstrs)
623 markInstruction(*
MI, StateWQM, Worklist);
624 for (MachineInstr *
MI : SoftWQMInstrs)
625 markInstruction(*
MI, StateWQM, Worklist);
631void SIWholeQuadMode::propagateInstruction(MachineInstr &
MI,
632 std::vector<WorkItem>& Worklist) {
633 MachineBasicBlock *
MBB =
MI.getParent();
634 InstrInfo
II = Instructions[&
MI];
635 BlockInfo &BI = Blocks[
MBB];
639 if ((
II.OutNeeds & StateWQM) && !(
II.Disabled & StateWQM) &&
640 (
MI.isTerminator() || (
TII->usesVM_CNT(
MI) &&
MI.mayStore()))) {
641 Instructions[&
MI].Needs = StateWQM;
646 if (
II.Needs & StateWQM) {
647 BI.Needs |= StateWQM;
648 if (!(BI.InNeeds & StateWQM)) {
649 BI.InNeeds |= StateWQM;
650 Worklist.emplace_back(
MBB);
655 if (MachineInstr *PrevMI =
MI.getPrevNode()) {
656 char InNeeds = (
II.Needs & ~StateStrict) |
II.OutNeeds;
657 if (!PrevMI->isPHI()) {
658 InstrInfo &PrevII = Instructions[PrevMI];
659 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
660 PrevII.OutNeeds |= InNeeds;
661 Worklist.emplace_back(PrevMI);
670 markInstructionUses(
MI,
II.Needs, Worklist);
674 if (
II.Needs & StateStrictWWM)
675 BI.Needs |= StateStrictWWM;
676 if (
II.Needs & StateStrictWQM)
677 BI.Needs |= StateStrictWQM;
680void SIWholeQuadMode::propagateBlock(MachineBasicBlock &
MBB,
681 std::vector<WorkItem>& Worklist) {
682 BlockInfo BI = Blocks[&
MBB];
687 InstrInfo &LastII = Instructions[LastMI];
688 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
689 LastII.OutNeeds |= BI.OutNeeds;
690 Worklist.emplace_back(LastMI);
696 BlockInfo &PredBI = Blocks[Pred];
697 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
700 PredBI.OutNeeds |= BI.InNeeds;
701 PredBI.InNeeds |= BI.InNeeds;
702 Worklist.emplace_back(Pred);
707 BlockInfo &SuccBI = Blocks[Succ];
708 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
711 SuccBI.InNeeds |= BI.OutNeeds;
712 Worklist.emplace_back(Succ);
716char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
717 std::vector<WorkItem> Worklist;
718 char GlobalFlags = scanInstructions(MF, Worklist);
720 while (!Worklist.empty()) {
721 WorkItem WI = Worklist.back();
725 propagateInstruction(*WI.MI, Worklist);
727 propagateBlock(*WI.MBB, Worklist);
734SIWholeQuadMode::saveSCC(MachineBasicBlock &
MBB,
736 Register SaveReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
741 MachineInstr *Restore =
752void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
753 MachineBasicBlock *BB = TermMI->
getParent();
757 MachineBasicBlock *SplitBB =
758 BB->
splitAt(*TermMI,
true, LIS);
762 unsigned NewOpcode = 0;
764 case AMDGPU::S_AND_B32:
765 NewOpcode = AMDGPU::S_AND_B32_term;
767 case AMDGPU::S_AND_B64:
768 NewOpcode = AMDGPU::S_AND_B64_term;
770 case AMDGPU::S_MOV_B32:
771 NewOpcode = AMDGPU::S_MOV_B32_term;
773 case AMDGPU::S_MOV_B64:
774 NewOpcode = AMDGPU::S_MOV_B64_term;
776 case AMDGPU::S_ANDN2_B32:
777 NewOpcode = AMDGPU::S_ANDN2_B32_term;
779 case AMDGPU::S_ANDN2_B64:
780 NewOpcode = AMDGPU::S_ANDN2_B64_term;
794 for (MachineBasicBlock *Succ : SplitBB->
successors()) {
795 DTUpdates.
push_back({DomTreeT::Insert, SplitBB, Succ});
796 DTUpdates.
push_back({DomTreeT::Delete, BB, Succ});
798 DTUpdates.
push_back({DomTreeT::Insert, BB, SplitBB});
806MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &
MI) {
821 switch (
MI.getOperand(2).getImm()) {
823 Opcode = AMDGPU::V_CMP_LG_F32_e64;
826 Opcode = AMDGPU::V_CMP_GE_F32_e64;
829 Opcode = AMDGPU::V_CMP_GT_F32_e64;
832 Opcode = AMDGPU::V_CMP_LE_F32_e64;
835 Opcode = AMDGPU::V_CMP_LT_F32_e64;
838 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
841 Opcode = AMDGPU::V_CMP_O_F32_e64;
844 Opcode = AMDGPU::V_CMP_U_F32_e64;
848 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
852 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
856 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
860 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
864 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
868 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
874 MachineBasicBlock &
MBB = *
MI.getParent();
877 MachineInstr *VcmpMI;
878 const MachineOperand &Op0 =
MI.getOperand(0);
879 const MachineOperand &Op1 =
MI.getOperand(1);
895 MachineInstr *MaskUpdateMI =
902 MachineInstr *EarlyTermMI =
905 MachineInstr *ExecMaskMI =
923MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &
MI,
bool IsWQM) {
926 MachineBasicBlock &
MBB = *
MI.getParent();
929 MachineInstr *MaskUpdateMI =
nullptr;
931 const bool IsDemote = IsWQM && (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
932 const MachineOperand &
Op =
MI.getOperand(0);
933 int64_t KillVal =
MI.getOperand(1).getImm();
934 MachineInstr *ComputeKilledMaskMI =
nullptr;
940 if (
Op.getImm() == KillVal) {
947 bool IsLastTerminator = std::next(
MI.getIterator()) ==
MBB.
end();
948 if (!IsLastTerminator) {
963 TmpReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
980 MachineInstr *EarlyTermMI =
985 MachineInstr *NewTerm;
986 MachineInstr *WQMMaskMI =
nullptr;
990 LiveMaskWQM =
MRI->createVirtualRegister(
TRI->getBoolRC());
1001 }
else if (!IsWQM) {
1019 if (ComputeKilledMaskMI)
1042void SIWholeQuadMode::lowerBlock(MachineBasicBlock &
MBB, BlockInfo &BI) {
1043 if (!BI.NeedsLowering)
1048 SmallVector<MachineInstr *, 4> SplitPoints;
1050 char State = BI.InitialState;
1054 auto MIState = StateTransition.find(&
MI);
1055 if (MIState != StateTransition.end())
1056 State = MIState->second;
1058 MachineInstr *SplitPoint =
nullptr;
1059 switch (
MI.getOpcode()) {
1060 case AMDGPU::SI_DEMOTE_I1:
1061 case AMDGPU::SI_KILL_I1_TERMINATOR:
1062 SplitPoint = lowerKillI1(
MI, State == StateWQM);
1064 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1065 SplitPoint = lowerKillF32(
MI);
1067 case AMDGPU::ENTER_STRICT_WWM:
1068 ActiveLanesReg =
MI.getOperand(0).getReg();
1070 case AMDGPU::EXIT_STRICT_WWM:
1073 case AMDGPU::V_SET_INACTIVE_B32:
1074 if (ActiveLanesReg) {
1075 LiveInterval &LI = LIS->
getInterval(
MI.getOperand(5).getReg());
1076 MRI->constrainRegClass(ActiveLanesReg,
TRI->getWaveMaskRegClass());
1077 MI.getOperand(5).setReg(ActiveLanesReg);
1080 assert(State == StateExact || State == StateWQM);
1091 for (MachineInstr *
MI : SplitPoints)
1111 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1112 const LiveRange::Segment *S;
1121 if (
Next < FirstIdx)
1126 assert(EndMI &&
"Segment does not end on valid instruction");
1150 bool IsExecDef =
false;
1151 for (
const MachineOperand &MO :
MBBI->all_defs()) {
1153 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1167void SIWholeQuadMode::toExact(MachineBasicBlock &
MBB,
1172 bool IsTerminator = Before ==
MBB.
end();
1173 if (!IsTerminator) {
1175 if (FirstTerm !=
MBB.
end()) {
1178 IsTerminator = BeforeIdx > FirstTermIdx;
1197 StateTransition[
MI] = StateExact;
1200void SIWholeQuadMode::toWQM(MachineBasicBlock &
MBB,
1214 StateTransition[
MI] = StateWQM;
1217void SIWholeQuadMode::toStrictMode(MachineBasicBlock &
MBB,
1219 Register SaveOrig,
char StrictStateNeeded) {
1222 assert(StrictStateNeeded == StateStrictWWM ||
1223 StrictStateNeeded == StateStrictWQM);
1225 if (StrictStateNeeded == StateStrictWWM) {
1235 StateTransition[
MI] = StrictStateNeeded;
1238void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &
MBB,
1240 Register SavedOrig,
char NonStrictState,
1241 char CurrentStrictState) {
1245 assert(CurrentStrictState == StateStrictWWM ||
1246 CurrentStrictState == StateStrictWQM);
1248 if (CurrentStrictState == StateStrictWWM) {
1258 StateTransition[
MI] = NonStrictState;
1261void SIWholeQuadMode::processBlock(MachineBasicBlock &
MBB, BlockInfo &BI,
1265 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1266 BI.InitialState = StateWQM;
1275 bool WQMFromExec = IsEntry;
1276 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1277 char NonStrictState = 0;
1278 const TargetRegisterClass *BoolRC =
TRI->getBoolRC();
1283 if (
II != IE &&
II->getOpcode() == AMDGPU::COPY &&
1284 II->getOperand(1).getReg() == LMC.
ExecReg)
1299 BI.InitialState = State;
1301 for (
unsigned Idx = 0;; ++Idx) {
1303 char Needs = StateExact | StateWQM;
1309 if (FirstStrict == IE)
1313 if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1319 MachineInstr &
MI = *
II;
1321 if (
MI.isTerminator() ||
TII->mayReadEXEC(*
MRI,
MI)) {
1322 auto III = Instructions.find(&
MI);
1323 if (III != Instructions.end()) {
1324 if (III->second.Needs & StateStrictWWM)
1325 Needs = StateStrictWWM;
1326 else if (III->second.Needs & StateStrictWQM)
1327 Needs = StateStrictWQM;
1328 else if (III->second.Needs & StateWQM)
1331 Needs &= ~III->second.Disabled;
1332 OutNeeds = III->second.OutNeeds;
1337 Needs = StateExact | StateWQM | StateStrict;
1341 if (
MI.isBranch() && OutNeeds == StateExact)
1347 if (BI.OutNeeds & StateWQM)
1349 else if (BI.OutNeeds == StateExact)
1352 Needs = StateWQM | StateExact;
1356 if (!(Needs & State)) {
1358 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1359 State == StateStrictWQM || Needs == StateStrictWQM) {
1361 First = FirstStrict;
1368 bool SaveSCC =
false;
1371 case StateStrictWWM:
1372 case StateStrictWQM:
1376 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1380 SaveSCC = !(Needs & StateWQM);
1386 char StartState = State & StateStrict ? NonStrictState : State;
1388 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1389 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1390 !(Needs & StateExact);
1391 bool PreferLast = Needs == StateWQM;
1396 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1398 if (
TII->hasUnwantedEffectsWhenEXECEmpty(*
I)) {
1399 PreferLast = WQMToExact;
1405 prepareInsertion(
MBB,
First,
II, PreferLast, SaveSCC);
1407 if (State & StateStrict) {
1408 assert(State == StateStrictWWM || State == StateStrictWQM);
1409 assert(SavedNonStrictReg);
1410 fromStrictMode(
MBB, Before, SavedNonStrictReg, NonStrictState, State);
1413 SavedNonStrictReg = 0;
1414 State = NonStrictState;
1417 if (Needs & StateStrict) {
1418 NonStrictState = State;
1419 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1420 assert(!SavedNonStrictReg);
1421 SavedNonStrictReg =
MRI->createVirtualRegister(BoolRC);
1423 toStrictMode(
MBB, Before, SavedNonStrictReg, Needs);
1427 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1429 SavedWQMReg =
MRI->createVirtualRegister(BoolRC);
1432 toExact(
MBB, Before, SavedWQMReg);
1434 }
else if (ExactToWQM) {
1435 assert(WQMFromExec == (SavedWQMReg == 0));
1437 toWQM(
MBB, Before, SavedWQMReg);
1453 if (Needs != (StateExact | StateWQM | StateStrict)) {
1454 if (Needs != (StateExact | StateWQM))
1465 assert(!SavedNonStrictReg);
1468bool SIWholeQuadMode::lowerLiveMaskQueries() {
1469 for (MachineInstr *
MI : LiveMaskQueries) {
1473 MachineInstr *
Copy =
1478 MI->eraseFromParent();
1480 return !LiveMaskQueries.empty();
1483bool SIWholeQuadMode::lowerCopyInstrs() {
1484 for (MachineInstr *
MI : LowerToMovInstrs) {
1485 assert(
MI->getNumExplicitOperands() == 2);
1489 const TargetRegisterClass *regClass =
1490 TRI->getRegClassForOperandReg(*
MRI,
MI->getOperand(0));
1491 if (
TRI->isVGPRClass(regClass)) {
1492 const unsigned MovOp =
TII->getMovOpcode(regClass);
1493 MI->setDesc(
TII->get(MovOp));
1497 assert(
any_of(
MI->implicit_operands(), [](
const MachineOperand &MO) {
1498 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1504 if (
MI->getOperand(0).isEarlyClobber()) {
1506 MI->getOperand(0).setIsEarlyClobber(
false);
1509 int Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1510 while (Index >= 0) {
1511 MI->removeOperand(Index);
1512 Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1514 MI->setDesc(
TII->get(AMDGPU::COPY));
1518 for (MachineInstr *
MI : LowerToCopyInstrs) {
1521 if (
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1522 assert(
MI->getNumExplicitOperands() == 6);
1524 LiveInterval *RecomputeLI =
nullptr;
1525 if (
MI->getOperand(4).isReg())
1526 RecomputeLI = &LIS->
getInterval(
MI->getOperand(4).getReg());
1528 MI->removeOperand(5);
1529 MI->removeOperand(4);
1530 MI->removeOperand(3);
1531 MI->removeOperand(1);
1536 assert(
MI->getNumExplicitOperands() == 2);
1539 unsigned CopyOp =
MI->getOperand(1).isReg()
1540 ? (unsigned)AMDGPU::COPY
1541 :
TII->getMovOpcode(
TRI->getRegClassForOperandReg(
1542 *
MRI,
MI->getOperand(0)));
1543 MI->setDesc(
TII->get(CopyOp));
1546 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1549bool SIWholeQuadMode::lowerKillInstrs(
bool IsWQM) {
1550 for (MachineInstr *
MI : KillInstrs) {
1551 MachineInstr *SplitPoint =
nullptr;
1552 switch (
MI->getOpcode()) {
1553 case AMDGPU::SI_DEMOTE_I1:
1554 case AMDGPU::SI_KILL_I1_TERMINATOR:
1555 SplitPoint = lowerKillI1(*
MI, IsWQM);
1557 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1558 SplitPoint = lowerKillF32(*
MI);
1564 return !KillInstrs.empty();
1567void SIWholeQuadMode::lowerInitExec(MachineInstr &
MI) {
1568 MachineBasicBlock *
MBB =
MI.getParent();
1570 if (
MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1572 "init whole wave not in entry block");
1573 Register EntryExec =
MRI->createVirtualRegister(
TRI->getBoolRC());
1579 MRI->replaceRegWith(
MI.getOperand(0).getReg(), EntryExec);
1585 MI.eraseFromParent();
1594 if (
MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1598 .
addImm(
MI.getOperand(0).getImm());
1603 MI.eraseFromParent();
1614 Register InputReg =
MI.getOperand(0).getReg();
1615 MachineInstr *FirstMI = &*
MBB->
begin();
1617 MachineInstr *DefInstr =
MRI->getVRegDef(InputReg);
1620 if (DefInstr != FirstMI) {
1638 Register CountReg =
MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1639 auto BfeMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_BFE_U32), CountReg)
1641 .
addImm((
MI.getOperand(1).getImm() & Mask) | 0x70000);
1645 auto CmpMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_CMP_EQ_U32))
1652 MI.eraseFromParent();
1657 MI.eraseFromParent();
1672SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry,
bool &
Changed) {
1675 for (MachineInstr *
MI : InitExecInstrs) {
1679 if (
MI->getParent() == &Entry)
1680 InsertPt = std::next(
MI->getIterator());
1689bool SIWholeQuadMode::run(MachineFunction &MF) {
1691 <<
" ------------- \n");
1694 Instructions.clear();
1696 LiveMaskQueries.clear();
1697 LowerToCopyInstrs.clear();
1698 LowerToMovInstrs.clear();
1700 InitExecInstrs.clear();
1701 SetInactiveInstrs.
clear();
1702 StateTransition.clear();
1713 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1714 const bool HasWaveModes = GlobalFlags & ~StateExact;
1715 const bool HasKills = !KillInstrs.empty();
1716 const bool UsesWQM = GlobalFlags & StateWQM;
1717 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1718 LiveMaskReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
1728 for (MachineInstr *
MI : SetInactiveInstrs) {
1729 if (LowerToCopyInstrs.contains(
MI))
1731 auto &
Info = Instructions[
MI];
1732 if (
Info.MarkedStates & StateStrict) {
1733 Info.Needs |= StateStrictWWM;
1734 Info.Disabled &= ~StateStrictWWM;
1735 Blocks[
MI->getParent()].Needs |= StateStrictWWM;
1738 LowerToCopyInstrs.insert(
MI);
1744 Changed |= lowerLiveMaskQueries();
1747 if (!HasWaveModes) {
1749 Changed |= lowerKillInstrs(
false);
1750 }
else if (GlobalFlags == StateWQM) {
1756 lowerKillInstrs(
true);
1760 if (GlobalFlags & StateWQM)
1761 Blocks[&
Entry].InNeeds |= StateWQM;
1763 for (
auto &BII : Blocks)
1764 processBlock(*BII.first, BII.second, BII.first == &Entry);
1766 for (
auto &BII : Blocks)
1767 lowerBlock(*BII.first, BII.second);
1772 if (LiveMaskReg != LMC.
ExecReg)
1781 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1787bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
1788 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1789 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1790 MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() :
nullptr;
1792 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1793 MachinePostDominatorTree *PDT =
1794 PDTWrapper ? &PDTWrapper->getPostDomTree() :
nullptr;
1795 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
1796 return Impl.run(MF);
1809 SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
Analysis containing CSE Info
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
SI Optimize VGPR LiveRange
unsigned getWavefrontSize() const
const unsigned AndSaveExecTermOpc
const unsigned AndTermOpc
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned OrSaveExecOpc
const unsigned AndSaveExecOpc
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
FunctionPass class - This class is used to implement most global optimizations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool hasExtendedImageInsts() const
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
LLVM_ABI void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
LLVM_ABI bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
succ_iterator succ_begin()
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
pred_iterator pred_begin()
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Wrapper class representing virtual and physical registers.
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Wrapper class representing a virtual register or register unit.
constexpr bool isVirtualReg() const
constexpr Register asVirtualReg() const
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
DominatorTreeBase< T, false > DomTreeBase
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionPass * createSIWholeQuadModeLegacyPass()
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
WorkItem(const BasicBlock *BB, int St)
static constexpr LaneBitmask getAll()
constexpr bool any() const
static constexpr LaneBitmask getNone()