87#define DEBUG_TYPE "si-wqm"
96 StateStrict = StateStrictWWM | StateStrictWQM,
103 explicit PrintState(
int State) : State(State) {}
109 static const std::pair<char, const char *> Mapping[] = {
110 std::pair(StateWQM,
"WQM"), std::pair(StateStrictWWM,
"StrictWWM"),
111 std::pair(StateStrictWQM,
"StrictWQM"), std::pair(StateExact,
"Exact")};
112 char State = PS.State;
113 for (
auto M : Mapping) {
114 if (State & M.first) {
131 char MarkedStates = 0;
138 char InitialState = 0;
139 bool NeedsLowering =
false;
165 unsigned AndSaveExecOpc;
166 unsigned AndSaveExecTermOpc;
187 std::vector<WorkItem> &Worklist);
189 unsigned SubReg,
char Flag, std::vector<WorkItem> &Worklist);
191 std::vector<WorkItem> &Worklist);
193 std::vector<WorkItem> &Worklist);
194 char scanInstructions(
MachineFunction &MF, std::vector<WorkItem> &Worklist);
195 void propagateInstruction(
MachineInstr &
MI, std::vector<WorkItem> &Worklist);
210 Register SaveOrig,
char StrictStateNeeded);
213 char NonStrictState,
char CurrentStrictState);
224 bool lowerLiveMaskQueries();
225 bool lowerCopyInstrs();
226 bool lowerKillInstrs(
bool IsWQM);
252 MachineFunctionProperties::Property::IsSSA);
258char SIWholeQuadMode::ID = 0;
271 return new SIWholeQuadMode;
276 for (
const auto &BII :
Blocks) {
279 <<
" InNeeds = " << PrintState(BII.second.InNeeds)
280 <<
", Needs = " << PrintState(BII.second.Needs)
281 <<
", OutNeeds = " << PrintState(BII.second.OutNeeds) <<
"\n\n";
284 auto III = Instructions.find(&
MI);
285 if (III != Instructions.end()) {
286 dbgs() <<
" " <<
MI <<
" Needs = " << PrintState(III->second.Needs)
287 <<
", OutNeeds = " << PrintState(III->second.OutNeeds) <<
'\n';
295 std::vector<WorkItem> &Worklist) {
298 assert(!(Flag & StateExact) && Flag != 0);
307 Flag &= ~II.Disabled;
311 if ((
II.Needs & Flag) == Flag)
316 Worklist.emplace_back(&
MI);
322 std::vector<WorkItem> &Worklist) {
334 : (
Reg.isVirtual() ?
MRI->getMaxLaneMaskForVReg(Reg)
346 :
Phi(
Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
348 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
352 unsigned NextPredIdx = 0;
354 const VNInfo *NextValue =
nullptr;
355 const VisitKey
Key(
Value, DefinedLanes);
357 if (Visited.
insert(Key).second) {
362 if (
Value->isPHIDef()) {
365 assert(
MBB &&
"Phi-def has no defining MBB");
368 unsigned Idx = NextPredIdx;
371 for (; PI != PE && !NextValue; ++PI, ++
Idx) {
373 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
383 assert(
MI &&
"Def has no defining instruction");
385 if (
Reg.isVirtual()) {
389 if (
Op.getReg() != Reg)
395 :
TRI->getSubRegIndexLaneMask(
Op.getSubReg());
399 HasDef |= Overlap.
any();
402 DefinedLanes |= OpLanes;
406 if ((DefinedLanes & UseLanes) != UseLanes) {
410 if (!Visited.
count(VisitKey(VN, DefinedLanes)))
417 markInstruction(*
MI, Flag, Worklist);
420 markInstruction(*
MI, Flag, Worklist);
424 if (!NextValue && !PhiStack.
empty()) {
427 NextValue =
Entry.Phi;
428 NextPredIdx =
Entry.PredIdx;
429 DefinedLanes =
Entry.DefinedLanes;
439 std::vector<WorkItem> &Worklist) {
446 case AMDGPU::EXEC_LO:
454 if (
Reg.isVirtual()) {
456 markDefs(
MI, LR, Reg,
Op.getSubReg(), Flag, Worklist);
465 markDefs(
MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
471void SIWholeQuadMode::markInstructionUses(
const MachineInstr &
MI,
char Flag,
472 std::vector<WorkItem> &Worklist) {
473 LLVM_DEBUG(
dbgs() <<
"markInstructionUses " << PrintState(Flag) <<
": "
477 markOperand(
MI,
Use, Flag, Worklist);
483 std::vector<WorkItem> &Worklist) {
484 char GlobalFlags = 0;
487 bool HasImplicitDerivatives =
500 unsigned Opcode =
MI.getOpcode();
503 if (
TII->isWQM(Opcode)) {
508 if (
ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
512 markInstructionUses(
MI, StateWQM, Worklist);
513 GlobalFlags |= StateWQM;
515 }
else if (Opcode == AMDGPU::WQM) {
519 LowerToCopyInstrs.insert(&
MI);
520 }
else if (Opcode == AMDGPU::SOFT_WQM) {
521 LowerToCopyInstrs.insert(&
MI);
523 }
else if (Opcode == AMDGPU::STRICT_WWM) {
527 markInstructionUses(
MI, StateStrictWWM, Worklist);
528 GlobalFlags |= StateStrictWWM;
529 LowerToMovInstrs.push_back(&
MI);
530 }
else if (Opcode == AMDGPU::STRICT_WQM ||
531 TII->isDualSourceBlendEXP(
MI)) {
535 markInstructionUses(
MI, StateStrictWQM, Worklist);
536 GlobalFlags |= StateStrictWQM;
538 if (Opcode == AMDGPU::STRICT_WQM) {
539 LowerToMovInstrs.push_back(&
MI);
544 BBI.Needs |= StateExact;
545 if (!(BBI.InNeeds & StateExact)) {
546 BBI.InNeeds |= StateExact;
547 Worklist.emplace_back(
MBB);
549 GlobalFlags |= StateExact;
550 III.Disabled = StateWQM | StateStrict;
552 }
else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
553 Opcode == AMDGPU::DS_PARAM_LOAD ||
554 Opcode == AMDGPU::LDS_DIRECT_LOAD ||
555 Opcode == AMDGPU::DS_DIRECT_LOAD) {
558 III.Needs |= StateStrictWQM;
559 GlobalFlags |= StateStrictWQM;
560 }
else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
562 III.Disabled = StateStrict;
564 if (Inactive.
isReg()) {
565 if (Inactive.
isUndef() &&
MI.getOperand(3).getImm() == 0)
566 LowerToCopyInstrs.insert(&
MI);
568 markOperand(
MI, Inactive, StateStrictWWM, Worklist);
570 SetInactiveInstrs.push_back(&
MI);
571 BBI.NeedsLowering =
true;
572 }
else if (
TII->isDisableWQM(
MI)) {
573 BBI.Needs |= StateExact;
574 if (!(BBI.InNeeds & StateExact)) {
575 BBI.InNeeds |= StateExact;
576 Worklist.emplace_back(
MBB);
578 GlobalFlags |= StateExact;
579 III.Disabled = StateWQM | StateStrict;
580 }
else if (Opcode == AMDGPU::SI_PS_LIVE ||
581 Opcode == AMDGPU::SI_LIVE_MASK) {
582 LiveMaskQueries.push_back(&
MI);
583 }
else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
584 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
585 Opcode == AMDGPU::SI_DEMOTE_I1) {
586 KillInstrs.push_back(&
MI);
587 BBI.NeedsLowering =
true;
588 }
else if (Opcode == AMDGPU::SI_INIT_EXEC ||
589 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
590 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
591 InitExecInstrs.push_back(&
MI);
592 }
else if (WQMOutputs) {
599 if (
Reg.isPhysical() &&
600 TRI->hasVectorRegisters(
TRI->getPhysRegBaseClass(Reg))) {
608 markInstruction(
MI, Flags, Worklist);
609 GlobalFlags |=
Flags;
618 if (GlobalFlags & StateWQM) {
620 markInstruction(*
MI, StateWQM, Worklist);
622 markInstruction(*
MI, StateWQM, Worklist);
629 std::vector<WorkItem>& Worklist) {
636 if ((
II.OutNeeds & StateWQM) && !(
II.Disabled & StateWQM) &&
637 (
MI.isTerminator() || (
TII->usesVM_CNT(
MI) &&
MI.mayStore()))) {
643 if (
II.Needs & StateWQM) {
644 BI.Needs |= StateWQM;
645 if (!(BI.InNeeds & StateWQM)) {
646 BI.InNeeds |= StateWQM;
647 Worklist.emplace_back(
MBB);
653 char InNeeds = (
II.Needs & ~StateStrict) |
II.OutNeeds;
654 if (!PrevMI->isPHI()) {
656 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
657 PrevII.OutNeeds |= InNeeds;
658 Worklist.emplace_back(PrevMI);
667 markInstructionUses(
MI,
II.Needs, Worklist);
671 if (
II.Needs & StateStrictWWM)
672 BI.Needs |= StateStrictWWM;
673 if (
II.Needs & StateStrictWQM)
674 BI.Needs |= StateStrictWQM;
678 std::vector<WorkItem>& Worklist) {
685 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
686 LastII.OutNeeds |= BI.OutNeeds;
687 Worklist.emplace_back(LastMI);
693 BlockInfo &PredBI =
Blocks[Pred];
694 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
697 PredBI.OutNeeds |= BI.InNeeds;
698 PredBI.InNeeds |= BI.InNeeds;
699 Worklist.emplace_back(Pred);
704 BlockInfo &SuccBI =
Blocks[Succ];
705 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
708 SuccBI.InNeeds |= BI.OutNeeds;
709 Worklist.emplace_back(Succ);
714 std::vector<WorkItem> Worklist;
715 char GlobalFlags = scanInstructions(MF, Worklist);
717 while (!Worklist.empty()) {
722 propagateInstruction(*WI.MI, Worklist);
724 propagateBlock(*WI.MBB, Worklist);
733 Register SaveReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
742 LIS->InsertMachineInstrInMaps(*Save);
743 LIS->InsertMachineInstrInMaps(*Restore);
744 LIS->createAndComputeVirtRegInterval(SaveReg);
755 BB->
splitAt(*TermMI,
true, LIS);
759 unsigned NewOpcode = 0;
761 case AMDGPU::S_AND_B32:
762 NewOpcode = AMDGPU::S_AND_B32_term;
764 case AMDGPU::S_AND_B64:
765 NewOpcode = AMDGPU::S_AND_B64_term;
767 case AMDGPU::S_MOV_B32:
768 NewOpcode = AMDGPU::S_MOV_B32_term;
770 case AMDGPU::S_MOV_B64:
771 NewOpcode = AMDGPU::S_MOV_B64_term;
784 DTUpdates.
push_back({DomTreeT::Insert, SplitBB, Succ});
785 DTUpdates.
push_back({DomTreeT::Delete, BB, Succ});
787 DTUpdates.
push_back({DomTreeT::Insert, BB, SplitBB});
789 MDT->applyUpdates(DTUpdates);
791 PDT->applyUpdates(DTUpdates);
797 LIS->InsertMachineInstrInMaps(*
MI);
805 assert(LiveMaskReg.isVirtual());
819 switch (
MI.getOperand(2).getImm()) {
821 Opcode = AMDGPU::V_CMP_LG_F32_e64;
824 Opcode = AMDGPU::V_CMP_GE_F32_e64;
827 Opcode = AMDGPU::V_CMP_GT_F32_e64;
830 Opcode = AMDGPU::V_CMP_LE_F32_e64;
833 Opcode = AMDGPU::V_CMP_LT_F32_e64;
836 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
839 Opcode = AMDGPU::V_CMP_O_F32_e64;
842 Opcode = AMDGPU::V_CMP_U_F32_e64;
846 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
850 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
854 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
858 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
862 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
866 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
878 Register VCC =
ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
911 LIS->ReplaceMachineInstrInMaps(
MI, *VcmpMI);
914 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
915 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
916 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
917 LIS->InsertMachineInstrInMaps(*NewTerm);
924 assert(LiveMaskReg.isVirtual());
929 const bool IsDemote = IsWQM && (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
931 int64_t KillVal =
MI.getOperand(1).getImm();
938 if (
Op.getImm() == KillVal) {
946 if (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
947 LIS->RemoveMachineInstrFromMaps(
MI);
952 LIS->ReplaceMachineInstrInMaps(
MI, *NewTerm);
961 TmpReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
962 ComputeKilledMaskMI =
987 LiveMaskWQM =
MRI->createVirtualRegister(
TRI->getBoolRC());
996 unsigned MovOpc =
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1003 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1010 LIS->RemoveMachineInstrFromMaps(
MI);
1015 if (ComputeKilledMaskMI)
1016 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1017 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1018 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1020 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1021 LIS->InsertMachineInstrInMaps(*NewTerm);
1024 LIS->removeInterval(CndReg);
1025 LIS->createAndComputeVirtRegInterval(CndReg);
1028 LIS->createAndComputeVirtRegInterval(TmpReg);
1030 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1043 const BlockInfo &BI = BII->second;
1044 if (!BI.NeedsLowering)
1051 char State = BI.InitialState;
1055 if (StateTransition.count(&
MI))
1056 State = StateTransition[&
MI];
1059 switch (
MI.getOpcode()) {
1060 case AMDGPU::SI_DEMOTE_I1:
1061 case AMDGPU::SI_KILL_I1_TERMINATOR:
1062 SplitPoint = lowerKillI1(
MBB,
MI, State == StateWQM);
1064 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1065 SplitPoint = lowerKillF32(
MBB,
MI);
1067 case AMDGPU::ENTER_STRICT_WWM:
1068 ActiveLanesReg =
MI.getOperand(0).getReg();
1070 case AMDGPU::EXIT_STRICT_WWM:
1073 case AMDGPU::V_SET_INACTIVE_B32:
1074 if (ActiveLanesReg) {
1076 MRI->constrainRegClass(ActiveLanesReg,
TRI->getWaveMaskRegClass());
1077 MI.getOperand(5).setReg(ActiveLanesReg);
1078 LIS->shrinkToUses(&LI);
1080 assert(State == StateExact || State == StateWQM);
1091 if (!SplitPoints.
empty()) {
1112 : LIS->getMBBEndIdx(&
MBB);
1114 Last != MBBE ? LIS->getInstructionIndex(*
Last) : LIS->getMBBEndIdx(&
MBB);
1125 if (Next < FirstIdx)
1130 assert(EndMI &&
"Segment does not end on valid instruction");
1134 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1154 bool IsExecDef =
false;
1157 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1174 assert(LiveMaskReg.isVirtual());
1177 if (!IsTerminator) {
1179 if (FirstTerm !=
MBB.
end()) {
1180 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1182 IsTerminator = BeforeIdx > FirstTermIdx;
1189 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1193 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1199 LIS->InsertMachineInstrInMaps(*
MI);
1200 StateTransition[
MI] = StateExact;
1215 LIS->InsertMachineInstrInMaps(*
MI);
1216 StateTransition[
MI] = StateWQM;
1221 Register SaveOrig,
char StrictStateNeeded) {
1224 assert(StrictStateNeeded == StateStrictWWM ||
1225 StrictStateNeeded == StateStrictWQM);
1227 if (StrictStateNeeded == StateStrictWWM) {
1236 LIS->InsertMachineInstrInMaps(*
MI);
1237 StateTransition[
MI] = StrictStateNeeded;
1242 Register SavedOrig,
char NonStrictState,
1243 char CurrentStrictState) {
1247 assert(CurrentStrictState == StateStrictWWM ||
1248 CurrentStrictState == StateStrictWQM);
1250 if (CurrentStrictState == StateStrictWWM) {
1259 LIS->InsertMachineInstrInMaps(*
MI);
1260 StateTransition[
MI] = NonStrictState;
1268 BlockInfo &BI = BII->second;
1272 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1273 BI.InitialState = StateWQM;
1282 bool WQMFromExec = IsEntry;
1283 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1284 char NonStrictState = 0;
1290 if (
II != IE &&
II->getOpcode() == AMDGPU::COPY &&
1291 II->getOperand(1).getReg() ==
TRI->getExec())
1306 BI.InitialState = State;
1308 for (
unsigned Idx = 0;; ++
Idx) {
1310 char Needs = StateExact | StateWQM;
1316 if (FirstStrict == IE)
1320 if (IsEntry &&
Idx == 0 && (BI.InNeeds & StateWQM))
1328 if (
MI.isTerminator() ||
TII->mayReadEXEC(*
MRI,
MI)) {
1331 if (III->second.Needs & StateStrictWWM)
1332 Needs = StateStrictWWM;
1333 else if (III->second.Needs & StateStrictWQM)
1334 Needs = StateStrictWQM;
1335 else if (III->second.Needs & StateWQM)
1338 Needs &= ~III->second.Disabled;
1339 OutNeeds = III->second.OutNeeds;
1344 Needs = StateExact | StateWQM | StateStrict;
1348 if (
MI.isBranch() && OutNeeds == StateExact)
1354 if (BI.OutNeeds & StateWQM)
1356 else if (BI.OutNeeds == StateExact)
1359 Needs = StateWQM | StateExact;
1363 if (!(Needs & State)) {
1365 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1366 State == StateStrictWQM || Needs == StateStrictWQM) {
1368 First = FirstStrict;
1375 bool SaveSCC =
false;
1378 case StateStrictWWM:
1379 case StateStrictWQM:
1383 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1387 SaveSCC = !(Needs & StateWQM);
1393 char StartState = State & StateStrict ? NonStrictState : State;
1395 StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
1396 bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
1397 !(Needs & StateExact);
1398 bool PreferLast = Needs == StateWQM;
1403 if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
1405 if (
TII->hasUnwantedEffectsWhenEXECEmpty(*
I)) {
1406 PreferLast = WQMToExact;
1412 prepareInsertion(
MBB,
First,
II, PreferLast, SaveSCC);
1414 if (State & StateStrict) {
1415 assert(State == StateStrictWWM || State == StateStrictWQM);
1416 assert(SavedNonStrictReg);
1417 fromStrictMode(
MBB,
Before, SavedNonStrictReg, NonStrictState, State);
1419 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1420 SavedNonStrictReg = 0;
1421 State = NonStrictState;
1424 if (Needs & StateStrict) {
1425 NonStrictState = State;
1426 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1427 assert(!SavedNonStrictReg);
1428 SavedNonStrictReg =
MRI->createVirtualRegister(BoolRC);
1430 toStrictMode(
MBB,
Before, SavedNonStrictReg, Needs);
1434 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1436 SavedWQMReg =
MRI->createVirtualRegister(BoolRC);
1441 }
else if (ExactToWQM) {
1442 assert(WQMFromExec == (SavedWQMReg == 0));
1447 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1460 if (Needs != (StateExact | StateWQM | StateStrict)) {
1461 if (Needs != (StateExact | StateWQM))
1472 assert(!SavedNonStrictReg);
1475bool SIWholeQuadMode::lowerLiveMaskQueries() {
1484 LIS->ReplaceMachineInstrInMaps(*
MI, *Copy);
1485 MI->eraseFromParent();
1487 return !LiveMaskQueries.empty();
1490bool SIWholeQuadMode::lowerCopyInstrs() {
1492 assert(
MI->getNumExplicitOperands() == 2);
1497 TRI->getRegClassForOperandReg(*
MRI,
MI->getOperand(0));
1498 if (
TRI->isVGPRClass(regClass)) {
1499 const unsigned MovOp =
TII->getMovOpcode(regClass);
1500 MI->setDesc(
TII->get(MovOp));
1505 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1511 if (
MI->getOperand(0).isEarlyClobber()) {
1512 LIS->removeInterval(Reg);
1513 MI->getOperand(0).setIsEarlyClobber(
false);
1514 LIS->createAndComputeVirtRegInterval(Reg);
1516 int Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1517 while (Index >= 0) {
1518 MI->removeOperand(Index);
1519 Index =
MI->findRegisterUseOperandIdx(AMDGPU::EXEC,
nullptr);
1521 MI->setDesc(
TII->get(AMDGPU::COPY));
1528 if (
MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
1529 assert(
MI->getNumExplicitOperands() == 6);
1532 if (
MI->getOperand(4).isReg())
1533 RecomputeLI = &LIS->getInterval(
MI->getOperand(4).getReg());
1535 MI->removeOperand(5);
1536 MI->removeOperand(4);
1537 MI->removeOperand(3);
1538 MI->removeOperand(1);
1541 LIS->shrinkToUses(RecomputeLI);
1543 assert(
MI->getNumExplicitOperands() == 2);
1546 unsigned CopyOp =
MI->getOperand(1).isReg()
1548 :
TII->getMovOpcode(
TRI->getRegClassForOperandReg(
1549 *
MRI,
MI->getOperand(0)));
1550 MI->setDesc(
TII->get(CopyOp));
1553 return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
1556bool SIWholeQuadMode::lowerKillInstrs(
bool IsWQM) {
1560 switch (
MI->getOpcode()) {
1561 case AMDGPU::SI_DEMOTE_I1:
1562 case AMDGPU::SI_KILL_I1_TERMINATOR:
1563 SplitPoint = lowerKillI1(*
MBB, *
MI, IsWQM);
1565 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1566 SplitPoint = lowerKillF32(*
MBB, *
MI);
1572 return !KillInstrs.empty();
1577 bool IsWave32 =
ST->isWave32();
1579 if (
MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1581 "init whole wave not in entry block");
1582 Register EntryExec =
MRI->createVirtualRegister(
TRI->getBoolRC());
1585 TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1586 : AMDGPU::S_OR_SAVEEXEC_B64),
1591 MRI->replaceRegWith(
MI.getOperand(0).getReg(), EntryExec);
1594 LIS->RemoveMachineInstrFromMaps(
MI);
1597 MI.eraseFromParent();
1600 LIS->InsertMachineInstrInMaps(*SaveExec);
1601 LIS->createAndComputeVirtRegInterval(EntryExec);
1606 if (
MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
1610 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1612 .
addImm(
MI.getOperand(0).getImm());
1614 LIS->RemoveMachineInstrFromMaps(
MI);
1615 LIS->InsertMachineInstrInMaps(*InitMI);
1617 MI.eraseFromParent();
1628 Register InputReg =
MI.getOperand(0).getReg();
1634 if (DefInstr != FirstMI) {
1640 LIS->handleMove(*DefInstr);
1652 Register CountReg =
MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1653 auto BfeMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_BFE_U32), CountReg)
1655 .
addImm((
MI.getOperand(1).getImm() & Mask) | 0x70000);
1658 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
1661 auto CmpMI =
BuildMI(*
MBB, FirstMI,
DL,
TII->get(AMDGPU::S_CMP_EQ_U32))
1666 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
1671 MI.eraseFromParent();
1675 LIS->RemoveMachineInstrFromMaps(
MI);
1676 MI.eraseFromParent();
1678 LIS->InsertMachineInstrInMaps(*BfeMI);
1679 LIS->InsertMachineInstrInMaps(*BfmMI);
1680 LIS->InsertMachineInstrInMaps(*CmpMI);
1681 LIS->InsertMachineInstrInMaps(*CmovMI);
1683 LIS->removeInterval(InputReg);
1684 LIS->createAndComputeVirtRegInterval(InputReg);
1685 LIS->createAndComputeVirtRegInterval(CountReg);
1698 if (
MI->getParent() == &Entry)
1699 InsertPt = std::next(
MI->getIterator());
1710 <<
" ------------- \n");
1715 LiveMaskQueries.clear();
1716 LowerToCopyInstrs.clear();
1717 LowerToMovInstrs.clear();
1719 InitExecInstrs.clear();
1720 SetInactiveInstrs.clear();
1721 StateTransition.clear();
1725 TII =
ST->getInstrInfo();
1726 TRI = &
TII->getRegisterInfo();
1728 LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
1729 auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
1730 MDT = MDTWrapper ? &MDTWrapper->getDomTree() :
nullptr;
1732 getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
1733 PDT = PDTWrapper ? &PDTWrapper->getPostDomTree() :
nullptr;
1735 if (
ST->isWave32()) {
1736 AndOpc = AMDGPU::S_AND_B32;
1737 AndTermOpc = AMDGPU::S_AND_B32_term;
1738 AndN2Opc = AMDGPU::S_ANDN2_B32;
1739 XorOpc = AMDGPU::S_XOR_B32;
1740 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1741 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1742 WQMOpc = AMDGPU::S_WQM_B32;
1743 Exec = AMDGPU::EXEC_LO;
1745 AndOpc = AMDGPU::S_AND_B64;
1746 AndTermOpc = AMDGPU::S_AND_B64_term;
1747 AndN2Opc = AMDGPU::S_ANDN2_B64;
1748 XorOpc = AMDGPU::S_XOR_B64;
1749 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1750 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1751 WQMOpc = AMDGPU::S_WQM_B64;
1752 Exec = AMDGPU::EXEC;
1756 bool Changed =
false;
1764 const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
1765 const bool HasWaveModes = GlobalFlags & ~StateExact;
1766 const bool HasKills = !KillInstrs.empty();
1767 const bool UsesWQM = GlobalFlags & StateWQM;
1768 if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
1769 LiveMaskReg =
MRI->createVirtualRegister(
TRI->getBoolRC());
1773 LIS->InsertMachineInstrInMaps(*
MI);
1780 if (LowerToCopyInstrs.contains(
MI))
1782 if (Instructions[
MI].MarkedStates & StateStrict) {
1785 Blocks[
MI->getParent()].Needs |= StateStrictWWM;
1788 LowerToCopyInstrs.insert(
MI);
1794 Changed |= lowerLiveMaskQueries();
1795 Changed |= lowerCopyInstrs();
1797 if (!HasWaveModes) {
1799 Changed |= lowerKillInstrs(
false);
1800 }
else if (GlobalFlags == StateWQM) {
1804 LIS->InsertMachineInstrInMaps(*
MI);
1805 lowerKillInstrs(
true);
1809 if (GlobalFlags & StateWQM)
1813 processBlock(*BII.first, BII.first == &Entry);
1816 lowerBlock(*BII.first);
1821 if (LiveMaskReg != Exec)
1822 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1827 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1830 if (!KillInstrs.empty() || !InitExecInstrs.empty())
1831 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
DenseMap< Block *, BlockRelaxAux > Blocks
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an Operation in the Expression.
Core dominator tree base class.
FunctionPass class - This class is used to implement most global optimizations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LiveInterval - This class represents the liveness of a register, or stack slot.
Result of a LiveRange query.
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
This class represents the liveness of a register, stack slot, etc.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarily including Idx,...
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
succ_iterator succ_begin()
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
pred_iterator pred_begin()
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual MachineFunctionProperties getClearedProperties() const
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
MachineInstr * removeFromParent()
Unlink 'this' from the containing basic block, and return it without deleting it.
const MachineBasicBlock * getParent() const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
SlotIndex - An opaque wrapper around machine indexes.
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
A Use represents the edge between a Value definition and its users.
VNInfo - Value Number Information.
LLVM Value Representation.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char WavefrontSize[]
Key for Kernel::CodeProps::Metadata::mWavefrontSize.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createSIWholeQuadModePass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
static constexpr LaneBitmask getAll()
constexpr bool any() const
static constexpr LaneBitmask getNone()
This represents a simple continuous liveness interval for a value.