32#define DEBUG_TYPE "si-pre-emit-peephole"
36class SIPreEmitPeephole {
93 return SIPreEmitPeephole().run(MF);
100 "SI peephole optimizations",
false,
false)
102char SIPreEmitPeepholeLegacy::
ID = 0;
106bool SIPreEmitPeephole::optimizeVccBranch(
MachineInstr &
MI)
const {
128 const bool IsWave32 =
ST.isWave32();
129 const unsigned CondReg =
TRI->getVCC();
130 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
131 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
132 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
133 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
137 bool ReadsCond =
false;
138 unsigned Threshold = 5;
142 if (
A->modifiesRegister(ExecReg,
TRI))
144 if (
A->modifiesRegister(CondReg,
TRI)) {
145 if (!
A->definesRegister(CondReg,
TRI) ||
146 (
A->getOpcode() !=
And &&
A->getOpcode() != AndN2))
150 ReadsCond |=
A->readsRegister(CondReg,
TRI);
158 Op2.
getReg() == ExecReg) {
159 TII->commuteInstruction(*A);
167 int64_t MaskValue = 0;
171 auto M = std::next(A);
172 bool ReadsSreg = false;
173 bool ModifiesExec = false;
174 for (; M != E; ++M) {
175 if (M->definesRegister(SReg, TRI))
177 if (M->modifiesRegister(SReg, TRI))
179 ReadsSreg |= M->readsRegister(SReg, TRI);
180 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
188 if (
A->getOpcode() ==
And && SReg == CondReg && !ModifiesExec &&
190 A->eraseFromParent();
193 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
194 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
196 MaskValue = M->getOperand(1).getImm();
200 A->getOperand(2).ChangeToImmediate(MaskValue);
201 M->eraseFromParent();
203 }
else if (Op2.
isImm()) {
204 MaskValue = Op2.getImm();
206 llvm_unreachable(
"Op2 must be register or immediate");
210 assert(MaskValue == 0 || MaskValue == -1);
211 if (
A->getOpcode() == AndN2)
212 MaskValue = ~MaskValue;
214 if (!ReadsCond &&
A->registerDefIsDead(AMDGPU::SCC,
nullptr)) {
215 if (!MI.killsRegister(CondReg, TRI)) {
217 if (MaskValue == 0) {
218 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
221 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
226 A->eraseFromParent();
229 bool IsVCCZ =
MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
230 if (SReg == ExecReg) {
233 MI.eraseFromParent();
236 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
237 }
else if (IsVCCZ && MaskValue == 0) {
248 Found =
Term.isIdenticalTo(
MI);
251 assert(Found &&
"conditional branch is not terminator");
254 assert(Dst.isMBB() &&
"destination is not basic block");
256 BranchMI->eraseFromParent();
264 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
265 }
else if (!IsVCCZ && MaskValue == 0) {
268 assert(Dst.isMBB() &&
"destination is not basic block");
269 MI.getParent()->removeSuccessor(Dst.getMBB());
270 MI.eraseFromParent();
272 }
else if (MaskValue == -1) {
275 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
278 MI.removeOperand(
MI.findRegisterUseOperandIdx(CondReg,
TRI,
false ));
279 MI.addImplicitDefUseOperands(*
MBB.getParent());
286 MachineBasicBlock &
MBB = *
MI.getParent();
289 MachineOperand *Idx =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
291 SmallVector<MachineInstr *, 4>
ToRemove;
299 E =
MI.getIterator();
301 if (
I->isBundle() ||
I->isDebugInstr())
303 switch (
I->getOpcode()) {
304 case AMDGPU::S_SET_GPR_IDX_MODE:
306 case AMDGPU::S_SET_GPR_IDX_OFF:
311 if (
I->modifiesRegister(AMDGPU::M0,
TRI))
313 if (IdxReg &&
I->modifiesRegister(IdxReg,
TRI))
316 return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
320 if (!IdxOn || !(
I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
321 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
327 MI.eraseFromBundle();
329 RI->eraseFromBundle();
333bool SIPreEmitPeephole::getBlockDestinations(
334 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
335 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &
Cond) {
346class BranchWeightCostModel {
347 const SIInstrInfo &
TII;
348 const TargetSchedModel &SchedModel;
349 BranchProbability BranchProb;
350 static constexpr uint64_t BranchNotTakenCost = 1;
351 uint64_t BranchTakenCost;
352 uint64_t ThenCyclesCost = 0;
355 BranchWeightCostModel(
const SIInstrInfo &
TII,
const MachineInstr &Branch,
356 const MachineBasicBlock &Succ)
357 :
TII(
TII), SchedModel(
TII.getSchedModel()) {
358 const MachineBasicBlock &Head = *
Branch.getParent();
365 BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
369 if (
TII.isWaitcnt(
MI.getOpcode()))
372 ThenCyclesCost += SchedModel.computeInstrLatency(&
MI);
384 return (Denominator - Numerator) * ThenCyclesCost <=
385 ((Denominator - Numerator) * BranchTakenCost +
386 Numerator * BranchNotTakenCost);
390bool SIPreEmitPeephole::mustRetainExeczBranch(
391 const MachineInstr &Branch,
const MachineBasicBlock &From,
392 const MachineBasicBlock &To)
const {
394 BranchWeightCostModel CostModel{*
TII,
Branch, From};
396 const MachineFunction *MF = From.
getParent();
399 const MachineBasicBlock &
MBB = *
MBBI;
401 for (
const MachineInstr &
MI :
MBB) {
405 if (
MI.isConditionalBranch())
408 if (
MI.isUnconditionalBranch() &&
412 if (
MI.isMetaInstruction())
415 if (
TII->hasUnwantedEffectsWhenEXECEmpty(
MI))
418 if (!CostModel.isProfitable(
MI))
428bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &
MI,
429 MachineBasicBlock &SrcMBB) {
431 if (!
TII->getSchedModel().hasInstrSchedModel())
434 MachineBasicBlock *TrueMBB =
nullptr;
435 MachineBasicBlock *FalseMBB =
nullptr;
438 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB,
Cond))
446 if (mustRetainExeczBranch(
MI, *FalseMBB, *TrueMBB))
450 MI.eraseFromParent();
456bool SIPreEmitPeephole::canUnpackingClobberRegister(
const MachineInstr &
MI) {
466 Register UnpackedDstReg =
TRI->getSubReg(DstReg, AMDGPU::sub0);
468 const MachineOperand *Src0MO =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
469 if (Src0MO && Src0MO->
isReg()) {
472 TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
474 ?
TRI->getSubReg(SrcReg0, AMDGPU::sub1)
475 :
TRI->getSubReg(SrcReg0, AMDGPU::sub0);
478 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
482 const MachineOperand *Src1MO =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
483 if (Src1MO && Src1MO->
isReg()) {
486 TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
488 ?
TRI->getSubReg(SrcReg1, AMDGPU::sub1)
489 :
TRI->getSubReg(SrcReg1, AMDGPU::sub0);
490 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
497 const MachineOperand *Src2MO =
498 TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
499 if (Src2MO && Src2MO->
isReg()) {
502 TII->getNamedOperand(
MI, AMDGPU::OpName::src2_modifiers)->getImm();
504 ?
TRI->getSubReg(SrcReg2, AMDGPU::sub1)
505 :
TRI->getSubReg(SrcReg2, AMDGPU::sub0);
506 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
513uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &
I) {
514 unsigned Opcode =
I.getOpcode();
519 case AMDGPU::V_PK_ADD_F32:
520 return AMDGPU::V_ADD_F32_e64;
521 case AMDGPU::V_PK_MUL_F32:
522 return AMDGPU::V_MUL_F32_e64;
523 case AMDGPU::V_PK_FMA_F32:
524 return AMDGPU::V_FMA_F32_e64;
526 return std::numeric_limits<uint16_t>::max();
531void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
532 unsigned SrcMods,
bool IsHiBits,
533 const MachineOperand &SrcMO) {
534 unsigned NewSrcMods = 0;
546 if (SrcMods & NegModifier)
557 Register UnpackedSrcReg = (SrcMods & OpSelModifier)
558 ?
TRI->getSubReg(SrcMO.
getReg(), AMDGPU::sub1)
559 :
TRI->getSubReg(SrcMO.
getReg(), AMDGPU::sub0);
561 MachineOperand UnpackedSrcMO =
578 bool KillState =
true;
579 if ((OpSel == OpSelHi) && !IsHiBits)
583 NewMI.
add(UnpackedSrcMO);
586void SIPreEmitPeephole::collectUnpackingCandidates(
587 MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
588 uint16_t NumMFMACycles) {
591 int TotalCyclesBetweenCandidates = 0;
592 auto SchedModel =
TII->getSchedModel();
597 uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
599 !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
600 if (
Instr.isMetaInstruction())
602 if ((
Instr.isTerminator()) ||
603 (
TII->isNeverCoissue(Instr) && !IsUnpackable) ||
605 Instr.modifiesRegister(AMDGPU::EXEC,
TRI)))
608 const MCSchedClassDesc *InstrSchedClassDesc =
612 TotalCyclesBetweenCandidates +=
Latency;
614 if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
621 for (
const MachineOperand &InstrMO :
Instr.operands()) {
622 if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
624 if (
TRI->regsOverlap(MFMADef, InstrMO.getReg()))
630 if (canUnpackingClobberRegister(Instr))
635 TotalCyclesBetweenCandidates -=
Latency;
637 TotalCyclesBetweenCandidates += 2;
639 if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
640 InstrsToUnpack.
insert(&Instr);
644void SIPreEmitPeephole::performF32Unpacking(MachineInstr &
I) {
645 const MachineOperand &DstOp =
I.getOperand(0);
647 uint16_t UnpackedOpcode = mapToUnpackedOpcode(
I);
648 assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
649 "Unsupported Opcode");
651 MachineInstrBuilder Op0LOp1L =
652 createUnpackedMI(
I, UnpackedOpcode,
false);
653 MachineOperand LoDstOp = Op0LOp1L->
getOperand(0);
657 MachineInstrBuilder Op0HOp1H =
658 createUnpackedMI(
I, UnpackedOpcode,
true);
659 MachineOperand HiDstOp = Op0HOp1H->
getOperand(0);
661 uint32_t
IFlags =
I.getFlags();
670MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &
I,
671 uint16_t UnpackedOpcode,
675 const MachineOperand *SrcMO0 =
TII->getNamedOperand(
I, AMDGPU::OpName::src0);
676 const MachineOperand *SrcMO1 =
TII->getNamedOperand(
I, AMDGPU::OpName::src1);
677 Register DstReg =
I.getOperand(0).getReg();
678 unsigned OpCode =
I.getOpcode();
679 Register UnpackedDstReg = IsHiBits ?
TRI->getSubReg(DstReg, AMDGPU::sub1)
680 :
TRI->getSubReg(DstReg, AMDGPU::sub0);
682 int64_t ClampVal =
TII->getNamedOperand(
I, AMDGPU::OpName::clamp)->getImm();
684 TII->getNamedOperand(
I, AMDGPU::OpName::src0_modifiers)->getImm();
686 TII->getNamedOperand(
I, AMDGPU::OpName::src1_modifiers)->getImm();
689 NewMI.
addDef(UnpackedDstReg);
690 addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
691 addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
694 const MachineOperand *SrcMO2 =
695 TII->getNamedOperand(
I, AMDGPU::OpName::src2);
697 TII->getNamedOperand(
I, AMDGPU::OpName::src2_modifiers)->getImm();
698 addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
713 if (SIPreEmitPeephole().
run(MF))
717 MDT->updateBlockNumbers();
719 MPDT->updateBlockNumbers();
725 TII = ST.getInstrInfo();
726 TRI = &
TII->getRegisterInfo();
734 if (TermI !=
MBB.end()) {
736 switch (
MI.getOpcode()) {
737 case AMDGPU::S_CBRANCH_VCCZ:
738 case AMDGPU::S_CBRANCH_VCCNZ:
741 case AMDGPU::S_CBRANCH_EXECZ:
747 if (!
ST.hasVGPRIndexMode())
750 MachineInstr *SetGPRMI =
nullptr;
751 const unsigned Threshold = 20;
759 if (
Count == Threshold)
764 if (
MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
773 if (optimizeSetGPR(*SetGPRMI,
MI))
784 if (!
ST.hasGFX940Insts())
786 for (MachineBasicBlock &
MBB : MF) {
789 auto SchedModel =
TII->getSchedModel();
790 SetVector<MachineInstr *> InstrsToUnpack;
794 const MCSchedClassDesc *SchedClassDesc =
796 uint16_t NumMFMACycles =
798 collectUnpackingCandidates(
MI, InstrsToUnpack, NumMFMACycles);
800 for (MachineInstr *
MI : InstrsToUnpack) {
801 performF32Unpacking(*
MI);
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
This file implements a set that has insertion order iteration characteristics.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
static uint32_t getDenominator()
uint32_t getNumerator() const
static BranchProbability getZero()
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
LLVM_ABI BranchProbability getSuccProbability(const_succ_iterator Succ) const
Return probability of the edge from this block to MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
void setFlags(unsigned flags)
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
LLVM_ABI bool isRenamable() const
isRenamable - Returns true if this register may be renamed, i.e.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
static bool isMFMA(const MachineInstr &MI)
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A vector that has set insertion semantics.
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
NodeAddr< InstrNode * > Instr
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &)
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...