31#define DEBUG_TYPE "si-pre-emit-peephole"
35class SIPreEmitPeephole {
92 return SIPreEmitPeephole().run(MF);
99 "SI peephole optimizations",
false,
false)
101char SIPreEmitPeepholeLegacy::
ID = 0;
105bool SIPreEmitPeephole::optimizeVccBranch(
MachineInstr &
MI)
const {
127 const bool IsWave32 =
ST.isWave32();
128 const unsigned CondReg =
TRI->getVCC();
129 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
130 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
131 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
132 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
136 bool ReadsCond =
false;
137 unsigned Threshold = 5;
141 if (
A->modifiesRegister(ExecReg,
TRI))
143 if (
A->modifiesRegister(CondReg,
TRI)) {
144 if (!
A->definesRegister(CondReg,
TRI) ||
145 (
A->getOpcode() !=
And &&
A->getOpcode() != AndN2))
149 ReadsCond |=
A->readsRegister(CondReg,
TRI);
157 TII->commuteInstruction(*A);
165 int64_t MaskValue = 0;
169 auto M = std::next(A);
170 bool ReadsSreg = false;
171 bool ModifiesExec = false;
172 for (; M != E; ++M) {
173 if (M->definesRegister(SReg, TRI))
175 if (M->modifiesRegister(SReg, TRI))
177 ReadsSreg |= M->readsRegister(SReg, TRI);
178 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
186 if (
A->getOpcode() ==
And && SReg == CondReg && !ModifiesExec &&
188 A->eraseFromParent();
191 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
192 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
194 MaskValue = M->getOperand(1).getImm();
198 A->getOperand(2).ChangeToImmediate(MaskValue);
199 M->eraseFromParent();
201 }
else if (Op2.
isImm()) {
202 MaskValue = Op2.getImm();
204 llvm_unreachable(
"Op2 must be register or immediate");
208 assert(MaskValue == 0 || MaskValue == -1);
209 if (
A->getOpcode() == AndN2)
210 MaskValue = ~MaskValue;
212 if (!ReadsCond &&
A->registerDefIsDead(AMDGPU::SCC,
nullptr)) {
213 if (!MI.killsRegister(CondReg, TRI)) {
215 if (MaskValue == 0) {
216 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
219 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
224 A->eraseFromParent();
227 bool IsVCCZ =
MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
228 if (SReg == ExecReg) {
231 MI.eraseFromParent();
234 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
235 }
else if (IsVCCZ && MaskValue == 0) {
246 Found =
Term.isIdenticalTo(
MI);
249 assert(Found &&
"conditional branch is not terminator");
252 assert(Dst.isMBB() &&
"destination is not basic block");
254 BranchMI->eraseFromParent();
262 MI.setDesc(
TII->get(AMDGPU::S_BRANCH));
263 }
else if (!IsVCCZ && MaskValue == 0) {
266 assert(Dst.isMBB() &&
"destination is not basic block");
267 MI.getParent()->removeSuccessor(Dst.getMBB());
268 MI.eraseFromParent();
270 }
else if (MaskValue == -1) {
273 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
276 MI.removeOperand(
MI.findRegisterUseOperandIdx(CondReg,
TRI,
false ));
277 MI.addImplicitDefUseOperands(*
MBB.getParent());
284 MachineBasicBlock &
MBB = *
MI.getParent();
287 MachineOperand *Idx =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
289 SmallVector<MachineInstr *, 4>
ToRemove;
297 E =
MI.getIterator();
301 switch (
I->getOpcode()) {
302 case AMDGPU::S_SET_GPR_IDX_MODE:
304 case AMDGPU::S_SET_GPR_IDX_OFF:
309 if (
I->modifiesRegister(AMDGPU::M0,
TRI))
311 if (IdxReg &&
I->modifiesRegister(IdxReg,
TRI))
314 return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
318 if (!IdxOn || !(
I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
319 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
325 MI.eraseFromBundle();
327 RI->eraseFromBundle();
331bool SIPreEmitPeephole::getBlockDestinations(
332 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
333 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &
Cond) {
344class BranchWeightCostModel {
345 const SIInstrInfo &
TII;
346 const TargetSchedModel &SchedModel;
347 BranchProbability BranchProb;
348 static constexpr uint64_t BranchNotTakenCost = 1;
349 uint64_t BranchTakenCost;
350 uint64_t ThenCyclesCost = 0;
353 BranchWeightCostModel(
const SIInstrInfo &
TII,
const MachineInstr &Branch,
354 const MachineBasicBlock &Succ)
355 :
TII(
TII), SchedModel(
TII.getSchedModel()) {
356 const MachineBasicBlock &Head = *
Branch.getParent();
363 BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
367 if (
TII.isWaitcnt(
MI.getOpcode()))
370 ThenCyclesCost += SchedModel.computeInstrLatency(&
MI);
382 return (Denominator - Numerator) * ThenCyclesCost <=
383 ((Denominator - Numerator) * BranchTakenCost +
384 Numerator * BranchNotTakenCost);
388bool SIPreEmitPeephole::mustRetainExeczBranch(
389 const MachineInstr &Branch,
const MachineBasicBlock &From,
390 const MachineBasicBlock &To)
const {
392 BranchWeightCostModel CostModel{*
TII,
Branch, From};
394 const MachineFunction *MF = From.
getParent();
397 const MachineBasicBlock &
MBB = *
MBBI;
399 for (
const MachineInstr &
MI :
MBB) {
403 if (
MI.isConditionalBranch())
406 if (
MI.isUnconditionalBranch() &&
410 if (
MI.isMetaInstruction())
413 if (
TII->hasUnwantedEffectsWhenEXECEmpty(
MI))
416 if (!CostModel.isProfitable(
MI))
426bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &
MI,
427 MachineBasicBlock &SrcMBB) {
429 if (!
TII->getSchedModel().hasInstrSchedModel())
432 MachineBasicBlock *TrueMBB =
nullptr;
433 MachineBasicBlock *FalseMBB =
nullptr;
436 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB,
Cond))
444 if (mustRetainExeczBranch(
MI, *FalseMBB, *TrueMBB))
448 MI.eraseFromParent();
454bool SIPreEmitPeephole::canUnpackingClobberRegister(
const MachineInstr &
MI) {
464 Register UnpackedDstReg =
TRI->getSubReg(DstReg, AMDGPU::sub0);
466 const MachineOperand *Src0MO =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
467 if (Src0MO && Src0MO->
isReg()) {
470 TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers)->getImm();
472 ?
TRI->getSubReg(SrcReg0, AMDGPU::sub1)
473 :
TRI->getSubReg(SrcReg0, AMDGPU::sub0);
476 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
480 const MachineOperand *Src1MO =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
481 if (Src1MO && Src1MO->
isReg()) {
484 TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers)->getImm();
486 ?
TRI->getSubReg(SrcReg1, AMDGPU::sub1)
487 :
TRI->getSubReg(SrcReg1, AMDGPU::sub0);
488 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
495 const MachineOperand *Src2MO =
496 TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
497 if (Src2MO && Src2MO->
isReg()) {
500 TII->getNamedOperand(
MI, AMDGPU::OpName::src2_modifiers)->getImm();
502 ?
TRI->getSubReg(SrcReg2, AMDGPU::sub1)
503 :
TRI->getSubReg(SrcReg2, AMDGPU::sub0);
504 if (
TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
511uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &
I) {
512 unsigned Opcode =
I.getOpcode();
517 case AMDGPU::V_PK_ADD_F32:
518 return AMDGPU::V_ADD_F32_e64;
519 case AMDGPU::V_PK_MUL_F32:
520 return AMDGPU::V_MUL_F32_e64;
521 case AMDGPU::V_PK_FMA_F32:
522 return AMDGPU::V_FMA_F32_e64;
524 return std::numeric_limits<uint16_t>::max();
529void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
530 unsigned SrcMods,
bool IsHiBits,
531 const MachineOperand &SrcMO) {
532 unsigned NewSrcMods = 0;
544 if (SrcMods & NegModifier)
555 Register UnpackedSrcReg = (SrcMods & OpSelModifier)
556 ?
TRI->getSubReg(SrcMO.
getReg(), AMDGPU::sub1)
557 :
TRI->getSubReg(SrcMO.
getReg(), AMDGPU::sub0);
559 MachineOperand UnpackedSrcMO =
576 bool KillState =
true;
577 if ((OpSel == OpSelHi) && !IsHiBits)
581 NewMI.
add(UnpackedSrcMO);
584void SIPreEmitPeephole::collectUnpackingCandidates(
585 MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
586 uint16_t NumMFMACycles) {
589 int TotalCyclesBetweenCandidates = 0;
590 auto SchedModel =
TII->getSchedModel();
595 uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
597 !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
598 if (
Instr.isMetaInstruction())
600 if ((
Instr.isTerminator()) ||
601 (
TII->isNeverCoissue(Instr) && !IsUnpackable) ||
603 Instr.modifiesRegister(AMDGPU::EXEC,
TRI)))
606 const MCSchedClassDesc *InstrSchedClassDesc =
610 TotalCyclesBetweenCandidates +=
Latency;
612 if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
619 for (
const MachineOperand &InstrMO :
Instr.operands()) {
620 if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
622 if (
TRI->regsOverlap(MFMADef, InstrMO.getReg()))
628 if (canUnpackingClobberRegister(Instr))
633 TotalCyclesBetweenCandidates -=
Latency;
635 TotalCyclesBetweenCandidates += 2;
637 if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
638 InstrsToUnpack.
insert(&Instr);
642void SIPreEmitPeephole::performF32Unpacking(MachineInstr &
I) {
643 MachineOperand DstOp =
I.getOperand(0);
645 uint16_t UnpackedOpcode = mapToUnpackedOpcode(
I);
646 assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
647 "Unsupported Opcode");
649 MachineInstrBuilder Op0LOp1L =
650 createUnpackedMI(
I, UnpackedOpcode,
false);
651 MachineOperand LoDstOp = Op0LOp1L->
getOperand(0);
655 MachineInstrBuilder Op0HOp1H =
656 createUnpackedMI(
I, UnpackedOpcode,
true);
657 MachineOperand HiDstOp = Op0HOp1H->
getOperand(0);
659 uint32_t
IFlags =
I.getFlags();
668MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &
I,
669 uint16_t UnpackedOpcode,
673 const MachineOperand *SrcMO0 =
TII->getNamedOperand(
I, AMDGPU::OpName::src0);
674 const MachineOperand *SrcMO1 =
TII->getNamedOperand(
I, AMDGPU::OpName::src1);
675 Register DstReg =
I.getOperand(0).getReg();
676 unsigned OpCode =
I.getOpcode();
677 Register UnpackedDstReg = IsHiBits ?
TRI->getSubReg(DstReg, AMDGPU::sub1)
678 :
TRI->getSubReg(DstReg, AMDGPU::sub0);
680 int64_t ClampVal =
TII->getNamedOperand(
I, AMDGPU::OpName::clamp)->getImm();
682 TII->getNamedOperand(
I, AMDGPU::OpName::src0_modifiers)->getImm();
684 TII->getNamedOperand(
I, AMDGPU::OpName::src1_modifiers)->getImm();
687 NewMI.
addDef(UnpackedDstReg);
688 addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
689 addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
692 const MachineOperand *SrcMO2 =
693 TII->getNamedOperand(
I, AMDGPU::OpName::src2);
695 TII->getNamedOperand(
I, AMDGPU::OpName::src2_modifiers)->getImm();
696 addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
708 if (!SIPreEmitPeephole().
run(MF))
716 TII = ST.getInstrInfo();
717 TRI = &
TII->getRegisterInfo();
725 if (TermI !=
MBB.end()) {
727 switch (
MI.getOpcode()) {
728 case AMDGPU::S_CBRANCH_VCCZ:
729 case AMDGPU::S_CBRANCH_VCCNZ:
732 case AMDGPU::S_CBRANCH_EXECZ:
738 if (!
ST.hasVGPRIndexMode())
741 MachineInstr *SetGPRMI =
nullptr;
742 const unsigned Threshold = 20;
750 if (
Count == Threshold)
755 if (
MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
764 if (optimizeSetGPR(*SetGPRMI,
MI))
775 if (!
ST.hasGFX940Insts())
777 for (MachineBasicBlock &
MBB : MF) {
780 auto SchedModel =
TII->getSchedModel();
781 SetVector<MachineInstr *> InstrsToUnpack;
785 const MCSchedClassDesc *SchedClassDesc =
787 uint16_t NumMFMACycles =
789 collectUnpackingCandidates(
MI, InstrsToUnpack, NumMFMACycles);
791 for (MachineInstr *
MI : InstrsToUnpack) {
792 performF32Unpacking(*
MI);
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
This file implements a set that has insertion order iteration characteristics.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
static uint32_t getDenominator()
uint32_t getNumerator() const
static BranchProbability getZero()
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
LLVM_ABI BranchProbability getSuccProbability(const_succ_iterator Succ) const
Return probability of the edge from this block to MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
void setFlags(unsigned flags)
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
LLVM_ABI bool isRenamable() const
isRenamable - Returns true if this register may be renamed, i.e.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
static bool isMFMA(const MachineInstr &MI)
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A vector that has set insertion semantics.
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
NodeAddr< InstrNode * > Instr
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &)
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...