49#define DEBUG_TYPE "gcn-dpp-combine"
51STATISTIC(NumDPPMovsCombined,
"Number of DPP moves combined.");
67 bool IsShrinkable)
const;
71 bool IsShrinkable)
const;
76 int64_t Mask = -1)
const;
80 int getDPPOp(
unsigned Op,
bool IsShrinkable)
const;
104 .
set(MachineFunctionProperties::Property::IsSSA);
113char GCNDPPCombineLegacy::
ID = 0;
118 return new GCNDPPCombineLegacy();
122 unsigned Op =
MI.getOpcode();
123 if (!
TII->isVOP3(
Op)) {
126 if (!
TII->hasVALU32BitEncoding(
Op)) {
134 if (
const auto *SDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
139 if (!
MRI->use_nodbg_empty(SDst->getReg()))
144 if (!hasNoImmOrEqual(
MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
145 !hasNoImmOrEqual(
MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
146 !hasNoImmOrEqual(
MI, AMDGPU::OpName::clamp, 0) ||
147 !hasNoImmOrEqual(
MI, AMDGPU::OpName::omod, 0) ||
148 !hasNoImmOrEqual(
MI, AMDGPU::OpName::byte_sel, 0)) {
155int GCNDPPCombine::getDPPOp(
unsigned Op,
bool IsShrinkable)
const {
162 if (DPP32 != -1 &&
TII->pseudoToMCOpcode(DPP32) != -1)
165 if (
ST->hasVOP3DPP())
167 if (DPP64 != -1 &&
TII->pseudoToMCOpcode(DPP64) != -1)
181 switch(
Def->getOpcode()) {
183 case AMDGPU::IMPLICIT_DEF:
186 case AMDGPU::V_MOV_B32_e32:
187 case AMDGPU::V_MOV_B64_PSEUDO:
188 case AMDGPU::V_MOV_B64_e32:
189 case AMDGPU::V_MOV_B64_e64: {
190 auto &Op1 =
Def->getOperand(1);
201 int16_t RegClass =
MI.getDesc().operands()[
Idx].RegClass;
206 return TRI->getRegSizeInBits(*
TRI->getRegClass(RegClass));
213 bool IsShrinkable)
const {
215 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp ||
216 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
218 bool HasVOP3DPP =
ST->hasVOP3DPP();
220 auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
227 auto *RowMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
228 assert(RowMaskOpnd && RowMaskOpnd->isImm());
229 auto *BankMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
230 assert(BankMaskOpnd && BankMaskOpnd->isImm());
231 const bool MaskAllLanes =
232 RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
235 !(
TII->isVOPC(DPPOp) || (
TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
236 TII->isVOPC(OrigOpE32)))) &&
237 "VOPC cannot form DPP unless mask is full");
246 if (
auto *Dst =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
250 if (
auto *SDst =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
251 if (
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
260 assert(OldIdx == NumOperands);
264 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
270 }
else if (
TII->isVOPC(DPPOp) || (
TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
271 TII->isVOPC(OrigOpE32))) {
276 LLVM_DEBUG(
dbgs() <<
" failed: no old operand in DPP instruction,"
282 auto *Mod0 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
285 AMDGPU::OpName::src0_modifiers));
288 DPPInst.addImm(Mod0->getImm());
294 auto *Src0 =
TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
296 int Src0Idx = NumOperands;
297 if (!
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
303 DPPInst->getOperand(NumOperands).setIsKill(
false);
306 auto *Mod1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
309 AMDGPU::OpName::src1_modifiers));
312 DPPInst.addImm(Mod1->getImm());
318 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
320 int OpNum = NumOperands;
324 if (!
ST->hasDPPSrc1SGPR()) {
327 "Src0 and Src1 operands should have the same size");
330 if (!
TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
339 auto *Mod2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
345 DPPInst.addImm(Mod2->getImm());
348 auto *Src2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
350 if (!
TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
351 !
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
361 auto *ClampOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
363 DPPInst.addImm(ClampOpr->getImm());
365 auto *VdstInOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
368 DPPInst.add(*VdstInOpr);
370 auto *OmodOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
372 DPPInst.addImm(OmodOpr->getImm());
376 if (
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
381 if (Mod0 &&
TII->isVOP3(OrigMI) && !
TII->isVOP3P(OrigMI))
390 DPPInst.addImm(OpSel);
392 if (
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
400 assert(Src2 &&
"Expected vop3p with 3 operands");
402 LLVM_DEBUG(
dbgs() <<
" failed: op_sel_hi must be all set to one\n");
407 DPPInst.addImm(OpSelHi);
409 auto *NegOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
411 DPPInst.addImm(NegOpr->getImm());
413 auto *NegHiOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
415 DPPInst.addImm(NegHiOpr->getImm());
417 auto *ByteSelOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
420 DPPInst.addImm(ByteSelOpr->getImm());
423 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
424 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
425 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
426 DPPInst.addImm(CombBCZ ? 1 : 0);
430 DPPInst.getInstr()->eraseFromParent();
434 return DPPInst.getInstr();
441 case AMDGPU::V_ADD_U32_e32:
442 case AMDGPU::V_ADD_U32_e64:
443 case AMDGPU::V_ADD_CO_U32_e32:
444 case AMDGPU::V_ADD_CO_U32_e64:
445 case AMDGPU::V_OR_B32_e32:
446 case AMDGPU::V_OR_B32_e64:
447 case AMDGPU::V_SUBREV_U32_e32:
448 case AMDGPU::V_SUBREV_U32_e64:
449 case AMDGPU::V_SUBREV_CO_U32_e32:
450 case AMDGPU::V_SUBREV_CO_U32_e64:
451 case AMDGPU::V_MAX_U32_e32:
452 case AMDGPU::V_MAX_U32_e64:
453 case AMDGPU::V_XOR_B32_e32:
454 case AMDGPU::V_XOR_B32_e64:
455 if (OldOpnd->
getImm() == 0)
458 case AMDGPU::V_AND_B32_e32:
459 case AMDGPU::V_AND_B32_e64:
460 case AMDGPU::V_MIN_U32_e32:
461 case AMDGPU::V_MIN_U32_e64:
463 std::numeric_limits<uint32_t>::max())
466 case AMDGPU::V_MIN_I32_e32:
467 case AMDGPU::V_MIN_I32_e64:
468 if (
static_cast<int32_t
>(OldOpnd->
getImm()) ==
469 std::numeric_limits<int32_t>::max())
472 case AMDGPU::V_MAX_I32_e32:
473 case AMDGPU::V_MAX_I32_e64:
474 if (
static_cast<int32_t
>(OldOpnd->
getImm()) ==
475 std::numeric_limits<int32_t>::min())
478 case AMDGPU::V_MUL_I32_I24_e32:
479 case AMDGPU::V_MUL_I32_I24_e64:
480 case AMDGPU::V_MUL_U32_U24_e32:
481 case AMDGPU::V_MUL_U32_U24_e64:
482 if (OldOpnd->
getImm() == 1)
491 MachineOperand *OldOpndValue,
bool CombBCZ,
bool IsShrinkable)
const {
493 if (!CombBCZ && OldOpndValue && OldOpndValue->
isImm()) {
494 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
495 if (!Src1 || !Src1->isReg()) {
496 LLVM_DEBUG(
dbgs() <<
" failed: no src1 or it isn't a register\n");
500 LLVM_DEBUG(
dbgs() <<
" failed: old immediate isn't an identity\n");
504 auto *MovDst =
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
511 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
516bool GCNDPPCombine::hasNoImmOrEqual(
MachineInstr &
MI,
unsigned OpndName,
517 int64_t
Value, int64_t Mask)
const {
518 auto *
Imm =
TII->getNamedOperand(
MI, OpndName);
523 return (
Imm->getImm() & Mask) ==
Value;
526bool GCNDPPCombine::combineDPPMov(
MachineInstr &MovMI)
const {
528 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp ||
529 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
532 auto *DstOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
533 assert(DstOpnd && DstOpnd->isReg());
534 auto DPPMovReg = DstOpnd->getReg();
535 if (DPPMovReg.isPhysical()) {
545 if (MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
546 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp) {
547 auto *
DppCtrl =
TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
557 auto *RowMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
558 assert(RowMaskOpnd && RowMaskOpnd->isImm());
559 auto *BankMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
560 assert(BankMaskOpnd && BankMaskOpnd->isImm());
561 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
562 BankMaskOpnd->getImm() == 0xF;
564 auto *BCZOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
565 assert(BCZOpnd && BCZOpnd->isImm());
566 bool BoundCtrlZero = BCZOpnd->getImm();
568 auto *OldOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
569 auto *SrcOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
571 assert(SrcOpnd && SrcOpnd->isReg());
577 auto *
const OldOpndValue = getOldOpndValue(*OldOpnd);
582 assert(!OldOpndValue || OldOpndValue->
isImm() || OldOpndValue == OldOpnd);
584 bool CombBCZ =
false;
586 if (MaskAllLanes && BoundCtrlZero) {
589 if (!OldOpndValue || !OldOpndValue->
isImm()) {
594 if (OldOpndValue->
getImm() == 0) {
599 }
else if (BoundCtrlZero) {
602 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
611 dbgs() << *OldOpndValue;
612 dbgs() <<
", bound_ctrl=" << CombBCZ <<
'\n');
618 if (CombBCZ && OldOpndValue) {
621 MRI->createVirtualRegister(RC));
623 TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.
Reg);
624 DPPMIs.push_back(UndefInst.getInstr());
627 OrigMIs.push_back(&MovMI);
628 bool Rollback =
true;
631 for (
auto &
Use :
MRI->use_nodbg_operands(DPPMovReg)) {
635 while (!
Uses.empty()) {
639 auto &OrigMI = *
Use->getParent();
644 "There should not be e32 True16 instructions pre-RA");
645 if (OrigOp == AMDGPU::REG_SEQUENCE) {
647 unsigned FwdSubReg = 0;
656 for (OpNo = 1; OpNo < E; OpNo += 2) {
666 for (
auto &
Op :
MRI->use_nodbg_operands(FwdReg)) {
667 if (
Op.getSubReg() == FwdSubReg)
670 RegSeqWithOpNos[&OrigMI].push_back(OpNo);
674 bool IsShrinkable = isShrinkable(OrigMI);
675 if (!(IsShrinkable ||
676 ((
TII->isVOP3P(OrigOp) ||
TII->isVOPC(OrigOp) ||
677 TII->isVOP3(OrigOp)) &&
679 TII->isVOP1(OrigOp) ||
TII->isVOP2(OrigOp))) {
688 auto *Src0 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
689 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
695 auto *Src2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
696 assert(Src0 &&
"Src1 without Src0?");
697 if ((
Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
698 (Src2 && Src2->isIdenticalTo(*Src0)))) ||
699 (
Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
700 (Src2 && Src2->isIdenticalTo(*Src1))))) {
704 <<
" failed: DPP register is used more than once per instruction\n");
710 if (
auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
711 OldOpndValue, CombBCZ, IsShrinkable)) {
712 DPPMIs.push_back(DPPInst);
719 BB->
insert(OrigMI, NewMI);
720 if (
TII->commuteInstruction(*NewMI)) {
723 createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
725 DPPMIs.push_back(DPPInst);
730 NewMI->eraseFromParent();
734 OrigMIs.push_back(&OrigMI);
737 Rollback |= !
Uses.empty();
739 for (
auto *
MI : *(Rollback? &DPPMIs : &OrigMIs))
740 MI->eraseFromParent();
743 for (
auto &S : RegSeqWithOpNos) {
744 if (
MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
745 S.first->eraseFromParent();
748 while (!S.second.empty())
749 S.first->getOperand(S.second.pop_back_val()).setIsUndef();
760 return GCNDPPCombine().run(MF);
769 TII =
ST->getInstrInfo();
771 bool Changed =
false;
772 for (
auto &
MBB : MF) {
774 if (
MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(
MI)) {
776 ++NumDPPMovsCombined;
777 }
else if (
MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
778 MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
779 if (
ST->hasDPALU_DPP() && combineDPPMov(
MI)) {
781 ++NumDPPMovsCombined;
785 if (M && combineDPPMov(*M))
786 ++NumDPPMovsCombined;
803 bool Changed = GCNDPPCombine().run(MF);
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd)
static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, MachineRegisterInfo &MRI)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
TargetInstrInfo::RegSubRegPair RegSubRegPair
Remove Loads Into Fake Uses
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MAM)
unsigned getSize(const MachineInstr &MI) const
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
unsigned getNumOperands() const
Retuns the total number of operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
void insert(mop_iterator InsertBefore, ArrayRef< MachineOperand > Ops)
Inserts Ops BEFORE It. Can untie/retie tied operands.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
LLVM Value Representation.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getDPPOp32(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READONLY int getDPPOp64(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
char & GCNDPPCombineLegacyID
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, const TargetRegisterClass &TRC, MachineRegisterInfo &MRI)
Returns true if a reg:subreg pair P has a TRC class.
FunctionPass * createGCNDPPCombinePass()
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
A pair composed of a register and a sub-register index.