48#define DEBUG_TYPE "gcn-dpp-combine"
50STATISTIC(NumDPPMovsCombined,
"Number of DPP moves combined.");
66 bool IsShrinkable)
const;
70 bool IsShrinkable)
const;
75 int64_t Mask = -1)
const;
97 .
set(MachineFunctionProperties::Property::IsSSA);
101 int getDPPOp(
unsigned Op,
bool IsShrinkable)
const;
109char GCNDPPCombine::
ID = 0;
114 return new GCNDPPCombine();
118 unsigned Op =
MI.getOpcode();
119 if (!
TII->isVOP3(
Op)) {
122 if (!
TII->hasVALU32BitEncoding(
Op)) {
130 if (
const auto *SDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
135 if (!
MRI->use_nodbg_empty(SDst->getReg()))
140 if (!hasNoImmOrEqual(
MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
141 !hasNoImmOrEqual(
MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
142 !hasNoImmOrEqual(
MI, AMDGPU::OpName::clamp, 0) ||
143 !hasNoImmOrEqual(
MI, AMDGPU::OpName::omod, 0) ||
144 !hasNoImmOrEqual(
MI, AMDGPU::OpName::byte_sel, 0)) {
151int GCNDPPCombine::getDPPOp(
unsigned Op,
bool IsShrinkable)
const {
158 if (DPP32 != -1 &&
TII->pseudoToMCOpcode(DPP32) != -1)
161 if (
ST->hasVOP3DPP())
163 if (DPP64 != -1 &&
TII->pseudoToMCOpcode(DPP64) != -1)
177 switch(
Def->getOpcode()) {
179 case AMDGPU::IMPLICIT_DEF:
182 case AMDGPU::V_MOV_B32_e32:
183 case AMDGPU::V_MOV_B64_PSEUDO:
184 case AMDGPU::V_MOV_B64_e32:
185 case AMDGPU::V_MOV_B64_e64: {
186 auto &Op1 =
Def->getOperand(1);
197 int16_t RegClass =
MI.getDesc().operands()[
Idx].RegClass;
202 return TRI->getRegSizeInBits(*
TRI->getRegClass(RegClass));
209 bool IsShrinkable)
const {
211 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp ||
212 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
214 bool HasVOP3DPP =
ST->hasVOP3DPP();
216 auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
223 auto *RowMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
224 assert(RowMaskOpnd && RowMaskOpnd->isImm());
225 auto *BankMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
226 assert(BankMaskOpnd && BankMaskOpnd->isImm());
227 const bool MaskAllLanes =
228 RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
231 !(
TII->isVOPC(DPPOp) || (
TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
232 TII->isVOPC(OrigOpE32)))) &&
233 "VOPC cannot form DPP unless mask is full");
242 if (
auto *Dst =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
246 if (
auto *SDst =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
247 if (
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
256 assert(OldIdx == NumOperands);
260 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
266 }
else if (
TII->isVOPC(DPPOp) || (
TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
267 TII->isVOPC(OrigOpE32))) {
272 LLVM_DEBUG(
dbgs() <<
" failed: no old operand in DPP instruction,"
278 auto *Mod0 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
281 AMDGPU::OpName::src0_modifiers));
284 DPPInst.addImm(Mod0->getImm());
290 auto *Src0 =
TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
292 int Src0Idx = NumOperands;
293 if (!
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
299 DPPInst->getOperand(NumOperands).setIsKill(
false);
302 auto *Mod1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
305 AMDGPU::OpName::src1_modifiers));
308 DPPInst.addImm(Mod1->getImm());
314 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
316 int OpNum = NumOperands;
320 if (!
ST->hasDPPSrc1SGPR()) {
323 "Src0 and Src1 operands should have the same size");
326 if (!
TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
335 auto *Mod2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
341 DPPInst.addImm(Mod2->getImm());
344 auto *Src2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
346 if (!
TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
347 !
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
357 auto *ClampOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
359 DPPInst.addImm(ClampOpr->getImm());
361 auto *VdstInOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
364 DPPInst.add(*VdstInOpr);
366 auto *OmodOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
368 DPPInst.addImm(OmodOpr->getImm());
372 if (
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
377 if (Mod0 &&
TII->isVOP3(OrigMI) && !
TII->isVOP3P(OrigMI))
386 DPPInst.addImm(OpSel);
388 if (
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
396 assert(Src2 &&
"Expected vop3p with 3 operands");
398 LLVM_DEBUG(
dbgs() <<
" failed: op_sel_hi must be all set to one\n");
403 DPPInst.addImm(OpSelHi);
405 auto *NegOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
407 DPPInst.addImm(NegOpr->getImm());
409 auto *NegHiOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
411 DPPInst.addImm(NegHiOpr->getImm());
413 auto *ByteSelOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
416 DPPInst.addImm(ByteSelOpr->getImm());
419 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
420 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
421 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
422 DPPInst.addImm(CombBCZ ? 1 : 0);
426 DPPInst.getInstr()->eraseFromParent();
430 return DPPInst.getInstr();
437 case AMDGPU::V_ADD_U32_e32:
438 case AMDGPU::V_ADD_U32_e64:
439 case AMDGPU::V_ADD_CO_U32_e32:
440 case AMDGPU::V_ADD_CO_U32_e64:
441 case AMDGPU::V_OR_B32_e32:
442 case AMDGPU::V_OR_B32_e64:
443 case AMDGPU::V_SUBREV_U32_e32:
444 case AMDGPU::V_SUBREV_U32_e64:
445 case AMDGPU::V_SUBREV_CO_U32_e32:
446 case AMDGPU::V_SUBREV_CO_U32_e64:
447 case AMDGPU::V_MAX_U32_e32:
448 case AMDGPU::V_MAX_U32_e64:
449 case AMDGPU::V_XOR_B32_e32:
450 case AMDGPU::V_XOR_B32_e64:
451 if (OldOpnd->
getImm() == 0)
454 case AMDGPU::V_AND_B32_e32:
455 case AMDGPU::V_AND_B32_e64:
456 case AMDGPU::V_MIN_U32_e32:
457 case AMDGPU::V_MIN_U32_e64:
459 std::numeric_limits<uint32_t>::max())
462 case AMDGPU::V_MIN_I32_e32:
463 case AMDGPU::V_MIN_I32_e64:
464 if (
static_cast<int32_t
>(OldOpnd->
getImm()) ==
465 std::numeric_limits<int32_t>::max())
468 case AMDGPU::V_MAX_I32_e32:
469 case AMDGPU::V_MAX_I32_e64:
470 if (
static_cast<int32_t
>(OldOpnd->
getImm()) ==
471 std::numeric_limits<int32_t>::min())
474 case AMDGPU::V_MUL_I32_I24_e32:
475 case AMDGPU::V_MUL_I32_I24_e64:
476 case AMDGPU::V_MUL_U32_U24_e32:
477 case AMDGPU::V_MUL_U32_U24_e64:
478 if (OldOpnd->
getImm() == 1)
487 MachineOperand *OldOpndValue,
bool CombBCZ,
bool IsShrinkable)
const {
489 if (!CombBCZ && OldOpndValue && OldOpndValue->
isImm()) {
490 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
491 if (!Src1 || !Src1->isReg()) {
492 LLVM_DEBUG(
dbgs() <<
" failed: no src1 or it isn't a register\n");
496 LLVM_DEBUG(
dbgs() <<
" failed: old immediate isn't an identity\n");
500 auto MovDst =
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
507 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
512bool GCNDPPCombine::hasNoImmOrEqual(
MachineInstr &
MI,
unsigned OpndName,
513 int64_t
Value, int64_t Mask)
const {
514 auto *
Imm =
TII->getNamedOperand(
MI, OpndName);
519 return (
Imm->getImm() & Mask) ==
Value;
522bool GCNDPPCombine::combineDPPMov(
MachineInstr &MovMI)
const {
524 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp ||
525 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
528 auto *DstOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
529 assert(DstOpnd && DstOpnd->isReg());
530 auto DPPMovReg = DstOpnd->getReg();
531 if (DPPMovReg.isPhysical()) {
541 if (MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
542 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp) {
543 auto *
DppCtrl =
TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
553 auto *RowMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
554 assert(RowMaskOpnd && RowMaskOpnd->isImm());
555 auto *BankMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
556 assert(BankMaskOpnd && BankMaskOpnd->isImm());
557 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
558 BankMaskOpnd->getImm() == 0xF;
560 auto *BCZOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
561 assert(BCZOpnd && BCZOpnd->isImm());
562 bool BoundCtrlZero = BCZOpnd->getImm();
564 auto *OldOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
565 auto *SrcOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
567 assert(SrcOpnd && SrcOpnd->isReg());
573 auto *
const OldOpndValue = getOldOpndValue(*OldOpnd);
578 assert(!OldOpndValue || OldOpndValue->
isImm() || OldOpndValue == OldOpnd);
580 bool CombBCZ =
false;
582 if (MaskAllLanes && BoundCtrlZero) {
585 if (!OldOpndValue || !OldOpndValue->
isImm()) {
590 if (OldOpndValue->
getImm() == 0) {
595 }
else if (BoundCtrlZero) {
598 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
607 dbgs() << *OldOpndValue;
608 dbgs() <<
", bound_ctrl=" << CombBCZ <<
'\n');
614 if (CombBCZ && OldOpndValue) {
617 MRI->createVirtualRegister(RC));
619 TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.
Reg);
620 DPPMIs.push_back(UndefInst.getInstr());
623 OrigMIs.push_back(&MovMI);
624 bool Rollback =
true;
627 for (
auto &
Use :
MRI->use_nodbg_operands(DPPMovReg)) {
631 while (!
Uses.empty()) {
635 auto &OrigMI = *
Use->getParent();
640 "There should not be e32 True16 instructions pre-RA");
641 if (OrigOp == AMDGPU::REG_SEQUENCE) {
643 unsigned FwdSubReg = 0;
652 for (OpNo = 1; OpNo < E; OpNo += 2) {
662 for (
auto &
Op :
MRI->use_nodbg_operands(FwdReg)) {
663 if (
Op.getSubReg() == FwdSubReg)
666 RegSeqWithOpNos[&OrigMI].push_back(OpNo);
670 bool IsShrinkable = isShrinkable(OrigMI);
671 if (!(IsShrinkable ||
672 ((
TII->isVOP3P(OrigOp) ||
TII->isVOPC(OrigOp) ||
673 TII->isVOP3(OrigOp)) &&
675 TII->isVOP1(OrigOp) ||
TII->isVOP2(OrigOp))) {
684 auto *Src0 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
685 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
691 auto *Src2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
692 assert(Src0 &&
"Src1 without Src0?");
693 if ((
Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
694 (Src2 && Src2->isIdenticalTo(*Src0)))) ||
695 (
Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
696 (Src2 && Src2->isIdenticalTo(*Src1))))) {
700 <<
" failed: DPP register is used more than once per instruction\n");
706 if (
auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
707 OldOpndValue, CombBCZ, IsShrinkable)) {
708 DPPMIs.push_back(DPPInst);
715 BB->
insert(OrigMI, NewMI);
716 if (
TII->commuteInstruction(*NewMI)) {
719 createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
721 DPPMIs.push_back(DPPInst);
726 NewMI->eraseFromParent();
730 OrigMIs.push_back(&OrigMI);
733 Rollback |= !
Uses.empty();
735 for (
auto *
MI : *(Rollback? &DPPMIs : &OrigMIs))
736 MI->eraseFromParent();
739 for (
auto &S : RegSeqWithOpNos) {
740 if (
MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
741 S.first->eraseFromParent();
744 while (!S.second.empty())
745 S.first->getOperand(S.second.pop_back_val()).setIsUndef();
758 TII =
ST->getInstrInfo();
760 bool Changed =
false;
761 for (
auto &
MBB : MF) {
763 if (
MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(
MI)) {
765 ++NumDPPMovsCombined;
766 }
else if (
MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
767 MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
768 if (
ST->hasDPALU_DPP() && combineDPPMov(
MI)) {
770 ++NumDPPMovsCombined;
774 if (M && combineDPPMov(*M))
775 ++NumDPPMovsCombined;
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd)
static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, MachineRegisterInfo &MRI)
Rewrite Partial Register Uses
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
TargetInstrInfo::RegSubRegPair RegSubRegPair
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
unsigned getSize(const MachineInstr &MI) const
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
unsigned getNumOperands() const
Retuns the total number of operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
void insert(mop_iterator InsertBefore, ArrayRef< MachineOperand > Ops)
Inserts Ops BEFORE It. Can untie/retie tied operands.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
LLVM Value Representation.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getDPPOp32(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READONLY int getDPPOp64(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, const TargetRegisterClass &TRC, MachineRegisterInfo &MRI)
Returns true if a reg:subreg pair P has a TRC class.
void initializeGCNDPPCombinePass(PassRegistry &)
FunctionPass * createGCNDPPCombinePass()
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
A pair composed of a register and a sub-register index.