48#define DEBUG_TYPE "gcn-dpp-combine"
50STATISTIC(NumDPPMovsCombined,
"Number of DPP moves combined.");
66 bool IsShrinkable)
const;
70 bool IsShrinkable)
const;
75 int64_t Mask = -1)
const;
97 .
set(MachineFunctionProperties::Property::IsSSA);
101 int getDPPOp(
unsigned Op,
bool IsShrinkable)
const;
109char GCNDPPCombine::
ID = 0;
114 return new GCNDPPCombine();
118 unsigned Op =
MI.getOpcode();
119 if (!
TII->isVOP3(
Op)) {
122 if (!
TII->hasVALU32BitEncoding(
Op)) {
130 if (
const auto *SDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
135 if (!
MRI->use_nodbg_empty(SDst->getReg()))
140 if (!hasNoImmOrEqual(
MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
141 !hasNoImmOrEqual(
MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
142 !hasNoImmOrEqual(
MI, AMDGPU::OpName::clamp, 0) ||
143 !hasNoImmOrEqual(
MI, AMDGPU::OpName::omod, 0)) {
150int GCNDPPCombine::getDPPOp(
unsigned Op,
bool IsShrinkable)
const {
157 if (DPP32 != -1 &&
TII->pseudoToMCOpcode(DPP32) != -1)
160 if (
ST->hasVOP3DPP())
162 if (DPP64 != -1 &&
TII->pseudoToMCOpcode(DPP64) != -1)
176 switch(
Def->getOpcode()) {
178 case AMDGPU::IMPLICIT_DEF:
181 case AMDGPU::V_MOV_B32_e32:
182 case AMDGPU::V_MOV_B64_PSEUDO:
183 case AMDGPU::V_MOV_B64_e32:
184 case AMDGPU::V_MOV_B64_e64: {
185 auto &Op1 =
Def->getOperand(1);
196 int16_t RegClass =
MI.getDesc().operands()[
Idx].RegClass;
201 return TRI->getRegSizeInBits(*
TRI->getRegClass(RegClass));
208 bool IsShrinkable)
const {
210 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp ||
211 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
213 bool HasVOP3DPP =
ST->hasVOP3DPP();
215 auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
222 auto *RowMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
223 assert(RowMaskOpnd && RowMaskOpnd->isImm());
224 auto *BankMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
225 assert(BankMaskOpnd && BankMaskOpnd->isImm());
226 const bool MaskAllLanes =
227 RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
230 !(
TII->isVOPC(DPPOp) || (
TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
231 TII->isVOPC(OrigOpE32)))) &&
232 "VOPC cannot form DPP unless mask is full");
241 if (
auto *Dst =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst)) {
245 if (
auto *SDst =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::sdst)) {
246 if (
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, SDst)) {
255 assert(OldIdx == NumOperands);
259 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()),
265 }
else if (
TII->isVOPC(DPPOp) || (
TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
266 TII->isVOPC(OrigOpE32))) {
271 LLVM_DEBUG(
dbgs() <<
" failed: no old operand in DPP instruction,"
277 auto *Mod0 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
280 AMDGPU::OpName::src0_modifiers));
283 DPPInst.addImm(Mod0->getImm());
289 auto *Src0 =
TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
291 int Src0Idx = NumOperands;
292 if (!
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
298 DPPInst->getOperand(NumOperands).setIsKill(
false);
301 auto *Mod1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
304 AMDGPU::OpName::src1_modifiers));
307 DPPInst.addImm(Mod1->getImm());
313 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
315 int OpNum = NumOperands;
319 if (!
ST->hasDPPSrc1SGPR()) {
322 "Src0 and Src1 operands should have the same size");
325 if (!
TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
334 auto *Mod2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
340 DPPInst.addImm(Mod2->getImm());
343 auto *Src2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
345 if (!
TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
346 !
TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
356 auto *ClampOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
358 DPPInst.addImm(ClampOpr->getImm());
360 auto *VdstInOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst_in);
363 DPPInst.add(*VdstInOpr);
365 auto *OmodOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::omod);
367 DPPInst.addImm(OmodOpr->getImm());
371 if (
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
376 if (Mod0 &&
TII->isVOP3(OrigMI) && !
TII->isVOP3P(OrigMI))
385 DPPInst.addImm(OpSel);
387 if (
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
395 assert(Src2 &&
"Expected vop3p with 3 operands");
397 LLVM_DEBUG(
dbgs() <<
" failed: op_sel_hi must be all set to one\n");
402 DPPInst.addImm(OpSelHi);
404 auto *NegOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_lo);
406 DPPInst.addImm(NegOpr->getImm());
408 auto *NegHiOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::neg_hi);
410 DPPInst.addImm(NegHiOpr->getImm());
412 auto *ByteSelOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
415 DPPInst.addImm(ByteSelOpr->getImm());
418 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
419 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
420 DPPInst.add(*
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
421 DPPInst.addImm(CombBCZ ? 1 : 0);
425 DPPInst.getInstr()->eraseFromParent();
429 return DPPInst.getInstr();
436 case AMDGPU::V_ADD_U32_e32:
437 case AMDGPU::V_ADD_U32_e64:
438 case AMDGPU::V_ADD_CO_U32_e32:
439 case AMDGPU::V_ADD_CO_U32_e64:
440 case AMDGPU::V_OR_B32_e32:
441 case AMDGPU::V_OR_B32_e64:
442 case AMDGPU::V_SUBREV_U32_e32:
443 case AMDGPU::V_SUBREV_U32_e64:
444 case AMDGPU::V_SUBREV_CO_U32_e32:
445 case AMDGPU::V_SUBREV_CO_U32_e64:
446 case AMDGPU::V_MAX_U32_e32:
447 case AMDGPU::V_MAX_U32_e64:
448 case AMDGPU::V_XOR_B32_e32:
449 case AMDGPU::V_XOR_B32_e64:
450 if (OldOpnd->
getImm() == 0)
453 case AMDGPU::V_AND_B32_e32:
454 case AMDGPU::V_AND_B32_e64:
455 case AMDGPU::V_MIN_U32_e32:
456 case AMDGPU::V_MIN_U32_e64:
458 std::numeric_limits<uint32_t>::max())
461 case AMDGPU::V_MIN_I32_e32:
462 case AMDGPU::V_MIN_I32_e64:
463 if (
static_cast<int32_t
>(OldOpnd->
getImm()) ==
464 std::numeric_limits<int32_t>::max())
467 case AMDGPU::V_MAX_I32_e32:
468 case AMDGPU::V_MAX_I32_e64:
469 if (
static_cast<int32_t
>(OldOpnd->
getImm()) ==
470 std::numeric_limits<int32_t>::min())
473 case AMDGPU::V_MUL_I32_I24_e32:
474 case AMDGPU::V_MUL_I32_I24_e64:
475 case AMDGPU::V_MUL_U32_U24_e32:
476 case AMDGPU::V_MUL_U32_U24_e64:
477 if (OldOpnd->
getImm() == 1)
486 MachineOperand *OldOpndValue,
bool CombBCZ,
bool IsShrinkable)
const {
488 if (!CombBCZ && OldOpndValue && OldOpndValue->
isImm()) {
489 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
490 if (!Src1 || !Src1->isReg()) {
491 LLVM_DEBUG(
dbgs() <<
" failed: no src1 or it isn't a register\n");
495 LLVM_DEBUG(
dbgs() <<
" failed: old immediate isn't an identity\n");
499 auto MovDst =
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
506 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ, IsShrinkable);
511bool GCNDPPCombine::hasNoImmOrEqual(
MachineInstr &
MI,
unsigned OpndName,
512 int64_t
Value, int64_t Mask)
const {
513 auto *
Imm =
TII->getNamedOperand(
MI, OpndName);
518 return (
Imm->getImm() & Mask) ==
Value;
521bool GCNDPPCombine::combineDPPMov(
MachineInstr &MovMI)
const {
523 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp ||
524 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
527 auto *DstOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
528 assert(DstOpnd && DstOpnd->isReg());
529 auto DPPMovReg = DstOpnd->getReg();
530 if (DPPMovReg.isPhysical()) {
540 if (MovMI.
getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
541 MovMI.
getOpcode() == AMDGPU::V_MOV_B64_dpp) {
542 auto *
DppCtrl =
TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
552 auto *RowMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
553 assert(RowMaskOpnd && RowMaskOpnd->isImm());
554 auto *BankMaskOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
555 assert(BankMaskOpnd && BankMaskOpnd->isImm());
556 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
557 BankMaskOpnd->getImm() == 0xF;
559 auto *BCZOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
560 assert(BCZOpnd && BCZOpnd->isImm());
561 bool BoundCtrlZero = BCZOpnd->getImm();
563 auto *OldOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
564 auto *SrcOpnd =
TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
566 assert(SrcOpnd && SrcOpnd->isReg());
572 auto *
const OldOpndValue = getOldOpndValue(*OldOpnd);
577 assert(!OldOpndValue || OldOpndValue->
isImm() || OldOpndValue == OldOpnd);
579 bool CombBCZ =
false;
581 if (MaskAllLanes && BoundCtrlZero) {
584 if (!OldOpndValue || !OldOpndValue->
isImm()) {
589 if (OldOpndValue->
getImm() == 0) {
594 }
else if (BoundCtrlZero) {
597 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
606 dbgs() << *OldOpndValue;
607 dbgs() <<
", bound_ctrl=" << CombBCZ <<
'\n');
613 if (CombBCZ && OldOpndValue) {
616 MRI->createVirtualRegister(RC));
618 TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.
Reg);
619 DPPMIs.push_back(UndefInst.getInstr());
622 OrigMIs.push_back(&MovMI);
623 bool Rollback =
true;
626 for (
auto &
Use :
MRI->use_nodbg_operands(DPPMovReg)) {
630 while (!
Uses.empty()) {
634 auto &OrigMI = *
Use->getParent();
639 "There should not be e32 True16 instructions pre-RA");
640 if (OrigOp == AMDGPU::REG_SEQUENCE) {
642 unsigned FwdSubReg = 0;
651 for (OpNo = 1; OpNo < E; OpNo += 2) {
661 for (
auto &
Op :
MRI->use_nodbg_operands(FwdReg)) {
662 if (
Op.getSubReg() == FwdSubReg)
665 RegSeqWithOpNos[&OrigMI].push_back(OpNo);
669 bool IsShrinkable = isShrinkable(OrigMI);
670 if (!(IsShrinkable ||
671 ((
TII->isVOP3P(OrigOp) ||
TII->isVOPC(OrigOp) ||
672 TII->isVOP3(OrigOp)) &&
674 TII->isVOP1(OrigOp) ||
TII->isVOP2(OrigOp))) {
683 auto *Src0 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
684 auto *Src1 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
690 auto *Src2 =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2);
691 assert(Src0 &&
"Src1 without Src0?");
692 if ((
Use == Src0 && ((Src1 && Src1->isIdenticalTo(*Src0)) ||
693 (Src2 && Src2->isIdenticalTo(*Src0)))) ||
694 (
Use == Src1 && (Src1->isIdenticalTo(*Src0) ||
695 (Src2 && Src2->isIdenticalTo(*Src1))))) {
699 <<
" failed: DPP register is used more than once per instruction\n");
705 if (
auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
706 OldOpndValue, CombBCZ, IsShrinkable)) {
707 DPPMIs.push_back(DPPInst);
714 BB->
insert(OrigMI, NewMI);
715 if (
TII->commuteInstruction(*NewMI)) {
718 createDPPInst(*NewMI, MovMI, CombOldVGPR, OldOpndValue, CombBCZ,
720 DPPMIs.push_back(DPPInst);
725 NewMI->eraseFromParent();
729 OrigMIs.push_back(&OrigMI);
732 Rollback |= !
Uses.empty();
734 for (
auto *
MI : *(Rollback? &DPPMIs : &OrigMIs))
735 MI->eraseFromParent();
738 for (
auto &S : RegSeqWithOpNos) {
739 if (
MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
740 S.first->eraseFromParent();
743 while (!S.second.empty())
744 S.first->getOperand(S.second.pop_back_val()).setIsUndef();
757 TII =
ST->getInstrInfo();
759 bool Changed =
false;
760 for (
auto &
MBB : MF) {
762 if (
MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(
MI)) {
764 ++NumDPPMovsCombined;
765 }
else if (
MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
766 MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
767 if (
ST->hasDPALU_DPP() && combineDPPMov(
MI)) {
769 ++NumDPPMovsCombined;
773 if (M && combineDPPMov(*M))
774 ++NumDPPMovsCombined;
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd)
static unsigned getOperandSize(MachineInstr &MI, unsigned Idx, MachineRegisterInfo &MRI)
Rewrite Partial Register Uses
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
TargetInstrInfo::RegSubRegPair RegSubRegPair
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
FunctionPass class - This class is used to implement most global optimizations.
unsigned getSize(const MachineInstr &MI) const
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
unsigned getNumOperands() const
Retuns the total number of operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
void insert(mop_iterator InsertBefore, ArrayRef< MachineOperand > Ops)
Inserts Ops BEFORE It. Can untie/retie tied operands.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
uint32_t getFlags() const
Return the MI flags bitvector.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A Use represents the edge between a Value definition and its users.
LLVM Value Representation.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getDPPOp32(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READONLY int getDPPOp64(uint16_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, const TargetRegisterClass &TRC, MachineRegisterInfo &MRI)
Returns true if a reg:subreg pair P has a TRC class.
void initializeGCNDPPCombinePass(PassRegistry &)
FunctionPass * createGCNDPPCombinePass()
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
A pair composed of a register and a sub-register index.