LCOV - llvm-toolchain.info - lib/Target/AMDGPU/SIFoldOperands.cpp

LCOV - code coverage report

Current view:	top level - lib/Target/AMDGPU - SIFoldOperands.cpp (source / functions)		Hit	Total	Coverage
Test:	llvm-toolchain.info	Lines:	407	487	83.6 %
Date:	2018-10-20 13:21:21	Functions:	20	30	66.7 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : /// \file
       9             : //===----------------------------------------------------------------------===//
      10             : //
      11             : 
      12             : #include "AMDGPU.h"
      13             : #include "AMDGPUSubtarget.h"
      14             : #include "SIInstrInfo.h"
      15             : #include "SIMachineFunctionInfo.h"
      16             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      17             : #include "llvm/ADT/DepthFirstIterator.h"
      18             : #include "llvm/CodeGen/LiveIntervals.h"
      19             : #include "llvm/CodeGen/MachineFunctionPass.h"
      20             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      21             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      22             : #include "llvm/Support/Debug.h"
      23             : #include "llvm/Support/raw_ostream.h"
      24             : #include "llvm/Target/TargetMachine.h"
      25             : 
      26             : #define DEBUG_TYPE "si-fold-operands"
      27             : using namespace llvm;
      28             : 
      29             : namespace {
      30             : 
      31             : struct FoldCandidate {
      32             :   MachineInstr *UseMI;
      33             :   union {
      34             :     MachineOperand *OpToFold;
      35             :     uint64_t ImmToFold;
      36             :     int FrameIndexToFold;
      37             :   };
      38             :   int ShrinkOpcode;
      39             :   unsigned char UseOpNo;
      40             :   MachineOperand::MachineOperandType Kind;
      41             :   bool Commuted;
      42             : 
      43             :   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
      44             :                 bool Commuted_ = false,
      45      139621 :                 int ShrinkOp = -1) :
      46             :     UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
      47             :     Kind(FoldOp->getType()),
      48       10826 :     Commuted(Commuted_) {
      49      139599 :     if (FoldOp->isImm()) {
      50       51185 :       ImmToFold = FoldOp->getImm();
      51       88414 :     } else if (FoldOp->isFI()) {
      52          73 :       FrameIndexToFold = FoldOp->getIndex();
      53             :     } else {
      54             :       assert(FoldOp->isReg());
      55       88341 :       OpToFold = FoldOp;
      56             :     }
      57             :   }
      58             : 
      59           0 :   bool isFI() const {
      60           0 :     return Kind == MachineOperand::MO_FrameIndex;
      61             :   }
      62             : 
      63           0 :   bool isImm() const {
      64           0 :     return Kind == MachineOperand::MO_Immediate;
      65             :   }
      66             : 
      67           0 :   bool isReg() const {
      68           0 :     return Kind == MachineOperand::MO_Register;
      69             :   }
      70             : 
      71           0 :   bool isCommuted() const {
      72           0 :     return Commuted;
      73             :   }
      74             : 
      75           0 :   bool needsShrink() const {
      76           0 :     return ShrinkOpcode != -1;
      77             :   }
      78             : 
      79           0 :   int getShrinkOpcode() const {
      80           0 :     return ShrinkOpcode;
      81             :   }
      82             : };
      83             : 
      84             : class SIFoldOperands : public MachineFunctionPass {
      85             : public:
      86             :   static char ID;
      87             :   MachineRegisterInfo *MRI;
      88             :   const SIInstrInfo *TII;
      89             :   const SIRegisterInfo *TRI;
      90             :   const GCNSubtarget *ST;
      91             : 
      92             :   void foldOperand(MachineOperand &OpToFold,
      93             :                    MachineInstr *UseMI,
      94             :                    unsigned UseOpIdx,
      95             :                    SmallVectorImpl<FoldCandidate> &FoldList,
      96             :                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
      97             : 
      98             :   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
      99             : 
     100             :   const MachineOperand *isClamp(const MachineInstr &MI) const;
     101             :   bool tryFoldClamp(MachineInstr &MI);
     102             : 
     103             :   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
     104             :   bool tryFoldOMod(MachineInstr &MI);
     105             : 
     106             : public:
     107        3847 :   SIFoldOperands() : MachineFunctionPass(ID) {
     108        3847 :     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
     109        3847 :   }
     110             : 
     111             :   bool runOnMachineFunction(MachineFunction &MF) override;
     112             : 
     113        3823 :   StringRef getPassName() const override { return "SI Fold Operands"; }
     114             : 
     115        3823 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     116        3823 :     AU.setPreservesCFG();
     117        3823 :     MachineFunctionPass::getAnalysisUsage(AU);
     118        3823 :   }
     119             : };
     120             : 
     121             : } // End anonymous namespace.
     122             : 
     123      202871 : INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
     124             :                 "SI Fold Operands", false, false)
     125             : 
     126             : char SIFoldOperands::ID = 0;
     127             : 
     128             : char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
     129             : 
     130             : // Wrapper around isInlineConstant that understands special cases when
     131             : // instruction types are replaced during operand folding.
     132      149933 : static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
     133             :                                      const MachineInstr &UseMI,
     134             :                                      unsigned OpNo,
     135             :                                      const MachineOperand &OpToFold) {
     136      149933 :   if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
     137             :     return true;
     138             : 
     139      101793 :   unsigned Opc = UseMI.getOpcode();
     140      101793 :   switch (Opc) {
     141         348 :   case AMDGPU::V_MAC_F32_e64:
     142             :   case AMDGPU::V_MAC_F16_e64:
     143             :   case AMDGPU::V_FMAC_F32_e64: {
     144             :     // Special case for mac. Since this is replaced with mad when folded into
     145             :     // src2, we need to check the legality for the final instruction.
     146         348 :     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     147         348 :     if (static_cast<int>(OpNo) == Src2Idx) {
     148             :       bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
     149             :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     150             : 
     151         120 :       unsigned Opc = IsFMA ?
     152             :         AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
     153         120 :       const MCInstrDesc &MadDesc = TII->get(Opc);
     154         120 :       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     155             :     }
     156             :     return false;
     157             :   }
     158             :   default:
     159             :     return false;
     160             :   }
     161             : }
     162             : 
     163           0 : FunctionPass *llvm::createSIFoldOperandsPass() {
     164           0 :   return new SIFoldOperands();
     165             : }
     166             : 
     167      128517 : static bool updateOperand(FoldCandidate &Fold,
     168             :                           const SIInstrInfo &TII,
     169             :                           const TargetRegisterInfo &TRI) {
     170      128517 :   MachineInstr *MI = Fold.UseMI;
     171      128517 :   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
     172             :   assert(Old.isReg());
     173             : 
     174      128517 :   if (Fold.isImm()) {
     175       50935 :     if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
     176             :       // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
     177             :       // already set.
     178             :       unsigned Opcode = MI->getOpcode();
     179         155 :       int OpNo = MI->getOperandNo(&Old);
     180             :       int ModIdx = -1;
     181         155 :       if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
     182             :         ModIdx = AMDGPU::OpName::src0_modifiers;
     183          92 :       else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
     184             :         ModIdx = AMDGPU::OpName::src1_modifiers;
     185           0 :       else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
     186             :         ModIdx = AMDGPU::OpName::src2_modifiers;
     187             :       assert(ModIdx != -1);
     188         155 :       ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
     189         155 :       MachineOperand &Mod = MI->getOperand(ModIdx);
     190         155 :       unsigned Val = Mod.getImm();
     191         155 :       if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
     192             :         return false;
     193             :       // If upper part is all zero we do not need op_sel_hi.
     194         155 :       if (!isUInt<16>(Fold.ImmToFold)) {
     195         139 :         if (!(Fold.ImmToFold & 0xffff)) {
     196           3 :           Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
     197           3 :           Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
     198           3 :           Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
     199           3 :           return true;
     200             :         }
     201         136 :         Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
     202             :       }
     203             :     }
     204             : 
     205       50932 :     if (Fold.needsShrink()) {
     206         206 :       MachineBasicBlock *MBB = MI->getParent();
     207         206 :       auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
     208         206 :       if (Liveness != MachineBasicBlock::LQR_Dead)
     209             :         return false;
     210             : 
     211          60 :       MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     212          60 :       int Op32 = Fold.getShrinkOpcode();
     213          60 :       MachineOperand &Dst0 = MI->getOperand(0);
     214             :       MachineOperand &Dst1 = MI->getOperand(1);
     215             :       assert(Dst0.isDef() && Dst1.isDef());
     216             : 
     217          60 :       bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
     218             : 
     219          60 :       const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
     220          60 :       unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
     221          60 :       const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
     222          60 :       unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);
     223             : 
     224          60 :       MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
     225             : 
     226          60 :       if (HaveNonDbgCarryUse) {
     227          62 :         BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
     228          31 :           .addReg(AMDGPU::VCC, RegState::Kill);
     229             :       }
     230             : 
     231             :       // Keep the old instruction around to avoid breaking iterators, but
     232             :       // replace the outputs with dummy registers.
     233          60 :       Dst0.setReg(NewReg0);
     234          60 :       Dst1.setReg(NewReg1);
     235             : 
     236          60 :       if (Fold.isCommuted())
     237          60 :         TII.commuteInstruction(*Inst32, false);
     238          60 :       return true;
     239             :     }
     240             : 
     241       50726 :     Old.ChangeToImmediate(Fold.ImmToFold);
     242       50726 :     return true;
     243             :   }
     244             : 
     245             :   assert(!Fold.needsShrink() && "not handled");
     246             : 
     247       77582 :   if (Fold.isFI()) {
     248          73 :     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     249          73 :     return true;
     250             :   }
     251             : 
     252       77509 :   MachineOperand *New = Fold.OpToFold;
     253      155018 :   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
     254       77509 :       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     255       77509 :     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
     256             : 
     257             :     Old.setIsUndef(New->isUndef());
     258       77509 :     return true;
     259             :   }
     260             : 
     261             :   // FIXME: Handle physical registers.
     262             : 
     263             :   return false;
     264             : }
     265             : 
     266             : static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
     267             :                               const MachineInstr *MI) {
     268       74673 :   for (auto Candidate : FoldList) {
     269        4629 :     if (Candidate.UseMI == MI)
     270             :       return true;
     271             :   }
     272             :   return false;
     273             : }
     274             : 
     275      197245 : static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     276             :                              MachineInstr *MI, unsigned OpNo,
     277             :                              MachineOperand *OpToFold,
     278             :                              const SIInstrInfo *TII) {
     279      197245 :   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     280             : 
     281             :     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     282       70130 :     unsigned Opc = MI->getOpcode();
     283       70130 :     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
     284         250 :          Opc == AMDGPU::V_FMAC_F32_e64) &&
     285         250 :         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
     286             :       bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
     287             :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     288         132 :       unsigned NewOpc = IsFMA ?
     289             :         AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
     290             : 
     291             :       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
     292             :       // to fold the operand.
     293         132 :       MI->setDesc(TII->get(NewOpc));
     294         132 :       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
     295         132 :       if (FoldAsMAD) {
     296          64 :         MI->untieRegOperand(OpNo);
     297          64 :         return true;
     298             :       }
     299          68 :       MI->setDesc(TII->get(Opc));
     300             :     }
     301             : 
     302             :     // Special case for s_setreg_b32
     303       70066 :     if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
     304          22 :       MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
     305          22 :       FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     306          22 :       return true;
     307             :     }
     308             : 
     309             :     // If we are already folding into another operand of MI, then
     310             :     // we can't commute the instruction, otherwise we risk making the
     311             :     // other fold illegal.
     312       70044 :     if (isUseMIInFoldList(FoldList, MI))
     313             :       return false;
     314             : 
     315             :     unsigned CommuteOpNo = OpNo;
     316             : 
     317             :     // Operand is not legal, so try to commute the instruction to
     318             :     // see if this makes it possible to fold.
     319       70044 :     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
     320       70044 :     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
     321       70044 :     bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
     322             : 
     323       70044 :     if (CanCommute) {
     324       18852 :       if (CommuteIdx0 == OpNo)
     325        8831 :         CommuteOpNo = CommuteIdx1;
     326       10021 :       else if (CommuteIdx1 == OpNo)
     327             :         CommuteOpNo = CommuteIdx0;
     328             :     }
     329             : 
     330             : 
     331             :     // One of operands might be an Imm operand, and OpNo may refer to it after
     332             :     // the call of commuteInstruction() below. Such situations are avoided
     333             :     // here explicitly as OpNo must be a register operand to be a candidate
     334             :     // for memory folding.
     335       70044 :     if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
     336       18804 :                        !MI->getOperand(CommuteIdx1).isReg()))
     337             :       return false;
     338             : 
     339       79070 :     if (!CanCommute ||
     340       13939 :         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
     341       51579 :       return false;
     342             : 
     343       13552 :     if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
     344       24200 :       if ((Opc == AMDGPU::V_ADD_I32_e64 ||
     345       11862 :            Opc == AMDGPU::V_SUB_I32_e64 ||
     346       12102 :            Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
     347             :           OpToFold->isImm()) {
     348         208 :         MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
     349             : 
     350             :         // Verify the other operand is a VGPR, otherwise we would violate the
     351             :         // constant bus restriction.
     352         208 :         unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
     353         208 :         MachineOperand &OtherOp = MI->getOperand(OtherIdx);
     354         416 :         if (!OtherOp.isReg() ||
     355         208 :             !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
     356           2 :           return false;
     357             : 
     358             :         assert(MI->getOperand(1).isDef());
     359             : 
     360         206 :         int Op32 =  AMDGPU::getVOPe32(Opc);
     361         412 :         FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
     362             :                                          Op32));
     363         206 :         return true;
     364             :       }
     365             : 
     366       11892 :       TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
     367       11892 :       return false;
     368             :     }
     369             : 
     370        2904 :     FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
     371        1452 :     return true;
     372             :   }
     373             : 
     374      254230 :   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     375      127115 :   return true;
     376             : }
     377             : 
     378             : // If the use operand doesn't care about the value, this may be an operand only
     379             : // used for register indexing, in which case it is unsafe to fold.
     380           0 : static bool isUseSafeToFold(const SIInstrInfo *TII,
     381             :                             const MachineInstr &MI,
     382             :                             const MachineOperand &UseMO) {
     383      379449 :   return !UseMO.isUndef() && !TII->isSDWA(MI);
     384             :   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
     385             : }
     386             : 
     387      379449 : void SIFoldOperands::foldOperand(
     388             :   MachineOperand &OpToFold,
     389             :   MachineInstr *UseMI,
     390             :   unsigned UseOpIdx,
     391             :   SmallVectorImpl<FoldCandidate> &FoldList,
     392             :   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
     393      379449 :   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
     394             : 
     395             :   if (!isUseSafeToFold(TII, *UseMI, UseOp))
     396             :     return;
     397             : 
     398             :   // FIXME: Fold operands with subregs.
     399      378388 :   if (UseOp.isReg() && OpToFold.isReg()) {
     400      284787 :     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
     401             :       return;
     402             : 
     403             :     // Don't fold subregister extracts into tied operands, only if it is a full
     404             :     // copy since a subregister use tied to a full register def doesn't really
     405             :     // make sense. e.g. don't fold:
     406             :     //
     407             :     // %1 = COPY %0:sub1
     408             :     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
     409             :     //
     410             :     //  into
     411             :     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
     412      279233 :     if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
     413             :       return;
     414             :   }
     415             : 
     416             :   // Special case for REG_SEQUENCE: We can't fold literals into
     417             :   // REG_SEQUENCE instructions, so we have to fold them into the
     418             :   // uses of REG_SEQUENCE.
     419      372430 :   if (UseMI->isRegSequence()) {
     420      114178 :     unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
     421      114178 :     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
     422             : 
     423             :     for (MachineRegisterInfo::use_iterator
     424      114178 :            RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
     425      266356 :          RSUse != RSE; ++RSUse) {
     426             : 
     427      152178 :       MachineInstr *RSUseMI = RSUse->getParent();
     428      152178 :       if (RSUse->getSubReg() != RegSeqDstSubReg)
     429             :         continue;
     430             : 
     431         993 :       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
     432             :                   CopiesToReplace);
     433             :     }
     434             : 
     435             :     return;
     436             :   }
     437             : 
     438             : 
     439             :   bool FoldingImm = OpToFold.isImm();
     440             : 
     441      258252 :   if (FoldingImm && UseMI->isCopy()) {
     442       11269 :     unsigned DestReg = UseMI->getOperand(0).getReg();
     443             :     const TargetRegisterClass *DestRC
     444       11269 :       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
     445       11051 :       MRI->getRegClass(DestReg) :
     446         218 :       TRI->getPhysRegClass(DestReg);
     447             : 
     448       11269 :     unsigned SrcReg  = UseMI->getOperand(1).getReg();
     449       11269 :     if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
     450             :       TargetRegisterInfo::isVirtualRegister(SrcReg)) {
     451       11051 :       const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
     452       11051 :       if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {
     453             :         MachineRegisterInfo::use_iterator NextUse;
     454        3893 :         SmallVector<FoldCandidate, 4> CopyUses;
     455       10826 :         for (MachineRegisterInfo::use_iterator
     456        3893 :           Use = MRI->use_begin(DestReg), E = MRI->use_end();
     457       14719 :           Use != E; Use = NextUse) {
     458             :           NextUse = std::next(Use);
     459             :           FoldCandidate FC = FoldCandidate(Use->getParent(),
     460       10826 :            Use.getOperandNo(), &UseMI->getOperand(1));
     461       10826 :           CopyUses.push_back(FC);
     462             :        }
     463       14719 :         for (auto & F : CopyUses) {
     464       10826 :           foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
     465             :            FoldList, CopiesToReplace);
     466             :         }
     467             :       }
     468             :     }
     469             : 
     470             :     // In order to fold immediates into copies, we need to change the
     471             :     // copy to a MOV.
     472             : 
     473       11269 :     unsigned MovOp = TII->getMovOpcode(DestRC);
     474       11269 :     if (MovOp == AMDGPU::COPY)
     475             :       return;
     476             : 
     477       11269 :     UseMI->setDesc(TII->get(MovOp));
     478       11269 :     CopiesToReplace.push_back(UseMI);
     479             :   } else {
     480       58363 :     if (UseMI->isCopy() && OpToFold.isReg() &&
     481       58327 :         TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
     482      100544 :         TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&
     483       87585 :         TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
     484      287834 :         TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&
     485        3538 :         !UseMI->getOperand(1).getSubReg()) {
     486        3538 :       UseMI->getOperand(1).setReg(OpToFold.getReg());
     487        3538 :       UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
     488        3538 :       UseMI->getOperand(1).setIsKill(false);
     489        3538 :       CopiesToReplace.push_back(UseMI);
     490             :       OpToFold.setIsKill(false);
     491        3538 :       return;
     492             :     }
     493             : 
     494      243445 :     const MCInstrDesc &UseDesc = UseMI->getDesc();
     495             : 
     496             :     // Don't fold into target independent nodes.  Target independent opcodes
     497             :     // don't have defined register classes.
     498      243445 :     if (UseDesc.isVariadic() ||
     499      243445 :         UseOp.isImplicit() ||
     500      240697 :         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
     501             :       return;
     502             :   }
     503             : 
     504      197113 :   if (!FoldingImm) {
     505      136889 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     506             : 
     507             :     // FIXME: We could try to change the instruction from 64-bit to 32-bit
     508             :     // to enable more folding opportunites.  The shrink operands pass
     509             :     // already does this.
     510      136889 :     return;
     511             :   }
     512             : 
     513             : 
     514       60224 :   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
     515             :   const TargetRegisterClass *FoldRC =
     516       60224 :     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
     517             : 
     518             : 
     519             :   // Split 64-bit constants into 32-bits for folding.
     520       60224 :   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
     521        2810 :     unsigned UseReg = UseOp.getReg();
     522             :     const TargetRegisterClass *UseRC
     523        2810 :       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
     524        2810 :       MRI->getRegClass(UseReg) :
     525           0 :       TRI->getPhysRegClass(UseReg);
     526             : 
     527        5620 :     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
     528             :       return;
     529             : 
     530        2810 :     APInt Imm(64, OpToFold.getImm());
     531        2810 :     if (UseOp.getSubReg() == AMDGPU::sub0) {
     532        2638 :       Imm = Imm.getLoBits(32);
     533             :     } else {
     534             :       assert(UseOp.getSubReg() == AMDGPU::sub1);
     535        2982 :       Imm = Imm.getHiBits(32);
     536             :     }
     537             : 
     538             :     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
     539        2810 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
     540             :     return;
     541             :   }
     542             : 
     543             : 
     544             : 
     545       57414 :   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     546             : }
     547             : 
     548        2926 : static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
     549             :                                   uint32_t LHS, uint32_t RHS) {
     550        2926 :   switch (Opcode) {
     551          13 :   case AMDGPU::V_AND_B32_e64:
     552             :   case AMDGPU::V_AND_B32_e32:
     553             :   case AMDGPU::S_AND_B32:
     554          13 :     Result = LHS & RHS;
     555          13 :     return true;
     556          10 :   case AMDGPU::V_OR_B32_e64:
     557             :   case AMDGPU::V_OR_B32_e32:
     558             :   case AMDGPU::S_OR_B32:
     559          10 :     Result = LHS | RHS;
     560          10 :     return true;
     561           0 :   case AMDGPU::V_XOR_B32_e64:
     562             :   case AMDGPU::V_XOR_B32_e32:
     563             :   case AMDGPU::S_XOR_B32:
     564           0 :     Result = LHS ^ RHS;
     565           0 :     return true;
     566          11 :   case AMDGPU::V_LSHL_B32_e64:
     567             :   case AMDGPU::V_LSHL_B32_e32:
     568             :   case AMDGPU::S_LSHL_B32:
     569             :     // The instruction ignores the high bits for out of bounds shifts.
     570          11 :     Result = LHS << (RHS & 31);
     571          11 :     return true;
     572           3 :   case AMDGPU::V_LSHLREV_B32_e64:
     573             :   case AMDGPU::V_LSHLREV_B32_e32:
     574           3 :     Result = RHS << (LHS & 31);
     575           3 :     return true;
     576           8 :   case AMDGPU::V_LSHR_B32_e64:
     577             :   case AMDGPU::V_LSHR_B32_e32:
     578             :   case AMDGPU::S_LSHR_B32:
     579           8 :     Result = LHS >> (RHS & 31);
     580           8 :     return true;
     581           3 :   case AMDGPU::V_LSHRREV_B32_e64:
     582             :   case AMDGPU::V_LSHRREV_B32_e32:
     583           3 :     Result = RHS >> (LHS & 31);
     584           3 :     return true;
     585           8 :   case AMDGPU::V_ASHR_I32_e64:
     586             :   case AMDGPU::V_ASHR_I32_e32:
     587             :   case AMDGPU::S_ASHR_I32:
     588           8 :     Result = static_cast<int32_t>(LHS) >> (RHS & 31);
     589           8 :     return true;
     590           3 :   case AMDGPU::V_ASHRREV_I32_e64:
     591             :   case AMDGPU::V_ASHRREV_I32_e32:
     592           3 :     Result = static_cast<int32_t>(RHS) >> (LHS & 31);
     593           3 :     return true;
     594             :   default:
     595             :     return false;
     596             :   }
     597             : }
     598             : 
     599             : static unsigned getMovOpc(bool IsScalar) {
     600          75 :   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
     601             : }
     602             : 
     603             : /// Remove any leftover implicit operands from mutating the instruction. e.g.
     604             : /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
     605             : /// anymore.
     606         202 : static void stripExtraCopyOperands(MachineInstr &MI) {
     607         202 :   const MCInstrDesc &Desc = MI.getDesc();
     608         202 :   unsigned NumOps = Desc.getNumOperands() +
     609         202 :                     Desc.getNumImplicitUses() +
     610         202 :                     Desc.getNumImplicitDefs();
     611             : 
     612         336 :   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
     613         134 :     MI.RemoveOperand(I);
     614         202 : }
     615             : 
     616             : static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
     617             :   MI.setDesc(NewDesc);
     618         200 :   stripExtraCopyOperands(MI);
     619             : }
     620             : 
     621      112286 : static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
     622             :                                                MachineOperand &Op) {
     623      112286 :   if (Op.isReg()) {
     624             :     // If this has a subregister, it obviously is a register source.
     625      111321 :     if (Op.getSubReg() != AMDGPU::NoSubRegister ||
     626      102061 :         !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
     627             :       return &Op;
     628             : 
     629      102048 :     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
     630      204094 :     if (Def && Def->isMoveImmediate()) {
     631       57288 :       MachineOperand &ImmSrc = Def->getOperand(1);
     632       57288 :       if (ImmSrc.isImm())
     633       57181 :         return &ImmSrc;
     634             :     }
     635             :   }
     636             : 
     637             :   return &Op;
     638             : }
     639             : 
     640             : // Try to simplify operations with a constant that may appear after instruction
     641             : // selection.
     642             : // TODO: See if a frame index with a fixed offset can fold.
     643      149474 : static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
     644             :                               const SIInstrInfo *TII,
     645             :                               MachineInstr *MI,
     646             :                               MachineOperand *ImmOp) {
     647      149474 :   unsigned Opc = MI->getOpcode();
     648      298948 :   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
     649      149474 :       Opc == AMDGPU::S_NOT_B32) {
     650          20 :     MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
     651          10 :     mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
     652          10 :     return true;
     653             :   }
     654             : 
     655      149464 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
     656      149464 :   if (Src1Idx == -1)
     657             :     return false;
     658             : 
     659       56143 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     660      112286 :   MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
     661      112286 :   MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
     662             : 
     663       56143 :   if (!Src0->isImm() && !Src1->isImm())
     664             :     return false;
     665             : 
     666      110428 :   if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
     667          34 :     if (Src0->isImm() && Src0->getImm() == 0) {
     668             :       // v_lshl_or_b32 0, X, Y -> copy Y
     669             :       // v_lshl_or_b32 0, X, K -> v_mov_b32 K
     670           6 :       bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
     671           6 :       MI->RemoveOperand(Src1Idx);
     672           6 :       MI->RemoveOperand(Src0Idx);
     673             : 
     674           7 :       MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32));
     675           6 :       return true;
     676             :     }
     677             :   }
     678             : 
     679             :   // and k0, k1 -> v_mov_b32 (k0 & k1)
     680             :   // or k0, k1 -> v_mov_b32 (k0 | k1)
     681             :   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
     682       55208 :   if (Src0->isImm() && Src1->isImm()) {
     683             :     int32_t NewImm;
     684        2926 :     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
     685             :       return false;
     686             : 
     687             :     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     688          59 :     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
     689             : 
     690             :     // Be careful to change the right operand, src0 may belong to a different
     691             :     // instruction.
     692         118 :     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
     693          59 :     MI->RemoveOperand(Src1Idx);
     694          59 :     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
     695          59 :     return true;
     696             :   }
     697             : 
     698       52282 :   if (!MI->isCommutable())
     699             :     return false;
     700             : 
     701       41732 :   if (Src0->isImm() && !Src1->isImm()) {
     702             :     std::swap(Src0, Src1);
     703             :     std::swap(Src0Idx, Src1Idx);
     704             :   }
     705             : 
     706       41732 :   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
     707       83464 :   if (Opc == AMDGPU::V_OR_B32_e64 ||
     708       41732 :       Opc == AMDGPU::V_OR_B32_e32 ||
     709       41732 :       Opc == AMDGPU::S_OR_B32) {
     710         981 :     if (Src1Val == 0) {
     711             :       // y = or x, 0 => y = copy x
     712         110 :       MI->RemoveOperand(Src1Idx);
     713         110 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     714         871 :     } else if (Src1Val == -1) {
     715             :       // y = or x, -1 => y = v_mov_b32 -1
     716           2 :       MI->RemoveOperand(Src1Idx);
     717           2 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
     718             :     } else
     719             :       return false;
     720             : 
     721         112 :     return true;
     722             :   }
     723             : 
     724       40751 :   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
     725       73320 :       MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
     726             :       MI->getOpcode() == AMDGPU::S_AND_B32) {
     727       11445 :     if (Src1Val == 0) {
     728             :       // y = and x, 0 => y = v_mov_b32 0
     729           4 :       MI->RemoveOperand(Src0Idx);
     730           4 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
     731       11441 :     } else if (Src1Val == -1) {
     732             :       // y = and x, -1 => y = copy x
     733           2 :       MI->RemoveOperand(Src1Idx);
     734           2 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     735           2 :       stripExtraCopyOperands(*MI);
     736             :     } else
     737             :       return false;
     738             : 
     739           6 :     return true;
     740             :   }
     741             : 
     742       29219 :   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
     743       58200 :       MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
     744             :       MI->getOpcode() == AMDGPU::S_XOR_B32) {
     745         442 :     if (Src1Val == 0) {
     746             :       // y = xor x, 0 => y = copy x
     747           6 :       MI->RemoveOperand(Src1Idx);
     748           6 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     749           6 :       return true;
     750             :     }
     751             :   }
     752             : 
     753             :   return false;
     754             : }
     755             : 
     756             : // Try to fold an instruction into a simpler one
     757     1087736 : static bool tryFoldInst(const SIInstrInfo *TII,
     758             :                         MachineInstr *MI) {
     759     1087736 :   unsigned Opc = MI->getOpcode();
     760             : 
     761     2175472 :   if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
     762     1087736 :       Opc == AMDGPU::V_CNDMASK_B32_e64    ||
     763     1087736 :       Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
     764       11437 :     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
     765       11437 :     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
     766       11437 :     if (Src1->isIdenticalTo(*Src0)) {
     767             :       LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
     768           7 :       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     769           7 :       if (Src2Idx != -1)
     770           6 :         MI->RemoveOperand(Src2Idx);
     771           7 :       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
     772          12 :       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
     773             :                                                : getMovOpc(false)));
     774             :       LLVM_DEBUG(dbgs() << *MI << '\n');
     775           7 :       return true;
     776             :     }
     777             :   }
     778             : 
     779             :   return false;
     780             : }
     781             : 
     782      323467 : void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     783             :                                      MachineOperand &OpToFold) const {
     784             :   // We need mutate the operands of new mov instructions to add implicit
     785             :   // uses of EXEC, but adding them invalidates the use_iterator, so defer
     786             :   // this.
     787             :   SmallVector<MachineInstr *, 4> CopiesToReplace;
     788      323467 :   SmallVector<FoldCandidate, 4> FoldList;
     789      323467 :   MachineOperand &Dst = MI.getOperand(0);
     790             : 
     791      323467 :   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
     792             :   if (FoldingImm) {
     793             :     unsigned NumLiteralUses = 0;
     794             :     MachineOperand *NonInlineUse = nullptr;
     795             :     int NonInlineUseOpNo = -1;
     796             : 
     797             :     MachineRegisterInfo::use_iterator NextUse;
     798             :     for (MachineRegisterInfo::use_iterator
     799       90354 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     800      240486 :          Use != E; Use = NextUse) {
     801             :       NextUse = std::next(Use);
     802      150132 :       MachineInstr *UseMI = Use->getParent();
     803             :       unsigned OpNo = Use.getOperandNo();
     804             : 
     805             :       // Folding the immediate may reveal operations that can be constant
     806             :       // folded or replaced with a copy. This can happen for example after
     807             :       // frame indices are lowered to constants or from splitting 64-bit
     808             :       // constants.
     809             :       //
     810             :       // We may also encounter cases where one or both operands are
     811             :       // immediates materialized into a register, which would ordinarily not
     812             :       // be folded due to multiple uses or operand constraints.
     813             : 
     814      150132 :       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
     815             :         LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
     816             : 
     817             :         // Some constant folding cases change the same immediate's use to a new
     818             :         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
     819             :         // again. The same constant folded instruction could also have a second
     820             :         // use operand.
     821         199 :         NextUse = MRI->use_begin(Dst.getReg());
     822             :         FoldList.clear();
     823         199 :         continue;
     824             :       }
     825             : 
     826             :       // Try to fold any inline immediate uses, and then only fold other
     827             :       // constants if they have one use.
     828             :       //
     829             :       // The legality of the inline immediate must be checked based on the use
     830             :       // operand, not the defining instruction, because 32-bit instructions
     831             :       // with 32-bit inline immediate sources may be used to materialize
     832             :       // constants used in 16-bit operands.
     833             :       //
     834             :       // e.g. it is unsafe to fold:
     835             :       //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
     836             :       //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
     837             : 
     838             :       // Folding immediates with more than one use will increase program size.
     839             :       // FIXME: This will also reduce register usage, which may be better
     840             :       // in some cases. A better heuristic is needed.
     841      149933 :       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
     842       48192 :         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
     843             :       } else {
     844      101741 :         if (++NumLiteralUses == 1) {
     845             :           NonInlineUse = &*Use;
     846       62701 :           NonInlineUseOpNo = OpNo;
     847             :         }
     848             :       }
     849             :     }
     850             : 
     851       90354 :     if (NumLiteralUses == 1) {
     852       45991 :       MachineInstr *UseMI = NonInlineUse->getParent();
     853       45991 :       foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
     854             :     }
     855             :   } else {
     856             :     // Folding register.
     857             :     for (MachineRegisterInfo::use_iterator
     858      233113 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     859      506560 :          Use != E; ++Use) {
     860      273447 :       MachineInstr *UseMI = Use->getParent();
     861             : 
     862      273447 :       foldOperand(OpToFold, UseMI, Use.getOperandNo(),
     863             :                   FoldList, CopiesToReplace);
     864             :     }
     865             :   }
     866             : 
     867      323467 :   MachineFunction *MF = MI.getParent()->getParent();
     868             :   // Make sure we add EXEC uses to any new v_mov instructions created.
     869      338274 :   for (MachineInstr *Copy : CopiesToReplace)
     870       14807 :     Copy->addImplicitDefUseOperands(*MF);
     871             : 
     872      451984 :   for (FoldCandidate &Fold : FoldList) {
     873      128517 :     if (updateOperand(Fold, *TII, *TRI)) {
     874             :       // Clear kill flags.
     875      128371 :       if (Fold.isReg()) {
     876             :         assert(Fold.OpToFold && Fold.OpToFold->isReg());
     877             :         // FIXME: Probably shouldn't bother trying to fold if not an
     878             :         // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
     879             :         // copies.
     880       77509 :         MRI->clearKillFlags(Fold.OpToFold->getReg());
     881             :       }
     882             :       LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
     883             :                         << static_cast<int>(Fold.UseOpNo) << " of "
     884             :                         << *Fold.UseMI << '\n');
     885      128371 :       tryFoldInst(TII, Fold.UseMI);
     886         146 :     } else if (Fold.isCommuted()) {
     887             :       // Restoring instruction's original operand order if fold has failed.
     888         146 :       TII->commuteInstruction(*Fold.UseMI, false);
     889             :     }
     890             :   }
     891      323467 : }
     892             : 
     893             : // Clamp patterns are canonically selected to v_max_* instructions, so only
     894             : // handle them.
     895           0 : const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
     896           0 :   unsigned Op = MI.getOpcode();
     897           0 :   switch (Op) {
     898           0 :   case AMDGPU::V_MAX_F32_e64:
     899             :   case AMDGPU::V_MAX_F16_e64:
     900             :   case AMDGPU::V_MAX_F64:
     901             :   case AMDGPU::V_PK_MAX_F16: {
     902           0 :     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
     903           0 :       return nullptr;
     904             : 
     905             :     // Make sure sources are identical.
     906             :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     907             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     908           0 :     if (!Src0->isReg() || !Src1->isReg() ||
     909           0 :         Src0->getReg() != Src1->getReg() ||
     910           0 :         Src0->getSubReg() != Src1->getSubReg() ||
     911             :         Src0->getSubReg() != AMDGPU::NoSubRegister)
     912           0 :       return nullptr;
     913             : 
     914             :     // Can't fold up if we have modifiers.
     915           0 :     if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     916           0 :       return nullptr;
     917             : 
     918             :     unsigned Src0Mods
     919           0 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
     920             :     unsigned Src1Mods
     921           0 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
     922             : 
     923             :     // Having a 0 op_sel_hi would require swizzling the output in the source
     924             :     // instruction, which we can't do.
     925           0 :     unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
     926           0 :     if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
     927           0 :       return nullptr;
     928             :     return Src0;
     929             :   }
     930             :   default:
     931             :     return nullptr;
     932             :   }
     933             : }
     934             : 
     935             : // We obviously have multiple uses in a clamp since the register is used twice
     936             : // in the same instruction.
     937         322 : static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
     938             :   int Count = 0;
     939         644 :   for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
     940         966 :        I != E; ++I) {
     941         342 :     if (++Count > 1)
     942          20 :       return false;
     943             :   }
     944             : 
     945         302 :   return true;
     946             : }
     947             : 
     948             : // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
     949      531836 : bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
     950      531836 :   const MachineOperand *ClampSrc = isClamp(MI);
     951      531836 :   if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
     952      531568 :     return false;
     953             : 
     954         268 :   MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
     955             : 
     956             :   // The type of clamp must be compatible.
     957         536 :   if (TII->getClampMask(*Def) != TII->getClampMask(MI))
     958             :     return false;
     959             : 
     960         171 :   MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
     961         171 :   if (!DefClamp)
     962             :     return false;
     963             : 
     964             :   LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
     965             :                     << '\n');
     966             : 
     967             :   // Clamp is applied after omod, so it is OK if omod is set.
     968             :   DefClamp->setImm(1);
     969         171 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
     970         171 :   MI.eraseFromParent();
     971         171 :   return true;
     972             : }
     973             : 
     974          31 : static int getOModValue(unsigned Opc, int64_t Val) {
     975          31 :   switch (Opc) {
     976          30 :   case AMDGPU::V_MUL_F32_e64: {
     977          30 :     switch (static_cast<uint32_t>(Val)) {
     978             :     case 0x3f000000: // 0.5
     979             :       return SIOutMods::DIV2;
     980           2 :     case 0x40000000: // 2.0
     981           2 :       return SIOutMods::MUL2;
     982           8 :     case 0x40800000: // 4.0
     983           8 :       return SIOutMods::MUL4;
     984           0 :     default:
     985           0 :       return SIOutMods::NONE;
     986             :     }
     987             :   }
     988           1 :   case AMDGPU::V_MUL_F16_e64: {
     989           1 :     switch (static_cast<uint16_t>(Val)) {
     990             :     case 0x3800: // 0.5
     991             :       return SIOutMods::DIV2;
     992           0 :     case 0x4000: // 2.0
     993           0 :       return SIOutMods::MUL2;
     994           0 :     case 0x4400: // 4.0
     995           0 :       return SIOutMods::MUL4;
     996           0 :     default:
     997           0 :       return SIOutMods::NONE;
     998             :     }
     999             :   }
    1000           0 :   default:
    1001           0 :     llvm_unreachable("invalid mul opcode");
    1002             :   }
    1003             : }
    1004             : 
    1005             : // FIXME: Does this really not support denormals with f16?
    1006             : // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
    1007             : // handled, so will anything other than that break?
    1008             : std::pair<const MachineOperand *, int>
    1009           0 : SIFoldOperands::isOMod(const MachineInstr &MI) const {
    1010           0 :   unsigned Op = MI.getOpcode();
    1011           0 :   switch (Op) {
    1012           0 :   case AMDGPU::V_MUL_F32_e64:
    1013             :   case AMDGPU::V_MUL_F16_e64: {
    1014             :     // If output denormals are enabled, omod is ignored.
    1015           0 :     if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
    1016           0 :         (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
    1017           0 :       return std::make_pair(nullptr, SIOutMods::NONE);
    1018             : 
    1019             :     const MachineOperand *RegOp = nullptr;
    1020             :     const MachineOperand *ImmOp = nullptr;
    1021           0 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    1022             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    1023           0 :     if (Src0->isImm()) {
    1024             :       ImmOp = Src0;
    1025             :       RegOp = Src1;
    1026           0 :     } else if (Src1->isImm()) {
    1027             :       ImmOp = Src1;
    1028             :       RegOp = Src0;
    1029             :     } else
    1030           0 :       return std::make_pair(nullptr, SIOutMods::NONE);
    1031             : 
    1032           0 :     int OMod = getOModValue(Op, ImmOp->getImm());
    1033           0 :     if (OMod == SIOutMods::NONE ||
    1034           0 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
    1035           0 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
    1036           0 :         TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
    1037           0 :         TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
    1038           0 :       return std::make_pair(nullptr, SIOutMods::NONE);
    1039             : 
    1040             :     return std::make_pair(RegOp, OMod);
    1041             :   }
    1042           0 :   case AMDGPU::V_ADD_F32_e64:
    1043             :   case AMDGPU::V_ADD_F16_e64: {
    1044             :     // If output denormals are enabled, omod is ignored.
    1045           0 :     if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
    1046           0 :         (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
    1047           0 :       return std::make_pair(nullptr, SIOutMods::NONE);
    1048             : 
    1049             :     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
    1050           0 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
    1051             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
    1052             : 
    1053           0 :     if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
    1054           0 :         Src0->getSubReg() == Src1->getSubReg() &&
    1055           0 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
    1056           0 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
    1057           0 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
    1058           0 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
    1059           0 :       return std::make_pair(Src0, SIOutMods::MUL2);
    1060             : 
    1061           0 :     return std::make_pair(nullptr, SIOutMods::NONE);
    1062             :   }
    1063           0 :   default:
    1064           0 :     return std::make_pair(nullptr, SIOutMods::NONE);
    1065             :   }
    1066             : }
    1067             : 
    1068             : // FIXME: Does this need to check IEEE bit on function?
    1069         532 : bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
    1070             :   const MachineOperand *RegOp;
    1071             :   int OMod;
    1072         532 :   std::tie(RegOp, OMod) = isOMod(MI);
    1073          38 :   if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
    1074         570 :       RegOp->getSubReg() != AMDGPU::NoSubRegister ||
    1075          38 :       !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
    1076         498 :     return false;
    1077             : 
    1078          34 :   MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
    1079          34 :   MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
    1080          34 :   if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
    1081             :     return false;
    1082             : 
    1083             :   // Clamp is applied after omod. If the source already has clamp set, don't
    1084             :   // fold it.
    1085          28 :   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
    1086             :     return false;
    1087             : 
    1088             :   LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
    1089             : 
    1090          20 :   DefOMod->setImm(OMod);
    1091          20 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
    1092          20 :   MI.eraseFromParent();
    1093          20 :   return true;
    1094             : }
    1095             : 
    1096       39046 : bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
    1097       39046 :   if (skipFunction(MF.getFunction()))
    1098             :     return false;
    1099             : 
    1100       39040 :   MRI = &MF.getRegInfo();
    1101       39040 :   ST = &MF.getSubtarget<GCNSubtarget>();
    1102       39040 :   TII = ST->getInstrInfo();
    1103       39040 :   TRI = &TII->getRegisterInfo();
    1104             : 
    1105       39040 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1106             : 
    1107             :   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
    1108             :   // correctly handle signed zeros.
    1109             :   //
    1110             :   bool IsIEEEMode = ST->enableIEEEBit(MF);
    1111       39040 :   bool HasNSZ = MFI->hasNoSignedZerosFPMath();
    1112             : 
    1113      121976 :   for (MachineBasicBlock *MBB : depth_first(&MF)) {
    1114             :     MachineBasicBlock::iterator I, Next;
    1115     1003261 :     for (I = MBB->begin(); I != MBB->end(); I = Next) {
    1116      959365 :       Next = std::next(I);
    1117             :       MachineInstr &MI = *I;
    1118             : 
    1119      959365 :       tryFoldInst(TII, &MI);
    1120             : 
    1121      959365 :       if (!TII->isFoldableCopy(MI)) {
    1122             :         // TODO: Omod might be OK if there is NSZ only on the source
    1123             :         // instruction, and not the omod multiply.
    1124      532388 :         if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
    1125         532 :             !tryFoldOMod(MI))
    1126      531836 :           tryFoldClamp(MI);
    1127      531856 :         continue;
    1128             :       }
    1129             : 
    1130      427509 :       MachineOperand &OpToFold = MI.getOperand(1);
    1131      427509 :       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
    1132             : 
    1133             :       // FIXME: We could also be folding things like TargetIndexes.
    1134      332500 :       if (!FoldingImm && !OpToFold.isReg())
    1135             :         continue;
    1136             : 
    1137      427509 :       if (OpToFold.isReg() &&
    1138      332500 :           !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
    1139             :         continue;
    1140             : 
    1141             :       // Prevent folding operands backwards in the function. For example,
    1142             :       // the COPY opcode must not be replaced by 1 in this example:
    1143             :       //
    1144             :       //    %3 = COPY %vgpr0; VGPR_32:%3
    1145             :       //    ...
    1146             :       //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
    1147             :       MachineOperand &Dst = MI.getOperand(0);
    1148      348123 :       if (Dst.isReg() &&
    1149      348123 :           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
    1150             :         continue;
    1151             : 
    1152      323467 :       foldInstOperand(MI, OpToFold);
    1153             :     }
    1154             :   }
    1155       39040 :   return false;
    1156             : }

Generated by: LCOV version 1.13