LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIFoldOperands.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 380 402 94.5 %
Date: 2018-07-13 00:08:38 Functions: 23 25 92.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : /// \file
       9             : //===----------------------------------------------------------------------===//
      10             : //
      11             : 
      12             : #include "AMDGPU.h"
      13             : #include "AMDGPUSubtarget.h"
      14             : #include "SIInstrInfo.h"
      15             : #include "SIMachineFunctionInfo.h"
      16             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      17             : #include "llvm/ADT/DepthFirstIterator.h"
      18             : #include "llvm/CodeGen/LiveIntervals.h"
      19             : #include "llvm/CodeGen/MachineFunctionPass.h"
      20             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      21             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      22             : #include "llvm/Support/Debug.h"
      23             : #include "llvm/Support/raw_ostream.h"
      24             : #include "llvm/Target/TargetMachine.h"
      25             : 
      26             : #define DEBUG_TYPE "si-fold-operands"
      27             : using namespace llvm;
      28             : 
      29             : namespace {
      30             : 
      31             : struct FoldCandidate {
      32             :   MachineInstr *UseMI;
      33             :   union {
      34             :     MachineOperand *OpToFold;
      35             :     uint64_t ImmToFold;
      36             :     int FrameIndexToFold;
      37             :   };
      38             :   unsigned char UseOpNo;
      39             :   MachineOperand::MachineOperandType Kind;
      40             :   bool Commuted;
      41             : 
      42             :   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
      43      125170 :                 bool Commuted_ = false) :
      44             :     UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
      45      250324 :     Commuted(Commuted_) {
      46      125154 :     if (FoldOp->isImm()) {
      47       46562 :       ImmToFold = FoldOp->getImm();
      48       78608 :     } else if (FoldOp->isFI()) {
      49          30 :       FrameIndexToFold = FoldOp->getIndex();
      50             :     } else {
      51             :       assert(FoldOp->isReg());
      52       78578 :       OpToFold = FoldOp;
      53             :     }
      54             :   }
      55             : 
      56             :   bool isFI() const {
      57             :     return Kind == MachineOperand::MO_FrameIndex;
      58             :   }
      59             : 
      60             :   bool isImm() const {
      61             :     return Kind == MachineOperand::MO_Immediate;
      62             :   }
      63             : 
      64             :   bool isReg() const {
      65             :     return Kind == MachineOperand::MO_Register;
      66             :   }
      67             : 
      68             :   bool isCommuted() const {
      69             :     return Commuted;
      70             :   }
      71             : };
      72             : 
      73        3490 : class SIFoldOperands : public MachineFunctionPass {
      74             : public:
      75             :   static char ID;
      76             :   MachineRegisterInfo *MRI;
      77             :   const SIInstrInfo *TII;
      78             :   const SIRegisterInfo *TRI;
      79             :   const AMDGPUSubtarget *ST;
      80             : 
      81             :   void foldOperand(MachineOperand &OpToFold,
      82             :                    MachineInstr *UseMI,
      83             :                    unsigned UseOpIdx,
      84             :                    SmallVectorImpl<FoldCandidate> &FoldList,
      85             :                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
      86             : 
      87             :   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
      88             : 
      89             :   const MachineOperand *isClamp(const MachineInstr &MI) const;
      90             :   bool tryFoldClamp(MachineInstr &MI);
      91             : 
      92             :   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
      93             :   bool tryFoldOMod(MachineInstr &MI);
      94             : 
      95             : public:
      96        3506 :   SIFoldOperands() : MachineFunctionPass(ID) {
      97        3506 :     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
      98        3506 :   }
      99             : 
     100             :   bool runOnMachineFunction(MachineFunction &MF) override;
     101             : 
     102        3482 :   StringRef getPassName() const override { return "SI Fold Operands"; }
     103             : 
     104        3482 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     105        3482 :     AU.setPreservesCFG();
     106        3482 :     MachineFunctionPass::getAnalysisUsage(AU);
     107        3482 :   }
     108             : };
     109             : 
     110             : } // End anonymous namespace.
     111             : 
     112      349582 : INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
     113             :                 "SI Fold Operands", false, false)
     114             : 
     115             : char SIFoldOperands::ID = 0;
     116             : 
     117             : char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
     118             : 
     119             : // Wrapper around isInlineConstant that understands special cases when
     120             : // instruction types are replaced during operand folding.
     121      143615 : static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
     122             :                                      const MachineInstr &UseMI,
     123             :                                      unsigned OpNo,
     124             :                                      const MachineOperand &OpToFold) {
     125      143615 :   if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
     126             :     return true;
     127             : 
     128       99131 :   unsigned Opc = UseMI.getOpcode();
     129       99131 :   switch (Opc) {
     130         339 :   case AMDGPU::V_MAC_F32_e64:
     131             :   case AMDGPU::V_MAC_F16_e64:
     132             :   case AMDGPU::V_FMAC_F32_e64: {
     133             :     // Special case for mac. Since this is replaced with mad when folded into
     134             :     // src2, we need to check the legality for the final instruction.
     135         339 :     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     136         339 :     if (static_cast<int>(OpNo) == Src2Idx) {
     137             :       bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
     138             :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     139             : 
     140         116 :       unsigned Opc = IsFMA ?
     141             :         AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
     142         116 :       const MCInstrDesc &MadDesc = TII->get(Opc);
     143         116 :       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     144             :     }
     145             :     return false;
     146             :   }
     147             :   default:
     148             :     return false;
     149             :   }
     150             : }
     151             : 
     152           0 : FunctionPass *llvm::createSIFoldOperandsPass() {
     153           0 :   return new SIFoldOperands();
     154             : }
     155             : 
     156      125169 : static bool updateOperand(FoldCandidate &Fold,
     157             :                           const TargetRegisterInfo &TRI) {
     158      125169 :   MachineInstr *MI = Fold.UseMI;
     159      125169 :   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
     160             :   assert(Old.isReg());
     161             : 
     162      125169 :   if (Fold.isImm()) {
     163       46561 :     if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {
     164             :       // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
     165             :       // already set.
     166             :       unsigned Opcode = MI->getOpcode();
     167         117 :       int OpNo = MI->getOperandNo(&Old);
     168             :       int ModIdx = -1;
     169         117 :       if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
     170             :         ModIdx = AMDGPU::OpName::src0_modifiers;
     171          86 :       else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
     172             :         ModIdx = AMDGPU::OpName::src1_modifiers;
     173           0 :       else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
     174             :         ModIdx = AMDGPU::OpName::src2_modifiers;
     175             :       assert(ModIdx != -1);
     176         117 :       ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
     177         117 :       MachineOperand &Mod = MI->getOperand(ModIdx);
     178         117 :       unsigned Val = Mod.getImm();
     179         117 :       if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
     180             :         return false;
     181             :       // If upper part is all zero we do not need op_sel_hi.
     182         117 :       if (!isUInt<16>(Fold.ImmToFold)) {
     183         101 :         if (!(Fold.ImmToFold & 0xffff)) {
     184           3 :           Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
     185           3 :           Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
     186           3 :           Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
     187           3 :           return true;
     188             :         }
     189          98 :         Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
     190             :       }
     191             :     }
     192       46558 :     Old.ChangeToImmediate(Fold.ImmToFold);
     193       46558 :     return true;
     194             :   }
     195             : 
     196       78608 :   if (Fold.isFI()) {
     197          30 :     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     198          30 :     return true;
     199             :   }
     200             : 
     201       78578 :   MachineOperand *New = Fold.OpToFold;
     202      235734 :   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
     203       78578 :       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     204       78578 :     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
     205             : 
     206             :     Old.setIsUndef(New->isUndef());
     207       78578 :     return true;
     208             :   }
     209             : 
     210             :   // FIXME: Handle physical registers.
     211             : 
     212             :   return false;
     213             : }
     214             : 
     215             : static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
     216             :                               const MachineInstr *MI) {
     217       56804 :   for (auto Candidate : FoldList) {
     218         400 :     if (Candidate.UseMI == MI)
     219             :       return true;
     220             :   }
     221             :   return false;
     222             : }
     223             : 
     224      181135 : static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     225             :                              MachineInstr *MI, unsigned OpNo,
     226             :                              MachineOperand *OpToFold,
     227             :                              const SIInstrInfo *TII) {
     228      181135 :   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     229             : 
     230             :     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     231       56080 :     unsigned Opc = MI->getOpcode();
     232       56080 :     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
     233         229 :          Opc == AMDGPU::V_FMAC_F32_e64) &&
     234         229 :         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
     235             :       bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
     236             :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     237         122 :       unsigned NewOpc = IsFMA ?
     238             :         AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
     239             : 
     240             :       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
     241             :       // to fold the operand.
     242         122 :       MI->setDesc(TII->get(NewOpc));
     243         122 :       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
     244         122 :       if (FoldAsMAD) {
     245          60 :         MI->untieRegOperand(OpNo);
     246          60 :         return true;
     247             :       }
     248          62 :       MI->setDesc(TII->get(Opc));
     249             :     }
     250             : 
     251             :     // Special case for s_setreg_b32
     252       56036 :     if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
     253          16 :       MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
     254          32 :       FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     255          16 :       return true;
     256             :     }
     257             : 
     258             :     // If we are already folding into another operand of MI, then
     259             :     // we can't commute the instruction, otherwise we risk making the
     260             :     // other fold illegal.
     261       56004 :     if (isUseMIInFoldList(FoldList, MI))
     262             :       return false;
     263             : 
     264             :     // Operand is not legal, so try to commute the instruction to
     265             :     // see if this makes it possible to fold.
     266       56004 :     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
     267       56004 :     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
     268       56004 :     bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
     269             : 
     270       56004 :     if (CanCommute) {
     271       10999 :       if (CommuteIdx0 == OpNo)
     272        6804 :         OpNo = CommuteIdx1;
     273        4195 :       else if (CommuteIdx1 == OpNo)
     274             :         OpNo = CommuteIdx0;
     275             :     }
     276             : 
     277             :     // One of operands might be an Imm operand, and OpNo may refer to it after
     278             :     // the call of commuteInstruction() below. Such situations are avoided
     279             :     // here explicitly as OpNo must be a register operand to be a candidate
     280             :     // for memory folding.
     281       77956 :     if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
     282       10953 :                        !MI->getOperand(CommuteIdx1).isReg()))
     283             :       return false;
     284             : 
     285       55915 :     if (!CanCommute ||
     286        5455 :         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
     287             :       return false;
     288             : 
     289        5162 :     if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     290        5063 :       TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
     291        5063 :       return false;
     292             :     }
     293             : 
     294         198 :     FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
     295          99 :     return true;
     296             :   }
     297             : 
     298      250110 :   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     299      125055 :   return true;
     300             : }
     301             : 
     302             : // If the use operand doesn't care about the value, this may be an operand only
     303             : // used for register indexing, in which case it is unsafe to fold.
     304             : static bool isUseSafeToFold(const SIInstrInfo *TII,
     305             :                             const MachineInstr &MI,
     306             :                             const MachineOperand &UseMO) {
     307      703460 :   return !UseMO.isUndef() && !TII->isSDWA(MI);
     308             :   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
     309             : }
     310             : 
     311      351791 : void SIFoldOperands::foldOperand(
     312             :   MachineOperand &OpToFold,
     313             :   MachineInstr *UseMI,
     314             :   unsigned UseOpIdx,
     315             :   SmallVectorImpl<FoldCandidate> &FoldList,
     316             :   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
     317      351791 :   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
     318             : 
     319             :   if (!isUseSafeToFold(TII, *UseMI, UseOp))
     320             :     return;
     321             : 
     322             :   // FIXME: Fold operands with subregs.
     323      701462 :   if (UseOp.isReg() && OpToFold.isReg()) {
     324      523832 :     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
     325             :       return;
     326             : 
     327             :     // Don't fold subregister extracts into tied operands, only if it is a full
     328             :     // copy since a subregister use tied to a full register def doesn't really
     329             :     // make sense. e.g. don't fold:
     330             :     //
     331             :     // %1 = COPY %0:sub1
     332             :     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
     333             :     //
     334             :     //  into
     335             :     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
     336      257354 :     if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
     337             :       return;
     338             :   }
     339             : 
     340             :   // Special case for REG_SEQUENCE: We can't fold literals into
     341             :   // REG_SEQUENCE instructions, so we have to fold them into the
     342             :   // uses of REG_SEQUENCE.
     343      345021 :   if (UseMI->isRegSequence()) {
     344      110525 :     unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
     345      221050 :     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
     346             : 
     347             :     for (MachineRegisterInfo::use_iterator
     348      110525 :            RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
     349      259989 :          RSUse != RSE; ++RSUse) {
     350             : 
     351      149464 :       MachineInstr *RSUseMI = RSUse->getParent();
     352      149464 :       if (RSUse->getSubReg() != RegSeqDstSubReg)
     353      148471 :         continue;
     354             : 
     355         993 :       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
     356             :                   CopiesToReplace);
     357             :     }
     358             : 
     359             :     return;
     360             :   }
     361             : 
     362             : 
     363             :   bool FoldingImm = OpToFold.isImm();
     364             : 
     365             :   // In order to fold immediates into copies, we need to change the
     366             :   // copy to a MOV.
     367      234496 :   if (FoldingImm && UseMI->isCopy()) {
     368       10747 :     unsigned DestReg = UseMI->getOperand(0).getReg();
     369             :     const TargetRegisterClass *DestRC
     370       10938 :       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
     371       10556 :       MRI->getRegClass(DestReg) :
     372         191 :       TRI->getPhysRegClass(DestReg);
     373             : 
     374       10747 :     unsigned MovOp = TII->getMovOpcode(DestRC);
     375       10747 :     if (MovOp == AMDGPU::COPY)
     376             :       return;
     377             : 
     378       10747 :     UseMI->setDesc(TII->get(MovOp));
     379       10747 :     CopiesToReplace.push_back(UseMI);
     380             :   } else {
     381             :     const MCInstrDesc &UseDesc = UseMI->getDesc();
     382             : 
     383             :     // Don't fold into target independent nodes.  Target independent opcodes
     384             :     // don't have defined register classes.
     385      445016 :     if (UseDesc.isVariadic() ||
     386      445015 :         UseOp.isImplicit() ||
     387      221266 :         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
     388             :       return;
     389             :   }
     390             : 
     391      181013 :   if (!FoldingImm) {
     392      125546 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     393             : 
     394             :     // FIXME: We could try to change the instruction from 64-bit to 32-bit
     395             :     // to enable more folding opportunites.  The shrink operands pass
     396             :     // already does this.
     397      125546 :     return;
     398             :   }
     399             : 
     400             : 
     401       55467 :   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
     402             :   const TargetRegisterClass *FoldRC =
     403       55467 :     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
     404             : 
     405             : 
     406             :   // Split 64-bit constants into 32-bits for folding.
     407       58248 :   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
     408        2764 :     unsigned UseReg = UseOp.getReg();
     409             :     const TargetRegisterClass *UseRC
     410        2764 :       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
     411        2764 :       MRI->getRegClass(UseReg) :
     412           0 :       TRI->getPhysRegClass(UseReg);
     413             : 
     414        5528 :     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
     415             :       return;
     416             : 
     417        2764 :     APInt Imm(64, OpToFold.getImm());
     418        2764 :     if (UseOp.getSubReg() == AMDGPU::sub0) {
     419        2596 :       Imm = Imm.getLoBits(32);
     420             :     } else {
     421             :       assert(UseOp.getSubReg() == AMDGPU::sub1);
     422        2932 :       Imm = Imm.getHiBits(32);
     423             :     }
     424             : 
     425             :     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
     426        2764 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
     427             :     return;
     428             :   }
     429             : 
     430             : 
     431             : 
     432       52703 :   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     433             : }
     434             : 
     435        3079 : static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
     436             :                                   uint32_t LHS, uint32_t RHS) {
     437        3079 :   switch (Opcode) {
     438          12 :   case AMDGPU::V_AND_B32_e64:
     439             :   case AMDGPU::V_AND_B32_e32:
     440             :   case AMDGPU::S_AND_B32:
     441          12 :     Result = LHS & RHS;
     442          12 :     return true;
     443          69 :   case AMDGPU::V_OR_B32_e64:
     444             :   case AMDGPU::V_OR_B32_e32:
     445             :   case AMDGPU::S_OR_B32:
     446          69 :     Result = LHS | RHS;
     447          69 :     return true;
     448           0 :   case AMDGPU::V_XOR_B32_e64:
     449             :   case AMDGPU::V_XOR_B32_e32:
     450             :   case AMDGPU::S_XOR_B32:
     451           0 :     Result = LHS ^ RHS;
     452           0 :     return true;
     453           9 :   case AMDGPU::V_LSHL_B32_e64:
     454             :   case AMDGPU::V_LSHL_B32_e32:
     455             :   case AMDGPU::S_LSHL_B32:
     456             :     // The instruction ignores the high bits for out of bounds shifts.
     457           9 :     Result = LHS << (RHS & 31);
     458           9 :     return true;
     459           3 :   case AMDGPU::V_LSHLREV_B32_e64:
     460             :   case AMDGPU::V_LSHLREV_B32_e32:
     461           3 :     Result = RHS << (LHS & 31);
     462           3 :     return true;
     463           8 :   case AMDGPU::V_LSHR_B32_e64:
     464             :   case AMDGPU::V_LSHR_B32_e32:
     465             :   case AMDGPU::S_LSHR_B32:
     466           8 :     Result = LHS >> (RHS & 31);
     467           8 :     return true;
     468           3 :   case AMDGPU::V_LSHRREV_B32_e64:
     469             :   case AMDGPU::V_LSHRREV_B32_e32:
     470           3 :     Result = RHS >> (LHS & 31);
     471           3 :     return true;
     472           8 :   case AMDGPU::V_ASHR_I32_e64:
     473             :   case AMDGPU::V_ASHR_I32_e32:
     474             :   case AMDGPU::S_ASHR_I32:
     475           8 :     Result = static_cast<int32_t>(LHS) >> (RHS & 31);
     476           8 :     return true;
     477           3 :   case AMDGPU::V_ASHRREV_I32_e64:
     478             :   case AMDGPU::V_ASHRREV_I32_e32:
     479           3 :     Result = static_cast<int32_t>(RHS) >> (LHS & 31);
     480           3 :     return true;
     481             :   default:
     482             :     return false;
     483             :   }
     484             : }
     485             : 
     486             : static unsigned getMovOpc(bool IsScalar) {
     487         131 :   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
     488             : }
     489             : 
     490             : /// Remove any leftover implicit operands from mutating the instruction. e.g.
     491             : /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
     492             : /// anymore.
     493         204 : static void stripExtraCopyOperands(MachineInstr &MI) {
     494         204 :   const MCInstrDesc &Desc = MI.getDesc();
     495         408 :   unsigned NumOps = Desc.getNumOperands() +
     496         204 :                     Desc.getNumImplicitUses() +
     497         408 :                     Desc.getNumImplicitDefs();
     498             : 
     499         282 :   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
     500          78 :     MI.RemoveOperand(I);
     501         204 : }
     502             : 
     503             : static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
     504             :   MI.setDesc(NewDesc);
     505         202 :   stripExtraCopyOperands(MI);
     506             : }
     507             : 
     508      105906 : static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
     509             :                                                MachineOperand &Op) {
     510      105906 :   if (Op.isReg()) {
     511             :     // If this has a subregister, it obviously is a register source.
     512      201932 :     if (Op.getSubReg() != AMDGPU::NoSubRegister ||
     513       96877 :         !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
     514             :       return &Op;
     515             : 
     516       96864 :     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
     517      193726 :     if (Def && Def->isMoveImmediate()) {
     518       54392 :       MachineOperand &ImmSrc = Def->getOperand(1);
     519       54392 :       if (ImmSrc.isImm())
     520             :         return &ImmSrc;
     521             :     }
     522             :   }
     523             : 
     524             :   return &Op;
     525             : }
     526             : 
     527             : // Try to simplify operations with a constant that may appear after instruction
     528             : // selection.
     529             : // TODO: See if a frame index with a fixed offset can fold.
     530      143107 : static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
     531             :                               const SIInstrInfo *TII,
     532             :                               MachineInstr *MI,
     533             :                               MachineOperand *ImmOp) {
     534      143107 :   unsigned Opc = MI->getOpcode();
     535      286214 :   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
     536      143107 :       Opc == AMDGPU::S_NOT_B32) {
     537          20 :     MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
     538          10 :     mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
     539          10 :     return true;
     540             :   }
     541             : 
     542      143097 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
     543      143097 :   if (Src1Idx == -1)
     544             :     return false;
     545             : 
     546       52953 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     547      105906 :   MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
     548      105906 :   MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
     549             : 
     550       84724 :   if (!Src0->isImm() && !Src1->isImm())
     551             :     return false;
     552             : 
     553             :   // and k0, k1 -> v_mov_b32 (k0 & k1)
     554             :   // or k0, k1 -> v_mov_b32 (k0 | k1)
     555             :   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
     556       73276 :   if (Src0->isImm() && Src1->isImm()) {
     557             :     int32_t NewImm;
     558        3079 :     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
     559             :       return false;
     560             : 
     561             :     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     562         115 :     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
     563             : 
     564             :     // Be careful to change the right operand, src0 may belong to a different
     565             :     // instruction.
     566         230 :     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
     567         115 :     MI->RemoveOperand(Src1Idx);
     568         115 :     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
     569         115 :     return true;
     570             :   }
     571             : 
     572       49015 :   if (!MI->isCommutable())
     573             :     return false;
     574             : 
     575       53809 :   if (Src0->isImm() && !Src1->isImm()) {
     576             :     std::swap(Src0, Src1);
     577             :     std::swap(Src0Idx, Src1Idx);
     578             :   }
     579             : 
     580       40122 :   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
     581       80244 :   if (Opc == AMDGPU::V_OR_B32_e64 ||
     582       40122 :       Opc == AMDGPU::V_OR_B32_e32 ||
     583       40122 :       Opc == AMDGPU::S_OR_B32) {
     584         939 :     if (Src1Val == 0) {
     585             :       // y = or x, 0 => y = copy x
     586          56 :       MI->RemoveOperand(Src1Idx);
     587          56 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     588         883 :     } else if (Src1Val == -1) {
     589             :       // y = or x, -1 => y = v_mov_b32 -1
     590           2 :       MI->RemoveOperand(Src1Idx);
     591           2 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
     592             :     } else
     593             :       return false;
     594             : 
     595             :     return true;
     596             :   }
     597             : 
     598       70157 :   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
     599       70157 :       MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
     600             :       MI->getOpcode() == AMDGPU::S_AND_B32) {
     601       10856 :     if (Src1Val == 0) {
     602             :       // y = and x, 0 => y = v_mov_b32 0
     603           4 :       MI->RemoveOperand(Src0Idx);
     604           4 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
     605       10852 :     } else if (Src1Val == -1) {
     606             :       // y = and x, -1 => y = copy x
     607           2 :       MI->RemoveOperand(Src1Idx);
     608           2 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     609           2 :       stripExtraCopyOperands(*MI);
     610             :     } else
     611             :       return false;
     612             : 
     613             :     return true;
     614             :   }
     615             : 
     616       28210 :   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
     617       56301 :       MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
     618             :       MI->getOpcode() == AMDGPU::S_XOR_B32) {
     619         383 :     if (Src1Val == 0) {
     620             :       // y = xor x, 0 => y = copy x
     621           6 :       MI->RemoveOperand(Src1Idx);
     622           6 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     623           6 :       return true;
     624             :     }
     625             :   }
     626             : 
     627             :   return false;
     628             : }
     629             : 
     630             : // Try to fold an instruction into a simpler one
     631     1001004 : static bool tryFoldInst(const SIInstrInfo *TII,
     632             :                         MachineInstr *MI) {
     633     1001004 :   unsigned Opc = MI->getOpcode();
     634             : 
     635     2002008 :   if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
     636     1001004 :       Opc == AMDGPU::V_CNDMASK_B32_e64    ||
     637     1001004 :       Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
     638       10750 :     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
     639       10750 :     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
     640       10750 :     if (Src1->isIdenticalTo(*Src0)) {
     641             :       LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");
     642           7 :       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     643           7 :       if (Src2Idx != -1)
     644           6 :         MI->RemoveOperand(Src2Idx);
     645           7 :       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
     646           7 :       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
     647             :                                                : getMovOpc(false)));
     648             :       LLVM_DEBUG(dbgs() << *MI << '\n');
     649           7 :       return true;
     650             :     }
     651             :   }
     652             : 
     653             :   return false;
     654             : }
     655             : 
     656      306116 : void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     657             :                                      MachineOperand &OpToFold) const {
     658             :   // We need mutate the operands of new mov instructions to add implicit
     659             :   // uses of EXEC, but adding them invalidates the use_iterator, so defer
     660             :   // this.
     661             :   SmallVector<MachineInstr *, 4> CopiesToReplace;
     662             :   SmallVector<FoldCandidate, 4> FoldList;
     663      306116 :   MachineOperand &Dst = MI.getOperand(0);
     664             : 
     665      306116 :   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
     666             :   if (FoldingImm) {
     667             :     unsigned NumLiteralUses = 0;
     668             :     MachineOperand *NonInlineUse = nullptr;
     669             :     int NonInlineUseOpNo = -1;
     670             : 
     671             :     MachineRegisterInfo::use_iterator NextUse;
     672             :     for (MachineRegisterInfo::use_iterator
     673       86853 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     674      230663 :          Use != E; Use = NextUse) {
     675             :       NextUse = std::next(Use);
     676      143810 :       MachineInstr *UseMI = Use->getParent();
     677             :       unsigned OpNo = Use.getOperandNo();
     678             : 
     679             :       // Folding the immediate may reveal operations that can be constant
     680             :       // folded or replaced with a copy. This can happen for example after
     681             :       // frame indices are lowered to constants or from splitting 64-bit
     682             :       // constants.
     683             :       //
     684             :       // We may also encounter cases where one or both operands are
     685             :       // immediates materialized into a register, which would ordinarily not
     686             :       // be folded due to multiple uses or operand constraints.
     687             : 
     688      144005 :       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
     689             :         LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n');
     690             : 
     691             :         // Some constant folding cases change the same immediate's use to a new
     692             :         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
     693             :         // again. The same constant folded instruction could also have a second
     694             :         // use operand.
     695         195 :         NextUse = MRI->use_begin(Dst.getReg());
     696             :         FoldList.clear();
     697         195 :         continue;
     698             :       }
     699             : 
     700             :       // Try to fold any inline immediate uses, and then only fold other
     701             :       // constants if they have one use.
     702             :       //
     703             :       // The legality of the inline immediate must be checked based on the use
     704             :       // operand, not the defining instruction, because 32-bit instructions
     705             :       // with 32-bit inline immediate sources may be used to materialize
     706             :       // constants used in 16-bit operands.
     707             :       //
     708             :       // e.g. it is unsafe to fold:
     709             :       //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
     710             :       //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
     711             : 
     712             :       // Folding immediates with more than one use will increase program size.
     713             :       // FIXME: This will also reduce register usage, which may be better
     714             :       // in some cases. A better heuristic is needed.
     715      143615 :       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
     716       44536 :         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
     717             :       } else {
     718       99079 :         if (++NumLiteralUses == 1) {
     719             :           NonInlineUse = &*Use;
     720       61135 :           NonInlineUseOpNo = OpNo;
     721             :         }
     722             :       }
     723             :     }
     724             : 
     725       86853 :     if (NumLiteralUses == 1) {
     726       44819 :       MachineInstr *UseMI = NonInlineUse->getParent();
     727       44819 :       foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
     728             :     }
     729             :   } else {
     730             :     // Folding register.
     731             :     for (MachineRegisterInfo::use_iterator
     732      219263 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     733      480706 :          Use != E; ++Use) {
     734      261443 :       MachineInstr *UseMI = Use->getParent();
     735             : 
     736      261443 :       foldOperand(OpToFold, UseMI, Use.getOperandNo(),
     737             :                   FoldList, CopiesToReplace);
     738             :     }
     739             :   }
     740             : 
     741      306116 :   MachineFunction *MF = MI.getParent()->getParent();
     742             :   // Make sure we add EXEC uses to any new v_mov instructions created.
     743      327610 :   for (MachineInstr *Copy : CopiesToReplace)
     744       10747 :     Copy->addImplicitDefUseOperands(*MF);
     745             : 
     746      556454 :   for (FoldCandidate &Fold : FoldList) {
     747      125169 :     if (updateOperand(Fold, *TRI)) {
     748             :       // Clear kill flags.
     749      125169 :       if (Fold.isReg()) {
     750             :         assert(Fold.OpToFold && Fold.OpToFold->isReg());
     751             :         // FIXME: Probably shouldn't bother trying to fold if not an
     752             :         // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
     753             :         // copies.
     754       78578 :         MRI->clearKillFlags(Fold.OpToFold->getReg());
     755             :       }
     756             :       LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
     757             :                         << static_cast<int>(Fold.UseOpNo) << " of "
     758             :                         << *Fold.UseMI << '\n');
     759      125169 :       tryFoldInst(TII, Fold.UseMI);
     760           0 :     } else if (Fold.isCommuted()) {
     761             :       // Restoring instruction's original operand order if fold has failed.
     762           0 :       TII->commuteInstruction(*Fold.UseMI, false);
     763             :     }
     764             :   }
     765      306116 : }
     766             : 
     767             : // Clamp patterns are canonically selected to v_max_* instructions, so only
     768             : // handle them.
     769      488800 : const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
     770      488800 :   unsigned Op = MI.getOpcode();
     771      488800 :   switch (Op) {
     772        1272 :   case AMDGPU::V_MAX_F32_e64:
     773             :   case AMDGPU::V_MAX_F16_e64:
     774             :   case AMDGPU::V_MAX_F64:
     775             :   case AMDGPU::V_PK_MAX_F16: {
     776        2544 :     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
     777             :       return nullptr;
     778             : 
     779             :     // Make sure sources are identical.
     780             :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     781             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     782         713 :     if (!Src0->isReg() || !Src1->isReg() ||
     783         709 :         Src0->getReg() != Src1->getReg() ||
     784         710 :         Src0->getSubReg() != Src1->getSubReg() ||
     785             :         Src0->getSubReg() != AMDGPU::NoSubRegister)
     786             :       return nullptr;
     787             : 
     788             :     // Can't fold up if we have modifiers.
     789         353 :     if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     790             :       return nullptr;
     791             : 
     792             :     unsigned Src0Mods
     793         704 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
     794             :     unsigned Src1Mods
     795         352 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
     796             : 
     797             :     // Having a 0 op_sel_hi would require swizzling the output in the source
     798             :     // instruction, which we can't do.
     799         352 :     unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
     800         352 :     if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
     801             :       return nullptr;
     802             :     return Src0;
     803             :   }
     804             :   default:
     805             :     return nullptr;
     806             :   }
     807             : }
     808             : 
     809             : // We obviously have multiple uses in a clamp since the register is used twice
     810             : // in the same instruction.
     811         313 : static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
     812             :   int Count = 0;
     813         626 :   for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
     814         626 :        I != E; ++I) {
     815         331 :     if (++Count > 1)
     816          18 :       return false;
     817             :   }
     818             : 
     819         295 :   return true;
     820             : }
     821             : 
     822             : // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
     823      488800 : bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
     824      488800 :   const MachineOperand *ClampSrc = isClamp(MI);
     825      488800 :   if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
     826             :     return false;
     827             : 
     828         263 :   MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
     829             : 
     830             :   // The type of clamp must be compatible.
     831         526 :   if (TII->getClampMask(*Def) != TII->getClampMask(MI))
     832             :     return false;
     833             : 
     834         165 :   MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
     835         165 :   if (!DefClamp)
     836             :     return false;
     837             : 
     838             :   LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def
     839             :                     << '\n');
     840             : 
     841             :   // Clamp is applied after omod, so it is OK if omod is set.
     842             :   DefClamp->setImm(1);
     843         165 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
     844         165 :   MI.eraseFromParent();
     845         165 :   return true;
     846             : }
     847             : 
     848          29 : static int getOModValue(unsigned Opc, int64_t Val) {
     849          29 :   switch (Opc) {
     850          28 :   case AMDGPU::V_MUL_F32_e64: {
     851          28 :     switch (static_cast<uint32_t>(Val)) {
     852             :     case 0x3f000000: // 0.5
     853             :       return SIOutMods::DIV2;
     854           0 :     case 0x40000000: // 2.0
     855           0 :       return SIOutMods::MUL2;
     856           8 :     case 0x40800000: // 4.0
     857           8 :       return SIOutMods::MUL4;
     858           0 :     default:
     859           0 :       return SIOutMods::NONE;
     860             :     }
     861             :   }
     862           1 :   case AMDGPU::V_MUL_F16_e64: {
     863           1 :     switch (static_cast<uint16_t>(Val)) {
     864             :     case 0x3800: // 0.5
     865             :       return SIOutMods::DIV2;
     866           0 :     case 0x4000: // 2.0
     867           0 :       return SIOutMods::MUL2;
     868           0 :     case 0x4400: // 4.0
     869           0 :       return SIOutMods::MUL4;
     870           0 :     default:
     871           0 :       return SIOutMods::NONE;
     872             :     }
     873             :   }
     874           0 :   default:
     875           0 :     llvm_unreachable("invalid mul opcode");
     876             :   }
     877             : }
     878             : 
     879             : // FIXME: Does this really not support denormals with f16?
     880             : // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
     881             : // handled, so will anything other than that break?
     882             : std::pair<const MachineOperand *, int>
     883         500 : SIFoldOperands::isOMod(const MachineInstr &MI) const {
     884         500 :   unsigned Op = MI.getOpcode();
     885         500 :   switch (Op) {
     886          39 :   case AMDGPU::V_MUL_F32_e64:
     887             :   case AMDGPU::V_MUL_F16_e64: {
     888             :     // If output denormals are enabled, omod is ignored.
     889          39 :     if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
     890           3 :         (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
     891           6 :       return std::make_pair(nullptr, SIOutMods::NONE);
     892             : 
     893             :     const MachineOperand *RegOp = nullptr;
     894             :     const MachineOperand *ImmOp = nullptr;
     895          33 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     896             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     897          33 :     if (Src0->isImm()) {
     898             :       ImmOp = Src0;
     899             :       RegOp = Src1;
     900          33 :     } else if (Src1->isImm()) {
     901             :       ImmOp = Src1;
     902             :       RegOp = Src0;
     903             :     } else
     904           4 :       return std::make_pair(nullptr, SIOutMods::NONE);
     905             : 
     906          29 :     int OMod = getOModValue(Op, ImmOp->getImm());
     907          29 :     if (OMod == SIOutMods::NONE ||
     908          54 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
     909          50 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
     910          79 :         TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
     911          25 :         TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
     912           4 :       return std::make_pair(nullptr, SIOutMods::NONE);
     913             : 
     914             :     return std::make_pair(RegOp, OMod);
     915             :   }
     916          99 :   case AMDGPU::V_ADD_F32_e64:
     917             :   case AMDGPU::V_ADD_F16_e64: {
     918             :     // If output denormals are enabled, omod is ignored.
     919          99 :     if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
     920           8 :         (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
     921          18 :       return std::make_pair(nullptr, SIOutMods::NONE);
     922             : 
     923             :     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
     924          81 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     925             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     926             : 
     927         106 :     if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
     928          25 :         Src0->getSubReg() == Src1->getSubReg() &&
     929          42 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
     930          30 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
     931         105 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
     932          11 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     933          11 :       return std::make_pair(Src0, SIOutMods::MUL2);
     934             : 
     935          70 :     return std::make_pair(nullptr, SIOutMods::NONE);
     936             :   }
     937         362 :   default:
     938         362 :     return std::make_pair(nullptr, SIOutMods::NONE);
     939             :   }
     940             : }
     941             : 
     942             : // FIXME: Does this need to check IEEE bit on function?
     943         500 : bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
     944             :   const MachineOperand *RegOp;
     945             :   int OMod;
     946        1000 :   std::tie(RegOp, OMod) = isOMod(MI);
     947          72 :   if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
     948         536 :       RegOp->getSubReg() != AMDGPU::NoSubRegister ||
     949          36 :       !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
     950             :     return false;
     951             : 
     952          32 :   MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
     953          32 :   MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
     954          32 :   if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
     955             :     return false;
     956             : 
     957             :   // Clamp is applied after omod. If the source already has clamp set, don't
     958             :   // fold it.
     959          26 :   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
     960             :     return false;
     961             : 
     962             :   LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
     963             : 
     964          18 :   DefOMod->setImm(OMod);
     965          18 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
     966          18 :   MI.eraseFromParent();
     967          18 :   return true;
     968             : }
     969             : 
     970       35316 : bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
     971       35316 :   if (skipFunction(MF.getFunction()))
     972             :     return false;
     973             : 
     974       35312 :   MRI = &MF.getRegInfo();
     975       35312 :   ST = &MF.getSubtarget<AMDGPUSubtarget>();
     976       35312 :   TII = ST->getInstrInfo();
     977       35312 :   TRI = &TII->getRegisterInfo();
     978             : 
     979       35312 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     980             : 
     981             :   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
     982             :   // correctly handle signed zeros.
     983             :   //
     984             :   // TODO: Check nsz on instructions when fast math flags are preserved to MI
     985             :   // level.
     986       35312 :   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
     987             : 
     988      185142 :   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     989             :     MachineBasicBlock::iterator I, Next;
     990      915438 :     for (I = MBB->begin(); I != MBB->end(); I = Next) {
     991             :       Next = std::next(I);
     992             :       MachineInstr &MI = *I;
     993             : 
     994      875835 :       tryFoldInst(TII, &MI);
     995             : 
     996     1364653 :       if (!TII->isFoldableCopy(MI)) {
     997      488818 :         if (IsIEEEMode || !tryFoldOMod(MI))
     998      488800 :           tryFoldClamp(MI);
     999      488818 :         continue;
    1000             :       }
    1001             : 
    1002      387017 :       MachineOperand &OpToFold = MI.getOperand(1);
    1003      387017 :       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
    1004             : 
    1005             :       // FIXME: We could also be folding things like TargetIndexes.
    1006      296105 :       if (!FoldingImm && !OpToFold.isReg())
    1007           0 :         continue;
    1008             : 
    1009      746671 :       if (OpToFold.isReg() &&
    1010      296105 :           !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
    1011       63549 :         continue;
    1012             : 
    1013             :       // Prevent folding operands backwards in the function. For example,
    1014             :       // the COPY opcode must not be replaced by 1 in this example:
    1015             :       //
    1016             :       //    %3 = COPY %vgpr0; VGPR_32:%3
    1017             :       //    ...
    1018             :       //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
    1019             :       MachineOperand &Dst = MI.getOperand(0);
    1020      664288 :       if (Dst.isReg() &&
    1021      323468 :           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
    1022       17352 :         continue;
    1023             : 
    1024      306116 :       foldInstOperand(MI, OpToFold);
    1025             :     }
    1026             :   }
    1027       35312 :   return false;
    1028             : }

Generated by: LCOV version 1.13