LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIFoldOperands.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 394 414 95.2 %
Date: 2017-09-14 15:23:50 Functions: 23 25 92.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : /// \file
       9             : //===----------------------------------------------------------------------===//
      10             : //
      11             : 
      12             : #include "AMDGPU.h"
      13             : #include "AMDGPUSubtarget.h"
      14             : #include "SIInstrInfo.h"
      15             : #include "SIMachineFunctionInfo.h"
      16             : #include "llvm/ADT/DepthFirstIterator.h"
      17             : #include "llvm/CodeGen/LiveIntervalAnalysis.h"
      18             : #include "llvm/CodeGen/MachineFunctionPass.h"
      19             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      20             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      21             : #include "llvm/Support/Debug.h"
      22             : #include "llvm/Support/raw_ostream.h"
      23             : #include "llvm/Target/TargetMachine.h"
      24             : 
      25             : #define DEBUG_TYPE "si-fold-operands"
      26             : using namespace llvm;
      27             : 
      28             : namespace {
      29             : 
      30             : struct FoldCandidate {
      31             :   MachineInstr *UseMI;
      32             :   union {
      33             :     MachineOperand *OpToFold;
      34             :     uint64_t ImmToFold;
      35             :     int FrameIndexToFold;
      36             :   };
      37             :   unsigned char UseOpNo;
      38             :   MachineOperand::MachineOperandType Kind;
      39             :   bool Commuted;
      40             : 
      41             :   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
      42      106605 :                 bool Commuted_ = false) :
      43      106605 :     UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
      44      213210 :     Commuted(Commuted_) {
      45      106605 :     if (FoldOp->isImm()) {
      46       38160 :       ImmToFold = FoldOp->getImm();
      47       68445 :     } else if (FoldOp->isFI()) {
      48          24 :       FrameIndexToFold = FoldOp->getIndex();
      49             :     } else {
      50             :       assert(FoldOp->isReg());
      51       68421 :       OpToFold = FoldOp;
      52             :     }
      53             :   }
      54             : 
      55             :   bool isFI() const {
      56             :     return Kind == MachineOperand::MO_FrameIndex;
      57             :   }
      58             : 
      59             :   bool isImm() const {
      60             :     return Kind == MachineOperand::MO_Immediate;
      61             :   }
      62             : 
      63             :   bool isReg() const {
      64             :     return Kind == MachineOperand::MO_Register;
      65             :   }
      66             : 
      67             :   bool isCommuted() const {
      68             :     return Commuted;
      69             :   }
      70             : };
      71             : 
      72        2835 : class SIFoldOperands : public MachineFunctionPass {
      73             : public:
      74             :   static char ID;
      75             :   MachineRegisterInfo *MRI;
      76             :   const SIInstrInfo *TII;
      77             :   const SIRegisterInfo *TRI;
      78             :   const SISubtarget *ST;
      79             : 
      80             :   void foldOperand(MachineOperand &OpToFold,
      81             :                    MachineInstr *UseMI,
      82             :                    unsigned UseOpIdx,
      83             :                    SmallVectorImpl<FoldCandidate> &FoldList,
      84             :                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
      85             : 
      86             :   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
      87             : 
      88             :   const MachineOperand *isClamp(const MachineInstr &MI) const;
      89             :   bool tryFoldClamp(MachineInstr &MI);
      90             : 
      91             :   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
      92             :   bool tryFoldOMod(MachineInstr &MI);
      93             : 
      94             : public:
      95        2851 :   SIFoldOperands() : MachineFunctionPass(ID) {
      96        2851 :     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
      97        2851 :   }
      98             : 
      99             :   bool runOnMachineFunction(MachineFunction &MF) override;
     100             : 
     101        2837 :   StringRef getPassName() const override { return "SI Fold Operands"; }
     102             : 
     103        2837 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     104        2837 :     AU.setPreservesCFG();
     105        2837 :     MachineFunctionPass::getAnalysisUsage(AU);
     106        2837 :   }
     107             : };
     108             : 
     109             : } // End anonymous namespace.
     110             : 
     111      321091 : INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
     112             :                 "SI Fold Operands", false, false)
     113             : 
     114             : char SIFoldOperands::ID = 0;
     115             : 
     116             : char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
     117             : 
     118             : // Wrapper around isInlineConstant that understands special cases when
     119             : // instruction types are replaced during operand folding.
     120      136204 : static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
     121             :                                      const MachineInstr &UseMI,
     122             :                                      unsigned OpNo,
     123             :                                      const MachineOperand &OpToFold) {
     124      136204 :   if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
     125             :     return true;
     126             : 
     127      203240 :   unsigned Opc = UseMI.getOpcode();
     128      101620 :   switch (Opc) {
     129         153 :   case AMDGPU::V_MAC_F32_e64:
     130             :   case AMDGPU::V_MAC_F16_e64: {
     131             :     // Special case for mac. Since this is replaced with mad when folded into
     132             :     // src2, we need to check the legality for the final instruction.
     133         153 :     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     134         153 :     if (static_cast<int>(OpNo) == Src2Idx) {
     135          86 :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     136             :       const MCInstrDesc &MadDesc
     137         172 :         = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
     138          86 :       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     139             :     }
     140             :     return false;
     141             :   }
     142             :   default:
     143             :     return false;
     144             :   }
     145             : }
     146             : 
     147           0 : FunctionPass *llvm::createSIFoldOperandsPass() {
     148           0 :   return new SIFoldOperands();
     149             : }
     150             : 
     151      106604 : static bool updateOperand(FoldCandidate &Fold,
     152             :                           const TargetRegisterInfo &TRI) {
     153      106604 :   MachineInstr *MI = Fold.UseMI;
     154      213208 :   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
     155             :   assert(Old.isReg());
     156             : 
     157      106604 :   if (Fold.isImm()) {
     158       38159 :     Old.ChangeToImmediate(Fold.ImmToFold);
     159       38159 :     return true;
     160             :   }
     161             : 
     162       68445 :   if (Fold.isFI()) {
     163          24 :     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     164          24 :     return true;
     165             :   }
     166             : 
     167       68421 :   MachineOperand *New = Fold.OpToFold;
     168      205263 :   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
     169      136842 :       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     170       68421 :     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
     171             : 
     172      136842 :     Old.setIsUndef(New->isUndef());
     173       68421 :     return true;
     174             :   }
     175             : 
     176             :   // FIXME: Handle physical registers.
     177             : 
     178             :   return false;
     179             : }
     180             : 
     181             : static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
     182             :                               const MachineInstr *MI) {
     183       86744 :   for (auto Candidate : FoldList) {
     184         260 :     if (Candidate.UseMI == MI)
     185             :       return true;
     186             :   }
     187             :   return false;
     188             : }
     189             : 
     190      148767 : static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     191             :                              MachineInstr *MI, unsigned OpNo,
     192             :                              MachineOperand *OpToFold,
     193             :                              const SIInstrInfo *TII) {
     194      148767 :   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     195             : 
     196             :     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     197       86608 :     unsigned Opc = MI->getOpcode();
     198       43443 :     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
     199         139 :         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
     200          96 :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     201             : 
     202             :       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
     203             :       // to fold the operand.
     204         288 :       MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
     205          96 :       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
     206          96 :       if (FoldAsMAD) {
     207          46 :         MI->untieRegOperand(OpNo);
     208          46 :         return true;
     209             :       }
     210         100 :       MI->setDesc(TII->get(Opc));
     211             :     }
     212             : 
     213             :     // Special case for s_setreg_b32
     214       43274 :     if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
     215          48 :       MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
     216          32 :       FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     217          16 :       return true;
     218             :     }
     219             : 
     220             :     // If we are already folding into another operand of MI, then
     221             :     // we can't commute the instruction, otherwise we risk making the
     222             :     // other fold illegal.
     223      129726 :     if (isUseMIInFoldList(FoldList, MI))
     224             :       return false;
     225             : 
     226             :     // Operand is not legal, so try to commute the instruction to
     227             :     // see if this makes it possible to fold.
     228       43242 :     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
     229       43242 :     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
     230       43242 :     bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
     231             : 
     232       43242 :     if (CanCommute) {
     233        9830 :       if (CommuteIdx0 == OpNo)
     234        4118 :         OpNo = CommuteIdx1;
     235        5712 :       else if (CommuteIdx1 == OpNo)
     236        5506 :         OpNo = CommuteIdx0;
     237             :     }
     238             : 
     239             :     // One of operands might be an Imm operand, and OpNo may refer to it after
     240             :     // the call of commuteInstruction() below. Such situations are avoided
     241             :     // here explicitly as OpNo must be a register operand to be a candidate
     242             :     // for memory folding.
     243       71380 :     if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
     244       25434 :                        !MI->getOperand(CommuteIdx1).isReg()))
     245             :       return false;
     246             : 
     247       50272 :     if (!CanCommute ||
     248        8430 :         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
     249             :       return false;
     250             : 
     251        8172 :     if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     252        7046 :       TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
     253        7046 :       return false;
     254             :     }
     255             : 
     256        2252 :     FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
     257        1126 :     return true;
     258             :   }
     259             : 
     260      210926 :   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     261      105463 :   return true;
     262             : }
     263             : 
     264             : // If the use operand doesn't care about the value, this may be an operand only
     265             : // used for register indexing, in which case it is unsafe to fold.
     266             : static bool isUseSafeToFold(const SIInstrInfo *TII,
     267             :                             const MachineInstr &MI,
     268             :                             const MachineOperand &UseMO) {
     269      573448 :   return !UseMO.isUndef() && !TII->isSDWA(MI);
     270             :   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
     271             : }
     272             : 
     273      286786 : void SIFoldOperands::foldOperand(
     274             :   MachineOperand &OpToFold,
     275             :   MachineInstr *UseMI,
     276             :   unsigned UseOpIdx,
     277             :   SmallVectorImpl<FoldCandidate> &FoldList,
     278             :   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
     279      573572 :   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
     280             : 
     281      573177 :   if (!isUseSafeToFold(TII, *UseMI, UseOp))
     282             :     return;
     283             : 
     284             :   // FIXME: Fold operands with subregs.
     285      572782 :   if (UseOp.isReg() && OpToFold.isReg()) {
     286      420060 :     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
     287             :       return;
     288             : 
     289             :     // Don't fold subregister extracts into tied operands, only if it is a full
     290             :     // copy since a subregister use tied to a full register def doesn't really
     291             :     // make sense. e.g. don't fold:
     292             :     //
     293             :     // %vreg1 = COPY %vreg0:sub1
     294             :     // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0>
     295             :     //
     296             :     //  into
     297             :     // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0>
     298      206043 :     if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
     299             :       return;
     300             :   }
     301             : 
     302             :   // Special case for REG_SEQUENCE: We can't fold literals into
     303             :   // REG_SEQUENCE instructions, so we have to fold them into the
     304             :   // uses of REG_SEQUENCE.
     305      563244 :   if (UseMI->isRegSequence()) {
     306      111236 :     unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
     307      222472 :     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
     308             : 
     309             :     for (MachineRegisterInfo::use_iterator
     310      222472 :            RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
     311      277366 :          RSUse != RSE; ++RSUse) {
     312             : 
     313      166130 :       MachineInstr *RSUseMI = RSUse->getParent();
     314      332260 :       if (RSUse->getSubReg() != RegSeqDstSubReg)
     315      165373 :         continue;
     316             : 
     317         757 :       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
     318             :                   CopiesToReplace);
     319             :     }
     320             : 
     321             :     return;
     322             :   }
     323             : 
     324             : 
     325      170386 :   bool FoldingImm = OpToFold.isImm();
     326             : 
     327             :   // In order to fold immediates into copies, we need to change the
     328             :   // copy to a MOV.
     329      216499 :   if (FoldingImm && UseMI->isCopy()) {
     330        8536 :     unsigned DestReg = UseMI->getOperand(0).getReg();
     331             :     const TargetRegisterClass *DestRC
     332       17072 :       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
     333        8344 :       MRI->getRegClass(DestReg) :
     334        8728 :       TRI->getPhysRegClass(DestReg);
     335             : 
     336        8536 :     unsigned MovOp = TII->getMovOpcode(DestRC);
     337        8536 :     if (MovOp == AMDGPU::COPY)
     338             :       return;
     339             : 
     340       25608 :     UseMI->setDesc(TII->get(MovOp));
     341        8536 :     CopiesToReplace.push_back(UseMI);
     342             :   } else {
     343      161850 :     const MCInstrDesc &UseDesc = UseMI->getDesc();
     344             : 
     345             :     // Don't fold into target independent nodes.  Target independent opcodes
     346             :     // don't have defined register classes.
     347      321466 :     if (UseDesc.isVariadic() ||
     348      159616 :         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
     349             :       return;
     350             :   }
     351             : 
     352      148671 :   if (!FoldingImm) {
     353      103368 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     354             : 
     355             :     // FIXME: We could try to change the instruction from 64-bit to 32-bit
     356             :     // to enable more folding opportunites.  The shrink operands pass
     357             :     // already does this.
     358      103368 :     return;
     359             :   }
     360             : 
     361             : 
     362       45303 :   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
     363             :   const TargetRegisterClass *FoldRC =
     364       90606 :     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
     365             : 
     366             : 
     367             :   // Split 64-bit constants into 32-bits for folding.
     368       48033 :   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
     369        2718 :     unsigned UseReg = UseOp.getReg();
     370             :     const TargetRegisterClass *UseRC
     371        5436 :       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
     372        2718 :       MRI->getRegClass(UseReg) :
     373        2718 :       TRI->getPhysRegClass(UseReg);
     374             : 
     375        5436 :     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
     376             :       return;
     377             : 
     378        5436 :     APInt Imm(64, OpToFold.getImm());
     379        2718 :     if (UseOp.getSubReg() == AMDGPU::sub0) {
     380        4077 :       Imm = Imm.getLoBits(32);
     381             :     } else {
     382             :       assert(UseOp.getSubReg() == AMDGPU::sub1);
     383        4077 :       Imm = Imm.getHiBits(32);
     384             :     }
     385             : 
     386        5436 :     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
     387        2718 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
     388             :     return;
     389             :   }
     390             : 
     391             : 
     392             : 
     393       42585 :   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     394             : }
     395             : 
     396        3671 : static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
     397             :                                   uint32_t LHS, uint32_t RHS) {
     398        3671 :   switch (Opcode) {
     399          12 :   case AMDGPU::V_AND_B32_e64:
     400             :   case AMDGPU::V_AND_B32_e32:
     401             :   case AMDGPU::S_AND_B32:
     402          12 :     Result = LHS & RHS;
     403          12 :     return true;
     404          69 :   case AMDGPU::V_OR_B32_e64:
     405             :   case AMDGPU::V_OR_B32_e32:
     406             :   case AMDGPU::S_OR_B32:
     407          69 :     Result = LHS | RHS;
     408          69 :     return true;
     409           0 :   case AMDGPU::V_XOR_B32_e64:
     410             :   case AMDGPU::V_XOR_B32_e32:
     411             :   case AMDGPU::S_XOR_B32:
     412           0 :     Result = LHS ^ RHS;
     413           0 :     return true;
     414           9 :   case AMDGPU::V_LSHL_B32_e64:
     415             :   case AMDGPU::V_LSHL_B32_e32:
     416             :   case AMDGPU::S_LSHL_B32:
     417             :     // The instruction ignores the high bits for out of bounds shifts.
     418           9 :     Result = LHS << (RHS & 31);
     419           9 :     return true;
     420           3 :   case AMDGPU::V_LSHLREV_B32_e64:
     421             :   case AMDGPU::V_LSHLREV_B32_e32:
     422           3 :     Result = RHS << (LHS & 31);
     423           3 :     return true;
     424           8 :   case AMDGPU::V_LSHR_B32_e64:
     425             :   case AMDGPU::V_LSHR_B32_e32:
     426             :   case AMDGPU::S_LSHR_B32:
     427           8 :     Result = LHS >> (RHS & 31);
     428           8 :     return true;
     429           3 :   case AMDGPU::V_LSHRREV_B32_e64:
     430             :   case AMDGPU::V_LSHRREV_B32_e32:
     431           3 :     Result = RHS >> (LHS & 31);
     432           3 :     return true;
     433           8 :   case AMDGPU::V_ASHR_I32_e64:
     434             :   case AMDGPU::V_ASHR_I32_e32:
     435             :   case AMDGPU::S_ASHR_I32:
     436           8 :     Result = static_cast<int32_t>(LHS) >> (RHS & 31);
     437           8 :     return true;
     438           3 :   case AMDGPU::V_ASHRREV_I32_e64:
     439             :   case AMDGPU::V_ASHRREV_I32_e32:
     440           3 :     Result = static_cast<int32_t>(RHS) >> (LHS & 31);
     441           3 :     return true;
     442             :   default:
     443             :     return false;
     444             :   }
     445             : }
     446             : 
     447             : static unsigned getMovOpc(bool IsScalar) {
     448         131 :   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
     449             : }
     450             : 
     451             : /// Remove any leftover implicit operands from mutating the instruction. e.g.
     452             : /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
     453             : /// anymore.
     454         194 : static void stripExtraCopyOperands(MachineInstr &MI) {
     455         194 :   const MCInstrDesc &Desc = MI.getDesc();
     456         194 :   unsigned NumOps = Desc.getNumOperands() +
     457         194 :                     Desc.getNumImplicitUses() +
     458         194 :                     Desc.getNumImplicitDefs();
     459             : 
     460         264 :   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
     461          70 :     MI.RemoveOperand(I);
     462         194 : }
     463             : 
     464             : static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
     465         384 :   MI.setDesc(NewDesc);
     466         192 :   stripExtraCopyOperands(MI);
     467             : }
     468             : 
     469       98234 : static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
     470             :                                                MachineOperand &Op) {
     471       98234 :   if (Op.isReg()) {
     472             :     // If this has a subregister, it obviously is a register source.
     473       97029 :     if (Op.getSubReg() != AMDGPU::NoSubRegister)
     474             :       return &Op;
     475             : 
     476       90505 :     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
     477      181008 :     if (Def && Def->isMoveImmediate()) {
     478      101824 :       MachineOperand &ImmSrc = Def->getOperand(1);
     479       50912 :       if (ImmSrc.isImm())
     480             :         return &ImmSrc;
     481             :     }
     482             :   }
     483             : 
     484             :   return &Op;
     485             : }
     486             : 
     487             : // Try to simplify operations with a constant that may appear after instruction
     488             : // selection.
     489             : // TODO: See if a frame index with a fixed offset can fold.
     490      135686 : static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
     491             :                               const SIInstrInfo *TII,
     492             :                               MachineInstr *MI,
     493             :                               MachineOperand *ImmOp) {
     494      271372 :   unsigned Opc = MI->getOpcode();
     495      135686 :   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
     496             :       Opc == AMDGPU::S_NOT_B32) {
     497          20 :     MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
     498          40 :     mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
     499          10 :     return true;
     500             :   }
     501             : 
     502      135676 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
     503      135676 :   if (Src1Idx == -1)
     504             :     return false;
     505             : 
     506       49117 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     507       98234 :   MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
     508       98234 :   MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
     509             : 
     510      127165 :   if (!Src0->isImm() && !Src1->isImm())
     511             :     return false;
     512             : 
     513             :   // and k0, k1 -> v_mov_b32 (k0 & k1)
     514             :   // or k0, k1 -> v_mov_b32 (k0 | k1)
     515             :   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
     516       68608 :   if (Src0->isImm() && Src1->isImm()) {
     517             :     int32_t NewImm;
     518        3671 :     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
     519             :       return false;
     520             : 
     521         115 :     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     522         115 :     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
     523             : 
     524             :     // Be careful to change the right operand, src0 may belong to a different
     525             :     // instruction.
     526         230 :     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
     527         115 :     MI->RemoveOperand(Src1Idx);
     528         460 :     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
     529         115 :     return true;
     530             :   }
     531             : 
     532       44751 :   if (!MI->isCommutable())
     533             :     return false;
     534             : 
     535       85172 :   if (Src0->isImm() && !Src1->isImm()) {
     536             :     std::swap(Src0, Src1);
     537             :     std::swap(Src0Idx, Src1Idx);
     538             :   }
     539             : 
     540       36306 :   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
     541       72612 :   if (Opc == AMDGPU::V_OR_B32_e64 ||
     542       72083 :       Opc == AMDGPU::V_OR_B32_e32 ||
     543             :       Opc == AMDGPU::S_OR_B32) {
     544         608 :     if (Src1Val == 0) {
     545             :       // y = or x, 0 => y = copy x
     546          48 :       MI->RemoveOperand(Src1Idx);
     547          96 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     548         560 :     } else if (Src1Val == -1) {
     549             :       // y = or x, -1 => y = v_mov_b32 -1
     550           2 :       MI->RemoveOperand(Src1Idx);
     551           6 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
     552             :     } else
     553             :       return false;
     554             : 
     555             :     return true;
     556             :   }
     557             : 
     558       64496 :   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
     559       64496 :       MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
     560       28798 :       MI->getOpcode() == AMDGPU::S_AND_B32) {
     561        9282 :     if (Src1Val == 0) {
     562             :       // y = and x, 0 => y = v_mov_b32 0
     563           4 :       MI->RemoveOperand(Src0Idx);
     564          12 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
     565        9278 :     } else if (Src1Val == -1) {
     566             :       // y = and x, -1 => y = copy x
     567           2 :       MI->RemoveOperand(Src1Idx);
     568           6 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     569           2 :       stripExtraCopyOperands(*MI);
     570             :     } else
     571             :       return false;
     572             : 
     573             :     return true;
     574             :   }
     575             : 
     576       52717 :   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
     577       78772 :       MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
     578       26055 :       MI->getOpcode() == AMDGPU::S_XOR_B32) {
     579         377 :     if (Src1Val == 0) {
     580             :       // y = xor x, 0 => y = copy x
     581           6 :       MI->RemoveOperand(Src1Idx);
     582          18 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     583           6 :       return true;
     584             :     }
     585             :   }
     586             : 
     587             :   return false;
     588             : }
     589             : 
     590             : // Try to fold an instruction into a simpler one
     591      858183 : static bool tryFoldInst(const SIInstrInfo *TII,
     592             :                         MachineInstr *MI) {
     593     1716366 :   unsigned Opc = MI->getOpcode();
     594             : 
     595     1716366 :   if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
     596     1697088 :       Opc == AMDGPU::V_CNDMASK_B32_e64    ||
     597             :       Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
     598       19278 :     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
     599       19278 :     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
     600       19278 :     if (Src1->isIdenticalTo(*Src0)) {
     601             :       DEBUG(dbgs() << "Folded " << *MI << " into ");
     602           5 :       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     603           5 :       if (Src2Idx != -1)
     604           4 :         MI->RemoveOperand(Src2Idx);
     605           5 :       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
     606          20 :       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
     607             :                                                : getMovOpc(false)));
     608             :       DEBUG(dbgs() << *MI << '\n');
     609           5 :       return true;
     610             :     }
     611             :   }
     612             : 
     613             :   return false;
     614             : }
     615             : 
     616      252312 : void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     617             :                                      MachineOperand &OpToFold) const {
     618             :   // We need mutate the operands of new mov instructions to add implicit
     619             :   // uses of EXEC, but adding them invalidates the use_iterator, so defer
     620             :   // this.
     621      504624 :   SmallVector<MachineInstr *, 4> CopiesToReplace;
     622      504624 :   SmallVector<FoldCandidate, 4> FoldList;
     623      252312 :   MachineOperand &Dst = MI.getOperand(0);
     624             : 
     625      425622 :   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
     626             :   if (FoldingImm) {
     627       79002 :     unsigned NumLiteralUses = 0;
     628       79002 :     MachineOperand *NonInlineUse = nullptr;
     629       79002 :     int NonInlineUseOpNo = -1;
     630             : 
     631       79002 :     MachineRegisterInfo::use_iterator NextUse, NextInstUse;
     632             :     for (MachineRegisterInfo::use_iterator
     633      158004 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     634      215393 :          Use != E; Use = NextUse) {
     635      136391 :       NextUse = std::next(Use);
     636      136391 :       MachineInstr *UseMI = Use->getParent();
     637      136391 :       unsigned OpNo = Use.getOperandNo();
     638             : 
     639             :       // Folding the immediate may reveal operations that can be constant
     640             :       // folded or replaced with a copy. This can happen for example after
     641             :       // frame indices are lowered to constants or from splitting 64-bit
     642             :       // constants.
     643             :       //
     644             :       // We may also encounter cases where one or both operands are
     645             :       // immediates materialized into a register, which would ordinarily not
     646             :       // be folded due to multiple uses or operand constraints.
     647             : 
     648      136578 :       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
     649             :         DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
     650             : 
     651             :         // Some constant folding cases change the same immediate's use to a new
     652             :         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
     653             :         // again. The same constant folded instruction could also have a second
     654             :         // use operand.
     655         187 :         NextUse = MRI->use_begin(Dst.getReg());
     656         187 :         FoldList.clear();
     657         187 :         continue;
     658             :       }
     659             : 
     660             :       // Try to fold any inline immediate uses, and then only fold other
     661             :       // constants if they have one use.
     662             :       //
     663             :       // The legality of the inline immediate must be checked based on the use
     664             :       // operand, not the defining instruction, because 32-bit instructions
     665             :       // with 32-bit inline immediate sources may be used to materialize
     666             :       // constants used in 16-bit operands.
     667             :       //
     668             :       // e.g. it is unsafe to fold:
     669             :       //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
     670             :       //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
     671             : 
     672             :       // Folding immediates with more than one use will increase program size.
     673             :       // FIXME: This will also reduce register usage, which may be better
     674             :       // in some cases. A better heuristic is needed.
     675      136204 :       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
     676       34626 :         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
     677             :       } else {
     678      101578 :         if (++NumLiteralUses == 1) {
     679       60217 :           NonInlineUse = &*Use;
     680       60217 :           NonInlineUseOpNo = OpNo;
     681             :         }
     682             :       }
     683             :     }
     684             : 
     685       79002 :     if (NumLiteralUses == 1) {
     686       41795 :       MachineInstr *UseMI = NonInlineUse->getParent();
     687       41795 :       foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
     688             :     }
     689             :   } else {
     690             :     // Folding register.
     691             :     for (MachineRegisterInfo::use_iterator
     692      346620 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     693      382918 :          Use != E; ++Use) {
     694      209608 :       MachineInstr *UseMI = Use->getParent();
     695             : 
     696      209608 :       foldOperand(OpToFold, UseMI, Use.getOperandNo(),
     697             :                   FoldList, CopiesToReplace);
     698             :     }
     699             :   }
     700             : 
     701      252312 :   MachineFunction *MF = MI.getParent()->getParent();
     702             :   // Make sure we add EXEC uses to any new v_mov instructions created.
     703      765472 :   for (MachineInstr *Copy : CopiesToReplace)
     704        8536 :     Copy->addImplicitDefUseOperands(*MF);
     705             : 
     706      863540 :   for (FoldCandidate &Fold : FoldList) {
     707      106604 :     if (updateOperand(Fold, *TRI)) {
     708             :       // Clear kill flags.
     709      106604 :       if (Fold.isReg()) {
     710             :         assert(Fold.OpToFold && Fold.OpToFold->isReg());
     711             :         // FIXME: Probably shouldn't bother trying to fold if not an
     712             :         // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
     713             :         // copies.
     714       68421 :         MRI->clearKillFlags(Fold.OpToFold->getReg());
     715             :       }
     716             :       DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
     717             :             static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
     718      106604 :       tryFoldInst(TII, Fold.UseMI);
     719           0 :     } else if (Fold.isCommuted()) {
     720             :       // Restoring instruction's original operand order if fold has failed.
     721           0 :       TII->commuteInstruction(*Fold.UseMI, false);
     722             :     }
     723             :   }
     724      252312 : }
     725             : 
     726      446985 : const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
     727      893970 :   unsigned Op = MI.getOpcode();
     728      446985 :   switch (Op) {
     729        1095 :   case AMDGPU::V_MAX_F32_e64:
     730             :   case AMDGPU::V_MAX_F16_e64:
     731             :   case AMDGPU::V_MAX_F64:
     732             :   case AMDGPU::V_PK_MAX_F16: {
     733        2190 :     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
     734             :       return nullptr;
     735             : 
     736             :     // Make sure sources are identical.
     737         578 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     738         578 :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     739         866 :     if (!Src0->isReg() || !Src1->isReg() ||
     740        1153 :         Src0->getSubReg() != Src1->getSubReg() ||
     741         288 :         Src0->getSubReg() != AMDGPU::NoSubRegister)
     742             :       return nullptr;
     743             : 
     744             :     // Can't fold up if we have modifiers.
     745         288 :     if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     746             :       return nullptr;
     747             : 
     748             :     unsigned Src0Mods
     749         574 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
     750             :     unsigned Src1Mods
     751         574 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
     752             : 
     753             :     // Having a 0 op_sel_hi would require swizzling the output in the source
     754             :     // instruction, which we can't do.
     755         287 :     unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
     756         287 :     if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
     757             :       return nullptr;
     758             :     return Src0;
     759             :   }
     760             :   default:
     761             :     return nullptr;
     762             :   }
     763             : }
     764             : 
     765             : // We obviously have multiple uses in a clamp since the register is used twice
     766             : // in the same instruction.
     767         250 : static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
     768         250 :   int Count = 0;
     769         750 :   for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
     770         500 :        I != E; ++I) {
     771         271 :     if (++Count > 1)
     772          21 :       return false;
     773             :   }
     774             : 
     775         229 :   return true;
     776             : }
     777             : 
     778      446985 : bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
     779      446985 :   const MachineOperand *ClampSrc = isClamp(MI);
     780      446985 :   if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
     781             :     return false;
     782             : 
     783         197 :   MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
     784             : 
     785             :   // The type of clamp must be compatible.
     786         591 :   if (TII->getClampMask(*Def) != TII->getClampMask(MI))
     787             :     return false;
     788             : 
     789          97 :   MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
     790          97 :   if (!DefClamp)
     791             :     return false;
     792             : 
     793             :   DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
     794             : 
     795             :   // Clamp is applied after omod, so it is OK if omod is set.
     796         194 :   DefClamp->setImm(1);
     797          97 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
     798          97 :   MI.eraseFromParent();
     799          97 :   return true;
     800             : }
     801             : 
     802          29 : static int getOModValue(unsigned Opc, int64_t Val) {
     803          29 :   switch (Opc) {
     804          28 :   case AMDGPU::V_MUL_F32_e64: {
     805          28 :     switch (static_cast<uint32_t>(Val)) {
     806             :     case 0x3f000000: // 0.5
     807             :       return SIOutMods::DIV2;
     808           0 :     case 0x40000000: // 2.0
     809           0 :       return SIOutMods::MUL2;
     810           8 :     case 0x40800000: // 4.0
     811           8 :       return SIOutMods::MUL4;
     812           0 :     default:
     813           0 :       return SIOutMods::NONE;
     814             :     }
     815             :   }
     816           1 :   case AMDGPU::V_MUL_F16_e64: {
     817           1 :     switch (static_cast<uint16_t>(Val)) {
     818             :     case 0x3800: // 0.5
     819             :       return SIOutMods::DIV2;
     820           0 :     case 0x4000: // 2.0
     821           0 :       return SIOutMods::MUL2;
     822           0 :     case 0x4400: // 4.0
     823           0 :       return SIOutMods::MUL4;
     824           0 :     default:
     825           0 :       return SIOutMods::NONE;
     826             :     }
     827             :   }
     828           0 :   default:
     829           0 :     llvm_unreachable("invalid mul opcode");
     830             :   }
     831             : }
     832             : 
     833             : // FIXME: Does this really not support denormals with f16?
     834             : // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
     835             : // handled, so will anything other than that break?
     836             : std::pair<const MachineOperand *, int>
     837         500 : SIFoldOperands::isOMod(const MachineInstr &MI) const {
     838        1000 :   unsigned Op = MI.getOpcode();
     839         500 :   switch (Op) {
     840          39 :   case AMDGPU::V_MUL_F32_e64:
     841             :   case AMDGPU::V_MUL_F16_e64: {
     842             :     // If output denormals are enabled, omod is ignored.
     843          39 :     if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
     844           3 :         (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
     845          12 :       return std::make_pair(nullptr, SIOutMods::NONE);
     846             : 
     847          33 :     const MachineOperand *RegOp = nullptr;
     848          33 :     const MachineOperand *ImmOp = nullptr;
     849          66 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     850          66 :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     851          33 :     if (Src0->isImm()) {
     852             :       ImmOp = Src0;
     853             :       RegOp = Src1;
     854          33 :     } else if (Src1->isImm()) {
     855             :       ImmOp = Src1;
     856             :       RegOp = Src0;
     857             :     } else
     858           8 :       return std::make_pair(nullptr, SIOutMods::NONE);
     859             : 
     860          29 :     int OMod = getOModValue(Op, ImmOp->getImm());
     861          29 :     if (OMod == SIOutMods::NONE ||
     862          54 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
     863          50 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
     864          79 :         TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
     865          25 :         TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
     866           8 :       return std::make_pair(nullptr, SIOutMods::NONE);
     867             : 
     868          25 :     return std::make_pair(RegOp, OMod);
     869             :   }
     870          99 :   case AMDGPU::V_ADD_F32_e64:
     871             :   case AMDGPU::V_ADD_F16_e64: {
     872             :     // If output denormals are enabled, omod is ignored.
     873          99 :     if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
     874           8 :         (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
     875          36 :       return std::make_pair(nullptr, SIOutMods::NONE);
     876             : 
     877             :     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
     878         162 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     879         162 :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     880             : 
     881         268 :     if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
     882         100 :         Src0->getSubReg() == Src1->getSubReg() &&
     883          42 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
     884          30 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
     885         105 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
     886          11 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     887          33 :       return std::make_pair(Src0, SIOutMods::MUL2);
     888             : 
     889         140 :     return std::make_pair(nullptr, SIOutMods::NONE);
     890             :   }
     891         362 :   default:
     892         724 :     return std::make_pair(nullptr, SIOutMods::NONE);
     893             :   }
     894             : }
     895             : 
     896             : // FIXME: Does this need to check IEEE bit on function?
     897         500 : bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
     898             :   const MachineOperand *RegOp;
     899             :   int OMod;
     900        1500 :   std::tie(RegOp, OMod) = isOMod(MI);
     901         108 :   if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
     902         608 :       RegOp->getSubReg() != AMDGPU::NoSubRegister ||
     903          36 :       !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
     904             :     return false;
     905             : 
     906          32 :   MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
     907          32 :   MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
     908          32 :   if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
     909             :     return false;
     910             : 
     911             :   // Clamp is applied after omod. If the source already has clamp set, don't
     912             :   // fold it.
     913          26 :   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
     914             :     return false;
     915             : 
     916             :   DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
     917             : 
     918          36 :   DefOMod->setImm(OMod);
     919          18 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
     920          18 :   MI.eraseFromParent();
     921          18 :   return true;
     922             : }
     923             : 
     924       29247 : bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
     925       29247 :   if (skipFunction(*MF.getFunction()))
     926             :     return false;
     927             : 
     928       29245 :   MRI = &MF.getRegInfo();
     929       29245 :   ST = &MF.getSubtarget<SISubtarget>();
     930       58490 :   TII = ST->getInstrInfo();
     931       58490 :   TRI = &TII->getRegisterInfo();
     932             : 
     933       29245 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     934             : 
     935             :   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
     936             :   // correctly handle signed zeros.
     937             :   //
     938             :   // TODO: Check nsz on instructions when fast math flags are preserved to MI
     939             :   // level.
     940       58490 :   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
     941             : 
     942      212315 :   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     943       66090 :     MachineBasicBlock::iterator I, Next;
     944     1602293 :     for (I = MBB->begin(); I != MBB->end(); I = Next) {
     945      751579 :       Next = std::next(I);
     946      751579 :       MachineInstr &MI = *I;
     947             : 
     948      751579 :       tryFoldInst(TII, &MI);
     949             : 
     950     1198582 :       if (!TII->isFoldableCopy(MI)) {
     951      447003 :         if (IsIEEEMode || !tryFoldOMod(MI))
     952      446985 :           tryFoldClamp(MI);
     953      447003 :         continue;
     954             :       }
     955             : 
     956      609152 :       MachineOperand &OpToFold = MI.getOperand(1);
     957      304576 :       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
     958             : 
     959             :       // FIXME: We could also be folding things like TargetIndexes.
     960      221933 :       if (!FoldingImm && !OpToFold.isReg())
     961           0 :         continue;
     962             : 
     963      567868 :       if (OpToFold.isReg() &&
     964      443866 :           !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
     965       41359 :         continue;
     966             : 
     967             :       // Prevent folding operands backwards in the function. For example,
     968             :       // the COPY opcode must not be replaced by 1 in this example:
     969             :       //
     970             :       //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
     971             :       //    ...
     972             :       //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
     973      263217 :       MachineOperand &Dst = MI.getOperand(0);
     974      537339 :       if (Dst.isReg() &&
     975      526434 :           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
     976       10905 :         continue;
     977             : 
     978      252312 :       foldInstOperand(MI, OpToFold);
     979             :     }
     980             :   }
     981       29245 :   return false;
     982             : }

Generated by: LCOV version 1.13