LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIFoldOperands.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 358 379 94.5 %
Date: 2018-02-23 05:02:05 Functions: 23 25 92.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : /// \file
       9             : //===----------------------------------------------------------------------===//
      10             : //
      11             : 
      12             : #include "AMDGPU.h"
      13             : #include "AMDGPUSubtarget.h"
      14             : #include "SIInstrInfo.h"
      15             : #include "SIMachineFunctionInfo.h"
      16             : #include "llvm/ADT/DepthFirstIterator.h"
      17             : #include "llvm/CodeGen/LiveIntervals.h"
      18             : #include "llvm/CodeGen/MachineFunctionPass.h"
      19             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      20             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      21             : #include "llvm/Support/Debug.h"
      22             : #include "llvm/Support/raw_ostream.h"
      23             : #include "llvm/Target/TargetMachine.h"
      24             : 
      25             : #define DEBUG_TYPE "si-fold-operands"
      26             : using namespace llvm;
      27             : 
      28             : namespace {
      29             : 
      30             : struct FoldCandidate {
      31             :   MachineInstr *UseMI;
      32             :   union {
      33             :     MachineOperand *OpToFold;
      34             :     uint64_t ImmToFold;
      35             :     int FrameIndexToFold;
      36             :   };
      37             :   unsigned char UseOpNo;
      38             :   MachineOperand::MachineOperandType Kind;
      39             :   bool Commuted;
      40             : 
      41             :   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
      42      110995 :                 bool Commuted_ = false) :
      43             :     UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
      44      221974 :     Commuted(Commuted_) {
      45      110979 :     if (FoldOp->isImm()) {
      46       45046 :       ImmToFold = FoldOp->getImm();
      47       65949 :     } else if (FoldOp->isFI()) {
      48          26 :       FrameIndexToFold = FoldOp->getIndex();
      49             :     } else {
      50             :       assert(FoldOp->isReg());
      51       65923 :       OpToFold = FoldOp;
      52             :     }
      53             :   }
      54             : 
      55             :   bool isFI() const {
      56             :     return Kind == MachineOperand::MO_FrameIndex;
      57             :   }
      58             : 
      59             :   bool isImm() const {
      60             :     return Kind == MachineOperand::MO_Immediate;
      61             :   }
      62             : 
      63             :   bool isReg() const {
      64             :     return Kind == MachineOperand::MO_Register;
      65             :   }
      66             : 
      67             :   bool isCommuted() const {
      68             :     return Commuted;
      69             :   }
      70             : };
      71             : 
      72        3272 : class SIFoldOperands : public MachineFunctionPass {
      73             : public:
      74             :   static char ID;
      75             :   MachineRegisterInfo *MRI;
      76             :   const SIInstrInfo *TII;
      77             :   const SIRegisterInfo *TRI;
      78             :   const SISubtarget *ST;
      79             : 
      80             :   void foldOperand(MachineOperand &OpToFold,
      81             :                    MachineInstr *UseMI,
      82             :                    unsigned UseOpIdx,
      83             :                    SmallVectorImpl<FoldCandidate> &FoldList,
      84             :                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
      85             : 
      86             :   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
      87             : 
      88             :   const MachineOperand *isClamp(const MachineInstr &MI) const;
      89             :   bool tryFoldClamp(MachineInstr &MI);
      90             : 
      91             :   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
      92             :   bool tryFoldOMod(MachineInstr &MI);
      93             : 
      94             : public:
      95        3288 :   SIFoldOperands() : MachineFunctionPass(ID) {
      96        3288 :     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
      97        3288 :   }
      98             : 
      99             :   bool runOnMachineFunction(MachineFunction &MF) override;
     100             : 
     101        3272 :   StringRef getPassName() const override { return "SI Fold Operands"; }
     102             : 
     103        3272 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     104        3272 :     AU.setPreservesCFG();
     105        3272 :     MachineFunctionPass::getAnalysisUsage(AU);
     106        3272 :   }
     107             : };
     108             : 
     109             : } // End anonymous namespace.
     110             : 
     111      286316 : INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
     112             :                 "SI Fold Operands", false, false)
     113             : 
     114             : char SIFoldOperands::ID = 0;
     115             : 
     116             : char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
     117             : 
     118             : // Wrapper around isInlineConstant that understands special cases when
     119             : // instruction types are replaced during operand folding.
     120      144177 : static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
     121             :                                      const MachineInstr &UseMI,
     122             :                                      unsigned OpNo,
     123             :                                      const MachineOperand &OpToFold) {
     124      144177 :   if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
     125             :     return true;
     126             : 
     127      101199 :   unsigned Opc = UseMI.getOpcode();
     128      101199 :   switch (Opc) {
     129         310 :   case AMDGPU::V_MAC_F32_e64:
     130             :   case AMDGPU::V_MAC_F16_e64: {
     131             :     // Special case for mac. Since this is replaced with mad when folded into
     132             :     // src2, we need to check the legality for the final instruction.
     133         310 :     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     134         310 :     if (static_cast<int>(OpNo) == Src2Idx) {
     135             :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     136             :       const MCInstrDesc &MadDesc
     137          91 :         = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
     138          91 :       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     139             :     }
     140             :     return false;
     141             :   }
     142             :   default:
     143             :     return false;
     144             :   }
     145             : }
     146             : 
     147           0 : FunctionPass *llvm::createSIFoldOperandsPass() {
     148           0 :   return new SIFoldOperands();
     149             : }
     150             : 
     151      110994 : static bool updateOperand(FoldCandidate &Fold,
     152             :                           const TargetRegisterInfo &TRI) {
     153      110994 :   MachineInstr *MI = Fold.UseMI;
     154      110994 :   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
     155             :   assert(Old.isReg());
     156             : 
     157      110994 :   if (Fold.isImm()) {
     158       45045 :     Old.ChangeToImmediate(Fold.ImmToFold);
     159       45045 :     return true;
     160             :   }
     161             : 
     162       65949 :   if (Fold.isFI()) {
     163          26 :     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     164          26 :     return true;
     165             :   }
     166             : 
     167       65923 :   MachineOperand *New = Fold.OpToFold;
     168      197769 :   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
     169       65923 :       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     170       65923 :     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
     171             : 
     172             :     Old.setIsUndef(New->isUndef());
     173       65923 :     return true;
     174             :   }
     175             : 
     176             :   // FIXME: Handle physical registers.
     177             : 
     178             :   return false;
     179             : }
     180             : 
     181             : static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
     182             :                               const MachineInstr *MI) {
     183       54755 :   for (auto Candidate : FoldList) {
     184         336 :     if (Candidate.UseMI == MI)
     185             :       return true;
     186             :   }
     187             :   return false;
     188             : }
     189             : 
     190      165029 : static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     191             :                              MachineInstr *MI, unsigned OpNo,
     192             :                              MachineOperand *OpToFold,
     193             :                              const SIInstrInfo *TII) {
     194      165029 :   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     195             : 
     196             :     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     197       54154 :     unsigned Opc = MI->getOpcode();
     198       54358 :     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
     199         204 :         (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
     200             :       bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
     201             : 
     202             :       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
     203             :       // to fold the operand.
     204         105 :       MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
     205         105 :       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
     206         105 :       if (FoldAsMAD) {
     207          55 :         MI->untieRegOperand(OpNo);
     208          55 :         return true;
     209             :       }
     210          50 :       MI->setDesc(TII->get(Opc));
     211             :     }
     212             : 
     213             :     // Special case for s_setreg_b32
     214       54115 :     if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
     215          16 :       MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
     216          32 :       FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     217          16 :       return true;
     218             :     }
     219             : 
     220             :     // If we are already folding into another operand of MI, then
     221             :     // we can't commute the instruction, otherwise we risk making the
     222             :     // other fold illegal.
     223       54083 :     if (isUseMIInFoldList(FoldList, MI))
     224             :       return false;
     225             : 
     226             :     // Operand is not legal, so try to commute the instruction to
     227             :     // see if this makes it possible to fold.
     228       54083 :     unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
     229       54083 :     unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
     230       54083 :     bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
     231             : 
     232       54083 :     if (CanCommute) {
     233       10084 :       if (CommuteIdx0 == OpNo)
     234        6274 :         OpNo = CommuteIdx1;
     235        3810 :       else if (CommuteIdx1 == OpNo)
     236             :         OpNo = CommuteIdx0;
     237             :     }
     238             : 
     239             :     // One of operands might be an Imm operand, and OpNo may refer to it after
     240             :     // the call of commuteInstruction() below. Such situations are avoided
     241             :     // here explicitly as OpNo must be a register operand to be a candidate
     242             :     // for memory folding.
     243       74209 :     if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
     244       10042 :                        !MI->getOperand(CommuteIdx1).isReg()))
     245             :       return false;
     246             : 
     247       54063 :     if (!CanCommute ||
     248        5032 :         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
     249             :       return false;
     250             : 
     251        4741 :     if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
     252        4637 :       TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
     253        4637 :       return false;
     254             :     }
     255             : 
     256         208 :     FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
     257         104 :     return true;
     258             :   }
     259             : 
     260      221750 :   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
     261      110875 :   return true;
     262             : }
     263             : 
     264             : // If the use operand doesn't care about the value, this may be an operand only
     265             : // used for register indexing, in which case it is unsafe to fold.
     266             : static bool isUseSafeToFold(const SIInstrInfo *TII,
     267             :                             const MachineInstr &MI,
     268             :                             const MachineOperand &UseMO) {
     269      600528 :   return !UseMO.isUndef() && !TII->isSDWA(MI);
     270             :   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
     271             : }
     272             : 
     273      300325 : void SIFoldOperands::foldOperand(
     274             :   MachineOperand &OpToFold,
     275             :   MachineInstr *UseMI,
     276             :   unsigned UseOpIdx,
     277             :   SmallVectorImpl<FoldCandidate> &FoldList,
     278             :   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
     279      300325 :   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
     280             : 
     281             :   if (!isUseSafeToFold(TII, *UseMI, UseOp))
     282             :     return;
     283             : 
     284             :   // FIXME: Fold operands with subregs.
     285      599292 :   if (UseOp.isReg() && OpToFold.isReg()) {
     286      428266 :     if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
     287             :       return;
     288             : 
     289             :     // Don't fold subregister extracts into tied operands, only if it is a full
     290             :     // copy since a subregister use tied to a full register def doesn't really
     291             :     // make sense. e.g. don't fold:
     292             :     //
     293             :     // %1 = COPY %0:sub1
     294             :     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
     295             :     //
     296             :     //  into
     297             :     // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
     298      209048 :     if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
     299             :       return;
     300             :   }
     301             : 
     302             :   // Special case for REG_SEQUENCE: We can't fold literals into
     303             :   // REG_SEQUENCE instructions, so we have to fold them into the
     304             :   // uses of REG_SEQUENCE.
     305      293583 :   if (UseMI->isRegSequence()) {
     306      105830 :     unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
     307      211660 :     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
     308             : 
     309             :     for (MachineRegisterInfo::use_iterator
     310      105830 :            RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
     311      265945 :          RSUse != RSE; ++RSUse) {
     312             : 
     313      160115 :       MachineInstr *RSUseMI = RSUse->getParent();
     314      160115 :       if (RSUse->getSubReg() != RegSeqDstSubReg)
     315      159348 :         continue;
     316             : 
     317         767 :       foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
     318             :                   CopiesToReplace);
     319             :     }
     320             : 
     321             :     return;
     322             :   }
     323             : 
     324             : 
     325             :   bool FoldingImm = OpToFold.isImm();
     326             : 
     327             :   // In order to fold immediates into copies, we need to change the
     328             :   // copy to a MOV.
     329      187753 :   if (FoldingImm && UseMI->isCopy()) {
     330       10789 :     unsigned DestReg = UseMI->getOperand(0).getReg();
     331             :     const TargetRegisterClass *DestRC
     332       10981 :       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
     333       10597 :       MRI->getRegClass(DestReg) :
     334         192 :       TRI->getPhysRegClass(DestReg);
     335             : 
     336       10789 :     unsigned MovOp = TII->getMovOpcode(DestRC);
     337       10789 :     if (MovOp == AMDGPU::COPY)
     338             :       return;
     339             : 
     340       10789 :     UseMI->setDesc(TII->get(MovOp));
     341       10789 :     CopiesToReplace.push_back(UseMI);
     342             :   } else {
     343             :     const MCInstrDesc &UseDesc = UseMI->getDesc();
     344             : 
     345             :     // Don't fold into target independent nodes.  Target independent opcodes
     346             :     // don't have defined register classes.
     347      351687 :     if (UseDesc.isVariadic() ||
     348      351686 :         UseOp.isImplicit() ||
     349      174722 :         UseDesc.OpInfo[UseOpIdx].RegClass == -1)
     350             :       return;
     351             :   }
     352             : 
     353      164924 :   if (!FoldingImm) {
     354      111661 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     355             : 
     356             :     // FIXME: We could try to change the instruction from 64-bit to 32-bit
     357             :     // to enable more folding opportunites.  The shrink operands pass
     358             :     // already does this.
     359      111661 :     return;
     360             :   }
     361             : 
     362             : 
     363       53263 :   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
     364             :   const TargetRegisterClass *FoldRC =
     365       53263 :     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
     366             : 
     367             : 
     368             :   // Split 64-bit constants into 32-bits for folding.
     369       56247 :   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
     370        2960 :     unsigned UseReg = UseOp.getReg();
     371             :     const TargetRegisterClass *UseRC
     372        2960 :       = TargetRegisterInfo::isVirtualRegister(UseReg) ?
     373        2960 :       MRI->getRegClass(UseReg) :
     374           0 :       TRI->getPhysRegClass(UseReg);
     375             : 
     376        5920 :     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
     377             :       return;
     378             : 
     379        2960 :     APInt Imm(64, OpToFold.getImm());
     380        2960 :     if (UseOp.getSubReg() == AMDGPU::sub0) {
     381        2798 :       Imm = Imm.getLoBits(32);
     382             :     } else {
     383             :       assert(UseOp.getSubReg() == AMDGPU::sub1);
     384        3122 :       Imm = Imm.getHiBits(32);
     385             :     }
     386             : 
     387             :     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
     388        2960 :     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
     389             :     return;
     390             :   }
     391             : 
     392             : 
     393             : 
     394       50303 :   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
     395             : }
     396             : 
     397        3543 : static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
     398             :                                   uint32_t LHS, uint32_t RHS) {
     399        3543 :   switch (Opcode) {
     400          12 :   case AMDGPU::V_AND_B32_e64:
     401             :   case AMDGPU::V_AND_B32_e32:
     402             :   case AMDGPU::S_AND_B32:
     403          12 :     Result = LHS & RHS;
     404          12 :     return true;
     405          69 :   case AMDGPU::V_OR_B32_e64:
     406             :   case AMDGPU::V_OR_B32_e32:
     407             :   case AMDGPU::S_OR_B32:
     408          69 :     Result = LHS | RHS;
     409          69 :     return true;
     410           0 :   case AMDGPU::V_XOR_B32_e64:
     411             :   case AMDGPU::V_XOR_B32_e32:
     412             :   case AMDGPU::S_XOR_B32:
     413           0 :     Result = LHS ^ RHS;
     414           0 :     return true;
     415           9 :   case AMDGPU::V_LSHL_B32_e64:
     416             :   case AMDGPU::V_LSHL_B32_e32:
     417             :   case AMDGPU::S_LSHL_B32:
     418             :     // The instruction ignores the high bits for out of bounds shifts.
     419           9 :     Result = LHS << (RHS & 31);
     420           9 :     return true;
     421           3 :   case AMDGPU::V_LSHLREV_B32_e64:
     422             :   case AMDGPU::V_LSHLREV_B32_e32:
     423           3 :     Result = RHS << (LHS & 31);
     424           3 :     return true;
     425           8 :   case AMDGPU::V_LSHR_B32_e64:
     426             :   case AMDGPU::V_LSHR_B32_e32:
     427             :   case AMDGPU::S_LSHR_B32:
     428           8 :     Result = LHS >> (RHS & 31);
     429           8 :     return true;
     430           3 :   case AMDGPU::V_LSHRREV_B32_e64:
     431             :   case AMDGPU::V_LSHRREV_B32_e32:
     432           3 :     Result = RHS >> (LHS & 31);
     433           3 :     return true;
     434           8 :   case AMDGPU::V_ASHR_I32_e64:
     435             :   case AMDGPU::V_ASHR_I32_e32:
     436             :   case AMDGPU::S_ASHR_I32:
     437           8 :     Result = static_cast<int32_t>(LHS) >> (RHS & 31);
     438           8 :     return true;
     439           3 :   case AMDGPU::V_ASHRREV_I32_e64:
     440             :   case AMDGPU::V_ASHRREV_I32_e32:
     441           3 :     Result = static_cast<int32_t>(RHS) >> (LHS & 31);
     442           3 :     return true;
     443             :   default:
     444             :     return false;
     445             :   }
     446             : }
     447             : 
     448             : static unsigned getMovOpc(bool IsScalar) {
     449         131 :   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
     450             : }
     451             : 
     452             : /// Remove any leftover implicit operands from mutating the instruction. e.g.
     453             : /// if we replace an s_and_b32 with a copy, we don't need the implicit scc def
     454             : /// anymore.
     455         200 : static void stripExtraCopyOperands(MachineInstr &MI) {
     456         200 :   const MCInstrDesc &Desc = MI.getDesc();
     457         400 :   unsigned NumOps = Desc.getNumOperands() +
     458         200 :                     Desc.getNumImplicitUses() +
     459         400 :                     Desc.getNumImplicitDefs();
     460             : 
     461         274 :   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
     462          74 :     MI.RemoveOperand(I);
     463         200 : }
     464             : 
     465             : static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
     466             :   MI.setDesc(NewDesc);
     467         198 :   stripExtraCopyOperands(MI);
     468             : }
     469             : 
     470      100458 : static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
     471             :                                                MachineOperand &Op) {
     472      100458 :   if (Op.isReg()) {
     473             :     // If this has a subregister, it obviously is a register source.
     474       99359 :     if (Op.getSubReg() != AMDGPU::NoSubRegister)
     475             :       return &Op;
     476             : 
     477       90548 :     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
     478      181093 :     if (Def && Def->isMoveImmediate()) {
     479       51980 :       MachineOperand &ImmSrc = Def->getOperand(1);
     480       51980 :       if (ImmSrc.isImm())
     481             :         return &ImmSrc;
     482             :     }
     483             :   }
     484             : 
     485             :   return &Op;
     486             : }
     487             : 
     488             : // Try to simplify operations with a constant that may appear after instruction
     489             : // selection.
     490             : // TODO: See if a frame index with a fixed offset can fold.
     491      143613 : static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
     492             :                               const SIInstrInfo *TII,
     493             :                               MachineInstr *MI,
     494             :                               MachineOperand *ImmOp) {
     495      143613 :   unsigned Opc = MI->getOpcode();
     496      143613 :   if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
     497             :       Opc == AMDGPU::S_NOT_B32) {
     498          20 :     MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
     499          10 :     mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
     500          10 :     return true;
     501             :   }
     502             : 
     503      143603 :   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
     504      143603 :   if (Src1Idx == -1)
     505             :     return false;
     506             : 
     507       50229 :   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     508      100458 :   MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
     509      100458 :   MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));
     510             : 
     511       79651 :   if (!Src0->isImm() && !Src1->isImm())
     512             :     return false;
     513             : 
     514             :   // and k0, k1 -> v_mov_b32 (k0 & k1)
     515             :   // or k0, k1 -> v_mov_b32 (k0 | k1)
     516             :   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
     517       70304 :   if (Src0->isImm() && Src1->isImm()) {
     518             :     int32_t NewImm;
     519        3543 :     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
     520             :       return false;
     521             : 
     522             :     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     523         115 :     bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());
     524             : 
     525             :     // Be careful to change the right operand, src0 may belong to a different
     526             :     // instruction.
     527         230 :     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
     528         115 :     MI->RemoveOperand(Src1Idx);
     529         115 :     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
     530         115 :     return true;
     531             :   }
     532             : 
     533       45954 :   if (!MI->isCommutable())
     534             :     return false;
     535             : 
     536       50767 :   if (Src0->isImm() && !Src1->isImm()) {
     537             :     std::swap(Src0, Src1);
     538             :     std::swap(Src0Idx, Src1Idx);
     539             :   }
     540             : 
     541       37760 :   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
     542       75520 :   if (Opc == AMDGPU::V_OR_B32_e64 ||
     543       74735 :       Opc == AMDGPU::V_OR_B32_e32 ||
     544             :       Opc == AMDGPU::S_OR_B32) {
     545         864 :     if (Src1Val == 0) {
     546             :       // y = or x, 0 => y = copy x
     547          52 :       MI->RemoveOperand(Src1Idx);
     548          52 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     549         812 :     } else if (Src1Val == -1) {
     550             :       // y = or x, -1 => y = v_mov_b32 -1
     551           2 :       MI->RemoveOperand(Src1Idx);
     552           2 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
     553             :     } else
     554             :       return false;
     555             : 
     556             :     return true;
     557             :   }
     558             : 
     559       65490 :   if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
     560       65490 :       MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
     561             :       MI->getOpcode() == AMDGPU::S_AND_B32) {
     562       10680 :     if (Src1Val == 0) {
     563             :       // y = and x, 0 => y = v_mov_b32 0
     564           4 :       MI->RemoveOperand(Src0Idx);
     565           4 :       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
     566       10676 :     } else if (Src1Val == -1) {
     567             :       // y = and x, -1 => y = copy x
     568           2 :       MI->RemoveOperand(Src1Idx);
     569           2 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     570           2 :       stripExtraCopyOperands(*MI);
     571             :     } else
     572             :       return false;
     573             : 
     574             :     return true;
     575             :   }
     576             : 
     577       26100 :   if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
     578       52069 :       MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
     579             :       MI->getOpcode() == AMDGPU::S_XOR_B32) {
     580         379 :     if (Src1Val == 0) {
     581             :       // y = xor x, 0 => y = copy x
     582           6 :       MI->RemoveOperand(Src1Idx);
     583           6 :       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
     584           6 :       return true;
     585             :     }
     586             :   }
     587             : 
     588             :   return false;
     589             : }
     590             : 
     591             : // Try to fold an instruction into a simpler one
     592      906587 : static bool tryFoldInst(const SIInstrInfo *TII,
     593             :                         MachineInstr *MI) {
     594      906587 :   unsigned Opc = MI->getOpcode();
     595             : 
     596     1813174 :   if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
     597     1801481 :       Opc == AMDGPU::V_CNDMASK_B32_e64    ||
     598             :       Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
     599       11693 :     const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
     600       11693 :     const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
     601       11693 :     if (Src1->isIdenticalTo(*Src0)) {
     602             :       DEBUG(dbgs() << "Folded " << *MI << " into ");
     603           7 :       int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
     604           7 :       if (Src2Idx != -1)
     605           6 :         MI->RemoveOperand(Src2Idx);
     606           7 :       MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
     607           7 :       mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
     608             :                                                : getMovOpc(false)));
     609             :       DEBUG(dbgs() << *MI << '\n');
     610           7 :       return true;
     611             :     }
     612             :   }
     613             : 
     614             :   return false;
     615             : }
     616             : 
     617      265316 : void SIFoldOperands::foldInstOperand(MachineInstr &MI,
     618             :                                      MachineOperand &OpToFold) const {
     619             :   // We need mutate the operands of new mov instructions to add implicit
     620             :   // uses of EXEC, but adding them invalidates the use_iterator, so defer
     621             :   // this.
     622             :   SmallVector<MachineInstr *, 4> CopiesToReplace;
     623             :   SmallVector<FoldCandidate, 4> FoldList;
     624      265316 :   MachineOperand &Dst = MI.getOperand(0);
     625             : 
     626      265316 :   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
     627             :   if (FoldingImm) {
     628             :     unsigned NumLiteralUses = 0;
     629             :     MachineOperand *NonInlineUse = nullptr;
     630             :     int NonInlineUseOpNo = -1;
     631             : 
     632             :     MachineRegisterInfo::use_iterator NextUse;
     633             :     for (MachineRegisterInfo::use_iterator
     634       85099 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     635      229467 :          Use != E; Use = NextUse) {
     636             :       NextUse = std::next(Use);
     637      144368 :       MachineInstr *UseMI = Use->getParent();
     638             :       unsigned OpNo = Use.getOperandNo();
     639             : 
     640             :       // Folding the immediate may reveal operations that can be constant
     641             :       // folded or replaced with a copy. This can happen for example after
     642             :       // frame indices are lowered to constants or from splitting 64-bit
     643             :       // constants.
     644             :       //
     645             :       // We may also encounter cases where one or both operands are
     646             :       // immediates materialized into a register, which would ordinarily not
     647             :       // be folded due to multiple uses or operand constraints.
     648             : 
     649      144559 :       if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
     650             :         DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
     651             : 
     652             :         // Some constant folding cases change the same immediate's use to a new
     653             :         // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
     654             :         // again. The same constant folded instruction could also have a second
     655             :         // use operand.
     656         191 :         NextUse = MRI->use_begin(Dst.getReg());
     657             :         FoldList.clear();
     658         191 :         continue;
     659             :       }
     660             : 
     661             :       // Try to fold any inline immediate uses, and then only fold other
     662             :       // constants if they have one use.
     663             :       //
     664             :       // The legality of the inline immediate must be checked based on the use
     665             :       // operand, not the defining instruction, because 32-bit instructions
     666             :       // with 32-bit inline immediate sources may be used to materialize
     667             :       // constants used in 16-bit operands.
     668             :       //
     669             :       // e.g. it is unsafe to fold:
     670             :       //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
     671             :       //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
     672             : 
     673             :       // Folding immediates with more than one use will increase program size.
     674             :       // FIXME: This will also reduce register usage, which may be better
     675             :       // in some cases. A better heuristic is needed.
     676      144177 :       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
     677       43025 :         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
     678             :       } else {
     679      101152 :         if (++NumLiteralUses == 1) {
     680             :           NonInlineUse = &*Use;
     681       60181 :           NonInlineUseOpNo = OpNo;
     682             :         }
     683             :       }
     684             :     }
     685             : 
     686       85099 :     if (NumLiteralUses == 1) {
     687       42802 :       MachineInstr *UseMI = NonInlineUse->getParent();
     688       42802 :       foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
     689             :     }
     690             :   } else {
     691             :     // Folding register.
     692             :     for (MachineRegisterInfo::use_iterator
     693      180217 :            Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
     694      393948 :          Use != E; ++Use) {
     695      213731 :       MachineInstr *UseMI = Use->getParent();
     696             : 
     697      213731 :       foldOperand(OpToFold, UseMI, Use.getOperandNo(),
     698             :                   FoldList, CopiesToReplace);
     699             :     }
     700             :   }
     701             : 
     702      265316 :   MachineFunction *MF = MI.getParent()->getParent();
     703             :   // Make sure we add EXEC uses to any new v_mov instructions created.
     704      286894 :   for (MachineInstr *Copy : CopiesToReplace)
     705       10789 :     Copy->addImplicitDefUseOperands(*MF);
     706             : 
     707      487304 :   for (FoldCandidate &Fold : FoldList) {
     708      110994 :     if (updateOperand(Fold, *TRI)) {
     709             :       // Clear kill flags.
     710      110994 :       if (Fold.isReg()) {
     711             :         assert(Fold.OpToFold && Fold.OpToFold->isReg());
     712             :         // FIXME: Probably shouldn't bother trying to fold if not an
     713             :         // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
     714             :         // copies.
     715       65923 :         MRI->clearKillFlags(Fold.OpToFold->getReg());
     716             :       }
     717             :       DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
     718             :             static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
     719      110994 :       tryFoldInst(TII, Fold.UseMI);
     720           0 :     } else if (Fold.isCommuted()) {
     721             :       // Restoring instruction's original operand order if fold has failed.
     722           0 :       TII->commuteInstruction(*Fold.UseMI, false);
     723             :     }
     724             :   }
     725      265316 : }
     726             : 
     727             : // Clamp patterns are canonically selected to v_max_* instructions, so only
     728             : // handle them.
     729      468116 : const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
     730      468116 :   unsigned Op = MI.getOpcode();
     731      468116 :   switch (Op) {
     732        1168 :   case AMDGPU::V_MAX_F32_e64:
     733             :   case AMDGPU::V_MAX_F16_e64:
     734             :   case AMDGPU::V_MAX_F64:
     735             :   case AMDGPU::V_PK_MAX_F16: {
     736        2336 :     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
     737             :       return nullptr;
     738             : 
     739             :     // Make sure sources are identical.
     740             :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     741             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     742         717 :     if (!Src0->isReg() || !Src1->isReg() ||
     743         713 :         Src0->getReg() != Src1->getReg() ||
     744         714 :         Src0->getSubReg() != Src1->getSubReg() ||
     745             :         Src0->getSubReg() != AMDGPU::NoSubRegister)
     746             :       return nullptr;
     747             : 
     748             :     // Can't fold up if we have modifiers.
     749         355 :     if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     750             :       return nullptr;
     751             : 
     752             :     unsigned Src0Mods
     753         708 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
     754             :     unsigned Src1Mods
     755         354 :       = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
     756             : 
     757             :     // Having a 0 op_sel_hi would require swizzling the output in the source
     758             :     // instruction, which we can't do.
     759         354 :     unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;
     760         354 :     if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
     761             :       return nullptr;
     762             :     return Src0;
     763             :   }
     764             :   default:
     765             :     return nullptr;
     766             :   }
     767             : }
     768             : 
     769             : // We obviously have multiple uses in a clamp since the register is used twice
     770             : // in the same instruction.
     771         317 : static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
     772             :   int Count = 0;
     773         634 :   for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
     774         634 :        I != E; ++I) {
     775         340 :     if (++Count > 1)
     776          23 :       return false;
     777             :   }
     778             : 
     779         294 :   return true;
     780             : }
     781             : 
     782             : // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
     783      468116 : bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
     784      468116 :   const MachineOperand *ClampSrc = isClamp(MI);
     785      468116 :   if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
     786             :     return false;
     787             : 
     788         262 :   MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
     789             : 
     790             :   // The type of clamp must be compatible.
     791         524 :   if (TII->getClampMask(*Def) != TII->getClampMask(MI))
     792             :     return false;
     793             : 
     794         162 :   MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
     795         162 :   if (!DefClamp)
     796             :     return false;
     797             : 
     798             :   DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
     799             : 
     800             :   // Clamp is applied after omod, so it is OK if omod is set.
     801             :   DefClamp->setImm(1);
     802         162 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
     803         162 :   MI.eraseFromParent();
     804         162 :   return true;
     805             : }
     806             : 
     807          29 : static int getOModValue(unsigned Opc, int64_t Val) {
     808          29 :   switch (Opc) {
     809          28 :   case AMDGPU::V_MUL_F32_e64: {
     810          28 :     switch (static_cast<uint32_t>(Val)) {
     811             :     case 0x3f000000: // 0.5
     812             :       return SIOutMods::DIV2;
     813           0 :     case 0x40000000: // 2.0
     814           0 :       return SIOutMods::MUL2;
     815           8 :     case 0x40800000: // 4.0
     816           8 :       return SIOutMods::MUL4;
     817           0 :     default:
     818           0 :       return SIOutMods::NONE;
     819             :     }
     820             :   }
     821           1 :   case AMDGPU::V_MUL_F16_e64: {
     822           1 :     switch (static_cast<uint16_t>(Val)) {
     823             :     case 0x3800: // 0.5
     824             :       return SIOutMods::DIV2;
     825           0 :     case 0x4000: // 2.0
     826           0 :       return SIOutMods::MUL2;
     827           0 :     case 0x4400: // 4.0
     828           0 :       return SIOutMods::MUL4;
     829           0 :     default:
     830           0 :       return SIOutMods::NONE;
     831             :     }
     832             :   }
     833           0 :   default:
     834           0 :     llvm_unreachable("invalid mul opcode");
     835             :   }
     836             : }
     837             : 
     838             : // FIXME: Does this really not support denormals with f16?
     839             : // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
     840             : // handled, so will anything other than that break?
     841             : std::pair<const MachineOperand *, int>
     842         500 : SIFoldOperands::isOMod(const MachineInstr &MI) const {
     843         500 :   unsigned Op = MI.getOpcode();
     844         500 :   switch (Op) {
     845          39 :   case AMDGPU::V_MUL_F32_e64:
     846             :   case AMDGPU::V_MUL_F16_e64: {
     847             :     // If output denormals are enabled, omod is ignored.
     848          39 :     if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
     849           3 :         (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
     850           6 :       return std::make_pair(nullptr, SIOutMods::NONE);
     851             : 
     852             :     const MachineOperand *RegOp = nullptr;
     853             :     const MachineOperand *ImmOp = nullptr;
     854          33 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     855             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     856          33 :     if (Src0->isImm()) {
     857             :       ImmOp = Src0;
     858             :       RegOp = Src1;
     859          33 :     } else if (Src1->isImm()) {
     860             :       ImmOp = Src1;
     861             :       RegOp = Src0;
     862             :     } else
     863           4 :       return std::make_pair(nullptr, SIOutMods::NONE);
     864             : 
     865          29 :     int OMod = getOModValue(Op, ImmOp->getImm());
     866          29 :     if (OMod == SIOutMods::NONE ||
     867          54 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
     868          50 :         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
     869          79 :         TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
     870          25 :         TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
     871           4 :       return std::make_pair(nullptr, SIOutMods::NONE);
     872             : 
     873             :     return std::make_pair(RegOp, OMod);
     874             :   }
     875          99 :   case AMDGPU::V_ADD_F32_e64:
     876             :   case AMDGPU::V_ADD_F16_e64: {
     877             :     // If output denormals are enabled, omod is ignored.
     878          99 :     if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
     879           8 :         (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
     880          18 :       return std::make_pair(nullptr, SIOutMods::NONE);
     881             : 
     882             :     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
     883          81 :     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
     884             :     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
     885             : 
     886         106 :     if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
     887          25 :         Src0->getSubReg() == Src1->getSubReg() &&
     888          42 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
     889          30 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
     890         105 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
     891          11 :         !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
     892          11 :       return std::make_pair(Src0, SIOutMods::MUL2);
     893             : 
     894          70 :     return std::make_pair(nullptr, SIOutMods::NONE);
     895             :   }
     896         362 :   default:
     897         362 :     return std::make_pair(nullptr, SIOutMods::NONE);
     898             :   }
     899             : }
     900             : 
     901             : // FIXME: Does this need to check IEEE bit on function?
     902         500 : bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
     903             :   const MachineOperand *RegOp;
     904             :   int OMod;
     905        1000 :   std::tie(RegOp, OMod) = isOMod(MI);
     906          72 :   if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
     907         536 :       RegOp->getSubReg() != AMDGPU::NoSubRegister ||
     908          36 :       !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
     909             :     return false;
     910             : 
     911          32 :   MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
     912          32 :   MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
     913          32 :   if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
     914             :     return false;
     915             : 
     916             :   // Clamp is applied after omod. If the source already has clamp set, don't
     917             :   // fold it.
     918          26 :   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
     919             :     return false;
     920             : 
     921             :   DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
     922             : 
     923          18 :   DefOMod->setImm(OMod);
     924          18 :   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
     925          18 :   MI.eraseFromParent();
     926          18 :   return true;
     927             : }
     928             : 
     929       32733 : bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
     930       32733 :   if (skipFunction(MF.getFunction()))
     931             :     return false;
     932             : 
     933       32729 :   MRI = &MF.getRegInfo();
     934       32729 :   ST = &MF.getSubtarget<SISubtarget>();
     935       32729 :   TII = ST->getInstrInfo();
     936       32729 :   TRI = &TII->getRegisterInfo();
     937             : 
     938       32729 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     939             : 
     940             :   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
     941             :   // correctly handle signed zeros.
     942             :   //
     943             :   // TODO: Check nsz on instructions when fast math flags are preserved to MI
     944             :   // level.
     945       32729 :   bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
     946             : 
     947      171843 :   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     948             :     MachineBasicBlock::iterator I, Next;
     949      832421 :     for (I = MBB->begin(); I != MBB->end(); I = Next) {
     950             :       Next = std::next(I);
     951             :       MachineInstr &MI = *I;
     952             : 
     953      795593 :       tryFoldInst(TII, &MI);
     954             : 
     955     1263727 :       if (!TII->isFoldableCopy(MI)) {
     956      468134 :         if (IsIEEEMode || !tryFoldOMod(MI))
     957      468116 :           tryFoldClamp(MI);
     958      468134 :         continue;
     959             :       }
     960             : 
     961      327459 :       MachineOperand &OpToFold = MI.getOperand(1);
     962      327459 :       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
     963             : 
     964             :       // FIXME: We could also be folding things like TargetIndexes.
     965      238694 :       if (!FoldingImm && !OpToFold.isReg())
     966           0 :         continue;
     967             : 
     968      614720 :       if (OpToFold.isReg() &&
     969      238694 :           !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
     970       48567 :         continue;
     971             : 
     972             :       // Prevent folding operands backwards in the function. For example,
     973             :       // the COPY opcode must not be replaced by 1 in this example:
     974             :       //
     975             :       //    %3 = COPY %vgpr0; VGPR_32:%3
     976             :       //    ...
     977             :       //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
     978             :       MachineOperand &Dst = MI.getOperand(0);
     979      571360 :       if (Dst.isReg() &&
     980      278892 :           !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
     981       13576 :         continue;
     982             : 
     983      265316 :       foldInstOperand(MI, OpToFold);
     984             :     }
     985             :   }
     986       32729 :   return false;
     987             : }

Generated by: LCOV version 1.13