LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIShrinkInstructions.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 155 159 97.5 %
Date: 2018-10-20 13:21:21 Functions: 13 13 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : /// The pass tries to use the 32-bit encoding for instructions when possible.
       9             : //===----------------------------------------------------------------------===//
      10             : //
      11             : 
      12             : #include "AMDGPU.h"
      13             : #include "AMDGPUSubtarget.h"
      14             : #include "SIInstrInfo.h"
      15             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      16             : #include "llvm/ADT/Statistic.h"
      17             : #include "llvm/CodeGen/MachineFunctionPass.h"
      18             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      19             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      20             : #include "llvm/IR/Constants.h"
      21             : #include "llvm/IR/Function.h"
      22             : #include "llvm/IR/LLVMContext.h"
      23             : #include "llvm/Support/Debug.h"
      24             : #include "llvm/Support/raw_ostream.h"
      25             : #include "llvm/Target/TargetMachine.h"
      26             : 
      27             : #define DEBUG_TYPE "si-shrink-instructions"
      28             : 
      29             : STATISTIC(NumInstructionsShrunk,
      30             :           "Number of 64-bit instruction reduced to 32-bit.");
      31             : STATISTIC(NumLiteralConstantsFolded,
      32             :           "Number of literal constants folded into 32-bit instructions.");
      33             : 
      34             : using namespace llvm;
      35             : 
      36             : namespace {
      37             : 
      38             : class SIShrinkInstructions : public MachineFunctionPass {
      39             : public:
      40             :   static char ID;
      41             : 
      42             : public:
      43        3888 :   SIShrinkInstructions() : MachineFunctionPass(ID) {
      44             :   }
      45             : 
      46             :   bool runOnMachineFunction(MachineFunction &MF) override;
      47             : 
      48        3867 :   StringRef getPassName() const override { return "SI Shrink Instructions"; }
      49             : 
      50        3867 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      51        3867 :     AU.setPreservesCFG();
      52        3867 :     MachineFunctionPass::getAnalysisUsage(AU);
      53        3867 :   }
      54             : };
      55             : 
      56             : } // End anonymous namespace.
      57             : 
      58      199024 : INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
      59             :                 "SI Shrink Instructions", false, false)
      60             : 
      61             : char SIShrinkInstructions::ID = 0;
      62             : 
      63        3883 : FunctionPass *llvm::createSIShrinkInstructionsPass() {
      64        3883 :   return new SIShrinkInstructions();
      65             : }
      66             : 
      67             : /// This function checks \p MI for operands defined by a move immediate
      68             : /// instruction and then folds the literal constant into the instruction if it
      69             : /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
      70       50585 : static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
      71             :                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
      72             :   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
      73             : 
      74       50585 :   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
      75             : 
      76             :   // Try to fold Src0
      77       50585 :   MachineOperand &Src0 = MI.getOperand(Src0Idx);
      78       50585 :   if (Src0.isReg()) {
      79       32522 :     unsigned Reg = Src0.getReg();
      80       32522 :     if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
      81       11379 :       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
      82       22758 :       if (Def && Def->isMoveImmediate()) {
      83         868 :         MachineOperand &MovSrc = Def->getOperand(1);
      84             :         bool ConstantFolded = false;
      85             : 
      86         868 :         if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
      87           0 :                                isUInt<32>(MovSrc.getImm()))) {
      88             :           // It's possible to have only one component of a super-reg defined by
      89             :           // a single mov, so we need to clear any subregister flag.
      90             :           Src0.setSubReg(0);
      91         856 :           Src0.ChangeToImmediate(MovSrc.getImm());
      92             :           ConstantFolded = true;
      93          12 :         } else if (MovSrc.isFI()) {
      94             :           Src0.setSubReg(0);
      95          12 :           Src0.ChangeToFrameIndex(MovSrc.getIndex());
      96             :           ConstantFolded = true;
      97             :         }
      98             : 
      99             :         if (ConstantFolded) {
     100             :           assert(MRI.use_empty(Reg));
     101         868 :           Def->eraseFromParent();
     102             :           ++NumLiteralConstantsFolded;
     103         868 :           return true;
     104             :         }
     105             :       }
     106             :     }
     107             :   }
     108             : 
     109             :   // We have failed to fold src0, so commute the instruction and try again.
     110       91753 :   if (TryToCommute && MI.isCommutable()) {
     111       31718 :     if (TII->commuteInstruction(MI)) {
     112        7686 :       if (foldImmediates(MI, TII, MRI, false))
     113             :         return true;
     114             : 
     115             :       // Commute back.
     116        7681 :       TII->commuteInstruction(MI);
     117             :     }
     118             :   }
     119             : 
     120             :   return false;
     121             : }
     122             : 
     123       26529 : static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
     124       68224 :   return isInt<16>(Src.getImm()) &&
     125       15166 :     !TII->isInlineConstant(*Src.getParent(),
     126       26529 :                            Src.getParent()->getOperandNo(&Src));
     127             : }
     128             : 
     129          36 : static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
     130          56 :   return isUInt<16>(Src.getImm()) &&
     131          20 :     !TII->isInlineConstant(*Src.getParent(),
     132          36 :                            Src.getParent()->getOperandNo(&Src));
     133             : }
     134             : 
     135         329 : static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
     136             :                                  const MachineOperand &Src,
     137             :                                  bool &IsUnsigned) {
     138         658 :   if (isInt<16>(Src.getImm())) {
     139         313 :     IsUnsigned = false;
     140         313 :     return !TII->isInlineConstant(Src);
     141             :   }
     142             : 
     143          16 :   if (isUInt<16>(Src.getImm())) {
     144           8 :     IsUnsigned = true;
     145           8 :     return !TII->isInlineConstant(Src);
     146             :   }
     147             : 
     148             :   return false;
     149             : }
     150             : 
     151             : /// \returns true if the constant in \p Src should be replaced with a bitreverse
     152             : /// of an inline immediate.
     153       34956 : static bool isReverseInlineImm(const SIInstrInfo *TII,
     154             :                                const MachineOperand &Src,
     155             :                                int32_t &ReverseImm) {
     156       69912 :   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
     157       21682 :     return false;
     158             : 
     159       13274 :   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
     160       13274 :   return ReverseImm >= -16 && ReverseImm <= 64;
     161             : }
     162             : 
     163             : /// Copy implicit register operands from specified instruction to this
     164             : /// instruction that are not part of the instruction definition.
     165       42899 : static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
     166             :                                  const MachineInstr &MI) {
     167       42899 :   for (unsigned i = MI.getDesc().getNumOperands() +
     168       42899 :          MI.getDesc().getNumImplicitUses() +
     169       42899 :          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
     170       42899 :        i != e; ++i) {
     171           0 :     const MachineOperand &MO = MI.getOperand(i);
     172           0 :     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
     173           0 :       NewMI.addOperand(MF, MO);
     174             :   }
     175       42899 : }
     176             : 
     177         505 : static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
     178             :   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
     179             :   // get constants on the RHS.
     180        1010 :   if (!MI.getOperand(0).isReg())
     181          20 :     TII->commuteInstruction(MI, false, 0, 1);
     182             : 
     183         505 :   const MachineOperand &Src1 = MI.getOperand(1);
     184         505 :   if (!Src1.isImm())
     185             :     return;
     186             : 
     187         459 :   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
     188         459 :   if (SOPKOpc == -1)
     189             :     return;
     190             : 
     191             :   // eq/ne is special because the imm16 can be treated as signed or unsigned,
     192             :   // and initially selectd to the unsigned versions.
     193         443 :   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
     194             :     bool HasUImm;
     195         329 :     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
     196          34 :       if (!HasUImm) {
     197          26 :         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
     198             :           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
     199             :       }
     200             : 
     201          34 :       MI.setDesc(TII->get(SOPKOpc));
     202             :     }
     203             : 
     204             :     return;
     205             :   }
     206             : 
     207         114 :   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
     208             : 
     209         228 :   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
     210          78 :       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
     211             :     MI.setDesc(NewDesc);
     212             :   }
     213             : }
     214             : 
     215       39296 : bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
     216       39296 :   if (skipFunction(MF.getFunction()))
     217             :     return false;
     218             : 
     219       39289 :   MachineRegisterInfo &MRI = MF.getRegInfo();
     220       39289 :   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     221       39289 :   const SIInstrInfo *TII = ST.getInstrInfo();
     222             : 
     223             :   std::vector<unsigned> I1Defs;
     224             : 
     225             :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
     226       83407 :                                                   BI != BE; ++BI) {
     227             : 
     228             :     MachineBasicBlock &MBB = *BI;
     229             :     MachineBasicBlock::iterator I, Next;
     230      830348 :     for (I = MBB.begin(); I != MBB.end(); I = Next) {
     231      786230 :       Next = std::next(I);
     232             :       MachineInstr &MI = *I;
     233             : 
     234     1572460 :       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
     235             :         // If this has a literal constant source that is the same as the
     236             :         // reversed bits of an inline immediate, replace with a bitreverse of
     237             :         // that constant. This saves 4 bytes in the common case of materializing
     238             :         // sign bits.
     239             : 
     240             :         // Test if we are after regalloc. We only want to do this after any
     241             :         // optimizations happen because this will confuse them.
     242             :         // XXX - not exactly a check for post-regalloc run.
     243       60444 :         MachineOperand &Src = MI.getOperand(1);
     244       60444 :         if (Src.isImm() &&
     245       17192 :             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
     246             :           int32_t ReverseImm;
     247       10248 :           if (isReverseInlineImm(TII, Src, ReverseImm)) {
     248          79 :             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
     249          79 :             Src.setImm(ReverseImm);
     250          79 :             continue;
     251             :           }
     252             :         }
     253             :       }
     254             : 
     255             :       // Combine adjacent s_nops to use the immediate operand encoding how long
     256             :       // to wait.
     257             :       //
     258             :       // s_nop N
     259             :       // s_nop M
     260             :       //  =>
     261             :       // s_nop (N + M)
     262      786151 :       if (MI.getOpcode() == AMDGPU::S_NOP &&
     263      787254 :           Next != MBB.end() &&
     264        1103 :           (*Next).getOpcode() == AMDGPU::S_NOP) {
     265             : 
     266             :         MachineInstr &NextMI = *Next;
     267             :         // The instruction encodes the amount to wait with an offset of 1,
     268             :         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
     269             :         // after adding.
     270         213 :         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
     271         213 :         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
     272             : 
     273             :         // Make sure we don't overflow the bounds.
     274         213 :         if (Nop0 + Nop1 <= 8) {
     275         213 :           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
     276         213 :           MI.eraseFromParent();
     277             :         }
     278             : 
     279         213 :         continue;
     280             :       }
     281             : 
     282             :       // FIXME: We also need to consider movs of constant operands since
     283             :       // immediate operands are not folded if they have more than one use, and
     284             :       // the operand folding pass is unaware if the immediate will be free since
     285             :       // it won't know if the src == dest constraint will end up being
     286             :       // satisfied.
     287      785938 :       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
     288             :           MI.getOpcode() == AMDGPU::S_MUL_I32) {
     289        3124 :         const MachineOperand *Dest = &MI.getOperand(0);
     290             :         MachineOperand *Src0 = &MI.getOperand(1);
     291             :         MachineOperand *Src1 = &MI.getOperand(2);
     292             : 
     293        3124 :         if (!Src0->isReg() && Src1->isReg()) {
     294           4 :           if (TII->commuteInstruction(MI, false, 1, 2))
     295             :             std::swap(Src0, Src1);
     296             :         }
     297             : 
     298             :         // FIXME: This could work better if hints worked with subregisters. If
     299             :         // we have a vector add of a constant, we usually don't get the correct
     300             :         // allocation due to the subregister usage.
     301        6248 :         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
     302             :             Src0->isReg()) {
     303        1529 :           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
     304        3058 :           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
     305             :           continue;
     306             :         }
     307             : 
     308        1595 :         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
     309         628 :           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
     310         170 :             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
     311             :               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
     312             : 
     313          85 :             MI.setDesc(TII->get(Opc));
     314          85 :             MI.tieOperands(0, 1);
     315             :           }
     316             :         }
     317             :       }
     318             : 
     319             :       // Try to use s_cmpk_*
     320      784409 :       if (MI.isCompare() && TII->isSOPC(MI)) {
     321         505 :         shrinkScalarCompare(TII, MI);
     322         505 :         continue;
     323             :       }
     324             : 
     325             :       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
     326     1567808 :       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
     327       69065 :         const MachineOperand &Dst = MI.getOperand(0);
     328             :         MachineOperand &Src = MI.getOperand(1);
     329             : 
     330       69065 :         if (Src.isImm() &&
     331       48218 :             TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
     332             :           int32_t ReverseImm;
     333       26090 :           if (isKImmOperand(TII, Src))
     334        1382 :             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
     335       24708 :           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
     336         104 :             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
     337         104 :             Src.setImm(ReverseImm);
     338             :           }
     339             :         }
     340             : 
     341       69065 :         continue;
     342             :       }
     343             : 
     344      714839 :       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
     345             :         continue;
     346             : 
     347       71404 :       if (!TII->canShrink(MI, MRI)) {
     348             :         // Try commuting the instruction and see if that enables us to shrink
     349             :         // it.
     350       38450 :         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
     351       16657 :             !TII->canShrink(MI, MRI))
     352        9140 :           continue;
     353             :       }
     354             : 
     355             :       // getVOPe32 could be -1 here if we started with an instruction that had
     356             :       // a 32-bit encoding and then commuted it to an instruction that did not.
     357      124528 :       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
     358             :         continue;
     359             : 
     360       62264 :       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
     361             : 
     362      124528 :       if (TII->isVOPC(Op32)) {
     363        6483 :         unsigned DstReg = MI.getOperand(0).getReg();
     364        6483 :         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
     365             :           // VOPC instructions can only write to the VCC register. We can't
     366             :           // force them to use VCC here, because this is only one register and
     367             :           // cannot deal with sequences which would require multiple copies of
     368             :           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
     369             :           //
     370             :           // So, instead of forcing the instruction to write to VCC, we provide
     371             :           // a hint to the register allocator to use VCC and then we will run
     372             :           // this pass again after RA and shrink it if it outputs to VCC.
     373        3212 :           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
     374        3212 :           continue;
     375             :         }
     376        3271 :         if (DstReg != AMDGPU::VCC)
     377             :           continue;
     378             :       }
     379             : 
     380       58010 :       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
     381             :         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
     382             :         // instructions.
     383             :         const MachineOperand *Src2 =
     384        6250 :             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
     385        6250 :         if (!Src2->isReg())
     386             :           continue;
     387        6250 :         unsigned SReg = Src2->getReg();
     388        6250 :         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
     389        3103 :           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
     390        3103 :           continue;
     391             :         }
     392        3147 :         if (SReg != AMDGPU::VCC)
     393             :           continue;
     394             :       }
     395             : 
     396             :       // Check for the bool flag output for instructions like V_ADD_I32_e64.
     397       54018 :       const MachineOperand *SDst = TII->getNamedOperand(MI,
     398             :                                                         AMDGPU::OpName::sdst);
     399             : 
     400             :       // Check the carry-in operand for v_addc_u32_e64.
     401       54018 :       const MachineOperand *Src2 = TII->getNamedOperand(MI,
     402             :                                                         AMDGPU::OpName::src2);
     403             : 
     404       54018 :       if (SDst) {
     405       23632 :         if (SDst->getReg() != AMDGPU::VCC) {
     406       11008 :           if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
     407       10673 :             MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
     408       11008 :           continue;
     409             :         }
     410             : 
     411             :         // All of the instructions with carry outs also have an SGPR input in
     412             :         // src2.
     413       12624 :         if (Src2 && Src2->getReg() != AMDGPU::VCC) {
     414         111 :           if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
     415           1 :             MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
     416             : 
     417         111 :           continue;
     418             :         }
     419             :       }
     420             : 
     421             :       // We can shrink this instruction
     422             :       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
     423             : 
     424       42899 :       MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
     425             :       ++NumInstructionsShrunk;
     426             : 
     427             :       // Copy extra operands not present in the instruction definition.
     428       42899 :       copyExtraImplicitOps(*Inst32, MF, MI);
     429             : 
     430       42899 :       MI.eraseFromParent();
     431       42899 :       foldImmediates(*Inst32, TII, MRI);
     432             : 
     433             :       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
     434             :     }
     435             :   }
     436             :   return false;
     437             : }

Generated by: LCOV version 1.13