LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertSkips.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 126 132 95.5 %
Date: 2018-02-23 15:42:53 Functions: 13 14 92.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief This pass inserts branches on the 0 exec mask over divergent branches
      12             : /// branches when it's expected that jumping over the untaken control flow will
      13             : /// be cheaper than having every workitem no-op through it.
      14             : //
      15             : //===----------------------------------------------------------------------===//
      16             : 
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "SIInstrInfo.h"
      20             : #include "SIMachineFunctionInfo.h"
      21             : #include "llvm/ADT/SmallVector.h"
      22             : #include "llvm/ADT/StringRef.h"
      23             : #include "llvm/CodeGen/MachineBasicBlock.h"
      24             : #include "llvm/CodeGen/MachineFunction.h"
      25             : #include "llvm/CodeGen/MachineFunctionPass.h"
      26             : #include "llvm/CodeGen/MachineInstr.h"
      27             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      28             : #include "llvm/CodeGen/MachineOperand.h"
      29             : #include "llvm/IR/CallingConv.h"
      30             : #include "llvm/IR/DebugLoc.h"
      31             : #include "llvm/MC/MCAsmInfo.h"
      32             : #include "llvm/Pass.h"
      33             : #include "llvm/Support/CommandLine.h"
      34             : #include "llvm/Target/TargetMachine.h"
      35             : #include <cassert>
      36             : #include <cstdint>
      37             : #include <iterator>
      38             : 
      39             : using namespace llvm;
      40             : 
      41             : #define DEBUG_TYPE "si-insert-skips"
      42             : 
      43       81686 : static cl::opt<unsigned> SkipThresholdFlag(
      44             :   "amdgpu-skip-threshold",
      45       81686 :   cl::desc("Number of instructions before jumping over divergent control flow"),
      46      245058 :   cl::init(12), cl::Hidden);
      47             : 
      48             : namespace {
      49             : 
      50        1680 : class SIInsertSkips : public MachineFunctionPass {
      51             : private:
      52             :   const SIRegisterInfo *TRI = nullptr;
      53             :   const SIInstrInfo *TII = nullptr;
      54             :   unsigned SkipThreshold = 0;
      55             : 
      56             :   bool shouldSkip(const MachineBasicBlock &From,
      57             :                   const MachineBasicBlock &To) const;
      58             : 
      59             :   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
      60             : 
      61             :   void kill(MachineInstr &MI);
      62             : 
      63             :   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
      64             :                                      MachineBasicBlock::iterator I) const;
      65             : 
      66             :   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
      67             : 
      68             : public:
      69             :   static char ID;
      70             : 
      71        1688 :   SIInsertSkips() : MachineFunctionPass(ID) {}
      72             : 
      73             :   bool runOnMachineFunction(MachineFunction &MF) override;
      74             : 
      75        1680 :   StringRef getPassName() const override {
      76        1680 :     return "SI insert s_cbranch_execz instructions";
      77             :   }
      78             : 
      79        1680 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      80        1680 :     MachineFunctionPass::getAnalysisUsage(AU);
      81        1680 :   }
      82             : };
      83             : 
      84             : } // end anonymous namespace
      85             : 
      86             : char SIInsertSkips::ID = 0;
      87             : 
      88      279728 : INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
      89             :                 "SI insert s_cbranch_execz instructions", false, false)
      90             : 
      91             : char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
      92             : 
      93             : static bool opcodeEmitsNoInsts(unsigned Opc) {
      94             :   switch (Opc) {
      95             :   case TargetOpcode::IMPLICIT_DEF:
      96             :   case TargetOpcode::KILL:
      97             :   case TargetOpcode::BUNDLE:
      98             :   case TargetOpcode::CFI_INSTRUCTION:
      99             :   case TargetOpcode::EH_LABEL:
     100             :   case TargetOpcode::GC_LABEL:
     101             :   case TargetOpcode::DBG_VALUE:
     102             :     return true;
     103             :   default:
     104             :     return false;
     105             :   }
     106             : }
     107             : 
     108         425 : bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
     109             :                                const MachineBasicBlock &To) const {
     110         425 :   if (From.succ_empty())
     111             :     return false;
     112             : 
     113             :   unsigned NumInstr = 0;
     114         417 :   const MachineFunction *MF = From.getParent();
     115             : 
     116             :   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
     117         802 :        MBBI != End && MBBI != ToI; ++MBBI) {
     118             :     const MachineBasicBlock &MBB = *MBBI;
     119             : 
     120         470 :     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
     121        5886 :          NumInstr < SkipThreshold && I != E; ++I) {
     122        2603 :       if (opcodeEmitsNoInsts(I->getOpcode()))
     123             :         continue;
     124             : 
     125             :       // FIXME: Since this is required for correctness, this should be inserted
     126             :       // during SILowerControlFlow.
     127             : 
     128             :       // When a uniform loop is inside non-uniform control flow, the branch
     129             :       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
     130             :       // when EXEC = 0. We should skip the loop lest it becomes infinite.
     131        5024 :       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
     132             :           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
     133          85 :         return true;
     134             : 
     135             :       // V_READFIRSTLANE/V_READLANE destination register may be used as operand
     136             :       // by some SALU instruction. If exec mask is zero vector instruction
     137             :       // defining the register that is used by the scalar one is not executed
     138             :       // and scalar instruction will operate on undefined data. For
     139             :       // V_READFIRSTLANE/V_READLANE we should avoid predicated execution.
     140        2511 :       if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) ||
     141             :           (I->getOpcode() == AMDGPU::V_READLANE_B32)) {
     142             :         return true;
     143             :       }
     144             : 
     145        2500 :       if (I->isInlineAsm()) {
     146          14 :         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
     147          14 :         const char *AsmStr = I->getOperand(0).getSymbolName();
     148             : 
     149             :         // inlineasm length estimate is number of bytes assuming the longest
     150             :         // instruction.
     151          14 :         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
     152          14 :         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
     153             :       } else {
     154        2486 :         ++NumInstr;
     155             :       }
     156             : 
     157        2500 :       if (NumInstr >= SkipThreshold)
     158             :         return true;
     159             :     }
     160             :   }
     161             : 
     162             :   return false;
     163             : }
     164             : 
     165          82 : bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
     166          82 :   MachineBasicBlock &MBB = *MI.getParent();
     167          82 :   MachineFunction *MF = MBB.getParent();
     168             : 
     169         197 :   if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
     170          33 :       !shouldSkip(MBB, MBB.getParent()->back()))
     171             :     return false;
     172             : 
     173           4 :   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
     174             : 
     175             :   const DebugLoc &DL = MI.getDebugLoc();
     176             : 
     177             :   // If the exec mask is non-zero, skip the next two instructions
     178           4 :   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
     179             :     .addMBB(&NextBB);
     180             : 
     181           4 :   MachineBasicBlock::iterator Insert = SkipBB->begin();
     182             : 
     183             :   // Exec mask is zero: Export to NULL target...
     184          12 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
     185             :     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
     186           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     187           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     188           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     189           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     190             :     .addImm(1)  // vm
     191             :     .addImm(0)  // compr
     192             :     .addImm(0); // en
     193             : 
     194             :   // ... and terminate wavefront.
     195           8 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
     196             : 
     197           4 :   return true;
     198             : }
     199             : 
     200          87 : void SIInsertSkips::kill(MachineInstr &MI) {
     201          87 :   MachineBasicBlock &MBB = *MI.getParent();
     202             :   DebugLoc DL = MI.getDebugLoc();
     203             : 
     204         174 :   switch (MI.getOpcode()) {
     205          53 :   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
     206             :     unsigned Opcode = 0;
     207             : 
     208             :     // The opcodes are inverted because the inline immediate has to be
     209             :     // the first operand, e.g. from "x < imm" to "imm > x"
     210          53 :     switch (MI.getOperand(2).getImm()) {
     211             :     case ISD::SETOEQ:
     212             :     case ISD::SETEQ:
     213             :       Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
     214             :       break;
     215           2 :     case ISD::SETOGT:
     216             :     case ISD::SETGT:
     217             :       Opcode = AMDGPU::V_CMPX_LT_F32_e64;
     218             :       break;
     219          27 :     case ISD::SETOGE:
     220             :     case ISD::SETGE:
     221             :       Opcode = AMDGPU::V_CMPX_LE_F32_e64;
     222             :       break;
     223           2 :     case ISD::SETOLT:
     224             :     case ISD::SETLT:
     225             :       Opcode = AMDGPU::V_CMPX_GT_F32_e64;
     226             :       break;
     227           4 :     case ISD::SETOLE:
     228             :     case ISD::SETLE:
     229             :       Opcode = AMDGPU::V_CMPX_GE_F32_e64;
     230             :       break;
     231           2 :     case ISD::SETONE:
     232             :     case ISD::SETNE:
     233             :       Opcode = AMDGPU::V_CMPX_LG_F32_e64;
     234             :       break;
     235           0 :     case ISD::SETO:
     236             :       Opcode = AMDGPU::V_CMPX_O_F32_e64;
     237             :       break;
     238           0 :     case ISD::SETUO:
     239             :       Opcode = AMDGPU::V_CMPX_U_F32_e64;
     240             :       break;
     241           2 :     case ISD::SETUEQ:
     242             :       Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
     243             :       break;
     244           2 :     case ISD::SETUGT:
     245             :       Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
     246             :       break;
     247           4 :     case ISD::SETUGE:
     248             :       Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
     249             :       break;
     250           2 :     case ISD::SETULT:
     251             :       Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
     252             :       break;
     253           2 :     case ISD::SETULE:
     254             :       Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
     255             :       break;
     256           2 :     case ISD::SETUNE:
     257             :       Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
     258             :       break;
     259           0 :     default:
     260           0 :       llvm_unreachable("invalid ISD:SET cond code");
     261             :     }
     262             : 
     263             :     assert(MI.getOperand(0).isReg());
     264             : 
     265             :     MachineInstr *NewMI;
     266          53 :     if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
     267             :                     MI.getOperand(0).getReg())) {
     268          51 :       Opcode = AMDGPU::getVOPe32(Opcode);
     269          51 :       NewMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
     270          51 :           .add(MI.getOperand(1))
     271          51 :           .add(MI.getOperand(0));
     272             :     } else {
     273           6 :       NewMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
     274           2 :           .addReg(AMDGPU::VCC, RegState::Define)
     275             :           .addImm(0)  // src0 modifiers
     276           2 :           .add(MI.getOperand(1))
     277             :           .addImm(0)  // src1 modifiers
     278           2 :           .add(MI.getOperand(0))
     279             :           .addImm(0);  // omod
     280             :     }
     281             :     // Clear isRenamable bit if new opcode requires it to be 0.
     282          53 :     if (NewMI->hasExtraSrcRegAllocReq())
     283          30 :       for (MachineOperand &NewMO : NewMI->uses())
     284          20 :         if (NewMO.isReg() && NewMO.isUse())
     285           4 :           NewMO.setIsRenamable(false);
     286             :     break;
     287             :   }
     288          34 :   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     289          34 :     const MachineOperand &Op = MI.getOperand(0);
     290          34 :     int64_t KillVal = MI.getOperand(1).getImm();
     291             :     assert(KillVal == 0 || KillVal == -1);
     292             : 
     293             :     // Kill all threads if Op0 is an immediate and equal to the Kill value.
     294          34 :     if (Op.isImm()) {
     295          22 :       int64_t Imm = Op.getImm();
     296             :       assert(Imm == 0 || Imm == -1);
     297             : 
     298          22 :       if (Imm == KillVal)
     299          18 :         BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
     300             :           .addImm(0);
     301             :       break;
     302             :     }
     303             : 
     304          12 :     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
     305          36 :     BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
     306          12 :         .addReg(AMDGPU::EXEC)
     307             :         .add(Op);
     308             :     break;
     309             :   }
     310           0 :   default:
     311           0 :     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
     312             :   }
     313          87 : }
     314             : 
     315           4 : MachineBasicBlock *SIInsertSkips::insertSkipBlock(
     316             :   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
     317           4 :   MachineFunction *MF = MBB.getParent();
     318             : 
     319           4 :   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
     320             :   MachineFunction::iterator MBBI(MBB);
     321             :   ++MBBI;
     322             : 
     323             :   MF->insert(MBBI, SkipBB);
     324           4 :   MBB.addSuccessor(SkipBB);
     325             : 
     326           4 :   return SkipBB;
     327             : }
     328             : 
     329             : // Returns true if a branch over the block was inserted.
     330         392 : bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
     331             :                                    MachineBasicBlock &SrcMBB) {
     332         392 :   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
     333             : 
     334         392 :   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
     335             :     return false;
     336             : 
     337             :   const DebugLoc &DL = MI.getDebugLoc();
     338          81 :   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
     339             : 
     340         243 :   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
     341             :     .addMBB(DestBB);
     342             : 
     343          81 :   return true;
     344             : }
     345             : 
     346       16576 : bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
     347       16576 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     348       16576 :   TII = ST.getInstrInfo();
     349       16576 :   TRI = &TII->getRegisterInfo();
     350       16576 :   SkipThreshold = SkipThresholdFlag;
     351             : 
     352             :   bool HaveKill = false;
     353             :   bool MadeChange = false;
     354             : 
     355             :   // Track depth of exec mask, divergent branches.
     356             :   SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
     357             : 
     358             :   MachineFunction::iterator NextBB;
     359             : 
     360             :   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     361             : 
     362             :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
     363       35193 :        BI != BE; BI = NextBB) {
     364             :     NextBB = std::next(BI);
     365             :     MachineBasicBlock &MBB = *BI;
     366             :     bool HaveSkipBlock = false;
     367             : 
     368       18617 :     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
     369             :       // Reached convergence point for last divergent branch.
     370             :       ExecBranchStack.pop_back();
     371             :     }
     372             : 
     373             :     if (HaveKill && ExecBranchStack.empty()) {
     374             :       HaveKill = false;
     375             : 
     376             :       // TODO: Insert skip if exec is 0?
     377             :     }
     378             : 
     379             :     MachineBasicBlock::iterator I, Next;
     380      338661 :     for (I = MBB.begin(); I != MBB.end(); I = Next) {
     381             :       Next = std::next(I);
     382             : 
     383             :       MachineInstr &MI = *I;
     384             : 
     385      640088 :       switch (MI.getOpcode()) {
     386         392 :       case AMDGPU::SI_MASK_BRANCH:
     387         392 :         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
     388         392 :         MadeChange |= skipMaskBranch(MI, MBB);
     389         392 :         break;
     390             : 
     391         635 :       case AMDGPU::S_BRANCH:
     392             :         // Optimize out branches to the next block.
     393             :         // FIXME: Shouldn't this be handled by BranchFolding?
     394         635 :         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
     395         464 :           MI.eraseFromParent();
     396         171 :         } else if (HaveSkipBlock) {
     397             :           // Remove the given unconditional branch when a skip block has been
     398             :           // inserted after the current one and let skip the two instructions
     399             :           // performing the kill if the exec mask is non-zero.
     400           1 :           MI.eraseFromParent();
     401             :         }
     402             :         break;
     403             : 
     404          87 :       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
     405             :       case AMDGPU::SI_KILL_I1_TERMINATOR:
     406             :         MadeChange = true;
     407          87 :         kill(MI);
     408             : 
     409          87 :         if (ExecBranchStack.empty()) {
     410          82 :           if (skipIfDead(MI, *NextBB)) {
     411             :             HaveSkipBlock = true;
     412             :             NextBB = std::next(BI);
     413             :             BE = MF.end();
     414             :           }
     415             :         } else {
     416             :           HaveKill = true;
     417             :         }
     418             : 
     419          87 :         MI.eraseFromParent();
     420          87 :         break;
     421             : 
     422             :       case AMDGPU::SI_RETURN_TO_EPILOG:
     423             :         // FIXME: Should move somewhere else
     424             :         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
     425             : 
     426             :         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
     427             :         // because external bytecode will be appended at the end.
     428         822 :         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
     429             :           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
     430             :           // the end and jump there.
     431           4 :           if (!EmptyMBBAtEnd) {
     432           4 :             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
     433             :             MF.insert(MF.end(), EmptyMBBAtEnd);
     434             :           }
     435             : 
     436           4 :           MBB.addSuccessor(EmptyMBBAtEnd);
     437          12 :           BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
     438             :             .addMBB(EmptyMBBAtEnd);
     439           4 :           I->eraseFromParent();
     440             :         }
     441             :         break;
     442             : 
     443             :       default:
     444             :         break;
     445             :       }
     446             :     }
     447             :   }
     448             : 
     449       16576 :   return MadeChange;
     450      245058 : }

Generated by: LCOV version 1.13