LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertSkips.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 128 128 100.0 %
Date: 2017-09-14 15:23:50 Functions: 13 14 92.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief This pass inserts branches on the 0 exec mask over divergent branches
      12             : /// branches when it's expected that jumping over the untaken control flow will
      13             : /// be cheaper than having every workitem no-op through it.
      14             : //
      15             : //===----------------------------------------------------------------------===//
      16             : 
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "SIInstrInfo.h"
      20             : #include "SIMachineFunctionInfo.h"
      21             : #include "llvm/ADT/SmallVector.h"
      22             : #include "llvm/ADT/StringRef.h"
      23             : #include "llvm/CodeGen/MachineBasicBlock.h"
      24             : #include "llvm/CodeGen/MachineFunction.h"
      25             : #include "llvm/CodeGen/MachineFunctionPass.h"
      26             : #include "llvm/CodeGen/MachineInstr.h"
      27             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      28             : #include "llvm/CodeGen/MachineOperand.h"
      29             : #include "llvm/IR/CallingConv.h"
      30             : #include "llvm/IR/DebugLoc.h"
      31             : #include "llvm/MC/MCAsmInfo.h"
      32             : #include "llvm/Pass.h"
      33             : #include "llvm/Support/CommandLine.h"
      34             : #include "llvm/Target/TargetMachine.h"
      35             : #include <cassert>
      36             : #include <cstdint>
      37             : #include <iterator>
      38             : 
      39             : using namespace llvm;
      40             : 
      41             : #define DEBUG_TYPE "si-insert-skips"
      42             : 
      43       72306 : static cl::opt<unsigned> SkipThresholdFlag(
      44             :   "amdgpu-skip-threshold",
      45      216918 :   cl::desc("Number of instructions before jumping over divergent control flow"),
      46      289224 :   cl::init(12), cl::Hidden);
      47             : 
      48             : namespace {
      49             : 
      50        1461 : class SIInsertSkips : public MachineFunctionPass {
      51             : private:
      52             :   const SIRegisterInfo *TRI = nullptr;
      53             :   const SIInstrInfo *TII = nullptr;
      54             :   unsigned SkipThreshold = 0;
      55             : 
      56             :   bool shouldSkip(const MachineBasicBlock &From,
      57             :                   const MachineBasicBlock &To) const;
      58             : 
      59             :   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
      60             : 
      61             :   void kill(MachineInstr &MI);
      62             : 
      63             :   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
      64             :                                      MachineBasicBlock::iterator I) const;
      65             : 
      66             :   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
      67             : 
      68             : public:
      69             :   static char ID;
      70             : 
      71        1469 :   SIInsertSkips() : MachineFunctionPass(ID) {}
      72             : 
      73             :   bool runOnMachineFunction(MachineFunction &MF) override;
      74             : 
      75        1461 :   StringRef getPassName() const override {
      76        1461 :     return "SI insert s_cbranch_execz instructions";
      77             :   }
      78             : 
      79        1461 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      80        1461 :     MachineFunctionPass::getAnalysisUsage(AU);
      81        1461 :   }
      82             : };
      83             : 
      84             : } // end anonymous namespace
      85             : 
      86             : char SIInsertSkips::ID = 0;
      87             : 
      88      312538 : INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
      89             :                 "SI insert s_cbranch_execz instructions", false, false)
      90             : 
      91             : char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
      92             : 
      93             : static bool opcodeEmitsNoInsts(unsigned Opc) {
      94             :   switch (Opc) {
      95             :   case TargetOpcode::IMPLICIT_DEF:
      96             :   case TargetOpcode::KILL:
      97             :   case TargetOpcode::BUNDLE:
      98             :   case TargetOpcode::CFI_INSTRUCTION:
      99             :   case TargetOpcode::EH_LABEL:
     100             :   case TargetOpcode::GC_LABEL:
     101             :   case TargetOpcode::DBG_VALUE:
     102             :     return true;
     103             :   default:
     104             :     return false;
     105             :   }
     106             : }
     107             : 
     108         390 : bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
     109             :                                const MachineBasicBlock &To) const {
     110         390 :   if (From.succ_empty())
     111             :     return false;
     112             : 
     113         383 :   unsigned NumInstr = 0;
     114         383 :   const MachineFunction *MF = From.getParent();
     115             : 
     116        1149 :   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
     117         745 :        MBBI != End && MBBI != ToI; ++MBBI) {
     118         437 :     const MachineBasicBlock &MBB = *MBBI;
     119             : 
     120         874 :     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
     121        5588 :          NumInstr < SkipThreshold && I != E; ++I) {
     122        4911 :       if (opcodeEmitsNoInsts(I->getOpcode()))
     123             :         continue;
     124             : 
     125             :       // FIXME: Since this is required for correctness, this should be inserted
     126             :       // during SILowerControlFlow.
     127             : 
     128             :       // When a uniform loop is inside non-uniform control flow, the branch
     129             :       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
     130             :       // when EXEC = 0. We should skip the loop lest it becomes infinite.
     131        4768 :       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
     132        2383 :           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
     133          75 :         return true;
     134             : 
     135        2382 :       if (I->isInlineAsm()) {
     136          13 :         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
     137          13 :         const char *AsmStr = I->getOperand(0).getSymbolName();
     138             : 
     139             :         // inlineasm length estimate is number of bytes assuming the longest
     140             :         // instruction.
     141          13 :         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
     142          13 :         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
     143             :       } else {
     144        2369 :         ++NumInstr;
     145             :       }
     146             : 
     147        2382 :       if (NumInstr >= SkipThreshold)
     148             :         return true;
     149             :     }
     150             :   }
     151             : 
     152             :   return false;
     153             : }
     154             : 
     155          29 : bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
     156          29 :   MachineBasicBlock &MBB = *MI.getParent();
     157          29 :   MachineFunction *MF = MBB.getParent();
     158             : 
     159          80 :   if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
     160          44 :       !shouldSkip(MBB, MBB.getParent()->back()))
     161             :     return false;
     162             : 
     163          12 :   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
     164             : 
     165           4 :   const DebugLoc &DL = MI.getDebugLoc();
     166             : 
     167             :   // If the exec mask is non-zero, skip the next two instructions
     168          16 :   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
     169           4 :     .addMBB(&NextBB);
     170             : 
     171           4 :   MachineBasicBlock::iterator Insert = SkipBB->begin();
     172             : 
     173             :   // Exec mask is zero: Export to NULL target...
     174          12 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
     175           4 :     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
     176           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     177           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     178           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     179           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     180           4 :     .addImm(1)  // vm
     181           4 :     .addImm(0)  // compr
     182           4 :     .addImm(0); // en
     183             : 
     184             :   // ... and terminate wavefront.
     185           8 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
     186             : 
     187           4 :   return true;
     188             : }
     189             : 
     190          34 : void SIInsertSkips::kill(MachineInstr &MI) {
     191          34 :   MachineBasicBlock &MBB = *MI.getParent();
     192         102 :   DebugLoc DL = MI.getDebugLoc();
     193          34 :   const MachineOperand &Op = MI.getOperand(0);
     194             : 
     195             : #ifndef NDEBUG
     196             :   CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
     197             :   // Kill is only allowed in pixel / geometry shaders.
     198             :   assert(CallConv == CallingConv::AMDGPU_PS ||
     199             :          CallConv == CallingConv::AMDGPU_GS);
     200             : #endif
     201             :   // Clear this thread from the exec mask if the operand is negative.
     202          34 :   if (Op.isImm()) {
     203             :     // Constant operand: Set exec mask to 0 or do nothing
     204          14 :     if (Op.getImm() & 0x80000000) {
     205          42 :       BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
     206          14 :         .addImm(0);
     207             :     }
     208             :   } else {
     209          60 :     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
     210          20 :         .addImm(0)
     211          20 :         .add(Op);
     212             :   }
     213          34 : }
     214             : 
     215           4 : MachineBasicBlock *SIInsertSkips::insertSkipBlock(
     216             :   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
     217           4 :   MachineFunction *MF = MBB.getParent();
     218             : 
     219           4 :   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
     220           4 :   MachineFunction::iterator MBBI(MBB);
     221           4 :   ++MBBI;
     222             : 
     223           4 :   MF->insert(MBBI, SkipBB);
     224           4 :   MBB.addSuccessor(SkipBB);
     225             : 
     226           4 :   return SkipBB;
     227             : }
     228             : 
     229             : // Returns true if a branch over the block was inserted.
     230         368 : bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
     231             :                                    MachineBasicBlock &SrcMBB) {
     232         368 :   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
     233             : 
     234         368 :   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
     235             :     return false;
     236             : 
     237          71 :   const DebugLoc &DL = MI.getDebugLoc();
     238         284 :   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
     239             : 
     240         213 :   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
     241          71 :     .addMBB(DestBB);
     242             : 
     243          71 :   return true;
     244             : }
     245             : 
     246       14829 : bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
     247       14829 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     248       14829 :   TII = ST.getInstrInfo();
     249       29658 :   TRI = &TII->getRegisterInfo();
     250       14829 :   SkipThreshold = SkipThresholdFlag;
     251             : 
     252       14829 :   bool HaveKill = false;
     253       14829 :   bool MadeChange = false;
     254             : 
     255             :   // Track depth of exec mask, divergent branches.
     256       29658 :   SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
     257             : 
     258       14829 :   MachineFunction::iterator NextBB;
     259             : 
     260       14829 :   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     261             : 
     262       29658 :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
     263       31558 :        BI != BE; BI = NextBB) {
     264       16729 :     NextBB = std::next(BI);
     265       16729 :     MachineBasicBlock &MBB = *BI;
     266       16729 :     bool HaveSkipBlock = false;
     267             : 
     268       17483 :     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
     269             :       // Reached convergence point for last divergent branch.
     270             :       ExecBranchStack.pop_back();
     271             :     }
     272             : 
     273             :     if (HaveKill && ExecBranchStack.empty()) {
     274             :       HaveKill = false;
     275             : 
     276             :       // TODO: Insert skip if exec is 0?
     277             :     }
     278             : 
     279       33458 :     MachineBasicBlock::iterator I, Next;
     280      653053 :     for (I = MBB.begin(); I != MBB.end(); I = Next) {
     281      301433 :       Next = std::next(I);
     282             : 
     283      301433 :       MachineInstr &MI = *I;
     284             : 
     285      301433 :       switch (MI.getOpcode()) {
     286         368 :       case AMDGPU::SI_MASK_BRANCH:
     287         368 :         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
     288         368 :         MadeChange |= skipMaskBranch(MI, MBB);
     289         368 :         break;
     290             : 
     291         602 :       case AMDGPU::S_BRANCH:
     292             :         // Optimize out branches to the next block.
     293             :         // FIXME: Shouldn't this be handled by BranchFolding?
     294         602 :         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
     295         435 :           MI.eraseFromParent();
     296         167 :         } else if (HaveSkipBlock) {
     297             :           // Remove the given unconditional branch when a skip block has been
     298             :           // inserted after the current one and let skip the two instructions
     299             :           // performing the kill if the exec mask is non-zero.
     300           1 :           MI.eraseFromParent();
     301             :         }
     302             :         break;
     303             : 
     304          34 :       case AMDGPU::SI_KILL_TERMINATOR:
     305          34 :         MadeChange = true;
     306          34 :         kill(MI);
     307             : 
     308          34 :         if (ExecBranchStack.empty()) {
     309          29 :           if (skipIfDead(MI, *NextBB)) {
     310             :             HaveSkipBlock = true;
     311             :             NextBB = std::next(BI);
     312             :             BE = MF.end();
     313             :           }
     314             :         } else {
     315             :           HaveKill = true;
     316             :         }
     317             : 
     318          34 :         MI.eraseFromParent();
     319          34 :         break;
     320             : 
     321         263 :       case AMDGPU::SI_RETURN_TO_EPILOG:
     322             :         // FIXME: Should move somewhere else
     323             :         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
     324             : 
     325             :         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
     326             :         // because external bytecode will be appended at the end.
     327         785 :         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
     328             :           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
     329             :           // the end and jump there.
     330           4 :           if (!EmptyMBBAtEnd) {
     331           4 :             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
     332           4 :             MF.insert(MF.end(), EmptyMBBAtEnd);
     333             :           }
     334             : 
     335           4 :           MBB.addSuccessor(EmptyMBBAtEnd);
     336          20 :           BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
     337           4 :             .addMBB(EmptyMBBAtEnd);
     338           4 :           I->eraseFromParent();
     339             :         }
     340             :         break;
     341             : 
     342             :       default:
     343             :         break;
     344             :       }
     345             :     }
     346             :   }
     347             : 
     348       29658 :   return MadeChange;
     349      216918 : }

Generated by: LCOV version 1.13