LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertSkips.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 122 128 95.3 %
Date: 2018-07-13 00:08:38 Functions: 13 14 92.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass inserts branches on the 0 exec mask over divergent branches
      12             : /// branches when it's expected that jumping over the untaken control flow will
      13             : /// be cheaper than having every workitem no-op through it.
      14             : //
      15             : //===----------------------------------------------------------------------===//
      16             : 
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "SIInstrInfo.h"
      20             : #include "SIMachineFunctionInfo.h"
      21             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      22             : #include "llvm/ADT/SmallVector.h"
      23             : #include "llvm/ADT/StringRef.h"
      24             : #include "llvm/CodeGen/MachineBasicBlock.h"
      25             : #include "llvm/CodeGen/MachineFunction.h"
      26             : #include "llvm/CodeGen/MachineFunctionPass.h"
      27             : #include "llvm/CodeGen/MachineInstr.h"
      28             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      29             : #include "llvm/CodeGen/MachineOperand.h"
      30             : #include "llvm/IR/CallingConv.h"
      31             : #include "llvm/IR/DebugLoc.h"
      32             : #include "llvm/MC/MCAsmInfo.h"
      33             : #include "llvm/Pass.h"
      34             : #include "llvm/Support/CommandLine.h"
      35             : #include "llvm/Target/TargetMachine.h"
      36             : #include <cassert>
      37             : #include <cstdint>
      38             : #include <iterator>
      39             : 
      40             : using namespace llvm;
      41             : 
      42             : #define DEBUG_TYPE "si-insert-skips"
      43             : 
      44       99743 : static cl::opt<unsigned> SkipThresholdFlag(
      45             :   "amdgpu-skip-threshold",
      46       99743 :   cl::desc("Number of instructions before jumping over divergent control flow"),
      47      299229 :   cl::init(12), cl::Hidden);
      48             : 
      49             : namespace {
      50             : 
      51        1789 : class SIInsertSkips : public MachineFunctionPass {
      52             : private:
      53             :   const SIRegisterInfo *TRI = nullptr;
      54             :   const SIInstrInfo *TII = nullptr;
      55             :   unsigned SkipThreshold = 0;
      56             : 
      57             :   bool shouldSkip(const MachineBasicBlock &From,
      58             :                   const MachineBasicBlock &To) const;
      59             : 
      60             :   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
      61             : 
      62             :   void kill(MachineInstr &MI);
      63             : 
      64             :   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
      65             :                                      MachineBasicBlock::iterator I) const;
      66             : 
      67             :   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
      68             : 
      69             : public:
      70             :   static char ID;
      71             : 
      72        1797 :   SIInsertSkips() : MachineFunctionPass(ID) {}
      73             : 
      74             :   bool runOnMachineFunction(MachineFunction &MF) override;
      75             : 
      76        1784 :   StringRef getPassName() const override {
      77        1784 :     return "SI insert s_cbranch_execz instructions";
      78             :   }
      79             : 
      80        1784 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      81        1784 :     MachineFunctionPass::getAnalysisUsage(AU);
      82        1784 :   }
      83             : };
      84             : 
      85             : } // end anonymous namespace
      86             : 
      87             : char SIInsertSkips::ID = 0;
      88             : 
      89      342570 : INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
      90             :                 "SI insert s_cbranch_execz instructions", false, false)
      91             : 
      92             : char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
      93             : 
      94             : static bool opcodeEmitsNoInsts(unsigned Opc) {
      95             :   switch (Opc) {
      96             :   case TargetOpcode::IMPLICIT_DEF:
      97             :   case TargetOpcode::KILL:
      98             :   case TargetOpcode::BUNDLE:
      99             :   case TargetOpcode::CFI_INSTRUCTION:
     100             :   case TargetOpcode::EH_LABEL:
     101             :   case TargetOpcode::GC_LABEL:
     102             :   case TargetOpcode::DBG_VALUE:
     103             :     return true;
     104             :   default:
     105             :     return false;
     106             :   }
     107             : }
     108             : 
     109         439 : bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
     110             :                                const MachineBasicBlock &To) const {
     111         439 :   if (From.succ_empty())
     112             :     return false;
     113             : 
     114             :   unsigned NumInstr = 0;
     115         431 :   const MachineFunction *MF = From.getParent();
     116             : 
     117             :   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
     118         829 :        MBBI != End && MBBI != ToI; ++MBBI) {
     119             :     const MachineBasicBlock &MBB = *MBBI;
     120             : 
     121         488 :     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
     122        5898 :          NumInstr < SkipThreshold && I != E; ++I) {
     123        2600 :       if (opcodeEmitsNoInsts(I->getOpcode()))
     124             :         continue;
     125             : 
     126             :       // FIXME: Since this is required for correctness, this should be inserted
     127             :       // during SILowerControlFlow.
     128             : 
     129             :       // When a uniform loop is inside non-uniform control flow, the branch
     130             :       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
     131             :       // when EXEC = 0. We should skip the loop lest it becomes infinite.
     132        4996 :       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
     133             :           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
     134          90 :         return true;
     135             : 
     136             :       // V_READFIRSTLANE/V_READLANE destination register may be used as operand
     137             :       // by some SALU instruction. If exec mask is zero vector instruction
     138             :       // defining the register that is used by the scalar one is not executed
     139             :       // and scalar instruction will operate on undefined data. For
     140             :       // V_READFIRSTLANE/V_READLANE we should avoid predicated execution.
     141        2492 :       if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) ||
     142             :           (I->getOpcode() == AMDGPU::V_READLANE_B32)) {
     143             :         return true;
     144             :       }
     145             : 
     146        2481 :       if (I->isInlineAsm()) {
     147          14 :         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
     148          14 :         const char *AsmStr = I->getOperand(0).getSymbolName();
     149             : 
     150             :         // inlineasm length estimate is number of bytes assuming the longest
     151             :         // instruction.
     152          14 :         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
     153          14 :         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
     154             :       } else {
     155        2467 :         ++NumInstr;
     156             :       }
     157             : 
     158        2481 :       if (NumInstr >= SkipThreshold)
     159             :         return true;
     160             :     }
     161             :   }
     162             : 
     163             :   return false;
     164             : }
     165             : 
     166          80 : bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
     167          80 :   MachineBasicBlock &MBB = *MI.getParent();
     168          80 :   MachineFunction *MF = MBB.getParent();
     169             : 
     170         195 :   if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
     171          35 :       !shouldSkip(MBB, MBB.getParent()->back()))
     172             :     return false;
     173             : 
     174           4 :   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
     175             : 
     176             :   const DebugLoc &DL = MI.getDebugLoc();
     177             : 
     178             :   // If the exec mask is non-zero, skip the next two instructions
     179           4 :   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
     180             :     .addMBB(&NextBB);
     181             : 
     182           4 :   MachineBasicBlock::iterator Insert = SkipBB->begin();
     183             : 
     184             :   // Exec mask is zero: Export to NULL target...
     185          12 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
     186             :     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
     187           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     188           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     189           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     190           4 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     191             :     .addImm(1)  // vm
     192             :     .addImm(0)  // compr
     193             :     .addImm(0); // en
     194             : 
     195             :   // ... and terminate wavefront.
     196           8 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
     197             : 
     198           4 :   return true;
     199             : }
     200             : 
     201          85 : void SIInsertSkips::kill(MachineInstr &MI) {
     202          85 :   MachineBasicBlock &MBB = *MI.getParent();
     203             :   DebugLoc DL = MI.getDebugLoc();
     204             : 
     205         170 :   switch (MI.getOpcode()) {
     206          53 :   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
     207             :     unsigned Opcode = 0;
     208             : 
     209             :     // The opcodes are inverted because the inline immediate has to be
     210             :     // the first operand, e.g. from "x < imm" to "imm > x"
     211          53 :     switch (MI.getOperand(2).getImm()) {
     212             :     case ISD::SETOEQ:
     213             :     case ISD::SETEQ:
     214             :       Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
     215             :       break;
     216           2 :     case ISD::SETOGT:
     217             :     case ISD::SETGT:
     218             :       Opcode = AMDGPU::V_CMPX_LT_F32_e64;
     219             :       break;
     220          27 :     case ISD::SETOGE:
     221             :     case ISD::SETGE:
     222             :       Opcode = AMDGPU::V_CMPX_LE_F32_e64;
     223             :       break;
     224           2 :     case ISD::SETOLT:
     225             :     case ISD::SETLT:
     226             :       Opcode = AMDGPU::V_CMPX_GT_F32_e64;
     227             :       break;
     228           4 :     case ISD::SETOLE:
     229             :     case ISD::SETLE:
     230             :       Opcode = AMDGPU::V_CMPX_GE_F32_e64;
     231             :       break;
     232           2 :     case ISD::SETONE:
     233             :     case ISD::SETNE:
     234             :       Opcode = AMDGPU::V_CMPX_LG_F32_e64;
     235             :       break;
     236           0 :     case ISD::SETO:
     237             :       Opcode = AMDGPU::V_CMPX_O_F32_e64;
     238             :       break;
     239           0 :     case ISD::SETUO:
     240             :       Opcode = AMDGPU::V_CMPX_U_F32_e64;
     241             :       break;
     242           2 :     case ISD::SETUEQ:
     243             :       Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
     244             :       break;
     245           2 :     case ISD::SETUGT:
     246             :       Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
     247             :       break;
     248           4 :     case ISD::SETUGE:
     249             :       Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
     250             :       break;
     251           2 :     case ISD::SETULT:
     252             :       Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
     253             :       break;
     254           2 :     case ISD::SETULE:
     255             :       Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
     256             :       break;
     257           2 :     case ISD::SETUNE:
     258             :       Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
     259             :       break;
     260           0 :     default:
     261           0 :       llvm_unreachable("invalid ISD:SET cond code");
     262             :     }
     263             : 
     264             :     assert(MI.getOperand(0).isReg());
     265             : 
     266          53 :     if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
     267             :                     MI.getOperand(0).getReg())) {
     268          51 :       Opcode = AMDGPU::getVOPe32(Opcode);
     269          51 :       BuildMI(MBB, &MI, DL, TII->get(Opcode))
     270          51 :           .add(MI.getOperand(1))
     271          51 :           .add(MI.getOperand(0));
     272             :     } else {
     273           6 :       BuildMI(MBB, &MI, DL, TII->get(Opcode))
     274           2 :           .addReg(AMDGPU::VCC, RegState::Define)
     275             :           .addImm(0)  // src0 modifiers
     276           2 :           .add(MI.getOperand(1))
     277             :           .addImm(0)  // src1 modifiers
     278           2 :           .add(MI.getOperand(0))
     279             :           .addImm(0);  // omod
     280             :     }
     281             :     break;
     282             :   }
     283          32 :   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     284          32 :     const MachineOperand &Op = MI.getOperand(0);
     285          32 :     int64_t KillVal = MI.getOperand(1).getImm();
     286             :     assert(KillVal == 0 || KillVal == -1);
     287             : 
     288             :     // Kill all threads if Op0 is an immediate and equal to the Kill value.
     289          32 :     if (Op.isImm()) {
     290          18 :       int64_t Imm = Op.getImm();
     291             :       assert(Imm == 0 || Imm == -1);
     292             : 
     293          18 :       if (Imm == KillVal)
     294          14 :         BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
     295             :           .addImm(0);
     296             :       break;
     297             :     }
     298             : 
     299          14 :     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
     300          42 :     BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
     301          14 :         .addReg(AMDGPU::EXEC)
     302             :         .add(Op);
     303             :     break;
     304             :   }
     305           0 :   default:
     306           0 :     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
     307             :   }
     308          85 : }
     309             : 
     310           4 : MachineBasicBlock *SIInsertSkips::insertSkipBlock(
     311             :   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
     312           4 :   MachineFunction *MF = MBB.getParent();
     313             : 
     314           4 :   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
     315             :   MachineFunction::iterator MBBI(MBB);
     316             :   ++MBBI;
     317             : 
     318             :   MF->insert(MBBI, SkipBB);
     319           4 :   MBB.addSuccessor(SkipBB);
     320             : 
     321           4 :   return SkipBB;
     322             : }
     323             : 
     324             : // Returns true if a branch over the block was inserted.
     325         404 : bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
     326             :                                    MachineBasicBlock &SrcMBB) {
     327         404 :   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
     328             : 
     329         404 :   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
     330             :     return false;
     331             : 
     332             :   const DebugLoc &DL = MI.getDebugLoc();
     333          86 :   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
     334             : 
     335         258 :   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
     336             :     .addMBB(DestBB);
     337             : 
     338          86 :   return true;
     339             : }
     340             : 
     341       17872 : bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
     342       17872 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     343       17872 :   TII = ST.getInstrInfo();
     344       17872 :   TRI = &TII->getRegisterInfo();
     345       17872 :   SkipThreshold = SkipThresholdFlag;
     346             : 
     347             :   bool HaveKill = false;
     348             :   bool MadeChange = false;
     349             : 
     350             :   // Track depth of exec mask, divergent branches.
     351             :   SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
     352             : 
     353             :   MachineFunction::iterator NextBB;
     354             : 
     355             :   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     356             : 
     357             :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
     358       37876 :        BI != BE; BI = NextBB) {
     359             :     NextBB = std::next(BI);
     360             :     MachineBasicBlock &MBB = *BI;
     361             :     bool HaveSkipBlock = false;
     362             : 
     363       20004 :     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
     364             :       // Reached convergence point for last divergent branch.
     365             :       ExecBranchStack.pop_back();
     366             :     }
     367             : 
     368             :     if (HaveKill && ExecBranchStack.empty()) {
     369             :       HaveKill = false;
     370             : 
     371             :       // TODO: Insert skip if exec is 0?
     372             :     }
     373             : 
     374             :     MachineBasicBlock::iterator I, Next;
     375      350753 :     for (I = MBB.begin(); I != MBB.end(); I = Next) {
     376             :       Next = std::next(I);
     377             : 
     378             :       MachineInstr &MI = *I;
     379             : 
     380      661498 :       switch (MI.getOpcode()) {
     381         404 :       case AMDGPU::SI_MASK_BRANCH:
     382         404 :         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
     383         404 :         MadeChange |= skipMaskBranch(MI, MBB);
     384         404 :         break;
     385             : 
     386         647 :       case AMDGPU::S_BRANCH:
     387             :         // Optimize out branches to the next block.
     388             :         // FIXME: Shouldn't this be handled by BranchFolding?
     389         647 :         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
     390         477 :           MI.eraseFromParent();
     391         170 :         } else if (HaveSkipBlock) {
     392             :           // Remove the given unconditional branch when a skip block has been
     393             :           // inserted after the current one and let skip the two instructions
     394             :           // performing the kill if the exec mask is non-zero.
     395           1 :           MI.eraseFromParent();
     396             :         }
     397             :         break;
     398             : 
     399          85 :       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
     400             :       case AMDGPU::SI_KILL_I1_TERMINATOR:
     401             :         MadeChange = true;
     402          85 :         kill(MI);
     403             : 
     404          85 :         if (ExecBranchStack.empty()) {
     405          80 :           if (skipIfDead(MI, *NextBB)) {
     406             :             HaveSkipBlock = true;
     407             :             NextBB = std::next(BI);
     408             :             BE = MF.end();
     409             :           }
     410             :         } else {
     411             :           HaveKill = true;
     412             :         }
     413             : 
     414          85 :         MI.eraseFromParent();
     415          85 :         break;
     416             : 
     417             :       case AMDGPU::SI_RETURN_TO_EPILOG:
     418             :         // FIXME: Should move somewhere else
     419             :         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
     420             : 
     421             :         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
     422             :         // because external bytecode will be appended at the end.
     423        1592 :         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
     424             :           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
     425             :           // the end and jump there.
     426           4 :           if (!EmptyMBBAtEnd) {
     427           4 :             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
     428             :             MF.insert(MF.end(), EmptyMBBAtEnd);
     429             :           }
     430             : 
     431           4 :           MBB.addSuccessor(EmptyMBBAtEnd);
     432          12 :           BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
     433             :             .addMBB(EmptyMBBAtEnd);
     434           4 :           I->eraseFromParent();
     435             :         }
     436             :         break;
     437             : 
     438             :       default:
     439             :         break;
     440             :       }
     441             :     }
     442             :   }
     443             : 
     444       17872 :   return MadeChange;
     445      299229 : }

Generated by: LCOV version 1.13