LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIInsertSkips.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 61 137 44.5 %
Date: 2018-10-14 09:39:32 Functions: 7 10 70.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass inserts branches on the 0 exec mask over divergent branches
      12             : /// branches when it's expected that jumping over the untaken control flow will
      13             : /// be cheaper than having every workitem no-op through it.
      14             : //
      15             : //===----------------------------------------------------------------------===//
      16             : 
      17             : #include "AMDGPU.h"
      18             : #include "AMDGPUSubtarget.h"
      19             : #include "SIInstrInfo.h"
      20             : #include "SIMachineFunctionInfo.h"
      21             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      22             : #include "llvm/ADT/SmallVector.h"
      23             : #include "llvm/ADT/StringRef.h"
      24             : #include "llvm/CodeGen/MachineBasicBlock.h"
      25             : #include "llvm/CodeGen/MachineFunction.h"
      26             : #include "llvm/CodeGen/MachineFunctionPass.h"
      27             : #include "llvm/CodeGen/MachineInstr.h"
      28             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      29             : #include "llvm/CodeGen/MachineOperand.h"
      30             : #include "llvm/IR/CallingConv.h"
      31             : #include "llvm/IR/DebugLoc.h"
      32             : #include "llvm/MC/MCAsmInfo.h"
      33             : #include "llvm/Pass.h"
      34             : #include "llvm/Support/CommandLine.h"
      35             : #include "llvm/Target/TargetMachine.h"
      36             : #include <cassert>
      37             : #include <cstdint>
      38             : #include <iterator>
      39             : 
      40             : using namespace llvm;
      41             : 
      42             : #define DEBUG_TYPE "si-insert-skips"
      43             : 
      44             : static cl::opt<unsigned> SkipThresholdFlag(
      45             :   "amdgpu-skip-threshold",
      46             :   cl::desc("Number of instructions before jumping over divergent control flow"),
      47             :   cl::init(12), cl::Hidden);
      48             : 
      49             : namespace {
      50             : 
      51             : class SIInsertSkips : public MachineFunctionPass {
      52             : private:
      53             :   const SIRegisterInfo *TRI = nullptr;
      54             :   const SIInstrInfo *TII = nullptr;
      55             :   unsigned SkipThreshold = 0;
      56             : 
      57             :   bool shouldSkip(const MachineBasicBlock &From,
      58             :                   const MachineBasicBlock &To) const;
      59             : 
      60             :   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
      61             : 
      62             :   void kill(MachineInstr &MI);
      63             : 
      64             :   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
      65             :                                      MachineBasicBlock::iterator I) const;
      66             : 
      67             :   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
      68             : 
      69             : public:
      70             :   static char ID;
      71             : 
      72        1930 :   SIInsertSkips() : MachineFunctionPass(ID) {}
      73             : 
      74             :   bool runOnMachineFunction(MachineFunction &MF) override;
      75             : 
      76        1917 :   StringRef getPassName() const override {
      77        1917 :     return "SI insert s_cbranch_execz instructions";
      78             :   }
      79             : 
      80        1917 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      81        1917 :     MachineFunctionPass::getAnalysisUsage(AU);
      82        1917 :   }
      83             : };
      84             : 
      85             : } // end anonymous namespace
      86             : 
      87             : char SIInsertSkips::ID = 0;
      88             : 
      89      197369 : INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
      90             :                 "SI insert s_cbranch_execz instructions", false, false)
      91             : 
      92             : char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
      93             : 
      94             : static bool opcodeEmitsNoInsts(unsigned Opc) {
      95             :   switch (Opc) {
      96             :   case TargetOpcode::IMPLICIT_DEF:
      97             :   case TargetOpcode::KILL:
      98             :   case TargetOpcode::BUNDLE:
      99             :   case TargetOpcode::CFI_INSTRUCTION:
     100             :   case TargetOpcode::EH_LABEL:
     101             :   case TargetOpcode::GC_LABEL:
     102             :   case TargetOpcode::DBG_VALUE:
     103             :     return true;
     104             :   default:
     105             :     return false;
     106             :   }
     107             : }
     108             : 
     109           0 : bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
     110             :                                const MachineBasicBlock &To) const {
     111           0 :   if (From.succ_empty())
     112           0 :     return false;
     113             : 
     114             :   unsigned NumInstr = 0;
     115           0 :   const MachineFunction *MF = From.getParent();
     116             : 
     117             :   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
     118           0 :        MBBI != End && MBBI != ToI; ++MBBI) {
     119             :     const MachineBasicBlock &MBB = *MBBI;
     120             : 
     121             :     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
     122           0 :          NumInstr < SkipThreshold && I != E; ++I) {
     123           0 :       if (opcodeEmitsNoInsts(I->getOpcode()))
     124           0 :         continue;
     125             : 
     126             :       // FIXME: Since this is required for correctness, this should be inserted
     127             :       // during SILowerControlFlow.
     128             : 
     129             :       // When a uniform loop is inside non-uniform control flow, the branch
     130             :       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
     131             :       // when EXEC = 0. We should skip the loop lest it becomes infinite.
     132           0 :       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
     133             :           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
     134           0 :         return true;
     135             : 
     136           0 :       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
     137           0 :         return true;
     138             : 
     139           0 :       ++NumInstr;
     140           0 :       if (NumInstr >= SkipThreshold)
     141           0 :         return true;
     142             :     }
     143             :   }
     144             : 
     145             :   return false;
     146             : }
     147             : 
     148          80 : bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
     149          80 :   MachineBasicBlock &MBB = *MI.getParent();
     150          80 :   MachineFunction *MF = MBB.getParent();
     151             : 
     152         195 :   if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
     153          35 :       !shouldSkip(MBB, MBB.getParent()->back()))
     154          74 :     return false;
     155             : 
     156           6 :   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
     157             : 
     158             :   const DebugLoc &DL = MI.getDebugLoc();
     159             : 
     160             :   // If the exec mask is non-zero, skip the next two instructions
     161           6 :   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
     162             :     .addMBB(&NextBB);
     163             : 
     164           6 :   MachineBasicBlock::iterator Insert = SkipBB->begin();
     165             : 
     166             :   // Exec mask is zero: Export to NULL target...
     167          12 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
     168             :     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
     169           6 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     170           6 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     171           6 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     172           6 :     .addReg(AMDGPU::VGPR0, RegState::Undef)
     173             :     .addImm(1)  // vm
     174             :     .addImm(0)  // compr
     175             :     .addImm(0); // en
     176             : 
     177             :   // ... and terminate wavefront.
     178          12 :   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
     179             : 
     180           6 :   return true;
     181             : }
     182             : 
     183           0 : void SIInsertSkips::kill(MachineInstr &MI) {
     184           0 :   MachineBasicBlock &MBB = *MI.getParent();
     185             :   DebugLoc DL = MI.getDebugLoc();
     186             : 
     187           0 :   switch (MI.getOpcode()) {
     188           0 :   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
     189             :     unsigned Opcode = 0;
     190             : 
     191             :     // The opcodes are inverted because the inline immediate has to be
     192             :     // the first operand, e.g. from "x < imm" to "imm > x"
     193           0 :     switch (MI.getOperand(2).getImm()) {
     194             :     case ISD::SETOEQ:
     195             :     case ISD::SETEQ:
     196             :       Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
     197             :       break;
     198           0 :     case ISD::SETOGT:
     199             :     case ISD::SETGT:
     200             :       Opcode = AMDGPU::V_CMPX_LT_F32_e64;
     201           0 :       break;
     202           0 :     case ISD::SETOGE:
     203             :     case ISD::SETGE:
     204             :       Opcode = AMDGPU::V_CMPX_LE_F32_e64;
     205           0 :       break;
     206           0 :     case ISD::SETOLT:
     207             :     case ISD::SETLT:
     208             :       Opcode = AMDGPU::V_CMPX_GT_F32_e64;
     209           0 :       break;
     210           0 :     case ISD::SETOLE:
     211             :     case ISD::SETLE:
     212             :       Opcode = AMDGPU::V_CMPX_GE_F32_e64;
     213           0 :       break;
     214           0 :     case ISD::SETONE:
     215             :     case ISD::SETNE:
     216             :       Opcode = AMDGPU::V_CMPX_LG_F32_e64;
     217           0 :       break;
     218           0 :     case ISD::SETO:
     219             :       Opcode = AMDGPU::V_CMPX_O_F32_e64;
     220           0 :       break;
     221           0 :     case ISD::SETUO:
     222             :       Opcode = AMDGPU::V_CMPX_U_F32_e64;
     223           0 :       break;
     224           0 :     case ISD::SETUEQ:
     225             :       Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
     226           0 :       break;
     227           0 :     case ISD::SETUGT:
     228             :       Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
     229           0 :       break;
     230           0 :     case ISD::SETUGE:
     231             :       Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
     232           0 :       break;
     233           0 :     case ISD::SETULT:
     234             :       Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
     235           0 :       break;
     236           0 :     case ISD::SETULE:
     237             :       Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
     238           0 :       break;
     239           0 :     case ISD::SETUNE:
     240             :       Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
     241           0 :       break;
     242           0 :     default:
     243           0 :       llvm_unreachable("invalid ISD:SET cond code");
     244             :     }
     245             : 
     246             :     assert(MI.getOperand(0).isReg());
     247             : 
     248           0 :     if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
     249             :                     MI.getOperand(0).getReg())) {
     250           0 :       Opcode = AMDGPU::getVOPe32(Opcode);
     251           0 :       BuildMI(MBB, &MI, DL, TII->get(Opcode))
     252           0 :           .add(MI.getOperand(1))
     253           0 :           .add(MI.getOperand(0));
     254             :     } else {
     255           0 :       BuildMI(MBB, &MI, DL, TII->get(Opcode))
     256           0 :           .addReg(AMDGPU::VCC, RegState::Define)
     257             :           .addImm(0)  // src0 modifiers
     258           0 :           .add(MI.getOperand(1))
     259             :           .addImm(0)  // src1 modifiers
     260           0 :           .add(MI.getOperand(0))
     261             :           .addImm(0);  // omod
     262             :     }
     263             :     break;
     264             :   }
     265           0 :   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     266           0 :     const MachineOperand &Op = MI.getOperand(0);
     267           0 :     int64_t KillVal = MI.getOperand(1).getImm();
     268             :     assert(KillVal == 0 || KillVal == -1);
     269             : 
     270             :     // Kill all threads if Op0 is an immediate and equal to the Kill value.
     271           0 :     if (Op.isImm()) {
     272           0 :       int64_t Imm = Op.getImm();
     273             :       assert(Imm == 0 || Imm == -1);
     274             : 
     275           0 :       if (Imm == KillVal)
     276           0 :         BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
     277             :           .addImm(0);
     278             :       break;
     279             :     }
     280             : 
     281           0 :     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
     282           0 :     BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
     283           0 :         .addReg(AMDGPU::EXEC)
     284             :         .add(Op);
     285           0 :     break;
     286             :   }
     287           0 :   default:
     288           0 :     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
     289             :   }
     290           0 : }
     291             : 
     292           0 : MachineBasicBlock *SIInsertSkips::insertSkipBlock(
     293             :   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
     294           0 :   MachineFunction *MF = MBB.getParent();
     295             : 
     296           0 :   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
     297             :   MachineFunction::iterator MBBI(MBB);
     298             :   ++MBBI;
     299             : 
     300             :   MF->insert(MBBI, SkipBB);
     301           0 :   MBB.addSuccessor(SkipBB);
     302             : 
     303           0 :   return SkipBB;
     304             : }
     305             : 
     306             : // Returns true if a branch over the block was inserted.
     307         522 : bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
     308             :                                    MachineBasicBlock &SrcMBB) {
     309         522 :   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
     310             : 
     311         522 :   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
     312             :     return false;
     313             : 
     314             :   const DebugLoc &DL = MI.getDebugLoc();
     315         119 :   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
     316             : 
     317         238 :   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
     318             :     .addMBB(DestBB);
     319             : 
     320         119 :   return true;
     321             : }
     322             : 
     323       19656 : bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
     324       19656 :   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     325       19656 :   TII = ST.getInstrInfo();
     326       19656 :   TRI = &TII->getRegisterInfo();
     327       19656 :   SkipThreshold = SkipThresholdFlag;
     328             : 
     329             :   bool HaveKill = false;
     330             :   bool MadeChange = false;
     331             : 
     332             :   // Track depth of exec mask, divergent branches.
     333             :   SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
     334             : 
     335             :   MachineFunction::iterator NextBB;
     336             : 
     337             :   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     338             : 
     339             :   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
     340       41713 :        BI != BE; BI = NextBB) {
     341             :     NextBB = std::next(BI);
     342             :     MachineBasicBlock &MBB = *BI;
     343             :     bool HaveSkipBlock = false;
     344             : 
     345       22057 :     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
     346             :       // Reached convergence point for last divergent branch.
     347             :       ExecBranchStack.pop_back();
     348             :     }
     349             : 
     350             :     if (HaveKill && ExecBranchStack.empty()) {
     351             :       HaveKill = false;
     352             : 
     353             :       // TODO: Insert skip if exec is 0?
     354             :     }
     355             : 
     356             :     MachineBasicBlock::iterator I, Next;
     357      380815 :     for (I = MBB.begin(); I != MBB.end(); I = Next) {
     358      358758 :       Next = std::next(I);
     359             : 
     360             :       MachineInstr &MI = *I;
     361             : 
     362      717516 :       switch (MI.getOpcode()) {
     363         522 :       case AMDGPU::SI_MASK_BRANCH:
     364         522 :         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
     365         522 :         MadeChange |= skipMaskBranch(MI, MBB);
     366         522 :         break;
     367             : 
     368         767 :       case AMDGPU::S_BRANCH:
     369             :         // Optimize out branches to the next block.
     370             :         // FIXME: Shouldn't this be handled by BranchFolding?
     371         767 :         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
     372         596 :           MI.eraseFromParent();
     373         171 :         } else if (HaveSkipBlock) {
     374             :           // Remove the given unconditional branch when a skip block has been
     375             :           // inserted after the current one and let skip the two instructions
     376             :           // performing the kill if the exec mask is non-zero.
     377           1 :           MI.eraseFromParent();
     378             :         }
     379             :         break;
     380             : 
     381          85 :       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
     382             :       case AMDGPU::SI_KILL_I1_TERMINATOR:
     383             :         MadeChange = true;
     384          85 :         kill(MI);
     385             : 
     386          85 :         if (ExecBranchStack.empty()) {
     387          80 :           if (skipIfDead(MI, *NextBB)) {
     388             :             HaveSkipBlock = true;
     389             :             NextBB = std::next(BI);
     390             :             BE = MF.end();
     391             :           }
     392             :         } else {
     393             :           HaveKill = true;
     394             :         }
     395             : 
     396          85 :         MI.eraseFromParent();
     397          85 :         break;
     398             : 
     399             :       case AMDGPU::SI_RETURN_TO_EPILOG:
     400             :         // FIXME: Should move somewhere else
     401             :         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
     402             : 
     403             :         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
     404             :         // because external bytecode will be appended at the end.
     405        1087 :         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
     406             :           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
     407             :           // the end and jump there.
     408           4 :           if (!EmptyMBBAtEnd) {
     409           4 :             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
     410             :             MF.insert(MF.end(), EmptyMBBAtEnd);
     411             :           }
     412             : 
     413           4 :           MBB.addSuccessor(EmptyMBBAtEnd);
     414           8 :           BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
     415             :             .addMBB(EmptyMBBAtEnd);
     416           4 :           I->eraseFromParent();
     417             :         }
     418             :         break;
     419             : 
     420             :       default:
     421             :         break;
     422             :       }
     423             :     }
     424             :   }
     425             : 
     426       19656 :   return MadeChange;
     427             : }

Generated by: LCOV version 1.13