LLVM  8.0.0svn
SIInsertSkips.cpp
Go to the documentation of this file.
1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass inserts branches on the 0 exec mask over divergent branches
12 /// branches when it's expected that jumping over the untaken control flow will
13 /// be cheaper than having every workitem no-op through it.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPU.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIInstrInfo.h"
20 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/Pass.h"
36 #include <cassert>
37 #include <cstdint>
38 #include <iterator>
39 
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "si-insert-skips"
43 
45  "amdgpu-skip-threshold",
46  cl::desc("Number of instructions before jumping over divergent control flow"),
47  cl::init(12), cl::Hidden);
48 
49 namespace {
50 
51 class SIInsertSkips : public MachineFunctionPass {
52 private:
53  const SIRegisterInfo *TRI = nullptr;
54  const SIInstrInfo *TII = nullptr;
55  unsigned SkipThreshold = 0;
56 
57  bool shouldSkip(const MachineBasicBlock &From,
58  const MachineBasicBlock &To) const;
59 
60  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
61 
62  void kill(MachineInstr &MI);
63 
64  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
66 
67  bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
68 
69 public:
70  static char ID;
71 
72  SIInsertSkips() : MachineFunctionPass(ID) {}
73 
74  bool runOnMachineFunction(MachineFunction &MF) override;
75 
76  StringRef getPassName() const override {
77  return "SI insert s_cbranch_execz instructions";
78  }
79 
80  void getAnalysisUsage(AnalysisUsage &AU) const override {
82  }
83 };
84 
85 } // end anonymous namespace
86 
87 char SIInsertSkips::ID = 0;
88 
89 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
90  "SI insert s_cbranch_execz instructions", false, false)
91 
92 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
93 
94 static bool opcodeEmitsNoInsts(unsigned Opc) {
95  switch (Opc) {
96  case TargetOpcode::IMPLICIT_DEF:
97  case TargetOpcode::KILL:
98  case TargetOpcode::BUNDLE:
99  case TargetOpcode::CFI_INSTRUCTION:
101  case TargetOpcode::GC_LABEL:
102  case TargetOpcode::DBG_VALUE:
103  return true;
104  default:
105  return false;
106  }
107 }
108 
109 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
110  const MachineBasicBlock &To) const {
111  if (From.succ_empty())
112  return false;
113 
114  unsigned NumInstr = 0;
115  const MachineFunction *MF = From.getParent();
116 
117  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
118  MBBI != End && MBBI != ToI; ++MBBI) {
119  const MachineBasicBlock &MBB = *MBBI;
120 
121  for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
122  NumInstr < SkipThreshold && I != E; ++I) {
123  if (opcodeEmitsNoInsts(I->getOpcode()))
124  continue;
125 
126  // FIXME: Since this is required for correctness, this should be inserted
127  // during SILowerControlFlow.
128 
129  // When a uniform loop is inside non-uniform control flow, the branch
130  // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
131  // when EXEC = 0. We should skip the loop lest it becomes infinite.
132  if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
133  I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
134  return true;
135 
136  if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
137  return true;
138 
139  ++NumInstr;
140  if (NumInstr >= SkipThreshold)
141  return true;
142  }
143  }
144 
145  return false;
146 }
147 
148 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
149  MachineBasicBlock &MBB = *MI.getParent();
150  MachineFunction *MF = MBB.getParent();
151 
153  !shouldSkip(MBB, MBB.getParent()->back()))
154  return false;
155 
156  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
157 
158  const DebugLoc &DL = MI.getDebugLoc();
159 
160  // If the exec mask is non-zero, skip the next two instructions
161  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
162  .addMBB(&NextBB);
163 
164  MachineBasicBlock::iterator Insert = SkipBB->begin();
165 
166  // Exec mask is zero: Export to NULL target...
167  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
168  .addImm(0x09) // V_008DFC_SQ_EXP_NULL
169  .addReg(AMDGPU::VGPR0, RegState::Undef)
170  .addReg(AMDGPU::VGPR0, RegState::Undef)
171  .addReg(AMDGPU::VGPR0, RegState::Undef)
172  .addReg(AMDGPU::VGPR0, RegState::Undef)
173  .addImm(1) // vm
174  .addImm(0) // compr
175  .addImm(0); // en
176 
177  // ... and terminate wavefront.
178  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
179 
180  return true;
181 }
182 
183 void SIInsertSkips::kill(MachineInstr &MI) {
184  MachineBasicBlock &MBB = *MI.getParent();
185  DebugLoc DL = MI.getDebugLoc();
186 
187  switch (MI.getOpcode()) {
188  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
189  unsigned Opcode = 0;
190 
191  // The opcodes are inverted because the inline immediate has to be
192  // the first operand, e.g. from "x < imm" to "imm > x"
193  switch (MI.getOperand(2).getImm()) {
194  case ISD::SETOEQ:
195  case ISD::SETEQ:
196  Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
197  break;
198  case ISD::SETOGT:
199  case ISD::SETGT:
200  Opcode = AMDGPU::V_CMPX_LT_F32_e64;
201  break;
202  case ISD::SETOGE:
203  case ISD::SETGE:
204  Opcode = AMDGPU::V_CMPX_LE_F32_e64;
205  break;
206  case ISD::SETOLT:
207  case ISD::SETLT:
208  Opcode = AMDGPU::V_CMPX_GT_F32_e64;
209  break;
210  case ISD::SETOLE:
211  case ISD::SETLE:
212  Opcode = AMDGPU::V_CMPX_GE_F32_e64;
213  break;
214  case ISD::SETONE:
215  case ISD::SETNE:
216  Opcode = AMDGPU::V_CMPX_LG_F32_e64;
217  break;
218  case ISD::SETO:
219  Opcode = AMDGPU::V_CMPX_O_F32_e64;
220  break;
221  case ISD::SETUO:
222  Opcode = AMDGPU::V_CMPX_U_F32_e64;
223  break;
224  case ISD::SETUEQ:
225  Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
226  break;
227  case ISD::SETUGT:
228  Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
229  break;
230  case ISD::SETUGE:
231  Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
232  break;
233  case ISD::SETULT:
234  Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
235  break;
236  case ISD::SETULE:
237  Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
238  break;
239  case ISD::SETUNE:
240  Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
241  break;
242  default:
243  llvm_unreachable("invalid ISD:SET cond code");
244  }
245 
246  assert(MI.getOperand(0).isReg());
247 
248  if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
249  MI.getOperand(0).getReg())) {
250  Opcode = AMDGPU::getVOPe32(Opcode);
251  BuildMI(MBB, &MI, DL, TII->get(Opcode))
252  .add(MI.getOperand(1))
253  .add(MI.getOperand(0));
254  } else {
255  BuildMI(MBB, &MI, DL, TII->get(Opcode))
256  .addReg(AMDGPU::VCC, RegState::Define)
257  .addImm(0) // src0 modifiers
258  .add(MI.getOperand(1))
259  .addImm(0) // src1 modifiers
260  .add(MI.getOperand(0))
261  .addImm(0); // omod
262  }
263  break;
264  }
265  case AMDGPU::SI_KILL_I1_TERMINATOR: {
266  const MachineOperand &Op = MI.getOperand(0);
267  int64_t KillVal = MI.getOperand(1).getImm();
268  assert(KillVal == 0 || KillVal == -1);
269 
270  // Kill all threads if Op0 is an immediate and equal to the Kill value.
271  if (Op.isImm()) {
272  int64_t Imm = Op.getImm();
273  assert(Imm == 0 || Imm == -1);
274 
275  if (Imm == KillVal)
276  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
277  .addImm(0);
278  break;
279  }
280 
281  unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
282  BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
283  .addReg(AMDGPU::EXEC)
284  .add(Op);
285  break;
286  }
287  default:
288  llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
289  }
290 }
291 
292 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
294  MachineFunction *MF = MBB.getParent();
295 
297  MachineFunction::iterator MBBI(MBB);
298  ++MBBI;
299 
300  MF->insert(MBBI, SkipBB);
301  MBB.addSuccessor(SkipBB);
302 
303  return SkipBB;
304 }
305 
306 // Returns true if a branch over the block was inserted.
307 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
308  MachineBasicBlock &SrcMBB) {
309  MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
310 
311  if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
312  return false;
313 
314  const DebugLoc &DL = MI.getDebugLoc();
315  MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
316 
317  BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
318  .addMBB(DestBB);
319 
320  return true;
321 }
322 
323 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
324  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
325  TII = ST.getInstrInfo();
326  TRI = &TII->getRegisterInfo();
327  SkipThreshold = SkipThresholdFlag;
328 
329  bool HaveKill = false;
330  bool MadeChange = false;
331 
332  // Track depth of exec mask, divergent branches.
333  SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
334 
336 
337  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
338 
339  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
340  BI != BE; BI = NextBB) {
341  NextBB = std::next(BI);
342  MachineBasicBlock &MBB = *BI;
343  bool HaveSkipBlock = false;
344 
345  if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
346  // Reached convergence point for last divergent branch.
347  ExecBranchStack.pop_back();
348  }
349 
350  if (HaveKill && ExecBranchStack.empty()) {
351  HaveKill = false;
352 
353  // TODO: Insert skip if exec is 0?
354  }
355 
357  for (I = MBB.begin(); I != MBB.end(); I = Next) {
358  Next = std::next(I);
359 
360  MachineInstr &MI = *I;
361 
362  switch (MI.getOpcode()) {
363  case AMDGPU::SI_MASK_BRANCH:
364  ExecBranchStack.push_back(MI.getOperand(0).getMBB());
365  MadeChange |= skipMaskBranch(MI, MBB);
366  break;
367 
368  case AMDGPU::S_BRANCH:
369  // Optimize out branches to the next block.
370  // FIXME: Shouldn't this be handled by BranchFolding?
371  if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
372  MI.eraseFromParent();
373  } else if (HaveSkipBlock) {
374  // Remove the given unconditional branch when a skip block has been
375  // inserted after the current one and let skip the two instructions
376  // performing the kill if the exec mask is non-zero.
377  MI.eraseFromParent();
378  }
379  break;
380 
381  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
382  case AMDGPU::SI_KILL_I1_TERMINATOR:
383  MadeChange = true;
384  kill(MI);
385 
386  if (ExecBranchStack.empty()) {
387  if (skipIfDead(MI, *NextBB)) {
388  HaveSkipBlock = true;
389  NextBB = std::next(BI);
390  BE = MF.end();
391  }
392  } else {
393  HaveKill = true;
394  }
395 
396  MI.eraseFromParent();
397  break;
398 
399  case AMDGPU::SI_RETURN_TO_EPILOG:
400  // FIXME: Should move somewhere else
402 
403  // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
404  // because external bytecode will be appended at the end.
405  if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
406  // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
407  // the end and jump there.
408  if (!EmptyMBBAtEnd) {
409  EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
410  MF.insert(MF.end(), EmptyMBBAtEnd);
411  }
412 
413  MBB.addSuccessor(EmptyMBBAtEnd);
414  BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
415  .addMBB(EmptyMBBAtEnd);
416  I->eraseFromParent();
417  }
418  break;
419 
420  default:
421  break;
422  }
423  }
424  }
425 
426  return MadeChange;
427 }
const MachineInstrBuilder & add(const MachineOperand &MO) const
AMDGPU specific subclass of TargetSubtarget.
MachineBasicBlock * getMBB() const
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:383
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned getReg() const
getReg - Returns the register number.
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
#define DEBUG_TYPE
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
char & SIInsertSkipsPassID
Represent the analysis usage information of a pass.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:632
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:34
Iterator for intrusive lists based on ilist_node.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
BlockVerifier::State From
LLVM_READONLY int getVOPe32(uint16_t Opcode)
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:213
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
int64_t getImm() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:254
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:56
#define I(x, y, z)
Definition: MD5.cpp:58
const MachineBasicBlock & back() const
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void insert(iterator MBBI, MachineBasicBlock *MBB)
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
static cl::opt< unsigned > SkipThresholdFlag("amdgpu-skip-threshold", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden)
static INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) char &llvm bool opcodeEmitsNoInsts(unsigned Opc)
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:414
const SIRegisterInfo * getRegisterInfo() const override