LLVM  7.0.0svn
SIInsertSkips.cpp
Go to the documentation of this file.
1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This pass inserts branches on the 0 exec mask over divergent branches
12 /// branches when it's expected that jumping over the untaken control flow will
13 /// be cheaper than having every workitem no-op through it.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPU.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIInstrInfo.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/StringRef.h"
29 #include "llvm/IR/CallingConv.h"
30 #include "llvm/IR/DebugLoc.h"
31 #include "llvm/MC/MCAsmInfo.h"
32 #include "llvm/Pass.h"
35 #include <cassert>
36 #include <cstdint>
37 #include <iterator>
38 
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "si-insert-skips"
42 
44  "amdgpu-skip-threshold",
45  cl::desc("Number of instructions before jumping over divergent control flow"),
46  cl::init(12), cl::Hidden);
47 
48 namespace {
49 
50 class SIInsertSkips : public MachineFunctionPass {
51 private:
52  const SIRegisterInfo *TRI = nullptr;
53  const SIInstrInfo *TII = nullptr;
54  unsigned SkipThreshold = 0;
55 
56  bool shouldSkip(const MachineBasicBlock &From,
57  const MachineBasicBlock &To) const;
58 
59  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
60 
61  void kill(MachineInstr &MI);
62 
63  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
65 
66  bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
67 
68 public:
69  static char ID;
70 
71  SIInsertSkips() : MachineFunctionPass(ID) {}
72 
73  bool runOnMachineFunction(MachineFunction &MF) override;
74 
75  StringRef getPassName() const override {
76  return "SI insert s_cbranch_execz instructions";
77  }
78 
79  void getAnalysisUsage(AnalysisUsage &AU) const override {
81  }
82 };
83 
84 } // end anonymous namespace
85 
86 char SIInsertSkips::ID = 0;
87 
88 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
89  "SI insert s_cbranch_execz instructions", false, false)
90 
91 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
92 
93 static bool opcodeEmitsNoInsts(unsigned Opc) {
94  switch (Opc) {
95  case TargetOpcode::IMPLICIT_DEF:
96  case TargetOpcode::KILL:
97  case TargetOpcode::BUNDLE:
98  case TargetOpcode::CFI_INSTRUCTION:
100  case TargetOpcode::GC_LABEL:
101  case TargetOpcode::DBG_VALUE:
102  return true;
103  default:
104  return false;
105  }
106 }
107 
108 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
109  const MachineBasicBlock &To) const {
110  if (From.succ_empty())
111  return false;
112 
113  unsigned NumInstr = 0;
114  const MachineFunction *MF = From.getParent();
115 
116  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
117  MBBI != End && MBBI != ToI; ++MBBI) {
118  const MachineBasicBlock &MBB = *MBBI;
119 
120  for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
121  NumInstr < SkipThreshold && I != E; ++I) {
122  if (opcodeEmitsNoInsts(I->getOpcode()))
123  continue;
124 
125  // FIXME: Since this is required for correctness, this should be inserted
126  // during SILowerControlFlow.
127 
128  // When a uniform loop is inside non-uniform control flow, the branch
129  // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
130  // when EXEC = 0. We should skip the loop lest it becomes infinite.
131  if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
132  I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
133  return true;
134 
135  // V_READFIRSTLANE/V_READLANE destination register may be used as operand
136  // by some SALU instruction. If exec mask is zero vector instruction
137  // defining the register that is used by the scalar one is not executed
138  // and scalar instruction will operate on undefined data. For
139  // V_READFIRSTLANE/V_READLANE we should avoid predicated execution.
140  if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) ||
141  (I->getOpcode() == AMDGPU::V_READLANE_B32)) {
142  return true;
143  }
144 
145  if (I->isInlineAsm()) {
146  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
147  const char *AsmStr = I->getOperand(0).getSymbolName();
148 
149  // inlineasm length estimate is number of bytes assuming the longest
150  // instruction.
151  uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
152  NumInstr += MaxAsmSize / MAI->getMaxInstLength();
153  } else {
154  ++NumInstr;
155  }
156 
157  if (NumInstr >= SkipThreshold)
158  return true;
159  }
160  }
161 
162  return false;
163 }
164 
165 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
166  MachineBasicBlock &MBB = *MI.getParent();
167  MachineFunction *MF = MBB.getParent();
168 
170  !shouldSkip(MBB, MBB.getParent()->back()))
171  return false;
172 
173  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
174 
175  const DebugLoc &DL = MI.getDebugLoc();
176 
177  // If the exec mask is non-zero, skip the next two instructions
178  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
179  .addMBB(&NextBB);
180 
181  MachineBasicBlock::iterator Insert = SkipBB->begin();
182 
183  // Exec mask is zero: Export to NULL target...
184  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
185  .addImm(0x09) // V_008DFC_SQ_EXP_NULL
186  .addReg(AMDGPU::VGPR0, RegState::Undef)
187  .addReg(AMDGPU::VGPR0, RegState::Undef)
188  .addReg(AMDGPU::VGPR0, RegState::Undef)
189  .addReg(AMDGPU::VGPR0, RegState::Undef)
190  .addImm(1) // vm
191  .addImm(0) // compr
192  .addImm(0); // en
193 
194  // ... and terminate wavefront.
195  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
196 
197  return true;
198 }
199 
200 void SIInsertSkips::kill(MachineInstr &MI) {
201  MachineBasicBlock &MBB = *MI.getParent();
202  DebugLoc DL = MI.getDebugLoc();
203 
204  switch (MI.getOpcode()) {
205  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
206  unsigned Opcode = 0;
207 
208  // The opcodes are inverted because the inline immediate has to be
209  // the first operand, e.g. from "x < imm" to "imm > x"
210  switch (MI.getOperand(2).getImm()) {
211  case ISD::SETOEQ:
212  case ISD::SETEQ:
213  Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
214  break;
215  case ISD::SETOGT:
216  case ISD::SETGT:
217  Opcode = AMDGPU::V_CMPX_LT_F32_e64;
218  break;
219  case ISD::SETOGE:
220  case ISD::SETGE:
221  Opcode = AMDGPU::V_CMPX_LE_F32_e64;
222  break;
223  case ISD::SETOLT:
224  case ISD::SETLT:
225  Opcode = AMDGPU::V_CMPX_GT_F32_e64;
226  break;
227  case ISD::SETOLE:
228  case ISD::SETLE:
229  Opcode = AMDGPU::V_CMPX_GE_F32_e64;
230  break;
231  case ISD::SETONE:
232  case ISD::SETNE:
233  Opcode = AMDGPU::V_CMPX_LG_F32_e64;
234  break;
235  case ISD::SETO:
236  Opcode = AMDGPU::V_CMPX_O_F32_e64;
237  break;
238  case ISD::SETUO:
239  Opcode = AMDGPU::V_CMPX_U_F32_e64;
240  break;
241  case ISD::SETUEQ:
242  Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
243  break;
244  case ISD::SETUGT:
245  Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
246  break;
247  case ISD::SETUGE:
248  Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
249  break;
250  case ISD::SETULT:
251  Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
252  break;
253  case ISD::SETULE:
254  Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
255  break;
256  case ISD::SETUNE:
257  Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
258  break;
259  default:
260  llvm_unreachable("invalid ISD:SET cond code");
261  }
262 
263  assert(MI.getOperand(0).isReg());
264 
265  MachineInstr *NewMI;
266  if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
267  MI.getOperand(0).getReg())) {
268  Opcode = AMDGPU::getVOPe32(Opcode);
269  NewMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
270  .add(MI.getOperand(1))
271  .add(MI.getOperand(0));
272  } else {
273  NewMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
274  .addReg(AMDGPU::VCC, RegState::Define)
275  .addImm(0) // src0 modifiers
276  .add(MI.getOperand(1))
277  .addImm(0) // src1 modifiers
278  .add(MI.getOperand(0))
279  .addImm(0); // omod
280  }
281  // Clear isRenamable bit if new opcode requires it to be 0.
282  if (NewMI->hasExtraSrcRegAllocReq())
283  for (MachineOperand &NewMO : NewMI->uses())
284  if (NewMO.isReg() && NewMO.isUse())
285  NewMO.setIsRenamable(false);
286  break;
287  }
288  case AMDGPU::SI_KILL_I1_TERMINATOR: {
289  const MachineOperand &Op = MI.getOperand(0);
290  int64_t KillVal = MI.getOperand(1).getImm();
291  assert(KillVal == 0 || KillVal == -1);
292 
293  // Kill all threads if Op0 is an immediate and equal to the Kill value.
294  if (Op.isImm()) {
295  int64_t Imm = Op.getImm();
296  assert(Imm == 0 || Imm == -1);
297 
298  if (Imm == KillVal)
299  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
300  .addImm(0);
301  break;
302  }
303 
304  unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
305  BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
306  .addReg(AMDGPU::EXEC)
307  .add(Op);
308  break;
309  }
310  default:
311  llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
312  }
313 }
314 
315 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
317  MachineFunction *MF = MBB.getParent();
318 
320  MachineFunction::iterator MBBI(MBB);
321  ++MBBI;
322 
323  MF->insert(MBBI, SkipBB);
324  MBB.addSuccessor(SkipBB);
325 
326  return SkipBB;
327 }
328 
329 // Returns true if a branch over the block was inserted.
330 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
331  MachineBasicBlock &SrcMBB) {
332  MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
333 
334  if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
335  return false;
336 
337  const DebugLoc &DL = MI.getDebugLoc();
338  MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
339 
340  BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
341  .addMBB(DestBB);
342 
343  return true;
344 }
345 
346 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
347  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
348  TII = ST.getInstrInfo();
349  TRI = &TII->getRegisterInfo();
350  SkipThreshold = SkipThresholdFlag;
351 
352  bool HaveKill = false;
353  bool MadeChange = false;
354 
355  // Track depth of exec mask, divergent branches.
356  SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
357 
359 
360  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
361 
362  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
363  BI != BE; BI = NextBB) {
364  NextBB = std::next(BI);
365  MachineBasicBlock &MBB = *BI;
366  bool HaveSkipBlock = false;
367 
368  if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
369  // Reached convergence point for last divergent branch.
370  ExecBranchStack.pop_back();
371  }
372 
373  if (HaveKill && ExecBranchStack.empty()) {
374  HaveKill = false;
375 
376  // TODO: Insert skip if exec is 0?
377  }
378 
380  for (I = MBB.begin(); I != MBB.end(); I = Next) {
381  Next = std::next(I);
382 
383  MachineInstr &MI = *I;
384 
385  switch (MI.getOpcode()) {
386  case AMDGPU::SI_MASK_BRANCH:
387  ExecBranchStack.push_back(MI.getOperand(0).getMBB());
388  MadeChange |= skipMaskBranch(MI, MBB);
389  break;
390 
391  case AMDGPU::S_BRANCH:
392  // Optimize out branches to the next block.
393  // FIXME: Shouldn't this be handled by BranchFolding?
394  if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
395  MI.eraseFromParent();
396  } else if (HaveSkipBlock) {
397  // Remove the given unconditional branch when a skip block has been
398  // inserted after the current one and let skip the two instructions
399  // performing the kill if the exec mask is non-zero.
400  MI.eraseFromParent();
401  }
402  break;
403 
404  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
405  case AMDGPU::SI_KILL_I1_TERMINATOR:
406  MadeChange = true;
407  kill(MI);
408 
409  if (ExecBranchStack.empty()) {
410  if (skipIfDead(MI, *NextBB)) {
411  HaveSkipBlock = true;
412  NextBB = std::next(BI);
413  BE = MF.end();
414  }
415  } else {
416  HaveKill = true;
417  }
418 
419  MI.eraseFromParent();
420  break;
421 
422  case AMDGPU::SI_RETURN_TO_EPILOG:
423  // FIXME: Should move somewhere else
425 
426  // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
427  // because external bytecode will be appended at the end.
428  if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
429  // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
430  // the end and jump there.
431  if (!EmptyMBBAtEnd) {
432  EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
433  MF.insert(MF.end(), EmptyMBBAtEnd);
434  }
435 
436  MBB.addSuccessor(EmptyMBBAtEnd);
437  BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
438  .addMBB(EmptyMBBAtEnd);
439  I->eraseFromParent();
440  }
441  break;
442 
443  default:
444  break;
445  }
446  }
447  }
448 
449  return MadeChange;
450 }
const MachineInstrBuilder & add(const MachineOperand &MO) const
AMDGPU specific subclass of TargetSubtarget.
MachineBasicBlock * getMBB() const
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:271
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned getReg() const
getReg - Returns the register number.
const SIInstrInfo * getInstrInfo() const override
A debug info location.
Definition: DebugLoc.h:34
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
#define DEBUG_TYPE
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:293
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
This class is intended to be used as a base class for asm properties and features specific to the tar...
Definition: MCAsmInfo.h:56
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:406
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
char & SIInsertSkipsPassID
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
Represent the analysis usage information of a pass.
static const unsigned End
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:640
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:34
Iterator for intrusive lists based on ilist_node.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
const SIRegisterInfo * getRegisterInfo() const override
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:194
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:862
unsigned getInlineAsmLength(const char *Str, const MCAsmInfo &MAI) const override
Measure the specified inline asm to determine an approximation of its length.
int64_t getImm() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
unsigned getMaxInstLength() const
Definition: MCAsmInfo.h:462
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:142
Representation of each machine instruction.
Definition: MachineInstr.h:60
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:61
#define I(x, y, z)
Definition: MD5.cpp:58
const MachineBasicBlock & back() const
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void insert(iterator MBBI, MachineBasicBlock *MBB)
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
static cl::opt< unsigned > SkipThresholdFlag("amdgpu-skip-threshold", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden)
static INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) char &llvm bool opcodeEmitsNoInsts(unsigned Opc)
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:298