LLVM  11.0.0git
SIInsertSkips.cpp
Go to the documentation of this file.
1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass inserts branches on the 0 exec mask over divergent branches
11 /// branches when it's expected that jumping over the untaken control flow will
12 /// be cheaper than having every workitem no-op through it.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/InitializePasses.h"
34 #include "llvm/MC/MCAsmInfo.h"
35 #include "llvm/Pass.h"
38 #include <cassert>
39 #include <cstdint>
40 #include <iterator>
41 
42 using namespace llvm;
43 
44 #define DEBUG_TYPE "si-insert-skips"
45 
47  "amdgpu-skip-threshold-legacy",
48  cl::desc("Number of instructions before jumping over divergent control flow"),
49  cl::init(12), cl::Hidden);
50 
51 namespace {
52 
53 class SIInsertSkips : public MachineFunctionPass {
54 private:
55  const SIRegisterInfo *TRI = nullptr;
56  const SIInstrInfo *TII = nullptr;
57  unsigned SkipThreshold = 0;
58  MachineDominatorTree *MDT = nullptr;
59 
60  bool shouldSkip(const MachineBasicBlock &From,
61  const MachineBasicBlock &To) const;
62 
63  bool dominatesAllReachable(MachineBasicBlock &MBB);
64  void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
65  DebugLoc DL);
66 
67  bool kill(MachineInstr &MI);
68 
69  bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
70 
71 public:
72  static char ID;
73 
74  SIInsertSkips() : MachineFunctionPass(ID) {}
75 
76  bool runOnMachineFunction(MachineFunction &MF) override;
77 
78  StringRef getPassName() const override {
79  return "SI insert s_cbranch_execz instructions";
80  }
81 
82  void getAnalysisUsage(AnalysisUsage &AU) const override {
86  }
87 };
88 
89 } // end anonymous namespace
90 
91 char SIInsertSkips::ID = 0;
92 
93 INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
94  "SI insert s_cbranch_execz instructions", false, false)
97  "SI insert s_cbranch_execz instructions", false, false)
98 
99 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
100 
102  if (MI.isMetaInstruction())
103  return true;
104 
105  // Handle target specific opcodes.
106  switch (MI.getOpcode()) {
107  case AMDGPU::SI_MASK_BRANCH:
108  return true;
109  default:
110  return false;
111  }
112 }
113 
114 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
115  const MachineBasicBlock &To) const {
116  unsigned NumInstr = 0;
117  const MachineFunction *MF = From.getParent();
118 
119  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
120  MBBI != End && MBBI != ToI; ++MBBI) {
121  const MachineBasicBlock &MBB = *MBBI;
122 
123  for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
124  NumInstr < SkipThreshold && I != E; ++I) {
125  if (opcodeEmitsNoInsts(*I))
126  continue;
127 
128  // FIXME: Since this is required for correctness, this should be inserted
129  // during SILowerControlFlow.
130 
131  // When a uniform loop is inside non-uniform control flow, the branch
132  // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
133  // when EXEC = 0. We should skip the loop lest it becomes infinite.
134  if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
135  I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
136  return true;
137 
138  if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
139  return true;
140 
141  // These instructions are potentially expensive even if EXEC = 0.
142  if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
143  I->getOpcode() == AMDGPU::S_WAITCNT)
144  return true;
145 
146  ++NumInstr;
147  if (NumInstr >= SkipThreshold)
148  return true;
149  }
150  }
151 
152  return false;
153 }
154 
155 /// Check whether \p MBB dominates all blocks that are reachable from it.
156 bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
157  for (MachineBasicBlock *Other : depth_first(&MBB)) {
158  if (!MDT->dominates(&MBB, Other))
159  return false;
160  }
161  return true;
162 }
163 
164 /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
165 /// iterator. Only applies to pixel shaders.
166 void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
168  MachineFunction *MF = MBB.getParent();
170 
171  // Currently, SI_KILL_*_TERMINATOR is expected to occur only as the last
172  // terminator of a basic block. If this ever changes, we need to optionally
173  // split MBB here.
174  assert(I == MBB.end());
175 
176  // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
177  // basic block that has no further successors (e.g., there was an
178  // `unreachable` there in IR). This can happen with original source of the
179  // form:
180  //
181  // if (uniform_condition) {
182  // write_to_memory();
183  // discard;
184  // }
185  //
186  // In this case, we write the "null_export; s_endpgm" skip code in the
187  // already-existing basic block.
188  auto NextBBI = std::next(MBB.getIterator());
189  bool NoSuccessor = llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
190  MachineBasicBlock *SkipBB;
191 
192  if (NoSuccessor) {
193  SkipBB = &MBB;
194  } else {
195  // Create a new basic block that will contain the "null export; s_endpgm"
196  // and set up the branching to go around it.
197  SkipBB = MF->CreateMachineBasicBlock();
198  MF->insert(NextBBI, SkipBB);
199 
200  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&*NextBBI);
201  MBB.addSuccessor(SkipBB);
202 
203  MDT->addNewBlock(SkipBB, &MBB);
204  }
205 
206  // Generate "null export; s_endpgm".
207  BuildMI(SkipBB, DL, TII->get(AMDGPU::EXP_DONE))
208  .addImm(0x09) // V_008DFC_SQ_EXP_NULL
209  .addReg(AMDGPU::VGPR0, RegState::Undef)
210  .addReg(AMDGPU::VGPR0, RegState::Undef)
211  .addReg(AMDGPU::VGPR0, RegState::Undef)
212  .addReg(AMDGPU::VGPR0, RegState::Undef)
213  .addImm(1) // vm
214  .addImm(0) // compr
215  .addImm(0); // en
216  BuildMI(SkipBB, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
217 }
218 
219 /// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
220 /// Return true unless the terminator is a no-op.
221 bool SIInsertSkips::kill(MachineInstr &MI) {
222  MachineBasicBlock &MBB = *MI.getParent();
223  DebugLoc DL = MI.getDebugLoc();
224 
225  switch (MI.getOpcode()) {
226  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
227  unsigned Opcode = 0;
228 
229  // The opcodes are inverted because the inline immediate has to be
230  // the first operand, e.g. from "x < imm" to "imm > x"
231  switch (MI.getOperand(2).getImm()) {
232  case ISD::SETOEQ:
233  case ISD::SETEQ:
234  Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
235  break;
236  case ISD::SETOGT:
237  case ISD::SETGT:
238  Opcode = AMDGPU::V_CMPX_LT_F32_e64;
239  break;
240  case ISD::SETOGE:
241  case ISD::SETGE:
242  Opcode = AMDGPU::V_CMPX_LE_F32_e64;
243  break;
244  case ISD::SETOLT:
245  case ISD::SETLT:
246  Opcode = AMDGPU::V_CMPX_GT_F32_e64;
247  break;
248  case ISD::SETOLE:
249  case ISD::SETLE:
250  Opcode = AMDGPU::V_CMPX_GE_F32_e64;
251  break;
252  case ISD::SETONE:
253  case ISD::SETNE:
254  Opcode = AMDGPU::V_CMPX_LG_F32_e64;
255  break;
256  case ISD::SETO:
257  Opcode = AMDGPU::V_CMPX_O_F32_e64;
258  break;
259  case ISD::SETUO:
260  Opcode = AMDGPU::V_CMPX_U_F32_e64;
261  break;
262  case ISD::SETUEQ:
263  Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
264  break;
265  case ISD::SETUGT:
266  Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
267  break;
268  case ISD::SETUGE:
269  Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
270  break;
271  case ISD::SETULT:
272  Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
273  break;
274  case ISD::SETULE:
275  Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
276  break;
277  case ISD::SETUNE:
278  Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
279  break;
280  default:
281  llvm_unreachable("invalid ISD:SET cond code");
282  }
283 
285  if (ST.hasNoSdstCMPX())
286  Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
287 
288  assert(MI.getOperand(0).isReg());
289 
290  if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
291  MI.getOperand(0).getReg())) {
292  Opcode = AMDGPU::getVOPe32(Opcode);
293  BuildMI(MBB, &MI, DL, TII->get(Opcode))
294  .add(MI.getOperand(1))
295  .add(MI.getOperand(0));
296  } else {
297  auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
298  if (!ST.hasNoSdstCMPX())
299  I.addReg(AMDGPU::VCC, RegState::Define);
300 
301  I.addImm(0) // src0 modifiers
302  .add(MI.getOperand(1))
303  .addImm(0) // src1 modifiers
304  .add(MI.getOperand(0));
305 
306  I.addImm(0); // omod
307  }
308  return true;
309  }
310  case AMDGPU::SI_KILL_I1_TERMINATOR: {
311  const MachineFunction *MF = MI.getParent()->getParent();
312  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
313  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
314  const MachineOperand &Op = MI.getOperand(0);
315  int64_t KillVal = MI.getOperand(1).getImm();
316  assert(KillVal == 0 || KillVal == -1);
317 
318  // Kill all threads if Op0 is an immediate and equal to the Kill value.
319  if (Op.isImm()) {
320  int64_t Imm = Op.getImm();
321  assert(Imm == 0 || Imm == -1);
322 
323  if (Imm == KillVal) {
324  BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
325  : AMDGPU::S_MOV_B64), Exec)
326  .addImm(0);
327  return true;
328  }
329  return false;
330  }
331 
332  unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
333  if (ST.isWave32())
334  Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
335  BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
336  .addReg(Exec)
337  .add(Op);
338  return true;
339  }
340  default:
341  llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
342  }
343 }
344 
345 // Returns true if a branch over the block was inserted.
346 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
347  MachineBasicBlock &SrcMBB) {
348  MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
349 
350  if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
351  return false;
352 
353  const DebugLoc &DL = MI.getDebugLoc();
354  MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
355 
356  BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
357  .addMBB(DestBB);
358 
359  return true;
360 }
361 
362 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
363  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
364  TII = ST.getInstrInfo();
365  TRI = &TII->getRegisterInfo();
366  MDT = &getAnalysis<MachineDominatorTree>();
368 
369  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
371  bool MadeChange = false;
372 
373  for (MachineBasicBlock &MBB : MF) {
375  for (I = MBB.begin(); I != MBB.end(); I = Next) {
376  Next = std::next(I);
377  MachineInstr &MI = *I;
378 
379  switch (MI.getOpcode()) {
380  case AMDGPU::SI_MASK_BRANCH:
381  MadeChange |= skipMaskBranch(MI, MBB);
382  break;
383 
384  case AMDGPU::S_BRANCH:
385  // Optimize out branches to the next block.
386  // FIXME: Shouldn't this be handled by BranchFolding?
387  if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
388  assert(&MI == &MBB.back());
389  MI.eraseFromParent();
390  MadeChange = true;
391  }
392  break;
393 
394  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
395  case AMDGPU::SI_KILL_I1_TERMINATOR: {
396  MadeChange = true;
397  bool CanKill = kill(MI);
398 
399  // Check if we can add an early "if exec=0 { end shader }".
400  //
401  // Note that we _always_ do this if it is correct, even if the kill
402  // happens fairly late in the shader, because the null export should
403  // generally still be cheaper than normal export(s).
404  //
405  // TODO: The dominatesAllReachable check is conservative: if the
406  // dominance is only missing due to _uniform_ branches, we could
407  // in fact insert the early-exit as well.
408  if (CanKill &&
409  MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
410  dominatesAllReachable(MBB)) {
411  // Mark the instruction for kill-if-dead insertion. We delay this
412  // change because it modifies the CFG.
413  KillInstrs.push_back(&MI);
414  } else {
415  MI.eraseFromParent();
416  }
417  break;
418  }
419 
420  case AMDGPU::SI_RETURN_TO_EPILOG:
421  // FIXME: Should move somewhere else
422  assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
423 
424  // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
425  // because external bytecode will be appended at the end.
426  if (&MBB != &MF.back() || &MI != &MBB.back()) {
427  // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
428  // the end and jump there.
429  if (!EmptyMBBAtEnd) {
430  EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
431  MF.insert(MF.end(), EmptyMBBAtEnd);
432  }
433 
434  MBB.addSuccessor(EmptyMBBAtEnd);
435  BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
436  .addMBB(EmptyMBBAtEnd);
437  MI.eraseFromParent();
438 
439  MDT->getBase().insertEdge(&MBB, EmptyMBBAtEnd);
440  }
441  break;
442 
443  default:
444  break;
445  }
446  }
447  }
448 
449  for (MachineInstr *Kill : KillInstrs) {
450  skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
451  Kill->getDebugLoc());
452  Kill->eraseFromParent();
453  }
454  KillInstrs.clear();
455 
456  return MadeChange;
457 }
const MachineInstrBuilder & add(const MachineOperand &MO) const
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
AMDGPU specific subclass of TargetSubtarget.
MachineBasicBlock * getMBB() const
This class represents lattice values for constants.
Definition: AllocatorList.h:23
void push_back(const T &Elt)
Definition: SmallVector.h:246
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:406
char & SIInsertSkipsPassID
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
iterator_range< succ_iterator > successors()
Function & getFunction()
Return the LLVM function that this machine code represents.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
AnalysisUsage & addRequired()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
#define DEBUG_TYPE
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:432
static cl::opt< unsigned > SkipThresholdFlag("amdgpu-skip-threshold-legacy", cl::desc("Number of instructions before jumping over divergent control flow"), cl::init(12), cl::Hidden)
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:434
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
static unsigned SkipThreshold
INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) INITIALIZE_PASS_END(SIInsertSkips
self_iterator getIterator()
Definition: ilist_node.h:81
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:205
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Iterator for intrusive lists based on ilist_node.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
BlockVerifier::State From
LLVM_READONLY int getVOPe32(uint16_t Opcode)
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:219
MachineOperand class - Representation of each machine instruction operand.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1503
LLVM_READONLY int getVCMPXNoSDstOp(uint16_t Opcode)
int64_t getImm() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:277
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:62
bool hasNoSdstCMPX() const
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:59
static bool opcodeEmitsNoInsts(const MachineInstr &MI)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
iterator_range< df_iterator< T > > depth_first(const T &G)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void insert(iterator MBBI, MachineBasicBlock *MBB)
aarch64 promote const
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
inst_range instructions(Function *F)
Definition: InstIterator.h:133
Register getReg() const
getReg - Returns the register number.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:437
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
const SIRegisterInfo * getRegisterInfo() const override