LLVM  12.0.0git
AMDGPUUnifyDivergentExitNodes.cpp
Go to the documentation of this file.
1 //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
10 // there is at most one ret and one unreachable instruction, it ensures there is
11 // at most one divergent exiting block.
12 //
13 // StructurizeCFG can't deal with multi-exit regions formed by branches to
14 // multiple return nodes. It is not desirable to structurize regions with
15 // uniform branches, so unifying those to the same return block as divergent
16 // branches inhibits use of scalar branching. It still can't deal with the case
17 // where one branch goes to return, and one unreachable. Replace unreachable in
18 // this case with a return.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "SIDefines.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/CFG.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/IntrinsicsAMDGPU.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/InitializePasses.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Utils.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
53 
54 namespace {
55 
56 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
57 public:
58  static char ID; // Pass identification, replacement for typeid
59 
60  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
62  }
63 
64  // We can preserve non-critical-edgeness when we unify function exit nodes
65  void getAnalysisUsage(AnalysisUsage &AU) const override;
66  bool runOnFunction(Function &F) override;
67 };
68 
69 } // end anonymous namespace
70 
72 
74 
75 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
76  "Unify divergent function exit nodes", false, false)
80 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
81  "Unify divergent function exit nodes", false, false)
82 
83 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
85  AU.addRequired<DominatorTreeWrapperPass>();
86 
87  AU.addRequired<PostDominatorTreeWrapperPass>();
88 
89  AU.addRequired<LegacyDivergenceAnalysis>();
90 
92  AU.addPreserved<DominatorTreeWrapperPass>();
93  // FIXME: preserve PostDominatorTreeWrapperPass
94  }
95 
96  // No divergent values are changed, only blocks and branch edges.
97  AU.addPreserved<LegacyDivergenceAnalysis>();
98 
99  // We preserve the non-critical-edgeness property
100  AU.addPreservedID(BreakCriticalEdgesID);
101 
102  // This is a cluster of orthogonal Transforms
103  AU.addPreservedID(LowerSwitchID);
105 
106  AU.addRequired<TargetTransformInfoWrapperPass>();
107 }
108 
109 /// \returns true if \p BB is reachable through only uniform branches.
110 /// XXX - Is there a more efficient way to find this?
112  BasicBlock &BB) {
115 
116  for (BasicBlock *Pred : predecessors(&BB))
117  Stack.push_back(Pred);
118 
119  while (!Stack.empty()) {
120  BasicBlock *Top = Stack.pop_back_val();
121  if (!DA.isUniform(Top->getTerminator()))
122  return false;
123 
124  for (BasicBlock *Pred : predecessors(Top)) {
125  if (Visited.insert(Pred).second)
126  Stack.push_back(Pred);
127  }
128  }
129 
130  return true;
131 }
132 
133 static void removeDoneExport(Function &F) {
134  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
135  for (BasicBlock &BB : F) {
136  for (Instruction &I : BB) {
137  if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
138  if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
139  Intrin->setArgOperand(6, BoolFalse); // done
140  } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
141  Intrin->setArgOperand(4, BoolFalse); // done
142  }
143  }
144  }
145  }
146 }
147 
149  ArrayRef<BasicBlock *> ReturningBlocks,
150  bool InsertExport,
151  const TargetTransformInfo &TTI,
152  StringRef Name) {
153  // Otherwise, we need to insert a new basic block into the function, add a PHI
154  // nodes (if the function returns values), and convert all of the return
155  // instructions into unconditional branches.
156  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
157  IRBuilder<> B(NewRetBlock);
158 
159  if (InsertExport) {
160  // Ensure that there's only one "done" export in the shader by removing the
161  // "done" bit set on the original final export. More than one "done" export
162  // can lead to undefined behavior.
164 
165  Value *Undef = UndefValue::get(B.getFloatTy());
166  B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
167  {
168  B.getInt32(AMDGPU::Exp::ET_NULL),
169  B.getInt32(0), // enabled channels
170  Undef, Undef, Undef, Undef, // values
171  B.getTrue(), // done
172  B.getTrue(), // valid mask
173  });
174  }
175 
176  PHINode *PN = nullptr;
177  if (F.getReturnType()->isVoidTy()) {
178  B.CreateRetVoid();
179  } else {
180  // If the function doesn't return void... add a PHI node to the block...
181  PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
182  "UnifiedRetVal");
183  assert(!InsertExport);
184  B.CreateRet(PN);
185  }
186 
187  // Loop over all of the blocks, replacing the return instruction with an
188  // unconditional branch.
189  std::vector<DominatorTree::UpdateType> Updates;
190  Updates.reserve(ReturningBlocks.size());
191  for (BasicBlock *BB : ReturningBlocks) {
192  // Add an incoming element to the PHI node for every return instruction that
193  // is merging into this new block...
194  if (PN)
195  PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
196 
197  // Remove and delete the return inst.
198  BB->getTerminator()->eraseFromParent();
199  BranchInst::Create(NewRetBlock, BB);
200  Updates.push_back({DominatorTree::Insert, BB, NewRetBlock});
201  }
202 
204  DTU.applyUpdates(Updates);
205  Updates.clear();
206 
207  for (BasicBlock *BB : ReturningBlocks) {
208  // Cleanup possible branch to unconditional branch to the return.
209  simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
210  SimplifyCFGOptions().bonusInstThreshold(2));
211  }
212 
213  return NewRetBlock;
214 }
215 
217  DominatorTree *DT = nullptr;
219  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
220 
221  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
222 
223  // If there's only one exit, we don't need to do anything, unless this is a
224  // pixel shader and that exit is an infinite loop, since we still have to
225  // insert an export in that case.
226  if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
227  return false;
228 
229  LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
230 
231  // Loop over all of the blocks in a function, tracking all of the blocks that
232  // return.
233  SmallVector<BasicBlock *, 4> ReturningBlocks;
234  SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
235  SmallVector<BasicBlock *, 4> UnreachableBlocks;
236 
237  // Dummy return block for infinite loop.
238  BasicBlock *DummyReturnBB = nullptr;
239 
240  bool InsertExport = false;
241 
242  bool Changed = false;
243  std::vector<DominatorTree::UpdateType> Updates;
244 
245  for (BasicBlock *BB : PDT.roots()) {
246  if (isa<ReturnInst>(BB->getTerminator())) {
247  if (!isUniformlyReached(DA, *BB))
248  ReturningBlocks.push_back(BB);
249  else
250  UniformlyReachedRetBlocks.push_back(BB);
251  } else if (isa<UnreachableInst>(BB->getTerminator())) {
252  if (!isUniformlyReached(DA, *BB))
253  UnreachableBlocks.push_back(BB);
254  } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
255 
256  ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
257  if (DummyReturnBB == nullptr) {
258  DummyReturnBB = BasicBlock::Create(F.getContext(),
259  "DummyReturnBlock", &F);
260  Type *RetTy = F.getReturnType();
261  Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
262 
263  // For pixel shaders, the producer guarantees that an export is
264  // executed before each return instruction. However, if there is an
265  // infinite loop and we insert a return ourselves, we need to uphold
266  // that guarantee by inserting a null export. This can happen e.g. in
267  // an infinite loop with kill instructions, which is supposed to
268  // terminate. However, we don't need to do this if there is a non-void
269  // return value, since then there is an epilog afterwards which will
270  // still export.
271  //
272  // Note: In the case where only some threads enter the infinite loop,
273  // this can result in the null export happening redundantly after the
274  // original exports. However, The last "real" export happens after all
275  // the threads that didn't enter an infinite loop converged, which
276  // means that the only extra threads to execute the null export are
277  // threads that entered the infinite loop, and they only could've
278  // exited through being killed which sets their exec bit to 0.
279  // Therefore, unless there's an actual infinite loop, which can have
280  // invalid results, or there's a kill after the last export, which we
281  // assume the frontend won't do, this export will have the same exec
282  // mask as the last "real" export, and therefore the valid mask will be
283  // overwritten with the same value and will still be correct. Also,
284  // even though this forces an extra unnecessary export wait, we assume
285  // that this happens rare enough in practice to that we don't have to
286  // worry about performance.
287  if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
288  RetTy->isVoidTy()) {
289  InsertExport = true;
290  }
291 
292  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
293  ReturningBlocks.push_back(DummyReturnBB);
294  }
295 
296  if (BI->isUnconditional()) {
297  BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
298  BI->eraseFromParent(); // Delete the unconditional branch.
299  // Add a new conditional branch with a dummy edge to the return block.
300  BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
301  Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
302  } else { // Conditional branch.
303  SmallVector<BasicBlock *, 2> Successors(succ_begin(BB), succ_end(BB));
304 
305  // Create a new transition block to hold the conditional branch.
306  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
307 
308  Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
309 
310  // 'Successors' become successors of TransitionBB instead of BB,
311  // and TransitionBB becomes a single successor of BB.
312  Updates.push_back({DominatorTree::Insert, BB, TransitionBB});
313  for (BasicBlock *Successor : Successors) {
314  Updates.push_back({DominatorTree::Insert, TransitionBB, Successor});
315  Updates.push_back({DominatorTree::Delete, BB, Successor});
316  }
317 
318  // Create a branch that will always branch to the transition block and
319  // references DummyReturnBB.
321  BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
322  Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
323  }
324  Changed = true;
325  }
326  }
327 
328  if (!UnreachableBlocks.empty()) {
329  BasicBlock *UnreachableBlock = nullptr;
330 
331  if (UnreachableBlocks.size() == 1) {
332  UnreachableBlock = UnreachableBlocks.front();
333  } else {
334  UnreachableBlock = BasicBlock::Create(F.getContext(),
335  "UnifiedUnreachableBlock", &F);
336  new UnreachableInst(F.getContext(), UnreachableBlock);
337 
338  Updates.reserve(Updates.size() + UnreachableBlocks.size());
339  for (BasicBlock *BB : UnreachableBlocks) {
340  // Remove and delete the unreachable inst.
341  BB->getTerminator()->eraseFromParent();
342  BranchInst::Create(UnreachableBlock, BB);
343  Updates.push_back({DominatorTree::Insert, BB, UnreachableBlock});
344  }
345  Changed = true;
346  }
347 
348  if (!ReturningBlocks.empty()) {
349  // Don't create a new unreachable inst if we have a return. The
350  // structurizer/annotator can't handle the multiple exits
351 
352  Type *RetTy = F.getReturnType();
353  Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
354  // Remove and delete the unreachable inst.
355  UnreachableBlock->getTerminator()->eraseFromParent();
356 
357  Function *UnreachableIntrin =
358  Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
359 
360  // Insert a call to an intrinsic tracking that this is an unreachable
361  // point, in case we want to kill the active lanes or something later.
362  CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
363 
364  // Don't create a scalar trap. We would only want to trap if this code was
365  // really reached, but a scalar trap would happen even if no lanes
366  // actually reached here.
367  ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
368  ReturningBlocks.push_back(UnreachableBlock);
369  Changed = true;
370  }
371  }
372 
373  // FIXME: add PDT here once simplifycfg is ready.
376  DTU.applyUpdates(Updates);
377  Updates.clear();
378 
379  // Now handle return blocks.
380  if (ReturningBlocks.empty())
381  return Changed; // No blocks return
382 
383  if (ReturningBlocks.size() == 1 && !InsertExport)
384  return Changed; // Already has a single return block
385 
386  const TargetTransformInfo &TTI
387  = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
388 
389  // Unify returning blocks. If we are going to insert the export it is also
390  // necessary to include blocks that are uniformly reached, because in addition
391  // to inserting the export the "done" bits on existing exports will be cleared
392  // and we do not want to end up with the normal export in a non-unified,
393  // uniformly reached block with the "done" bit cleared.
394  auto BlocksToUnify = std::move(ReturningBlocks);
395  if (InsertExport) {
396  llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
397  }
398 
399  unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
400  "UnifiedReturnBlock");
401  return true;
402 }
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1683
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:77
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:822
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
F(f)
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:148
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, Instruction *InsertBefore=nullptr)
Value of the register doesn't matter.
static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, BasicBlock &BB)
cl::opt< bool > RequireAndPreserveDomTree
This function is used to do simplification of a CFG.
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:99
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:93
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1250
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:102
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static bool runOnFunction(Function &F, bool PostInlining)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Wrapper pass for TargetTransformInfo.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
Conditional or Unconditional Branch instruction.
char & BreakCriticalEdgesID
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:156
This function has undefined behavior.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:100
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:375
static BasicBlock * unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, ArrayRef< BasicBlock * > ReturningBlocks, bool InsertExport, const TargetTransformInfo &TTI, StringRef Name)
char & LowerSwitchID
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1742
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:442
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
char & AMDGPUUnifyDivergentExitNodesID
void applyUpdates(ArrayRef< DominatorTree::UpdateType > Updates)
Submit updates to all available trees.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1116
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:205
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:125
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:815
bool simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, DomTreeUpdater *DTU=nullptr, const SimplifyCFGOptions &Options={}, SmallPtrSetImpl< BasicBlock * > *LoopHeaders=nullptr)
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iterator_range< typename GraphTraits< GraphType >::nodes_iterator > nodes(const GraphType &G)
Definition: GraphTraits.h:108
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:129
#define I(x, y, z)
Definition: MD5.cpp:59
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:75
bool isUniform(const Value *V) const
print Print MemDeps of function
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1556
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
static void removeDoneExport(Function &F)
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:278
This pass exposes codegen information to IR-level passes.
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)