Line data Source code
1 : //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file implements a general divergence analysis for loop vectorization
11 : // and GPU programs. It determines which branches and values in a loop or GPU
12 : // program are divergent. It can help branch optimizations such as jump
13 : // threading and loop unswitching to make better decisions.
14 : //
15 : // GPU programs typically use the SIMD execution model, where multiple threads
16 : // in the same execution group have to execute in lock-step. Therefore, if the
17 : // code contains divergent branches (i.e., threads in a group do not agree on
18 : // which path of the branch to take), the group of threads has to execute all
19 : // the paths from that branch with different subsets of threads enabled until
20 : // they re-converge.
21 : //
22 : // Due to this execution model, some optimizations such as jump
23 : // threading and loop unswitching can interfere with thread re-convergence.
24 : // Therefore, an analysis that computes which branches in a GPU program are
25 : // divergent can help the compiler to selectively run these optimizations.
26 : //
27 : // This implementation is derived from the Vectorization Analysis of the
28 : // Region Vectorizer (RV). That implementation in turn is based on the approach
29 : // described in
30 : //
31 : // Improving Performance of OpenCL on CPUs
32 : // Ralf Karrenberg and Sebastian Hack
33 : // CC '12
34 : //
35 : // This DivergenceAnalysis implementation is generic in the sense that it does
36 : // not itself identify original sources of divergence.
37 : // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
38 : // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
39 : // (e.g., special variables that hold the thread ID or the iteration variable).
40 : //
41 : // The generic implementation propagates divergence to variables that are data
42 : // or sync dependent on a source of divergence.
43 : //
44 : // While data dependency is a well-known concept, the notion of sync dependency
45 : // is worth more explanation. Sync dependence characterizes the control flow
46 : // aspect of the propagation of branch divergence. For example,
47 : //
48 : // %cond = icmp slt i32 %tid, 10
49 : // br i1 %cond, label %then, label %else
50 : // then:
51 : // br label %merge
52 : // else:
53 : // br label %merge
54 : // merge:
55 : // %a = phi i32 [ 0, %then ], [ 1, %else ]
56 : //
57 : // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
58 : // because %tid is not on its use-def chains, %a is sync dependent on %tid
59 : // because the branch "br i1 %cond" depends on %tid and affects which value %a
60 : // is assigned to.
61 : //
62 : // The sync dependence detection (which branch induces divergence in which join
63 : // points) is implemented in the SyncDependenceAnalysis.
64 : //
65 : // The current DivergenceAnalysis implementation has the following limitations:
66 : // 1. intra-procedural. It conservatively considers the arguments of a
67 : // non-kernel-entry function and the return value of a function call as
68 : // divergent.
69 : // 2. memory as black box. It conservatively considers values loaded from
70 : // generic or local address as divergent. This can be improved by leveraging
71 : // pointer analysis and/or by modelling non-escaping memory objects in SSA
72 : // as done in RV.
73 : //
74 : //===----------------------------------------------------------------------===//
75 :
76 : #include "llvm/Analysis/DivergenceAnalysis.h"
77 : #include "llvm/Analysis/LoopInfo.h"
78 : #include "llvm/Analysis/Passes.h"
79 : #include "llvm/Analysis/PostDominators.h"
80 : #include "llvm/Analysis/TargetTransformInfo.h"
81 : #include "llvm/IR/Dominators.h"
82 : #include "llvm/IR/InstIterator.h"
83 : #include "llvm/IR/Instructions.h"
84 : #include "llvm/IR/IntrinsicInst.h"
85 : #include "llvm/IR/Value.h"
86 : #include "llvm/Support/Debug.h"
87 : #include "llvm/Support/raw_ostream.h"
88 : #include <vector>
89 :
90 : using namespace llvm;
91 :
92 : #define DEBUG_TYPE "divergence-analysis"
93 :
94 : // class DivergenceAnalysis
95 13 : DivergenceAnalysis::DivergenceAnalysis(
96 : const Function &F, const Loop *RegionLoop, const DominatorTree &DT,
97 13 : const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
98 : : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA),
99 13 : IsLCSSAForm(IsLCSSAForm) {}
100 :
101 39 : void DivergenceAnalysis::markDivergent(const Value &DivVal) {
102 : assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal));
103 : assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
104 39 : DivergentValues.insert(&DivVal);
105 39 : }
106 :
107 0 : void DivergenceAnalysis::addUniformOverride(const Value &UniVal) {
108 0 : UniformOverrides.insert(&UniVal);
109 0 : }
110 :
111 11 : bool DivergenceAnalysis::updateTerminator(const TerminatorInst &Term) const {
112 11 : if (Term.getNumSuccessors() <= 1)
113 : return false;
114 : if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
115 : assert(BranchTerm->isConditional());
116 9 : return isDivergent(*BranchTerm->getCondition());
117 : }
118 : if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) {
119 1 : return isDivergent(*SwitchTerm->getCondition());
120 : }
121 0 : if (isa<InvokeInst>(Term)) {
122 : return false; // ignore abnormal executions through landingpad
123 : }
124 :
125 0 : llvm_unreachable("unexpected terminator");
126 : }
127 :
128 3 : bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
129 : // TODO function calls with side effects, etc
130 8 : for (const auto &Op : I.operands()) {
131 5 : if (isDivergent(*Op))
132 : return true;
133 : }
134 : return false;
135 : }
136 :
137 1 : bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
138 : const Value &Val) const {
139 : const auto *Inst = dyn_cast<const Instruction>(&Val);
140 : if (!Inst)
141 : return false;
142 : // check whether any divergent loop carrying Val terminates before control
143 : // proceeds to ObservingBlock
144 1 : for (const auto *Loop = LI.getLoopFor(Inst->getParent());
145 2 : Loop != RegionLoop && !Loop->contains(&ObservingBlock);
146 0 : Loop = Loop->getParentLoop()) {
147 1 : if (DivergentLoops.find(Loop) != DivergentLoops.end())
148 : return true;
149 : }
150 :
151 : return false;
152 : }
153 :
154 12 : bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const {
155 : // joining divergent disjoint path in Phi parent block
156 12 : if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) {
157 : return true;
158 : }
159 :
160 : // An incoming value could be divergent by itself.
161 : // Otherwise, an incoming value could be uniform within the loop
162 : // that carries its definition but it may appear divergent
163 : // from outside the loop. This happens when divergent loop exits
164 : // drop definitions of that uniform value in different iterations.
165 : //
166 : // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
167 : // if (i % thread_id == 0) break; // divergent loop exit
168 : // }
169 : // int divI = i; // divI is divergent
170 1 : for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) {
171 : const auto *InVal = Phi.getIncomingValue(i);
172 2 : if (isDivergent(*Phi.getIncomingValue(i)) ||
173 1 : isTemporalDivergent(*Phi.getParent(), *InVal)) {
174 1 : return true;
175 : }
176 : }
177 : return false;
178 : }
179 :
180 13 : bool DivergenceAnalysis::inRegion(const Instruction &I) const {
181 13 : return I.getParent() && inRegion(*I.getParent());
182 : }
183 :
184 29 : bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
185 29 : return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
186 : }
187 :
188 : // marks all users of loop-carried values of the loop headed by LoopHeader as
189 : // divergent
190 1 : void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
191 1 : auto *DivLoop = LI.getLoopFor(&LoopHeader);
192 : assert(DivLoop && "loopHeader is not actually part of a loop");
193 :
194 : SmallVector<BasicBlock *, 8> TaintStack;
195 1 : DivLoop->getExitBlocks(TaintStack);
196 :
197 : // Otherwise potential users of loop-carried values could be anywhere in the
198 : // dominance region of DivLoop (including its fringes for phi nodes)
199 : DenseSet<const BasicBlock *> Visited;
200 2 : for (auto *Block : TaintStack) {
201 1 : Visited.insert(Block);
202 : }
203 1 : Visited.insert(&LoopHeader);
204 :
205 2 : while (!TaintStack.empty()) {
206 1 : auto *UserBlock = TaintStack.back();
207 : TaintStack.pop_back();
208 :
209 : // don't spread divergence beyond the region
210 1 : if (!inRegion(*UserBlock))
211 : continue;
212 :
213 : assert(!DivLoop->contains(UserBlock) &&
214 : "irreducible control flow detected");
215 :
216 : // phi nodes at the fringes of the dominance region
217 1 : if (!DT.dominates(&LoopHeader, UserBlock)) {
218 : // all PHI nodes of UserBlock become divergent
219 0 : for (auto &Phi : UserBlock->phis()) {
220 0 : Worklist.push_back(&Phi);
221 : }
222 : continue;
223 : }
224 :
225 : // taint outside users of values carried by DivLoop
226 2 : for (auto &I : *UserBlock) {
227 1 : if (isAlwaysUniform(I))
228 : continue;
229 1 : if (isDivergent(I))
230 : continue;
231 :
232 2 : for (auto &Op : I.operands()) {
233 : auto *OpInst = dyn_cast<Instruction>(&Op);
234 : if (!OpInst)
235 : continue;
236 1 : if (DivLoop->contains(OpInst->getParent())) {
237 1 : markDivergent(I);
238 1 : pushUsers(I);
239 1 : break;
240 : }
241 : }
242 : }
243 :
244 : // visit all blocks in the dominance region
245 2 : for (auto *SuccBlock : successors(UserBlock)) {
246 0 : if (!Visited.insert(SuccBlock).second) {
247 : continue;
248 : }
249 0 : TaintStack.push_back(SuccBlock);
250 : }
251 : }
252 1 : }
253 :
254 13 : void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) {
255 25 : for (const auto &Phi : Block.phis()) {
256 12 : if (isDivergent(Phi))
257 : continue;
258 12 : Worklist.push_back(&Phi);
259 : }
260 13 : }
261 :
262 29 : void DivergenceAnalysis::pushUsers(const Value &V) {
263 42 : for (const auto *User : V.users()) {
264 13 : const auto *UserInst = dyn_cast<const Instruction>(User);
265 13 : if (!UserInst)
266 0 : continue;
267 :
268 13 : if (isDivergent(*UserInst))
269 : continue;
270 :
271 : // only compute divergent inside loop
272 13 : if (!inRegion(*UserInst))
273 : continue;
274 13 : Worklist.push_back(UserInst);
275 : }
276 29 : }
277 :
278 13 : bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
279 : const Loop *BranchLoop) {
280 : LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
281 :
282 : // ignore divergence outside the region
283 13 : if (!inRegion(JoinBlock)) {
284 : return false;
285 : }
286 :
287 : // push non-divergent phi nodes in JoinBlock to the worklist
288 13 : pushPHINodes(JoinBlock);
289 :
290 : // JoinBlock is a divergent loop exit
291 15 : if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
292 : return true;
293 : }
294 :
295 : // disjoint-paths divergent at JoinBlock
296 : markBlockJoinDivergent(JoinBlock);
297 11 : return false;
298 : }
299 :
300 10 : void DivergenceAnalysis::propagateBranchDivergence(const TerminatorInst &Term) {
301 : LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
302 :
303 10 : markDivergent(Term);
304 :
305 10 : const auto *BranchLoop = LI.getLoopFor(Term.getParent());
306 :
307 : // whether there is a divergent loop exit from BranchLoop (if any)
308 : bool IsBranchLoopDivergent = false;
309 :
310 : // iterate over all blocks reachable by disjoint from Term within the loop
311 : // also iterates over loop exits that become divergent due to Term.
312 23 : for (const auto *JoinBlock : SDA.join_blocks(Term)) {
313 13 : IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
314 : }
315 :
316 : // Branch loop is a divergent loop due to the divergent branch in Term
317 10 : if (IsBranchLoopDivergent) {
318 : assert(BranchLoop);
319 2 : if (!DivergentLoops.insert(BranchLoop).second) {
320 0 : return;
321 : }
322 2 : propagateLoopDivergence(*BranchLoop);
323 : }
324 : }
325 :
326 2 : void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) {
327 : LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n");
328 :
329 : // don't propagate beyond region
330 2 : if (!inRegion(*ExitingLoop.getHeader()))
331 0 : return;
332 :
333 2 : const auto *BranchLoop = ExitingLoop.getParentLoop();
334 :
335 : // Uses of loop-carried values could occur anywhere
336 : // within the dominance region of the definition. All loop-carried
337 : // definitions are dominated by the loop header (reducible control).
338 : // Thus all users have to be in the dominance region of the loop header,
339 : // except PHI nodes that can also live at the fringes of the dom region
340 : // (incoming defining value).
341 2 : if (!IsLCSSAForm)
342 1 : taintLoopLiveOuts(*ExitingLoop.getHeader());
343 :
344 : // whether there is a divergent loop exit from BranchLoop (if any)
345 : bool IsBranchLoopDivergent = false;
346 :
347 : // iterate over all blocks reachable by disjoint paths from exits of
348 : // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
349 : // become divergent.
350 2 : for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
351 0 : IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
352 : }
353 :
354 : // Branch loop is a divergent due to divergent loop exit in ExitingLoop
355 2 : if (IsBranchLoopDivergent) {
356 : assert(BranchLoop);
357 0 : if (!DivergentLoops.insert(BranchLoop).second) {
358 : return;
359 : }
360 0 : propagateLoopDivergence(*BranchLoop);
361 : }
362 : }
363 :
364 14 : void DivergenceAnalysis::compute() {
365 27 : for (auto *DivVal : DivergentValues) {
366 13 : pushUsers(*DivVal);
367 : }
368 :
369 : // propagate divergence
370 39 : while (!Worklist.empty()) {
371 25 : const Instruction &I = *Worklist.back();
372 : Worklist.pop_back();
373 :
374 : // maintain uniformity of overrides
375 25 : if (isAlwaysUniform(I))
376 : continue;
377 :
378 25 : bool WasDivergent = isDivergent(I);
379 25 : if (WasDivergent)
380 : continue;
381 :
382 : // propagate divergence caused by terminator
383 25 : if (isa<TerminatorInst>(I)) {
384 : auto &Term = cast<TerminatorInst>(I);
385 11 : if (updateTerminator(Term)) {
386 : // propagate control divergence to affected instructions
387 10 : propagateBranchDivergence(Term);
388 10 : continue;
389 : }
390 : }
391 :
392 : // update divergence of I due to divergent operands
393 : bool DivergentUpd = false;
394 : const auto *Phi = dyn_cast<const PHINode>(&I);
395 : if (Phi) {
396 12 : DivergentUpd = updatePHINode(*Phi);
397 : } else {
398 3 : DivergentUpd = updateNormalInstruction(I);
399 : }
400 :
401 : // propagate value divergence to users
402 15 : if (DivergentUpd) {
403 15 : markDivergent(I);
404 15 : pushUsers(I);
405 : }
406 : }
407 14 : }
408 :
409 26 : bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const {
410 26 : return UniformOverrides.find(&V) != UniformOverrides.end();
411 : }
412 :
413 93 : bool DivergenceAnalysis::isDivergent(const Value &V) const {
414 93 : return DivergentValues.find(&V) != DivergentValues.end();
415 : }
416 :
417 0 : void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
418 0 : if (DivergentValues.empty())
419 : return;
420 : // iterate instructions using instructions() to ensure a deterministic order.
421 0 : for (auto &I : instructions(F)) {
422 0 : if (isDivergent(I))
423 0 : OS << "DIVERGENT:" << I << '\n';
424 : }
425 : }
|