LCOV - code coverage report
Current view: top level - lib/Analysis - DivergenceAnalysis.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 87 90 96.7 %
Date: 2018-07-13 00:08:38 Functions: 12 13 92.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file implements divergence analysis which determines whether a branch
      11             : // in a GPU program is divergent.It can help branch optimizations such as jump
      12             : // threading and loop unswitching to make better decisions.
      13             : //
      14             : // GPU programs typically use the SIMD execution model, where multiple threads
      15             : // in the same execution group have to execute in lock-step. Therefore, if the
      16             : // code contains divergent branches (i.e., threads in a group do not agree on
      17             : // which path of the branch to take), the group of threads has to execute all
      18             : // the paths from that branch with different subsets of threads enabled until
      19             : // they converge at the immediately post-dominating BB of the paths.
      20             : //
      21             : // Due to this execution model, some optimizations such as jump
      22             : // threading and loop unswitching can be unfortunately harmful when performed on
      23             : // divergent branches. Therefore, an analysis that computes which branches in a
      24             : // GPU program are divergent can help the compiler to selectively run these
      25             : // optimizations.
      26             : //
      27             : // This file defines divergence analysis which computes a conservative but
      28             : // non-trivial approximation of all divergent branches in a GPU program. It
      29             : // partially implements the approach described in
      30             : //
      31             : //   Divergence Analysis
      32             : //   Sampaio, Souza, Collange, Pereira
      33             : //   TOPLAS '13
      34             : //
      35             : // The divergence analysis identifies the sources of divergence (e.g., special
      36             : // variables that hold the thread ID), and recursively marks variables that are
      37             : // data or sync dependent on a source of divergence as divergent.
      38             : //
      39             : // While data dependency is a well-known concept, the notion of sync dependency
      40             : // is worth more explanation. Sync dependence characterizes the control flow
      41             : // aspect of the propagation of branch divergence. For example,
      42             : //
      43             : //   %cond = icmp slt i32 %tid, 10
      44             : //   br i1 %cond, label %then, label %else
      45             : // then:
      46             : //   br label %merge
      47             : // else:
      48             : //   br label %merge
      49             : // merge:
      50             : //   %a = phi i32 [ 0, %then ], [ 1, %else ]
      51             : //
      52             : // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
      53             : // because %tid is not on its use-def chains, %a is sync dependent on %tid
      54             : // because the branch "br i1 %cond" depends on %tid and affects which value %a
      55             : // is assigned to.
      56             : //
      57             : // The current implementation has the following limitations:
      58             : // 1. intra-procedural. It conservatively considers the arguments of a
      59             : //    non-kernel-entry function and the return value of a function call as
      60             : //    divergent.
      61             : // 2. memory as black box. It conservatively considers values loaded from
      62             : //    generic or local address as divergent. This can be improved by leveraging
      63             : //    pointer analysis.
      64             : //
      65             : //===----------------------------------------------------------------------===//
      66             : 
      67             : #include "llvm/Analysis/DivergenceAnalysis.h"
      68             : #include "llvm/Analysis/Passes.h"
      69             : #include "llvm/Analysis/PostDominators.h"
      70             : #include "llvm/Analysis/TargetTransformInfo.h"
      71             : #include "llvm/IR/Dominators.h"
      72             : #include "llvm/IR/InstIterator.h"
      73             : #include "llvm/IR/Instructions.h"
      74             : #include "llvm/IR/Value.h"
      75             : #include "llvm/Support/Debug.h"
      76             : #include "llvm/Support/raw_ostream.h"
      77             : #include <vector>
      78             : using namespace llvm;
      79             : 
      80             : namespace {
      81             : 
      82             : class DivergencePropagator {
      83             : public:
      84             :   DivergencePropagator(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
      85             :                        PostDominatorTree &PDT, DenseSet<const Value *> &DV)
      86      179294 :       : F(F), TTI(TTI), DT(DT), PDT(PDT), DV(DV) {}
      87             :   void populateWithSourcesOfDivergence();
      88             :   void propagate();
      89             : 
      90             : private:
      91             :   // A helper function that explores data dependents of V.
      92             :   void exploreDataDependency(Value *V);
      93             :   // A helper function that explores sync dependents of TI.
      94             :   void exploreSyncDependency(TerminatorInst *TI);
      95             :   // Computes the influence region from Start to End. This region includes all
      96             :   // basic blocks on any simple path from Start to End.
      97             :   void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End,
      98             :                               DenseSet<BasicBlock *> &InfluenceRegion);
      99             :   // Finds all users of I that are outside the influence region, and add these
     100             :   // users to Worklist.
     101             :   void findUsersOutsideInfluenceRegion(
     102             :       Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion);
     103             : 
     104             :   Function &F;
     105             :   TargetTransformInfo &TTI;
     106             :   DominatorTree &DT;
     107             :   PostDominatorTree &PDT;
     108             :   std::vector<Value *> Worklist; // Stack for DFS.
     109             :   DenseSet<const Value *> &DV;   // Stores all divergent values.
     110             : };
     111             : 
     112       89647 : void DivergencePropagator::populateWithSourcesOfDivergence() {
     113       89647 :   Worklist.clear();
     114       89647 :   DV.clear();
     115     1209384 :   for (auto &I : instructions(F)) {
     116     1119737 :     if (TTI.isSourceOfDivergence(&I)) {
     117       79082 :       Worklist.push_back(&I);
     118       79082 :       DV.insert(&I);
     119             :     }
     120             :   }
     121      295008 :   for (auto &Arg : F.args()) {
     122      205361 :     if (TTI.isSourceOfDivergence(&Arg)) {
     123       61574 :       Worklist.push_back(&Arg);
     124       61574 :       DV.insert(&Arg);
     125             :     }
     126             :   }
     127       89647 : }
     128             : 
     129        2221 : void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) {
     130             :   // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's
     131             :   // immediate post dominator are divergent. This rule handles if-then-else
     132             :   // patterns. For example,
     133             :   //
     134             :   // if (tid < 5)
     135             :   //   a1 = 1;
     136             :   // else
     137             :   //   a2 = 2;
     138             :   // a = phi(a1, a2); // sync dependent on (tid < 5)
     139        2221 :   BasicBlock *ThisBB = TI->getParent();
     140             : 
     141             :   // Unreachable blocks may not be in the dominator tree.
     142        2221 :   if (!DT.isReachableFromEntry(ThisBB))
     143         134 :     return;
     144             : 
     145             :   // If the function has no exit blocks or doesn't reach any exit blocks, the
     146             :   // post dominator may be null.
     147        2220 :   DomTreeNode *ThisNode = PDT.getNode(ThisBB);
     148        2220 :   if (!ThisNode)
     149             :     return;
     150             : 
     151        2220 :   BasicBlock *IPostDom = ThisNode->getIDom()->getBlock();
     152        2220 :   if (IPostDom == nullptr)
     153             :     return;
     154             : 
     155        5745 :   for (auto I = IPostDom->begin(); isa<PHINode>(I); ++I) {
     156             :     // A PHINode is uniform if it returns the same value no matter which path is
     157             :     // taken.
     158        5089 :     if (!cast<PHINode>(I)->hasConstantOrUndefValue() && DV.insert(&*I).second)
     159        2010 :       Worklist.push_back(&*I);
     160             :   }
     161             : 
     162             :   // Propagation rule 2: if a value defined in a loop is used outside, the user
     163             :   // is sync dependent on the condition of the loop exits that dominate the
     164             :   // user. For example,
     165             :   //
     166             :   // int i = 0;
     167             :   // do {
     168             :   //   i++;
     169             :   //   if (foo(i)) ... // uniform
     170             :   // } while (i < tid);
     171             :   // if (bar(i)) ...   // divergent
     172             :   //
     173             :   // A program may contain unstructured loops. Therefore, we cannot leverage
     174             :   // LoopInfo, which only recognizes natural loops.
     175             :   //
     176             :   // The algorithm used here handles both natural and unstructured loops.  Given
     177             :   // a branch TI, we first compute its influence region, the union of all simple
     178             :   // paths from TI to its immediate post dominator (IPostDom). Then, we search
     179             :   // for all the values defined in the influence region but used outside. All
     180             :   // these users are sync dependent on TI.
     181             :   DenseSet<BasicBlock *> InfluenceRegion;
     182        2087 :   computeInfluenceRegion(ThisBB, IPostDom, InfluenceRegion);
     183             :   // An insight that can speed up the search process is that all the in-region
     184             :   // values that are used outside must dominate TI. Therefore, instead of
     185             :   // searching every basic blocks in the influence region, we search all the
     186             :   // dominators of TI until it is outside the influence region.
     187             :   BasicBlock *InfluencedBB = ThisBB;
     188         541 :   while (InfluenceRegion.count(InfluencedBB)) {
     189       10934 :     for (auto &I : *InfluencedBB)
     190       10393 :       findUsersOutsideInfluenceRegion(I, InfluenceRegion);
     191        1082 :     DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom();
     192         541 :     if (IDomNode == nullptr)
     193             :       break;
     194         541 :     InfluencedBB = IDomNode->getBlock();
     195             :   }
     196             : }
     197             : 
     198       10393 : void DivergencePropagator::findUsersOutsideInfluenceRegion(
     199             :     Instruction &I, const DenseSet<BasicBlock *> &InfluenceRegion) {
     200       26483 :   for (User *U : I.users()) {
     201             :     Instruction *UserInst = cast<Instruction>(U);
     202       16090 :     if (!InfluenceRegion.count(UserInst->getParent())) {
     203       10904 :       if (DV.insert(UserInst).second)
     204        7398 :         Worklist.push_back(UserInst);
     205             :     }
     206             :   }
     207       10393 : }
     208             : 
     209             : // A helper function for computeInfluenceRegion that adds successors of "ThisBB"
     210             : // to the influence region.
     211             : static void
     212        5747 : addSuccessorsToInfluenceRegion(BasicBlock *ThisBB, BasicBlock *End,
     213             :                                DenseSet<BasicBlock *> &InfluenceRegion,
     214             :                                std::vector<BasicBlock *> &InfluenceStack) {
     215       14830 :   for (BasicBlock *Succ : successors(ThisBB)) {
     216       13786 :     if (Succ != End && InfluenceRegion.insert(Succ).second)
     217        3660 :       InfluenceStack.push_back(Succ);
     218             :   }
     219        5747 : }
     220             : 
     221        2087 : void DivergencePropagator::computeInfluenceRegion(
     222             :     BasicBlock *Start, BasicBlock *End,
     223             :     DenseSet<BasicBlock *> &InfluenceRegion) {
     224             :   assert(PDT.properlyDominates(End, Start) &&
     225             :          "End does not properly dominate Start");
     226             : 
     227             :   // The influence region starts from the end of "Start" to the beginning of
     228             :   // "End". Therefore, "Start" should not be in the region unless "Start" is in
     229             :   // a loop that doesn't contain "End".
     230             :   std::vector<BasicBlock *> InfluenceStack;
     231        2087 :   addSuccessorsToInfluenceRegion(Start, End, InfluenceRegion, InfluenceStack);
     232        5747 :   while (!InfluenceStack.empty()) {
     233        3660 :     BasicBlock *BB = InfluenceStack.back();
     234             :     InfluenceStack.pop_back();
     235        3660 :     addSuccessorsToInfluenceRegion(BB, End, InfluenceRegion, InfluenceStack);
     236             :   }
     237        2087 : }
     238             : 
     239      254711 : void DivergencePropagator::exploreDataDependency(Value *V) {
     240             :   // Follow def-use chains of V.
     241      489393 :   for (User *U : V->users()) {
     242             :     Instruction *UserInst = cast<Instruction>(U);
     243      524336 :     if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second)
     244      359358 :       Worklist.push_back(UserInst);
     245             :   }
     246      254711 : }
     247             : 
     248       89647 : void DivergencePropagator::propagate() {
     249             :   // Traverse the dependency graph using DFS.
     250      599069 :   while (!Worklist.empty()) {
     251      254711 :     Value *V = Worklist.back();
     252             :     Worklist.pop_back();
     253             :     if (TerminatorInst *TI = dyn_cast<TerminatorInst>(V)) {
     254             :       // Terminators with less than two successors won't introduce sync
     255             :       // dependency. Ignore them.
     256        8079 :       if (TI->getNumSuccessors() > 1)
     257        2221 :         exploreSyncDependency(TI);
     258             :     }
     259      254711 :     exploreDataDependency(V);
     260             :   }
     261       89647 : }
     262             : 
     263             : } /// end namespace anonymous
     264             : 
     265             : // Register this pass.
     266             : char DivergenceAnalysis::ID = 0;
     267       73266 : INITIALIZE_PASS_BEGIN(DivergenceAnalysis, "divergence", "Divergence Analysis",
     268             :                       false, true)
     269       73266 : INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
     270       73266 : INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
     271     1043054 : INITIALIZE_PASS_END(DivergenceAnalysis, "divergence", "Divergence Analysis",
     272             :                     false, true)
     273             : 
     274           0 : FunctionPass *llvm::createDivergenceAnalysisPass() {
     275           0 :   return new DivergenceAnalysis();
     276             : }
     277             : 
     278        9276 : void DivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
     279             :   AU.addRequired<DominatorTreeWrapperPass>();
     280             :   AU.addRequired<PostDominatorTreeWrapperPass>();
     281             :   AU.setPreservesAll();
     282        9276 : }
     283             : 
     284       92336 : bool DivergenceAnalysis::runOnFunction(Function &F) {
     285       92336 :   auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
     286       92336 :   if (TTIWP == nullptr)
     287             :     return false;
     288             : 
     289       92336 :   TargetTransformInfo &TTI = TTIWP->getTTI(F);
     290             :   // Fast path: if the target does not have branch divergence, we do not mark
     291             :   // any branch as divergent.
     292       92336 :   if (!TTI.hasBranchDivergence())
     293             :     return false;
     294             : 
     295             :   DivergentValues.clear();
     296       89647 :   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
     297             :   DivergencePropagator DP(F, TTI,
     298       89647 :                           getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
     299       89647 :                           PDT, DivergentValues);
     300       89647 :   DP.populateWithSourcesOfDivergence();
     301       89647 :   DP.propagate();
     302             :   return false;
     303             : }
     304             : 
     305          51 : void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
     306          51 :   if (DivergentValues.empty())
     307             :     return;
     308          50 :   const Value *FirstDivergentValue = *DivergentValues.begin();
     309             :   const Function *F;
     310             :   if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
     311          18 :     F = Arg->getParent();
     312             :   } else if (const Instruction *I =
     313             :                  dyn_cast<Instruction>(FirstDivergentValue)) {
     314          32 :     F = I->getParent()->getParent();
     315             :   } else {
     316           0 :     llvm_unreachable("Only arguments and instructions can be divergent");
     317             :   }
     318             : 
     319             :   // Dumps all divergent values in F, arguments and then instructions.
     320         163 :   for (auto &Arg : F->args()) {
     321         113 :     if (DivergentValues.count(&Arg))
     322         190 :       OS << "DIVERGENT:  " << Arg << "\n";
     323             :   }
     324             :   // Iterate instructions using instructions() to ensure a deterministic order.
     325         208 :   for (auto &I : instructions(F)) {
     326         208 :     if (DivergentValues.count(&I))
     327         302 :       OS << "DIVERGENT:" << I << "\n";
     328             :   }
     329             : }

Generated by: LCOV version 1.13