LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUCodeGenPrepare.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 246 338 72.8 %
Date: 2018-10-20 13:21:21 Functions: 17 30 56.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass does misc. AMDGPU optimizations on IR before instruction
      12             : /// selection.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUSubtarget.h"
      18             : #include "AMDGPUTargetMachine.h"
      19             : #include "llvm/ADT/StringRef.h"
      20             : #include "llvm/Analysis/AssumptionCache.h"
      21             : #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
      22             : #include "llvm/Analysis/Loads.h"
      23             : #include "llvm/Analysis/ValueTracking.h"
      24             : #include "llvm/CodeGen/Passes.h"
      25             : #include "llvm/CodeGen/TargetPassConfig.h"
      26             : #include "llvm/IR/Attributes.h"
      27             : #include "llvm/IR/BasicBlock.h"
      28             : #include "llvm/IR/Constants.h"
      29             : #include "llvm/IR/DerivedTypes.h"
      30             : #include "llvm/IR/Function.h"
      31             : #include "llvm/IR/IRBuilder.h"
      32             : #include "llvm/IR/InstVisitor.h"
      33             : #include "llvm/IR/InstrTypes.h"
      34             : #include "llvm/IR/Instruction.h"
      35             : #include "llvm/IR/Instructions.h"
      36             : #include "llvm/IR/IntrinsicInst.h"
      37             : #include "llvm/IR/Intrinsics.h"
      38             : #include "llvm/IR/LLVMContext.h"
      39             : #include "llvm/IR/Operator.h"
      40             : #include "llvm/IR/Type.h"
      41             : #include "llvm/IR/Value.h"
      42             : #include "llvm/Pass.h"
      43             : #include "llvm/Support/Casting.h"
      44             : #include <cassert>
      45             : #include <iterator>
      46             : 
      47             : #define DEBUG_TYPE "amdgpu-codegenprepare"
      48             : 
      49             : using namespace llvm;
      50             : 
      51             : namespace {
      52             : 
      53             : static cl::opt<bool> WidenLoads(
      54             :   "amdgpu-codegenprepare-widen-constant-loads",
      55             :   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
      56             :   cl::ReallyHidden,
      57             :   cl::init(true));
      58             : 
      59             : class AMDGPUCodeGenPrepare : public FunctionPass,
      60             :                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
      61             :   const GCNSubtarget *ST = nullptr;
      62             :   AssumptionCache *AC = nullptr;
      63             :   LegacyDivergenceAnalysis *DA = nullptr;
      64             :   Module *Mod = nullptr;
      65             :   bool HasUnsafeFPMath = false;
      66             : 
      67             :   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
      68             :   /// binary operation \p V.
      69             :   ///
      70             :   /// \returns Binary operation \p V.
      71             :   /// \returns \p T's base element bit width.
      72             :   unsigned getBaseElementBitWidth(const Type *T) const;
      73             : 
      74             :   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
      75             :   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
      76             :   /// is returned.
      77             :   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
      78             : 
      79             :   /// \returns True if binary operation \p I is a signed binary operation, false
      80             :   /// otherwise.
      81             :   bool isSigned(const BinaryOperator &I) const;
      82             : 
      83             :   /// \returns True if the condition of 'select' operation \p I comes from a
      84             :   /// signed 'icmp' operation, false otherwise.
      85             :   bool isSigned(const SelectInst &I) const;
      86             : 
      87             :   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
      88             :   /// false otherwise.
      89             :   bool needsPromotionToI32(const Type *T) const;
      90             : 
      91             :   /// Promotes uniform binary operation \p I to equivalent 32 bit binary
      92             :   /// operation.
      93             :   ///
      94             :   /// \details \p I's base element bit width must be greater than 1 and less
      95             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
      96             :   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
      97             :   /// truncating the result of 32 bit binary operation back to \p I's original
      98             :   /// type. Division operation is not promoted.
      99             :   ///
     100             :   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
     101             :   /// false otherwise.
     102             :   bool promoteUniformOpToI32(BinaryOperator &I) const;
     103             : 
     104             :   /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
     105             :   ///
     106             :   /// \details \p I's base element bit width must be greater than 1 and less
     107             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     108             :   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
     109             :   ///
     110             :   /// \returns True.
     111             :   bool promoteUniformOpToI32(ICmpInst &I) const;
     112             : 
     113             :   /// Promotes uniform 'select' operation \p I to 32 bit 'select'
     114             :   /// operation.
     115             :   ///
     116             :   /// \details \p I's base element bit width must be greater than 1 and less
     117             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     118             :   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
     119             :   /// result of 32 bit 'select' operation back to \p I's original type.
     120             :   ///
     121             :   /// \returns True.
     122             :   bool promoteUniformOpToI32(SelectInst &I) const;
     123             : 
     124             :   /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
     125             :   /// intrinsic.
     126             :   ///
     127             :   /// \details \p I's base element bit width must be greater than 1 and less
     128             :   /// than or equal 16. Promotion is done by zero extending the operand to 32
     129             :   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
     130             :   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
     131             :   /// shift amount is 32 minus \p I's base element bit width), and truncating
     132             :   /// the result of the shift operation back to \p I's original type.
     133             :   ///
     134             :   /// \returns True.
     135             :   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
     136             : 
     137             :   /// Expands 24 bit div or rem.
     138             :   Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
     139             :                         Value *Num, Value *Den,
     140             :                         bool IsDiv, bool IsSigned) const;
     141             : 
     142             :   /// Expands 32 bit div or rem.
     143             :   Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
     144             :                         Value *Num, Value *Den) const;
     145             : 
     146             :   /// Widen a scalar load.
     147             :   ///
     148             :   /// \details \p Widen scalar load for uniform, small type loads from constant
     149             :   //  memory / to a full 32-bits and then truncate the input to allow a scalar
     150             :   //  load instead of a vector load.
     151             :   //
     152             :   /// \returns True.
     153             : 
     154             :   bool canWidenScalarExtLoad(LoadInst &I) const;
     155             : 
     156             : public:
     157             :   static char ID;
     158             : 
     159        3942 :   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
     160             : 
     161             :   bool visitFDiv(BinaryOperator &I);
     162             : 
     163           0 :   bool visitInstruction(Instruction &I) { return false; }
     164             :   bool visitBinaryOperator(BinaryOperator &I);
     165             :   bool visitLoadInst(LoadInst &I);
     166             :   bool visitICmpInst(ICmpInst &I);
     167             :   bool visitSelectInst(SelectInst &I);
     168             : 
     169             :   bool visitIntrinsicInst(IntrinsicInst &I);
     170             :   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
     171             : 
     172             :   bool doInitialization(Module &M) override;
     173             :   bool runOnFunction(Function &F) override;
     174             : 
     175           0 :   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
     176             : 
     177        1955 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     178             :     AU.addRequired<AssumptionCacheTracker>();
     179             :     AU.addRequired<LegacyDivergenceAnalysis>();
     180             :     AU.setPreservesAll();
     181        1955 :  }
     182             : };
     183             : 
     184             : } // end anonymous namespace
     185             : 
     186           0 : unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
     187             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     188             : 
     189           0 :   if (T->isIntegerTy())
     190           0 :     return T->getIntegerBitWidth();
     191           0 :   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
     192             : }
     193             : 
     194           0 : Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
     195             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     196             : 
     197           0 :   if (T->isIntegerTy())
     198           0 :     return B.getInt32Ty();
     199           0 :   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
     200             : }
     201             : 
     202           0 : bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
     203           0 :   return I.getOpcode() == Instruction::AShr ||
     204           0 :       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
     205             : }
     206             : 
     207           0 : bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
     208             :   return isa<ICmpInst>(I.getOperand(0)) ?
     209           0 :       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
     210             : }
     211             : 
     212             : bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
     213             :   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
     214        3814 :   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
     215             :     return true;
     216             : 
     217             :   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
     218             :     // TODO: The set of packed operations is more limited, so may want to
     219             :     // promote some anyway.
     220         886 :     if (ST->hasVOP3PInsts())
     221             :       return false;
     222             : 
     223         526 :     return needsPromotionToI32(VT->getElementType());
     224             :   }
     225             : 
     226             :   return false;
     227             : }
     228             : 
     229             : // Return true if the op promoted to i32 should have nsw set.
     230         878 : static bool promotedOpIsNSW(const Instruction &I) {
     231         878 :   switch (I.getOpcode()) {
     232             :   case Instruction::Shl:
     233             :   case Instruction::Add:
     234             :   case Instruction::Sub:
     235             :     return true;
     236         298 :   case Instruction::Mul:
     237         298 :     return I.hasNoUnsignedWrap();
     238          68 :   default:
     239          68 :     return false;
     240             :   }
     241             : }
     242             : 
     243             : // Return true if the op promoted to i32 should have nuw set.
     244         878 : static bool promotedOpIsNUW(const Instruction &I) {
     245         878 :   switch (I.getOpcode()) {
     246             :   case Instruction::Shl:
     247             :   case Instruction::Add:
     248             :   case Instruction::Mul:
     249             :     return true;
     250          26 :   case Instruction::Sub:
     251          26 :     return I.hasNoUnsignedWrap();
     252          68 :   default:
     253          68 :     return false;
     254             :   }
     255             : }
     256             : 
     257           0 : bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
     258           0 :   Type *Ty = I.getType();
     259           0 :   const DataLayout &DL = Mod->getDataLayout();
     260           0 :   int TySize = DL.getTypeSizeInBits(Ty);
     261           0 :   unsigned Align = I.getAlignment() ?
     262           0 :                    I.getAlignment() : DL.getABITypeAlignment(Ty);
     263             : 
     264           0 :   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
     265             : }
     266             : 
     267           0 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     268             :   assert(needsPromotionToI32(I.getType()) &&
     269             :          "I does not need promotion to i32");
     270             : 
     271           0 :   if (I.getOpcode() == Instruction::SDiv ||
     272           0 :       I.getOpcode() == Instruction::UDiv ||
     273           0 :       I.getOpcode() == Instruction::SRem ||
     274             :       I.getOpcode() == Instruction::URem)
     275           0 :     return false;
     276             : 
     277           0 :   IRBuilder<> Builder(&I);
     278           0 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     279             : 
     280           0 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     281             :   Value *ExtOp0 = nullptr;
     282             :   Value *ExtOp1 = nullptr;
     283             :   Value *ExtRes = nullptr;
     284             :   Value *TruncRes = nullptr;
     285             : 
     286             :   if (isSigned(I)) {
     287           0 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     288           0 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     289             :   } else {
     290           0 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     291           0 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     292             :   }
     293             : 
     294           0 :   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
     295             :   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
     296           0 :     if (promotedOpIsNSW(cast<Instruction>(I)))
     297           0 :       Inst->setHasNoSignedWrap();
     298             : 
     299           0 :     if (promotedOpIsNUW(cast<Instruction>(I)))
     300           0 :       Inst->setHasNoUnsignedWrap();
     301             : 
     302             :     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
     303           0 :       Inst->setIsExact(ExactOp->isExact());
     304             :   }
     305             : 
     306           0 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     307             : 
     308           0 :   I.replaceAllUsesWith(TruncRes);
     309           0 :   I.eraseFromParent();
     310             : 
     311             :   return true;
     312             : }
     313             : 
     314           0 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
     315             :   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
     316             :          "I does not need promotion to i32");
     317             : 
     318           0 :   IRBuilder<> Builder(&I);
     319           0 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     320             : 
     321           0 :   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
     322             :   Value *ExtOp0 = nullptr;
     323             :   Value *ExtOp1 = nullptr;
     324             :   Value *NewICmp  = nullptr;
     325             : 
     326           0 :   if (I.isSigned()) {
     327           0 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     328           0 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     329             :   } else {
     330           0 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     331           0 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     332             :   }
     333           0 :   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
     334             : 
     335           0 :   I.replaceAllUsesWith(NewICmp);
     336           0 :   I.eraseFromParent();
     337             : 
     338           0 :   return true;
     339             : }
     340             : 
     341           0 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
     342             :   assert(needsPromotionToI32(I.getType()) &&
     343             :          "I does not need promotion to i32");
     344             : 
     345           0 :   IRBuilder<> Builder(&I);
     346           0 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     347             : 
     348           0 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     349             :   Value *ExtOp1 = nullptr;
     350             :   Value *ExtOp2 = nullptr;
     351             :   Value *ExtRes = nullptr;
     352             :   Value *TruncRes = nullptr;
     353             : 
     354           0 :   if (isSigned(I)) {
     355           0 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     356           0 :     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
     357             :   } else {
     358           0 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     359           0 :     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
     360             :   }
     361           0 :   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
     362           0 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     363             : 
     364           0 :   I.replaceAllUsesWith(TruncRes);
     365           0 :   I.eraseFromParent();
     366             : 
     367           0 :   return true;
     368             : }
     369             : 
     370           0 : bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
     371             :     IntrinsicInst &I) const {
     372             :   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
     373             :          "I must be bitreverse intrinsic");
     374             :   assert(needsPromotionToI32(I.getType()) &&
     375             :          "I does not need promotion to i32");
     376             : 
     377           0 :   IRBuilder<> Builder(&I);
     378           0 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     379             : 
     380           0 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     381             :   Function *I32 =
     382           0 :       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
     383           0 :   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
     384           0 :   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
     385             :   Value *LShrOp =
     386           0 :       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
     387             :   Value *TruncRes =
     388           0 :       Builder.CreateTrunc(LShrOp, I.getType());
     389             : 
     390           0 :   I.replaceAllUsesWith(TruncRes);
     391           0 :   I.eraseFromParent();
     392             : 
     393           0 :   return true;
     394             : }
     395             : 
     396          74 : static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
     397             :   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
     398             :   if (!CNum)
     399             :     return HasDenormals;
     400             : 
     401          59 :   if (UnsafeDiv)
     402             :     return true;
     403             : 
     404          59 :   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
     405             : 
     406             :   // Reciprocal f32 is handled separately without denormals.
     407          59 :   return HasDenormals ^ IsOne;
     408             : }
     409             : 
     410             : // Insert an intrinsic for fast fdiv for safe math situations where we can
     411             : // reduce precision. Leave fdiv for situations where the generic node is
     412             : // expected to be optimized.
     413         300 : bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     414         300 :   Type *Ty = FDiv.getType();
     415             : 
     416         300 :   if (!Ty->getScalarType()->isFloatTy())
     417             :     return false;
     418             : 
     419         211 :   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
     420          69 :   if (!FPMath)
     421         142 :     return false;
     422             : 
     423             :   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
     424          69 :   float ULP = FPOp->getFPAccuracy();
     425          69 :   if (ULP < 2.5f)
     426             :     return false;
     427             : 
     428             :   FastMathFlags FMF = FPOp->getFastMathFlags();
     429          61 :   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
     430             :                                       FMF.allowReciprocal();
     431             : 
     432             :   // With UnsafeDiv node will be optimized to just rcp and mul.
     433             :   if (UnsafeDiv)
     434             :     return false;
     435             : 
     436          74 :   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
     437             :   Builder.setFastMathFlags(FMF);
     438          37 :   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
     439             : 
     440          37 :   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
     441             : 
     442             :   Value *Num = FDiv.getOperand(0);
     443             :   Value *Den = FDiv.getOperand(1);
     444             : 
     445             :   Value *NewFDiv = nullptr;
     446             : 
     447          37 :   bool HasDenormals = ST->hasFP32Denormals();
     448             :   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     449          13 :     NewFDiv = UndefValue::get(VT);
     450             : 
     451             :     // FIXME: Doesn't do the right thing for cases where the vector is partially
     452             :     // constant. This works when the scalarizer pass is run first.
     453          63 :     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
     454         100 :       Value *NumEltI = Builder.CreateExtractElement(Num, I);
     455          50 :       Value *DenEltI = Builder.CreateExtractElement(Den, I);
     456             :       Value *NewElt;
     457             : 
     458          50 :       if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
     459          24 :         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
     460             :       } else {
     461          26 :         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
     462             :       }
     463             : 
     464          50 :       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     465             :     }
     466             :   } else {
     467          24 :     if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
     468          11 :       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
     469             :   }
     470             : 
     471          24 :   if (NewFDiv) {
     472          24 :     FDiv.replaceAllUsesWith(NewFDiv);
     473          24 :     NewFDiv->takeName(&FDiv);
     474          24 :     FDiv.eraseFromParent();
     475             :   }
     476             : 
     477          37 :   return !!NewFDiv;
     478             : }
     479             : 
     480       19715 : static bool hasUnsafeFPMath(const Function &F) {
     481       19715 :   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
     482       19715 :   return Attr.getValueAsString() == "true";
     483             : }
     484             : 
     485         513 : static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
     486             :                                           Value *LHS, Value *RHS) {
     487         513 :   Type *I32Ty = Builder.getInt32Ty();
     488         513 :   Type *I64Ty = Builder.getInt64Ty();
     489             : 
     490         513 :   Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
     491         513 :   Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
     492         513 :   Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
     493         513 :   Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
     494         513 :   Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
     495         513 :   Hi = Builder.CreateTrunc(Hi, I32Ty);
     496         513 :   return std::make_pair(Lo, Hi);
     497             : }
     498             : 
     499             : static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
     500         342 :   return getMul64(Builder, LHS, RHS).second;
     501             : }
     502             : 
     503             : // The fractional part of a float is enough to accurately represent up to
     504             : // a 24-bit signed integer.
     505         281 : Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
     506             :                                             BinaryOperator &I,
     507             :                                             Value *Num, Value *Den,
     508             :                                             bool IsDiv, bool IsSigned) const {
     509             :   assert(Num->getType()->isIntegerTy(32));
     510             : 
     511         281 :   const DataLayout &DL = Mod->getDataLayout();
     512         281 :   unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
     513         281 :   if (LHSSignBits < 9)
     514             :     return nullptr;
     515             : 
     516         120 :   unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
     517         120 :   if (RHSSignBits < 9)
     518             :     return nullptr;
     519             : 
     520             : 
     521         110 :   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
     522         110 :   unsigned DivBits = 32 - SignBits;
     523         110 :   if (IsSigned)
     524          58 :     ++DivBits;
     525             : 
     526         110 :   Type *Ty = Num->getType();
     527         110 :   Type *I32Ty = Builder.getInt32Ty();
     528         110 :   Type *F32Ty = Builder.getFloatTy();
     529         110 :   ConstantInt *One = Builder.getInt32(1);
     530             :   Value *JQ = One;
     531             : 
     532         110 :   if (IsSigned) {
     533             :     // char|short jq = ia ^ ib;
     534          58 :     JQ = Builder.CreateXor(Num, Den);
     535             : 
     536             :     // jq = jq >> (bitsize - 2)
     537          58 :     JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
     538             : 
     539             :     // jq = jq | 0x1
     540          58 :     JQ = Builder.CreateOr(JQ, One);
     541             :   }
     542             : 
     543             :   // int ia = (int)LHS;
     544             :   Value *IA = Num;
     545             : 
     546             :   // int ib, (int)RHS;
     547             :   Value *IB = Den;
     548             : 
     549             :   // float fa = (float)ia;
     550         110 :   Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
     551         110 :                        : Builder.CreateUIToFP(IA, F32Ty);
     552             : 
     553             :   // float fb = (float)ib;
     554         110 :   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
     555         110 :                        : Builder.CreateUIToFP(IB,F32Ty);
     556             : 
     557         110 :   Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
     558         110 :   Value *FQM = Builder.CreateFMul(FA, RCP);
     559             : 
     560             :   // fq = trunc(fqm);
     561         110 :   CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
     562         110 :   FQ->copyFastMathFlags(Builder.getFastMathFlags());
     563             : 
     564             :   // float fqneg = -fq;
     565         110 :   Value *FQNeg = Builder.CreateFNeg(FQ);
     566             : 
     567             :   // float fr = mad(fqneg, fb, fa);
     568         220 :   Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
     569         110 :                                       {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
     570             : 
     571             :   // int iq = (int)fq;
     572         110 :   Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
     573         110 :                        : Builder.CreateFPToUI(FQ, I32Ty);
     574             : 
     575             :   // fr = fabs(fr);
     576         110 :   FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
     577             : 
     578             :   // fb = fabs(fb);
     579         110 :   FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
     580             : 
     581             :   // int cv = fr >= fb;
     582         110 :   Value *CV = Builder.CreateFCmpOGE(FR, FB);
     583             : 
     584             :   // jq = (cv ? jq : 0);
     585         110 :   JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
     586             : 
     587             :   // dst = iq + jq;
     588         110 :   Value *Div = Builder.CreateAdd(IQ, JQ);
     589             : 
     590             :   Value *Res = Div;
     591         110 :   if (!IsDiv) {
     592             :     // Rem needs compensation, it's easier to recompute it
     593          42 :     Value *Rem = Builder.CreateMul(Div, Den);
     594          42 :     Res = Builder.CreateSub(Num, Rem);
     595             :   }
     596             : 
     597             :   // Truncate to number of bits this divide really is.
     598         110 :   if (IsSigned) {
     599         116 :     Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
     600          58 :     Res = Builder.CreateSExt(Res, Ty);
     601             :   } else {
     602          52 :     ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
     603          52 :     Res = Builder.CreateAnd(Res, TruncMask);
     604             :   }
     605             : 
     606             :   return Res;
     607             : }
     608             : 
     609         367 : Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
     610             :                                             BinaryOperator &I,
     611             :                                             Value *Num, Value *Den) const {
     612             :   Instruction::BinaryOps Opc = I.getOpcode();
     613             :   assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
     614             :          Opc == Instruction::SRem || Opc == Instruction::SDiv);
     615             : 
     616             :   FastMathFlags FMF;
     617             :   FMF.setFast();
     618             :   Builder.setFastMathFlags(FMF);
     619             : 
     620         367 :   if (isa<Constant>(Den))
     621             :     return nullptr; // Keep it for optimization
     622             : 
     623         281 :   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
     624         281 :   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
     625             : 
     626         281 :   Type *Ty = Num->getType();
     627         281 :   Type *I32Ty = Builder.getInt32Ty();
     628         281 :   Type *F32Ty = Builder.getFloatTy();
     629             : 
     630         281 :   if (Ty->getScalarSizeInBits() < 32) {
     631          96 :     if (IsSigned) {
     632          48 :       Num = Builder.CreateSExt(Num, I32Ty);
     633          48 :       Den = Builder.CreateSExt(Den, I32Ty);
     634             :     } else {
     635          48 :       Num = Builder.CreateZExt(Num, I32Ty);
     636          48 :       Den = Builder.CreateZExt(Den, I32Ty);
     637             :     }
     638             :   }
     639             : 
     640         281 :   if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
     641         110 :     Res = Builder.CreateTrunc(Res, Ty);
     642         110 :     return Res;
     643             :   }
     644             : 
     645         171 :   ConstantInt *Zero = Builder.getInt32(0);
     646         171 :   ConstantInt *One = Builder.getInt32(1);
     647         171 :   ConstantInt *MinusOne = Builder.getInt32(~0);
     648             : 
     649             :   Value *Sign = nullptr;
     650         171 :   if (IsSigned) {
     651          68 :     ConstantInt *K31 = Builder.getInt32(31);
     652          68 :     Value *LHSign = Builder.CreateAShr(Num, K31);
     653          68 :     Value *RHSign = Builder.CreateAShr(Den, K31);
     654             :     // Remainder sign is the same as LHS
     655          68 :     Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
     656             : 
     657          68 :     Num = Builder.CreateAdd(Num, LHSign);
     658          68 :     Den = Builder.CreateAdd(Den, RHSign);
     659             : 
     660          68 :     Num = Builder.CreateXor(Num, LHSign);
     661          68 :     Den = Builder.CreateXor(Den, RHSign);
     662             :   }
     663             : 
     664             :   // RCP =  URECIP(Den) = 2^32 / Den + e
     665             :   // e is rounding error.
     666         171 :   Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
     667         171 :   Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
     668         171 :   Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
     669         171 :   Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
     670         171 :   Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
     671             : 
     672             :   // RCP_LO, RCP_HI = mul(RCP, Den) */
     673             :   Value *RCP_LO, *RCP_HI;
     674         171 :   std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
     675             : 
     676             :   // NEG_RCP_LO = -RCP_LO
     677         171 :   Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
     678             : 
     679             :   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
     680         171 :   Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
     681         171 :   Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
     682             : 
     683             :   // Calculate the rounding error from the URECIP instruction
     684             :   // E = mulhu(ABS_RCP_LO, RCP)
     685             :   Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
     686             : 
     687             :   // RCP_A_E = RCP + E
     688         171 :   Value *RCP_A_E = Builder.CreateAdd(RCP, E);
     689             : 
     690             :   // RCP_S_E = RCP - E
     691         171 :   Value *RCP_S_E = Builder.CreateSub(RCP, E);
     692             : 
     693             :   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
     694         171 :   Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
     695             : 
     696             :   // Quotient = mulhu(Tmp0, Num)
     697             :   Value *Quotient = getMulHu(Builder, Tmp0, Num);
     698             : 
     699             :   // Num_S_Remainder = Quotient * Den
     700         171 :   Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
     701             : 
     702             :   // Remainder = Num - Num_S_Remainder
     703         171 :   Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
     704             : 
     705             :   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
     706         171 :   Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
     707         171 :   Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
     708             : 
     709             :   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
     710         171 :   Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
     711         171 :   Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
     712             :                                                   MinusOne, Zero);
     713             : 
     714             :   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
     715         171 :   Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
     716         171 :   Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
     717             : 
     718             :   Value *Res;
     719         171 :   if (IsDiv) {
     720             :     // Quotient_A_One = Quotient + 1
     721         100 :     Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
     722             : 
     723             :     // Quotient_S_One = Quotient - 1
     724         100 :     Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
     725             : 
     726             :     // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
     727         100 :     Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
     728             : 
     729             :     // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
     730         100 :     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
     731             :   } else {
     732             :     // Remainder_S_Den = Remainder - Den
     733          71 :     Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
     734             : 
     735             :     // Remainder_A_Den = Remainder + Den
     736          71 :     Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
     737             : 
     738             :     // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
     739          71 :     Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
     740             : 
     741             :     // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
     742          71 :     Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
     743             :   }
     744             : 
     745         171 :   if (IsSigned) {
     746          68 :     Res = Builder.CreateXor(Res, Sign);
     747          68 :     Res = Builder.CreateSub(Res, Sign);
     748             :   }
     749             : 
     750         171 :   Res = Builder.CreateTrunc(Res, Ty);
     751             : 
     752         171 :   return Res;
     753             : }
     754             : 
     755       13104 : bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     756       14160 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     757       15033 :       DA->isUniform(&I) && promoteUniformOpToI32(I))
     758             :     return true;
     759             : 
     760             :   bool Changed = false;
     761             :   Instruction::BinaryOps Opc = I.getOpcode();
     762       12222 :   Type *Ty = I.getType();
     763             :   Value *NewDiv = nullptr;
     764       12222 :   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
     765       12075 :        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
     766         286 :       Ty->getScalarSizeInBits() <= 32) {
     767             :     Value *Num = I.getOperand(0);
     768             :     Value *Den = I.getOperand(1);
     769         233 :     IRBuilder<> Builder(&I);
     770         233 :     Builder.SetCurrentDebugLocation(I.getDebugLoc());
     771             : 
     772             :     if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     773          60 :       NewDiv = UndefValue::get(VT);
     774             : 
     775         254 :       for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
     776         388 :         Value *NumEltN = Builder.CreateExtractElement(Num, N);
     777         194 :         Value *DenEltN = Builder.CreateExtractElement(Den, N);
     778         194 :         Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
     779         194 :         if (!NewElt)
     780          54 :           NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
     781         194 :         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
     782             :       }
     783             :     } else {
     784         173 :       NewDiv = expandDivRem32(Builder, I, Num, Den);
     785             :     }
     786             : 
     787         233 :     if (NewDiv) {
     788         201 :       I.replaceAllUsesWith(NewDiv);
     789         201 :       I.eraseFromParent();
     790             :       Changed = true;
     791             :     }
     792             :   }
     793             : 
     794             :   return Changed;
     795             : }
     796             : 
     797       15508 : bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
     798       15508 :   if (!WidenLoads)
     799             :     return false;
     800             : 
     801       14113 :   if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
     802       16971 :        I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
     803        1487 :       canWidenScalarExtLoad(I)) {
     804          69 :     IRBuilder<> Builder(&I);
     805          69 :     Builder.SetCurrentDebugLocation(I.getDebugLoc());
     806             : 
     807          69 :     Type *I32Ty = Builder.getInt32Ty();
     808          69 :     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
     809          69 :     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
     810          69 :     LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
     811          69 :     WidenLoad->copyMetadata(I);
     812             : 
     813             :     // If we have range metadata, we need to convert the type, and not make
     814             :     // assumptions about the high bits.
     815           7 :     if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
     816             :       ConstantInt *Lower =
     817             :         mdconst::extract<ConstantInt>(Range->getOperand(0));
     818             : 
     819           6 :       if (Lower->getValue().isNullValue()) {
     820           1 :         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
     821             :       } else {
     822             :         Metadata *LowAndHigh[] = {
     823           5 :           ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
     824             :           // Don't make assumptions about the high bits.
     825           5 :           ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
     826           5 :         };
     827             : 
     828           5 :         WidenLoad->setMetadata(LLVMContext::MD_range,
     829           5 :                                MDNode::get(Mod->getContext(), LowAndHigh));
     830             :       }
     831             :     }
     832             : 
     833          69 :     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
     834          69 :     Type *IntNTy = Builder.getIntNTy(TySize);
     835          69 :     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
     836          69 :     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
     837          69 :     I.replaceAllUsesWith(ValOrig);
     838          69 :     I.eraseFromParent();
     839             :     return true;
     840             :   }
     841             : 
     842             :   return false;
     843             : }
     844             : 
     845           0 : bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
     846             :   bool Changed = false;
     847             : 
     848           0 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
     849           0 :       DA->isUniform(&I))
     850           0 :     Changed |= promoteUniformOpToI32(I);
     851             : 
     852           0 :   return Changed;
     853             : }
     854             : 
     855           0 : bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
     856             :   bool Changed = false;
     857             : 
     858           0 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     859           0 :       DA->isUniform(&I))
     860           0 :     Changed |= promoteUniformOpToI32(I);
     861             : 
     862           0 :   return Changed;
     863             : }
     864             : 
     865             : bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
     866       12868 :   switch (I.getIntrinsicID()) {
     867          41 :   case Intrinsic::bitreverse:
     868          41 :     return visitBitreverseIntrinsicInst(I);
     869             :   default:
     870             :     return false;
     871             :   }
     872             : }
     873             : 
     874          41 : bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
     875             :   bool Changed = false;
     876             : 
     877          67 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     878          10 :       DA->isUniform(&I))
     879           8 :     Changed |= promoteUniformBitreverseToI32(I);
     880             : 
     881          41 :   return Changed;
     882             : }
     883             : 
     884        1954 : bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
     885        1954 :   Mod = &M;
     886        1954 :   return false;
     887             : }
     888             : 
     889       19727 : bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
     890       19727 :   if (skipFunction(F))
     891             :     return false;
     892             : 
     893       19723 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     894       19723 :   if (!TPC)
     895             :     return false;
     896             : 
     897       19715 :   const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
     898       19715 :   ST = &TM.getSubtarget<GCNSubtarget>(F);
     899       19715 :   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     900       19715 :   DA = &getAnalysis<LegacyDivergenceAnalysis>();
     901       19715 :   HasUnsafeFPMath = hasUnsafeFPMath(F);
     902             : 
     903             :   bool MadeChange = false;
     904             : 
     905       41696 :   for (BasicBlock &BB : F) {
     906             :     BasicBlock::iterator Next;
     907      145789 :     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
     908             :       Next = std::next(I);
     909      123808 :       MadeChange |= visit(*I);
     910             :     }
     911             :   }
     912             : 
     913             :   return MadeChange;
     914             : }
     915             : 
     916       85105 : INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
     917             :                       "AMDGPU IR optimizations", false, false)
     918       85105 : INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
     919       85105 : INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
     920      199024 : INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
     921             :                     false, false)
     922             : 
     923             : char AMDGPUCodeGenPrepare::ID = 0;
     924             : 
     925        1964 : FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
     926        1964 :   return new AMDGPUCodeGenPrepare();
     927             : }

Generated by: LCOV version 1.13