LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUCodeGenPrepare.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 178 180 98.9 %
Date: 2018-02-21 17:27:13 Functions: 24 26 92.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass does misc. AMDGPU optimizations on IR before instruction
      12             : /// selection.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUSubtarget.h"
      18             : #include "AMDGPUTargetMachine.h"
      19             : #include "llvm/ADT/StringRef.h"
      20             : #include "llvm/Analysis/DivergenceAnalysis.h"
      21             : #include "llvm/Analysis/Loads.h"
      22             : #include "llvm/CodeGen/Passes.h"
      23             : #include "llvm/CodeGen/TargetPassConfig.h"
      24             : #include "llvm/IR/Attributes.h"
      25             : #include "llvm/IR/BasicBlock.h"
      26             : #include "llvm/IR/Constants.h"
      27             : #include "llvm/IR/DerivedTypes.h"
      28             : #include "llvm/IR/Function.h"
      29             : #include "llvm/IR/IRBuilder.h"
      30             : #include "llvm/IR/InstVisitor.h"
      31             : #include "llvm/IR/InstrTypes.h"
      32             : #include "llvm/IR/Instruction.h"
      33             : #include "llvm/IR/Instructions.h"
      34             : #include "llvm/IR/IntrinsicInst.h"
      35             : #include "llvm/IR/Intrinsics.h"
      36             : #include "llvm/IR/LLVMContext.h"
      37             : #include "llvm/IR/Operator.h"
      38             : #include "llvm/IR/Type.h"
      39             : #include "llvm/IR/Value.h"
      40             : #include "llvm/Pass.h"
      41             : #include "llvm/Support/Casting.h"
      42             : #include <cassert>
      43             : #include <iterator>
      44             : 
      45             : #define DEBUG_TYPE "amdgpu-codegenprepare"
      46             : 
      47             : using namespace llvm;
      48             : 
      49             : namespace {
      50             : 
      51        3366 : class AMDGPUCodeGenPrepare : public FunctionPass,
      52             :                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
      53             :   const SISubtarget *ST = nullptr;
      54             :   DivergenceAnalysis *DA = nullptr;
      55             :   Module *Mod = nullptr;
      56             :   bool HasUnsafeFPMath = false;
      57             :   AMDGPUAS AMDGPUASI;
      58             : 
      59             :   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
      60             :   /// binary operation \p V.
      61             :   ///
      62             :   /// \returns Binary operation \p V.
      63             :   /// \returns \p T's base element bit width.
      64             :   unsigned getBaseElementBitWidth(const Type *T) const;
      65             : 
      66             :   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
      67             :   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
      68             :   /// is returned.
      69             :   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
      70             : 
      71             :   /// \returns True if binary operation \p I is a signed binary operation, false
      72             :   /// otherwise.
      73             :   bool isSigned(const BinaryOperator &I) const;
      74             : 
      75             :   /// \returns True if the condition of 'select' operation \p I comes from a
      76             :   /// signed 'icmp' operation, false otherwise.
      77             :   bool isSigned(const SelectInst &I) const;
      78             : 
      79             :   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
      80             :   /// false otherwise.
      81             :   bool needsPromotionToI32(const Type *T) const;
      82             : 
      83             :   /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
      84             :   /// operation.
      85             :   ///
      86             :   /// \details \p I's base element bit width must be greater than 1 and less
      87             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
      88             :   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
      89             :   /// truncating the result of 32 bit binary operation back to \p I's original
      90             :   /// type. Division operation is not promoted.
      91             :   ///
      92             :   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
      93             :   /// false otherwise.
      94             :   bool promoteUniformOpToI32(BinaryOperator &I) const;
      95             : 
      96             :   /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
      97             :   ///
      98             :   /// \details \p I's base element bit width must be greater than 1 and less
      99             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     100             :   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
     101             :   ///
     102             :   /// \returns True.
     103             :   bool promoteUniformOpToI32(ICmpInst &I) const;
     104             : 
     105             :   /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
     106             :   /// operation.
     107             :   ///
     108             :   /// \details \p I's base element bit width must be greater than 1 and less
     109             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     110             :   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
     111             :   /// result of 32 bit 'select' operation back to \p I's original type.
     112             :   ///
     113             :   /// \returns True.
     114             :   bool promoteUniformOpToI32(SelectInst &I) const;
     115             : 
     116             :   /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
     117             :   /// intrinsic.
     118             :   ///
     119             :   /// \details \p I's base element bit width must be greater than 1 and less
     120             :   /// than or equal 16. Promotion is done by zero extending the operand to 32
     121             :   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
     122             :   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
     123             :   /// shift amount is 32 minus \p I's base element bit width), and truncating
     124             :   /// the result of the shift operation back to \p I's original type.
     125             :   ///
     126             :   /// \returns True.
     127             :   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
     128             :   /// \brief Widen a scalar load.
     129             :   ///
     130             :   /// \details \p Widen scalar load for uniform, small type loads from constant
     131             :   //  memory / to a full 32-bits and then truncate the input to allow a scalar
     132             :   //  load instead of a vector load.
     133             :   //
     134             :   /// \returns True.
     135             : 
     136             :   bool canWidenScalarExtLoad(LoadInst &I) const;
     137             : 
     138             : public:
     139             :   static char ID;
     140             : 
     141        3382 :   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
     142             : 
     143             :   bool visitFDiv(BinaryOperator &I);
     144             : 
     145             :   bool visitInstruction(Instruction &I) { return false; }
     146             :   bool visitBinaryOperator(BinaryOperator &I);
     147             :   bool visitLoadInst(LoadInst &I);
     148             :   bool visitICmpInst(ICmpInst &I);
     149             :   bool visitSelectInst(SelectInst &I);
     150             : 
     151             :   bool visitIntrinsicInst(IntrinsicInst &I);
     152             :   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
     153             : 
     154             :   bool doInitialization(Module &M) override;
     155             :   bool runOnFunction(Function &F) override;
     156             : 
     157           0 :   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
     158             : 
     159        1681 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     160             :     AU.addRequired<DivergenceAnalysis>();
     161             :     AU.setPreservesAll();
     162        1681 :  }
     163             : };
     164             : 
     165             : } // end anonymous namespace
     166             : 
     167             : unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
     168             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     169             : 
     170           8 :   if (T->isIntegerTy())
     171             :     return T->getIntegerBitWidth();
     172           2 :   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
     173             : }
     174             : 
     175         465 : Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
     176             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     177             : 
     178         465 :   if (T->isIntegerTy())
     179         269 :     return B.getInt32Ty();
     180         392 :   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
     181             : }
     182             : 
     183             : bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
     184         195 :   return I.getOpcode() == Instruction::AShr ||
     185         416 :       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
     186             : }
     187             : 
     188         125 : bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
     189             :   return isa<ICmpInst>(I.getOperand(0)) ?
     190         125 :       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
     191             : }
     192             : 
     193             : bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
     194             :   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
     195        3986 :   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
     196             :     return true;
     197             : 
     198             :   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
     199             :     // TODO: The set of packed operations is more limited, so may want to
     200             :     // promote some anyway.
     201         943 :     if (ST->hasVOP3PInsts())
     202             :       return false;
     203             : 
     204         596 :     return needsPromotionToI32(VT->getElementType());
     205             :   }
     206             : 
     207             :   return false;
     208             : }
     209             : 
     210             : // Return true if the op promoted to i32 should have nsw set.
     211         217 : static bool promotedOpIsNSW(const Instruction &I) {
     212         217 :   switch (I.getOpcode()) {
     213             :   case Instruction::Shl:
     214             :   case Instruction::Add:
     215             :   case Instruction::Sub:
     216             :     return true;
     217          46 :   case Instruction::Mul:
     218          46 :     return I.hasNoUnsignedWrap();
     219          75 :   default:
     220          75 :     return false;
     221             :   }
     222             : }
     223             : 
     224             : // Return true if the op promoted to i32 should have nuw set.
     225         217 : static bool promotedOpIsNUW(const Instruction &I) {
     226         217 :   switch (I.getOpcode()) {
     227             :   case Instruction::Shl:
     228             :   case Instruction::Add:
     229             :   case Instruction::Mul:
     230             :     return true;
     231          24 :   case Instruction::Sub:
     232          24 :     return I.hasNoUnsignedWrap();
     233          75 :   default:
     234          75 :     return false;
     235             :   }
     236             : }
     237             : 
     238        1263 : bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
     239        1263 :   Type *Ty = I.getType();
     240        1263 :   const DataLayout &DL = Mod->getDataLayout();
     241        1263 :   int TySize = DL.getTypeSizeInBits(Ty);
     242        1263 :   unsigned Align = I.getAlignment() ?
     243             :                    I.getAlignment() : DL.getABITypeAlignment(Ty);
     244             : 
     245        2397 :   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
     246             : }
     247             : 
     248         231 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     249             :   assert(needsPromotionToI32(I.getType()) &&
     250             :          "I does not need promotion to i32");
     251             : 
     252         231 :   if (I.getOpcode() == Instruction::SDiv ||
     253             :       I.getOpcode() == Instruction::UDiv)
     254             :     return false;
     255             : 
     256         221 :   IRBuilder<> Builder(&I);
     257         221 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     258             : 
     259         221 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     260             :   Value *ExtOp0 = nullptr;
     261             :   Value *ExtOp1 = nullptr;
     262             :   Value *ExtRes = nullptr;
     263             :   Value *TruncRes = nullptr;
     264             : 
     265             :   if (isSigned(I)) {
     266          32 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     267          32 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     268             :   } else {
     269         189 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     270         189 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     271             :   }
     272             : 
     273         221 :   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
     274             :   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
     275         217 :     if (promotedOpIsNSW(cast<Instruction>(I)))
     276         104 :       Inst->setHasNoSignedWrap();
     277             : 
     278         217 :     if (promotedOpIsNUW(cast<Instruction>(I)))
     279         126 :       Inst->setHasNoUnsignedWrap();
     280             : 
     281             :     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
     282          36 :       Inst->setIsExact(ExactOp->isExact());
     283             :   }
     284             : 
     285         442 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     286             : 
     287         221 :   I.replaceAllUsesWith(TruncRes);
     288         221 :   I.eraseFromParent();
     289             : 
     290             :   return true;
     291             : }
     292             : 
     293         111 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
     294             :   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
     295             :          "I does not need promotion to i32");
     296             : 
     297         111 :   IRBuilder<> Builder(&I);
     298         111 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     299             : 
     300         111 :   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
     301             :   Value *ExtOp0 = nullptr;
     302             :   Value *ExtOp1 = nullptr;
     303             :   Value *NewICmp  = nullptr;
     304             : 
     305         111 :   if (I.isSigned()) {
     306          53 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     307          53 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     308             :   } else {
     309          58 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     310          58 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     311             :   }
     312         111 :   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
     313             : 
     314         111 :   I.replaceAllUsesWith(NewICmp);
     315         111 :   I.eraseFromParent();
     316             : 
     317         111 :   return true;
     318             : }
     319             : 
     320         125 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
     321             :   assert(needsPromotionToI32(I.getType()) &&
     322             :          "I does not need promotion to i32");
     323             : 
     324         125 :   IRBuilder<> Builder(&I);
     325         125 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     326             : 
     327         125 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     328             :   Value *ExtOp1 = nullptr;
     329             :   Value *ExtOp2 = nullptr;
     330             :   Value *ExtRes = nullptr;
     331             :   Value *TruncRes = nullptr;
     332             : 
     333         125 :   if (isSigned(I)) {
     334          57 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     335          57 :     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
     336             :   } else {
     337          68 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     338          68 :     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
     339             :   }
     340         125 :   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
     341         250 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     342             : 
     343         125 :   I.replaceAllUsesWith(TruncRes);
     344         125 :   I.eraseFromParent();
     345             : 
     346         125 :   return true;
     347             : }
     348             : 
     349           8 : bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
     350             :     IntrinsicInst &I) const {
     351             :   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
     352             :          "I must be bitreverse intrinsic");
     353             :   assert(needsPromotionToI32(I.getType()) &&
     354             :          "I does not need promotion to i32");
     355             : 
     356           8 :   IRBuilder<> Builder(&I);
     357           8 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     358             : 
     359           8 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     360             :   Function *I32 =
     361          16 :       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
     362          16 :   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
     363          16 :   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
     364             :   Value *LShrOp =
     365          16 :       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
     366             :   Value *TruncRes =
     367          16 :       Builder.CreateTrunc(LShrOp, I.getType());
     368             : 
     369           8 :   I.replaceAllUsesWith(TruncRes);
     370           8 :   I.eraseFromParent();
     371             : 
     372           8 :   return true;
     373             : }
     374             : 
     375             : static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
     376             :   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
     377             :   if (!CNum)
     378             :     return false;
     379             : 
     380             :   // Reciprocal f32 is handled separately without denormals.
     381           3 :   return UnsafeDiv || CNum->isExactlyValue(+1.0);
     382             : }
     383             : 
     384             : // Insert an intrinsic for fast fdiv for safe math situations where we can
     385             : // reduce precision. Leave fdiv for situations where the generic node is
     386             : // expected to be optimized.
     387         233 : bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     388         233 :   Type *Ty = FDiv.getType();
     389             : 
     390         233 :   if (!Ty->getScalarType()->isFloatTy())
     391             :     return false;
     392             : 
     393         156 :   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
     394          47 :   if (!FPMath)
     395             :     return false;
     396             : 
     397             :   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
     398          47 :   float ULP = FPOp->getFPAccuracy();
     399          47 :   if (ULP < 2.5f)
     400             :     return false;
     401             : 
     402             :   FastMathFlags FMF = FPOp->getFastMathFlags();
     403          65 :   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
     404             :                                       FMF.allowReciprocal();
     405             : 
     406             :   // With UnsafeDiv node will be optimized to just rcp and mul.
     407          39 :   if (ST->hasFP32Denormals() || UnsafeDiv)
     408             :     return false;
     409             : 
     410          20 :   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
     411             :   Builder.setFastMathFlags(FMF);
     412          10 :   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
     413             : 
     414          10 :   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
     415             : 
     416             :   Value *Num = FDiv.getOperand(0);
     417             :   Value *Den = FDiv.getOperand(1);
     418             : 
     419             :   Value *NewFDiv = nullptr;
     420             : 
     421             :   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     422           1 :     NewFDiv = UndefValue::get(VT);
     423             : 
     424             :     // FIXME: Doesn't do the right thing for cases where the vector is partially
     425             :     // constant. This works when the scalarizer pass is run first.
     426           3 :     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
     427           4 :       Value *NumEltI = Builder.CreateExtractElement(Num, I);
     428           2 :       Value *DenEltI = Builder.CreateExtractElement(Den, I);
     429             :       Value *NewElt;
     430             : 
     431             :       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
     432           0 :         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
     433             :       } else {
     434           4 :         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
     435             :       }
     436             : 
     437           2 :       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     438             :     }
     439             :   } else {
     440             :     if (!shouldKeepFDivF32(Num, UnsafeDiv))
     441          12 :       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
     442             :   }
     443             : 
     444           7 :   if (NewFDiv) {
     445           7 :     FDiv.replaceAllUsesWith(NewFDiv);
     446           7 :     NewFDiv->takeName(&FDiv);
     447           7 :     FDiv.eraseFromParent();
     448             :   }
     449             : 
     450             :   return true;
     451             : }
     452             : 
     453       16719 : static bool hasUnsafeFPMath(const Function &F) {
     454       16719 :   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
     455       33438 :   return Attr.getValueAsString() == "true";
     456             : }
     457             : 
     458       10048 : bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     459             :   bool Changed = false;
     460             : 
     461       14853 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     462         330 :       DA->isUniform(&I))
     463         231 :     Changed |= promoteUniformOpToI32(I);
     464             : 
     465       10048 :   return Changed;
     466             : }
     467             : 
     468       13010 : bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
     469       11859 :   if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
     470       14273 :        I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
     471        1263 :       canWidenScalarExtLoad(I)) {
     472          62 :     IRBuilder<> Builder(&I);
     473          62 :     Builder.SetCurrentDebugLocation(I.getDebugLoc());
     474             : 
     475          62 :     Type *I32Ty = Builder.getInt32Ty();
     476          62 :     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
     477          62 :     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
     478          62 :     Value *WidenLoad = Builder.CreateLoad(BitCast);
     479             : 
     480          62 :     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
     481          62 :     Type *IntNTy = Builder.getIntNTy(TySize);
     482          62 :     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
     483         124 :     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
     484          62 :     I.replaceAllUsesWith(ValOrig);
     485          62 :     I.eraseFromParent();
     486             :     return true;
     487             :   }
     488             : 
     489             :   return false;
     490             : }
     491             : 
     492        2370 : bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
     493             :   bool Changed = false;
     494             : 
     495        4752 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
     496         194 :       DA->isUniform(&I))
     497         111 :     Changed |= promoteUniformOpToI32(I);
     498             : 
     499        2370 :   return Changed;
     500             : }
     501             : 
     502        1660 : bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
     503             :   bool Changed = false;
     504             : 
     505        2596 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     506         188 :       DA->isUniform(&I))
     507         125 :     Changed |= promoteUniformOpToI32(I);
     508             : 
     509        1660 :   return Changed;
     510             : }
     511             : 
     512       10952 : bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
     513       10952 :   switch (I.getIntrinsicID()) {
     514          41 :   case Intrinsic::bitreverse:
     515          41 :     return visitBitreverseIntrinsicInst(I);
     516             :   default:
     517             :     return false;
     518             :   }
     519             : }
     520             : 
     521          41 : bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
     522             :   bool Changed = false;
     523             : 
     524          67 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     525          10 :       DA->isUniform(&I))
     526           8 :     Changed |= promoteUniformBitreverseToI32(I);
     527             : 
     528          41 :   return Changed;
     529             : }
     530             : 
     531        1681 : bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
     532        1681 :   Mod = &M;
     533        1681 :   return false;
     534             : }
     535             : 
     536       16730 : bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
     537       16730 :   if (skipFunction(F))
     538             :     return false;
     539             : 
     540       16727 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     541       16727 :   if (!TPC)
     542             :     return false;
     543             : 
     544       16719 :   const TargetMachine &TM = TPC->getTM<TargetMachine>();
     545       16719 :   ST = &TM.getSubtarget<SISubtarget>(F);
     546       16719 :   DA = &getAnalysis<DivergenceAnalysis>();
     547       16719 :   HasUnsafeFPMath = hasUnsafeFPMath(F);
     548             : 
     549             :   bool MadeChange = false;
     550             : 
     551       35548 :   for (BasicBlock &BB : F) {
     552             :     BasicBlock::iterator Next;
     553      228995 :     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
     554             :       Next = std::next(I);
     555      105083 :       MadeChange |= visit(*I);
     556             :     }
     557             :   }
     558             : 
     559             :   return MadeChange;
     560             : }
     561             : 
     562       75295 : INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
     563             :                       "AMDGPU IR optimizations", false, false)
     564       75295 : INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
     565      354886 : INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
     566             :                     false, false)
     567             : 
     568             : char AMDGPUCodeGenPrepare::ID = 0;
     569             : 
     570        1686 : FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
     571        3372 :   return new AMDGPUCodeGenPrepare();
     572             : }

Generated by: LCOV version 1.13