LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUCodeGenPrepare.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 220 222 99.1 %
Date: 2017-09-14 15:23:50 Functions: 24 26 92.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass does misc. AMDGPU optimizations on IR before instruction
      12             : /// selection.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUSubtarget.h"
      18             : #include "AMDGPUTargetMachine.h"
      19             : #include "llvm/ADT/StringRef.h"
      20             : #include "llvm/Analysis/DivergenceAnalysis.h"
      21             : #include "llvm/Analysis/Loads.h"
      22             : #include "llvm/CodeGen/Passes.h"
      23             : #include "llvm/CodeGen/TargetPassConfig.h"
      24             : #include "llvm/IR/Attributes.h"
      25             : #include "llvm/IR/BasicBlock.h"
      26             : #include "llvm/IR/Constants.h"
      27             : #include "llvm/IR/DerivedTypes.h"
      28             : #include "llvm/IR/Function.h"
      29             : #include "llvm/IR/IRBuilder.h"
      30             : #include "llvm/IR/InstVisitor.h"
      31             : #include "llvm/IR/InstrTypes.h"
      32             : #include "llvm/IR/Instruction.h"
      33             : #include "llvm/IR/Instructions.h"
      34             : #include "llvm/IR/IntrinsicInst.h"
      35             : #include "llvm/IR/Intrinsics.h"
      36             : #include "llvm/IR/LLVMContext.h"
      37             : #include "llvm/IR/Operator.h"
      38             : #include "llvm/IR/Type.h"
      39             : #include "llvm/IR/Value.h"
      40             : #include "llvm/Pass.h"
      41             : #include "llvm/Support/Casting.h"
      42             : #include <cassert>
      43             : #include <iterator>
      44             : 
      45             : #define DEBUG_TYPE "amdgpu-codegenprepare"
      46             : 
      47             : using namespace llvm;
      48             : 
      49             : namespace {
      50             : 
      51        2930 : class AMDGPUCodeGenPrepare : public FunctionPass,
      52             :                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
      53             :   const SISubtarget *ST = nullptr;
      54             :   DivergenceAnalysis *DA = nullptr;
      55             :   Module *Mod = nullptr;
      56             :   bool HasUnsafeFPMath = false;
      57             :   AMDGPUAS AMDGPUASI;
      58             : 
      59             :   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
      60             :   /// binary operation \p V.
      61             :   ///
      62             :   /// \returns Binary operation \p V.
      63             :   /// \returns \p T's base element bit width.
      64             :   unsigned getBaseElementBitWidth(const Type *T) const;
      65             : 
      66             :   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
      67             :   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
      68             :   /// is returned.
      69             :   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
      70             : 
      71             :   /// \returns True if binary operation \p I is a signed binary operation, false
      72             :   /// otherwise.
      73             :   bool isSigned(const BinaryOperator &I) const;
      74             : 
      75             :   /// \returns True if the condition of 'select' operation \p I comes from a
      76             :   /// signed 'icmp' operation, false otherwise.
      77             :   bool isSigned(const SelectInst &I) const;
      78             : 
      79             :   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
      80             :   /// false otherwise.
      81             :   bool needsPromotionToI32(const Type *T) const;
      82             : 
      83             :   /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
      84             :   /// operation.
      85             :   ///
      86             :   /// \details \p I's base element bit width must be greater than 1 and less
      87             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
      88             :   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
      89             :   /// truncating the result of 32 bit binary operation back to \p I's original
      90             :   /// type. Division operation is not promoted.
      91             :   ///
      92             :   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
      93             :   /// false otherwise.
      94             :   bool promoteUniformOpToI32(BinaryOperator &I) const;
      95             : 
      96             :   /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
      97             :   ///
      98             :   /// \details \p I's base element bit width must be greater than 1 and less
      99             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     100             :   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
     101             :   ///
     102             :   /// \returns True.
     103             :   bool promoteUniformOpToI32(ICmpInst &I) const;
     104             : 
     105             :   /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
     106             :   /// operation.
     107             :   ///
     108             :   /// \details \p I's base element bit width must be greater than 1 and less
     109             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     110             :   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
     111             :   /// result of 32 bit 'select' operation back to \p I's original type.
     112             :   ///
     113             :   /// \returns True.
     114             :   bool promoteUniformOpToI32(SelectInst &I) const;
     115             : 
     116             :   /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
     117             :   /// intrinsic.
     118             :   ///
     119             :   /// \details \p I's base element bit width must be greater than 1 and less
     120             :   /// than or equal 16. Promotion is done by zero extending the operand to 32
     121             :   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
     122             :   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
     123             :   /// shift amount is 32 minus \p I's base element bit width), and truncating
     124             :   /// the result of the shift operation back to \p I's original type.
     125             :   ///
     126             :   /// \returns True.
     127             :   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
     128             :   /// \brief Widen a scalar load.
     129             :   ///
     130             :   /// \details \p Widen scalar load for uniform, small type loads from constant
     131             :   //  memory / to a full 32-bits and then truncate the input to allow a scalar
     132             :   //  load instead of a vector load.
     133             :   //
     134             :   /// \returns True.
     135             : 
     136             :   bool canWidenScalarExtLoad(LoadInst &I) const;
     137             : 
     138             : public:
     139             :   static char ID;
     140             : 
     141        2946 :   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
     142             : 
     143             :   bool visitFDiv(BinaryOperator &I);
     144             : 
     145             :   bool visitInstruction(Instruction &I) { return false; }
     146             :   bool visitBinaryOperator(BinaryOperator &I);
     147             :   bool visitLoadInst(LoadInst &I);
     148             :   bool visitICmpInst(ICmpInst &I);
     149             :   bool visitSelectInst(SelectInst &I);
     150             : 
     151             :   bool visitIntrinsicInst(IntrinsicInst &I);
     152             :   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
     153             : 
     154             :   bool doInitialization(Module &M) override;
     155             :   bool runOnFunction(Function &F) override;
     156             : 
     157           0 :   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
     158             : 
     159        1465 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     160        1465 :     AU.addRequired<DivergenceAnalysis>();
     161        1465 :     AU.setPreservesAll();
     162        1465 :  }
     163             : };
     164             : 
     165             : } // end anonymous namespace
     166             : 
     167             : unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
     168             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     169             : 
     170           8 :   if (T->isIntegerTy())
     171           6 :     return T->getIntegerBitWidth();
     172           4 :   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
     173             : }
     174             : 
     175         450 : Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
     176             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     177             : 
     178         450 :   if (T->isIntegerTy())
     179         255 :     return B.getInt32Ty();
     180         390 :   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
     181             : }
     182             : 
     183             : bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
     184         410 :   return I.getOpcode() == Instruction::AShr ||
     185         602 :       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
     186             : }
     187             : 
     188         119 : bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
     189         236 :   return isa<ICmpInst>(I.getOperand(0)) ?
     190         353 :       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
     191             : }
     192             : 
     193             : bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
     194        3515 :   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
     195        6977 :   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
     196             :     return true;
     197             : 
     198         861 :   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
     199             :     // TODO: The set of packed operations is more limited, so may want to
     200             :     // promote some anyway.
     201         861 :     if (ST->hasVOP3PInsts())
     202             :       return false;
     203             : 
     204         572 :     return needsPromotionToI32(VT->getElementType());
     205             :   }
     206             : 
     207             :   return false;
     208             : }
     209             : 
     210             : // Return true if the op promoted to i32 should have nsw set.
     211         214 : static bool promotedOpIsNSW(const Instruction &I) {
     212         214 :   switch (I.getOpcode()) {
     213             :   case Instruction::Shl:
     214             :   case Instruction::Add:
     215             :   case Instruction::Sub:
     216             :     return true;
     217          46 :   case Instruction::Mul:
     218          46 :     return I.hasNoUnsignedWrap();
     219          72 :   default:
     220          72 :     return false;
     221             :   }
     222             : }
     223             : 
     224             : // Return true if the op promoted to i32 should have nuw set.
     225         214 : static bool promotedOpIsNUW(const Instruction &I) {
     226         214 :   switch (I.getOpcode()) {
     227             :   case Instruction::Shl:
     228             :   case Instruction::Add:
     229             :   case Instruction::Mul:
     230             :     return true;
     231          24 :   case Instruction::Sub:
     232          24 :     return I.hasNoUnsignedWrap();
     233          72 :   default:
     234          72 :     return false;
     235             :   }
     236             : }
     237             : 
     238        1120 : bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
     239        1120 :   Type *Ty = I.getType();
     240        1120 :   const DataLayout &DL = Mod->getDataLayout();
     241        1120 :   int TySize = DL.getTypeSizeInBits(Ty);
     242        1120 :   unsigned Align = I.getAlignment() ?
     243        1120 :                    I.getAlignment() : DL.getABITypeAlignment(Ty);
     244             : 
     245        1045 :   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
     246             : }
     247             : 
     248         227 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     249             :   assert(needsPromotionToI32(I.getType()) &&
     250             :          "I does not need promotion to i32");
     251             : 
     252         451 :   if (I.getOpcode() == Instruction::SDiv ||
     253         224 :       I.getOpcode() == Instruction::UDiv)
     254             :     return false;
     255             : 
     256         218 :   IRBuilder<> Builder(&I);
     257         872 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     258             : 
     259         218 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     260         218 :   Value *ExtOp0 = nullptr;
     261         218 :   Value *ExtOp1 = nullptr;
     262         218 :   Value *ExtRes = nullptr;
     263         218 :   Value *TruncRes = nullptr;
     264             : 
     265         404 :   if (isSigned(I)) {
     266          96 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     267          96 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     268             :   } else {
     269         558 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     270         558 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     271             :   }
     272             : 
     273         436 :   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
     274         214 :   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
     275         214 :     if (promotedOpIsNSW(cast<Instruction>(I)))
     276         104 :       Inst->setHasNoSignedWrap();
     277             : 
     278         214 :     if (promotedOpIsNUW(cast<Instruction>(I)))
     279         126 :       Inst->setHasNoUnsignedWrap();
     280             : 
     281          36 :     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
     282          36 :       Inst->setIsExact(ExactOp->isExact());
     283             :   }
     284             : 
     285         436 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     286             : 
     287         218 :   I.replaceAllUsesWith(TruncRes);
     288         218 :   I.eraseFromParent();
     289             : 
     290         218 :   return true;
     291             : }
     292             : 
     293         105 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
     294             :   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
     295             :          "I does not need promotion to i32");
     296             : 
     297         210 :   IRBuilder<> Builder(&I);
     298         420 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     299             : 
     300         210 :   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
     301         105 :   Value *ExtOp0 = nullptr;
     302         105 :   Value *ExtOp1 = nullptr;
     303         105 :   Value *NewICmp  = nullptr;
     304             : 
     305         210 :   if (I.isSigned()) {
     306         159 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     307         159 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     308             :   } else {
     309         156 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     310         156 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     311             :   }
     312         210 :   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
     313             : 
     314         105 :   I.replaceAllUsesWith(NewICmp);
     315         105 :   I.eraseFromParent();
     316             : 
     317         210 :   return true;
     318             : }
     319             : 
     320         119 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
     321             :   assert(needsPromotionToI32(I.getType()) &&
     322             :          "I does not need promotion to i32");
     323             : 
     324         238 :   IRBuilder<> Builder(&I);
     325         476 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     326             : 
     327         119 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     328         119 :   Value *ExtOp1 = nullptr;
     329         119 :   Value *ExtOp2 = nullptr;
     330         119 :   Value *ExtRes = nullptr;
     331         119 :   Value *TruncRes = nullptr;
     332             : 
     333         119 :   if (isSigned(I)) {
     334         171 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     335         171 :     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
     336             :   } else {
     337         186 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     338         186 :     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
     339             :   }
     340         238 :   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
     341         238 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     342             : 
     343         119 :   I.replaceAllUsesWith(TruncRes);
     344         119 :   I.eraseFromParent();
     345             : 
     346         238 :   return true;
     347             : }
     348             : 
     349           8 : bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
     350             :     IntrinsicInst &I) const {
     351             :   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
     352             :          "I must be bitreverse intrinsic");
     353             :   assert(needsPromotionToI32(I.getType()) &&
     354             :          "I does not need promotion to i32");
     355             : 
     356          16 :   IRBuilder<> Builder(&I);
     357          32 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     358             : 
     359           8 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     360             :   Function *I32 =
     361          16 :       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
     362          24 :   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
     363          24 :   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
     364             :   Value *LShrOp =
     365          16 :       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
     366             :   Value *TruncRes =
     367          16 :       Builder.CreateTrunc(LShrOp, I.getType());
     368             : 
     369           8 :   I.replaceAllUsesWith(TruncRes);
     370           8 :   I.eraseFromParent();
     371             : 
     372          16 :   return true;
     373             : }
     374             : 
     375             : static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
     376           3 :   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
     377             :   if (!CNum)
     378             :     return false;
     379             : 
     380             :   // Reciprocal f32 is handled separately without denormals.
     381           3 :   return UnsafeDiv || CNum->isExactlyValue(+1.0);
     382             : }
     383             : 
     384             : // Insert an intrinsic for fast fdiv for safe math situations where we can
     385             : // reduce precision. Leave fdiv for situations where the generic node is
     386             : // expected to be optimized.
     387         231 : bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     388         231 :   Type *Ty = FDiv.getType();
     389             : 
     390         462 :   if (!Ty->getScalarType()->isFloatTy())
     391             :     return false;
     392             : 
     393         201 :   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
     394          47 :   if (!FPMath)
     395             :     return false;
     396             : 
     397          47 :   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
     398          47 :   float ULP = FPOp->getFPAccuracy();
     399          47 :   if (ULP < 2.5f)
     400             :     return false;
     401             : 
     402          39 :   FastMathFlags FMF = FPOp->getFastMathFlags();
     403         102 :   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
     404          65 :                                       FMF.allowReciprocal();
     405             : 
     406             :   // With UnsafeDiv node will be optimized to just rcp and mul.
     407          39 :   if (ST->hasFP32Denormals() || UnsafeDiv)
     408             :     return false;
     409             : 
     410          30 :   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
     411          10 :   Builder.setFastMathFlags(FMF);
     412          40 :   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
     413             : 
     414          10 :   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
     415             : 
     416          10 :   Value *Num = FDiv.getOperand(0);
     417          10 :   Value *Den = FDiv.getOperand(1);
     418             : 
     419          10 :   Value *NewFDiv = nullptr;
     420             : 
     421           1 :   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     422           1 :     NewFDiv = UndefValue::get(VT);
     423             : 
     424             :     // FIXME: Doesn't do the right thing for cases where the vector is partially
     425             :     // constant. This works when the scalarizer pass is run first.
     426           3 :     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
     427           4 :       Value *NumEltI = Builder.CreateExtractElement(Num, I);
     428           4 :       Value *DenEltI = Builder.CreateExtractElement(Den, I);
     429             :       Value *NewElt;
     430             : 
     431           2 :       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
     432           0 :         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
     433             :       } else {
     434           6 :         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
     435             :       }
     436             : 
     437           4 :       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     438             :     }
     439             :   } else {
     440           9 :     if (!shouldKeepFDivF32(Num, UnsafeDiv))
     441          18 :       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
     442             :   }
     443             : 
     444           7 :   if (NewFDiv) {
     445           7 :     FDiv.replaceAllUsesWith(NewFDiv);
     446           7 :     NewFDiv->takeName(&FDiv);
     447           7 :     FDiv.eraseFromParent();
     448             :   }
     449             : 
     450          10 :   return true;
     451             : }
     452             : 
     453       14978 : static bool hasUnsafeFPMath(const Function &F) {
     454       29956 :   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
     455       29956 :   return Attr.getValueAsString() == "true";
     456             : }
     457             : 
     458        8981 : bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     459        8981 :   bool Changed = false;
     460             : 
     461       13132 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     462         636 :       DA->isUniform(&I))
     463         227 :     Changed |= promoteUniformOpToI32(I);
     464             : 
     465        8981 :   return Changed;
     466             : }
     467             : 
     468       11780 : bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst  &I) {
     469       12900 :   if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
     470        1120 :       canWidenScalarExtLoad(I)) {
     471         100 :     IRBuilder<> Builder(&I);
     472         200 :     Builder.SetCurrentDebugLocation(I.getDebugLoc());
     473             : 
     474         100 :     Type *I32Ty = Builder.getInt32Ty();
     475          50 :     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
     476         150 :     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
     477          50 :     Value *WidenLoad = Builder.CreateLoad(BitCast);
     478             : 
     479          50 :     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
     480         100 :     Type *IntNTy = Builder.getIntNTy(TySize);
     481         100 :     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
     482         100 :     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
     483          50 :     I.replaceAllUsesWith(ValOrig);
     484          50 :     I.eraseFromParent();
     485          50 :     return true;
     486             :   }
     487             : 
     488             :   return false;
     489             : }
     490             : 
     491        2284 : bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
     492        2284 :   bool Changed = false;
     493             : 
     494        4568 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
     495         376 :       DA->isUniform(&I))
     496         105 :     Changed |= promoteUniformOpToI32(I);
     497             : 
     498        2284 :   return Changed;
     499             : }
     500             : 
     501        1594 : bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
     502        1594 :   bool Changed = false;
     503             : 
     504        2493 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     505         364 :       DA->isUniform(&I))
     506         119 :     Changed |= promoteUniformOpToI32(I);
     507             : 
     508        1594 :   return Changed;
     509             : }
     510             : 
     511        9780 : bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
     512        9780 :   switch (I.getIntrinsicID()) {
     513          38 :   case Intrinsic::bitreverse:
     514          38 :     return visitBitreverseIntrinsicInst(I);
     515             :   default:
     516             :     return false;
     517             :   }
     518             : }
     519             : 
     520          38 : bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
     521          38 :   bool Changed = false;
     522             : 
     523          62 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     524          16 :       DA->isUniform(&I))
     525           8 :     Changed |= promoteUniformBitreverseToI32(I);
     526             : 
     527          38 :   return Changed;
     528             : }
     529             : 
     530        1465 : bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
     531        1465 :   Mod = &M;
     532        1465 :   return false;
     533             : }
     534             : 
     535       14988 : bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
     536       14988 :   if (skipFunction(F))
     537             :     return false;
     538             : 
     539       14986 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     540       14986 :   if (!TPC)
     541             :     return false;
     542             : 
     543       14978 :   const TargetMachine &TM = TPC->getTM<TargetMachine>();
     544       14978 :   ST = &TM.getSubtarget<SISubtarget>(F);
     545       14978 :   DA = &getAnalysis<DivergenceAnalysis>();
     546       14978 :   HasUnsafeFPMath = hasUnsafeFPMath(F);
     547             : 
     548       14978 :   bool MadeChange = false;
     549             : 
     550       61916 :   for (BasicBlock &BB : F) {
     551       16982 :     BasicBlock::iterator Next;
     552      128681 :     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
     553       94717 :       Next = std::next(I);
     554       94717 :       MadeChange |= visit(*I);
     555             :     }
     556             :   }
     557             : 
     558             :   return MadeChange;
     559             : }
     560             : 
     561       53042 : INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
     562             :                       "AMDGPU IR optimizations", false, false)
     563       53042 : INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
     564      312538 : INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
     565             :                     false, false)
     566             : 
     567             : char AMDGPUCodeGenPrepare::ID = 0;
     568             : 
     569        1468 : FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
     570        2936 :   return new AMDGPUCodeGenPrepare();
     571             : }

Generated by: LCOV version 1.13