LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUCodeGenPrepare.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 199 200 99.5 %
Date: 2018-06-17 00:07:59 Functions: 27 29 93.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// This pass does misc. AMDGPU optimizations on IR before instruction
      12             : /// selection.
      13             : //
      14             : //===----------------------------------------------------------------------===//
      15             : 
      16             : #include "AMDGPU.h"
      17             : #include "AMDGPUSubtarget.h"
      18             : #include "AMDGPUTargetMachine.h"
      19             : #include "llvm/ADT/StringRef.h"
      20             : #include "llvm/Analysis/DivergenceAnalysis.h"
      21             : #include "llvm/Analysis/Loads.h"
      22             : #include "llvm/CodeGen/Passes.h"
      23             : #include "llvm/CodeGen/TargetPassConfig.h"
      24             : #include "llvm/IR/Attributes.h"
      25             : #include "llvm/IR/BasicBlock.h"
      26             : #include "llvm/IR/Constants.h"
      27             : #include "llvm/IR/DerivedTypes.h"
      28             : #include "llvm/IR/Function.h"
      29             : #include "llvm/IR/IRBuilder.h"
      30             : #include "llvm/IR/InstVisitor.h"
      31             : #include "llvm/IR/InstrTypes.h"
      32             : #include "llvm/IR/Instruction.h"
      33             : #include "llvm/IR/Instructions.h"
      34             : #include "llvm/IR/IntrinsicInst.h"
      35             : #include "llvm/IR/Intrinsics.h"
      36             : #include "llvm/IR/LLVMContext.h"
      37             : #include "llvm/IR/Operator.h"
      38             : #include "llvm/IR/Type.h"
      39             : #include "llvm/IR/Value.h"
      40             : #include "llvm/Pass.h"
      41             : #include "llvm/Support/Casting.h"
      42             : #include <cassert>
      43             : #include <iterator>
      44             : 
      45             : #define DEBUG_TYPE "amdgpu-codegenprepare"
      46             : 
      47             : using namespace llvm;
      48             : 
      49             : namespace {
      50             : 
      51      101169 : static cl::opt<bool> WidenLoads(
      52             :   "amdgpu-codegenprepare-widen-constant-loads",
      53      101169 :   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
      54             :   cl::ReallyHidden,
      55      303507 :   cl::init(true));
      56             : 
      57        3584 : class AMDGPUCodeGenPrepare : public FunctionPass,
      58             :                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
      59             :   const SISubtarget *ST = nullptr;
      60             :   DivergenceAnalysis *DA = nullptr;
      61             :   Module *Mod = nullptr;
      62             :   bool HasUnsafeFPMath = false;
      63             :   AMDGPUAS AMDGPUASI;
      64             : 
      65             :   /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
      66             :   /// binary operation \p V.
      67             :   ///
      68             :   /// \returns Binary operation \p V.
      69             :   /// \returns \p T's base element bit width.
      70             :   unsigned getBaseElementBitWidth(const Type *T) const;
      71             : 
      72             :   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
      73             :   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
      74             :   /// is returned.
      75             :   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
      76             : 
      77             :   /// \returns True if binary operation \p I is a signed binary operation, false
      78             :   /// otherwise.
      79             :   bool isSigned(const BinaryOperator &I) const;
      80             : 
      81             :   /// \returns True if the condition of 'select' operation \p I comes from a
      82             :   /// signed 'icmp' operation, false otherwise.
      83             :   bool isSigned(const SelectInst &I) const;
      84             : 
      85             :   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
      86             :   /// false otherwise.
      87             :   bool needsPromotionToI32(const Type *T) const;
      88             : 
      89             :   /// Promotes uniform binary operation \p I to equivalent 32 bit binary
      90             :   /// operation.
      91             :   ///
      92             :   /// \details \p I's base element bit width must be greater than 1 and less
      93             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
      94             :   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
      95             :   /// truncating the result of 32 bit binary operation back to \p I's original
      96             :   /// type. Division operation is not promoted.
      97             :   ///
      98             :   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
      99             :   /// false otherwise.
     100             :   bool promoteUniformOpToI32(BinaryOperator &I) const;
     101             : 
     102             :   /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
     103             :   ///
     104             :   /// \details \p I's base element bit width must be greater than 1 and less
     105             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     106             :   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
     107             :   ///
     108             :   /// \returns True.
     109             :   bool promoteUniformOpToI32(ICmpInst &I) const;
     110             : 
     111             :   /// Promotes uniform 'select' operation \p I to 32 bit 'select'
     112             :   /// operation.
     113             :   ///
     114             :   /// \details \p I's base element bit width must be greater than 1 and less
     115             :   /// than or equal 16. Promotion is done by sign or zero extending operands to
     116             :   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
     117             :   /// result of 32 bit 'select' operation back to \p I's original type.
     118             :   ///
     119             :   /// \returns True.
     120             :   bool promoteUniformOpToI32(SelectInst &I) const;
     121             : 
     122             :   /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
     123             :   /// intrinsic.
     124             :   ///
     125             :   /// \details \p I's base element bit width must be greater than 1 and less
     126             :   /// than or equal 16. Promotion is done by zero extending the operand to 32
     127             :   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
     128             :   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
     129             :   /// shift amount is 32 minus \p I's base element bit width), and truncating
     130             :   /// the result of the shift operation back to \p I's original type.
     131             :   ///
     132             :   /// \returns True.
     133             :   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
     134             :   /// Widen a scalar load.
     135             :   ///
     136             :   /// \details \p Widen scalar load for uniform, small type loads from constant
     137             :   //  memory / to a full 32-bits and then truncate the input to allow a scalar
     138             :   //  load instead of a vector load.
     139             :   //
     140             :   /// \returns True.
     141             : 
     142             :   bool canWidenScalarExtLoad(LoadInst &I) const;
     143             : 
     144             : public:
     145             :   static char ID;
     146             : 
     147        3600 :   AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
     148             : 
     149             :   bool visitFDiv(BinaryOperator &I);
     150             : 
     151             :   bool visitInstruction(Instruction &I) { return false; }
     152             :   bool visitBinaryOperator(BinaryOperator &I);
     153             :   bool visitLoadInst(LoadInst &I);
     154             :   bool visitICmpInst(ICmpInst &I);
     155             :   bool visitSelectInst(SelectInst &I);
     156             : 
     157             :   bool visitIntrinsicInst(IntrinsicInst &I);
     158             :   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
     159             : 
     160             :   bool doInitialization(Module &M) override;
     161             :   bool runOnFunction(Function &F) override;
     162             : 
     163           0 :   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
     164             : 
     165        1790 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
     166             :     AU.addRequired<DivergenceAnalysis>();
     167             :     AU.setPreservesAll();
     168        1790 :  }
     169             : };
     170             : 
     171             : } // end anonymous namespace
     172             : 
     173             : unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
     174             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     175             : 
     176           8 :   if (T->isIntegerTy())
     177             :     return T->getIntegerBitWidth();
     178           2 :   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
     179             : }
     180             : 
     181         492 : Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
     182             :   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
     183             : 
     184         492 :   if (T->isIntegerTy())
     185         286 :     return B.getInt32Ty();
     186         412 :   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
     187             : }
     188             : 
     189             : bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
     190         221 :   return I.getOpcode() == Instruction::AShr ||
     191         468 :       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
     192             : }
     193             : 
     194         125 : bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
     195             :   return isa<ICmpInst>(I.getOperand(0)) ?
     196         125 :       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
     197             : }
     198             : 
     199             : bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
     200             :   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
     201        4431 :   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
     202             :     return true;
     203             : 
     204             :   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
     205             :     // TODO: The set of packed operations is more limited, so may want to
     206             :     // promote some anyway.
     207        1114 :     if (ST->hasVOP3PInsts())
     208             :       return false;
     209             : 
     210         674 :     return needsPromotionToI32(VT->getElementType());
     211             :   }
     212             : 
     213             :   return false;
     214             : }
     215             : 
     216             : // Return true if the op promoted to i32 should have nsw set.
     217         243 : static bool promotedOpIsNSW(const Instruction &I) {
     218         243 :   switch (I.getOpcode()) {
     219             :   case Instruction::Shl:
     220             :   case Instruction::Add:
     221             :   case Instruction::Sub:
     222             :     return true;
     223          50 :   case Instruction::Mul:
     224          50 :     return I.hasNoUnsignedWrap();
     225          80 :   default:
     226          80 :     return false;
     227             :   }
     228             : }
     229             : 
     230             : // Return true if the op promoted to i32 should have nuw set.
     231         243 : static bool promotedOpIsNUW(const Instruction &I) {
     232         243 :   switch (I.getOpcode()) {
     233             :   case Instruction::Shl:
     234             :   case Instruction::Add:
     235             :   case Instruction::Mul:
     236             :     return true;
     237          24 :   case Instruction::Sub:
     238          24 :     return I.hasNoUnsignedWrap();
     239          80 :   default:
     240          80 :     return false;
     241             :   }
     242             : }
     243             : 
     244        1345 : bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
     245        1345 :   Type *Ty = I.getType();
     246        1345 :   const DataLayout &DL = Mod->getDataLayout();
     247        1345 :   int TySize = DL.getTypeSizeInBits(Ty);
     248        1345 :   unsigned Align = I.getAlignment() ?
     249             :                    I.getAlignment() : DL.getABITypeAlignment(Ty);
     250             : 
     251        2548 :   return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
     252             : }
     253             : 
     254         257 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     255             :   assert(needsPromotionToI32(I.getType()) &&
     256             :          "I does not need promotion to i32");
     257             : 
     258         257 :   if (I.getOpcode() == Instruction::SDiv ||
     259             :       I.getOpcode() == Instruction::UDiv)
     260             :     return false;
     261             : 
     262         247 :   IRBuilder<> Builder(&I);
     263         247 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     264             : 
     265         247 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     266             :   Value *ExtOp0 = nullptr;
     267             :   Value *ExtOp1 = nullptr;
     268             :   Value *ExtRes = nullptr;
     269             :   Value *TruncRes = nullptr;
     270             : 
     271             :   if (isSigned(I)) {
     272          32 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     273          32 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     274             :   } else {
     275         215 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     276         215 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     277             :   }
     278             : 
     279         247 :   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
     280             :   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
     281         243 :     if (promotedOpIsNSW(cast<Instruction>(I)))
     282         121 :       Inst->setHasNoSignedWrap();
     283             : 
     284         243 :     if (promotedOpIsNUW(cast<Instruction>(I)))
     285         147 :       Inst->setHasNoUnsignedWrap();
     286             : 
     287             :     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
     288          36 :       Inst->setIsExact(ExactOp->isExact());
     289             :   }
     290             : 
     291         494 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     292             : 
     293         247 :   I.replaceAllUsesWith(TruncRes);
     294         247 :   I.eraseFromParent();
     295             : 
     296             :   return true;
     297             : }
     298             : 
     299         112 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
     300             :   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
     301             :          "I does not need promotion to i32");
     302             : 
     303         112 :   IRBuilder<> Builder(&I);
     304         112 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     305             : 
     306         112 :   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
     307             :   Value *ExtOp0 = nullptr;
     308             :   Value *ExtOp1 = nullptr;
     309             :   Value *NewICmp  = nullptr;
     310             : 
     311         112 :   if (I.isSigned()) {
     312          53 :     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
     313          53 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     314             :   } else {
     315          59 :     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     316          59 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     317             :   }
     318         112 :   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
     319             : 
     320         112 :   I.replaceAllUsesWith(NewICmp);
     321         112 :   I.eraseFromParent();
     322             : 
     323         112 :   return true;
     324             : }
     325             : 
     326         125 : bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
     327             :   assert(needsPromotionToI32(I.getType()) &&
     328             :          "I does not need promotion to i32");
     329             : 
     330         125 :   IRBuilder<> Builder(&I);
     331         125 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     332             : 
     333         125 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     334             :   Value *ExtOp1 = nullptr;
     335             :   Value *ExtOp2 = nullptr;
     336             :   Value *ExtRes = nullptr;
     337             :   Value *TruncRes = nullptr;
     338             : 
     339         125 :   if (isSigned(I)) {
     340          57 :     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
     341          57 :     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
     342             :   } else {
     343          68 :     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
     344          68 :     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
     345             :   }
     346         125 :   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
     347         250 :   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
     348             : 
     349         125 :   I.replaceAllUsesWith(TruncRes);
     350         125 :   I.eraseFromParent();
     351             : 
     352         125 :   return true;
     353             : }
     354             : 
     355           8 : bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
     356             :     IntrinsicInst &I) const {
     357             :   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
     358             :          "I must be bitreverse intrinsic");
     359             :   assert(needsPromotionToI32(I.getType()) &&
     360             :          "I does not need promotion to i32");
     361             : 
     362           8 :   IRBuilder<> Builder(&I);
     363           8 :   Builder.SetCurrentDebugLocation(I.getDebugLoc());
     364             : 
     365           8 :   Type *I32Ty = getI32Ty(Builder, I.getType());
     366             :   Function *I32 =
     367          16 :       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
     368          16 :   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
     369          16 :   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
     370             :   Value *LShrOp =
     371          16 :       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
     372             :   Value *TruncRes =
     373          16 :       Builder.CreateTrunc(LShrOp, I.getType());
     374             : 
     375           8 :   I.replaceAllUsesWith(TruncRes);
     376           8 :   I.eraseFromParent();
     377             : 
     378           8 :   return true;
     379             : }
     380             : 
     381          74 : static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
     382             :   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
     383             :   if (!CNum)
     384             :     return HasDenormals;
     385             : 
     386          59 :   if (UnsafeDiv)
     387             :     return true;
     388             : 
     389          59 :   bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
     390             : 
     391             :   // Reciprocal f32 is handled separately without denormals.
     392          59 :   return HasDenormals ^ IsOne;
     393             : }
     394             : 
     395             : // Insert an intrinsic for fast fdiv for safe math situations where we can
     396             : // reduce precision. Leave fdiv for situations where the generic node is
     397             : // expected to be optimized.
     398         283 : bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     399         283 :   Type *Ty = FDiv.getType();
     400             : 
     401         283 :   if (!Ty->getScalarType()->isFloatTy())
     402             :     return false;
     403             : 
     404         194 :   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
     405          69 :   if (!FPMath)
     406             :     return false;
     407             : 
     408             :   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
     409          69 :   float ULP = FPOp->getFPAccuracy();
     410          69 :   if (ULP < 2.5f)
     411             :     return false;
     412             : 
     413             :   FastMathFlags FMF = FPOp->getFastMathFlags();
     414         109 :   bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
     415             :                                       FMF.allowReciprocal();
     416             : 
     417             :   // With UnsafeDiv node will be optimized to just rcp and mul.
     418             :   if (UnsafeDiv)
     419             :     return false;
     420             : 
     421          74 :   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
     422             :   Builder.setFastMathFlags(FMF);
     423          37 :   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
     424             : 
     425          37 :   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
     426             : 
     427             :   Value *Num = FDiv.getOperand(0);
     428             :   Value *Den = FDiv.getOperand(1);
     429             : 
     430             :   Value *NewFDiv = nullptr;
     431             : 
     432          37 :   bool HasDenormals = ST->hasFP32Denormals();
     433             :   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
     434          13 :     NewFDiv = UndefValue::get(VT);
     435             : 
     436             :     // FIXME: Doesn't do the right thing for cases where the vector is partially
     437             :     // constant. This works when the scalarizer pass is run first.
     438          63 :     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
     439         100 :       Value *NumEltI = Builder.CreateExtractElement(Num, I);
     440          50 :       Value *DenEltI = Builder.CreateExtractElement(Den, I);
     441             :       Value *NewElt;
     442             : 
     443          50 :       if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
     444          24 :         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
     445             :       } else {
     446          52 :         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
     447             :       }
     448             : 
     449          50 :       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     450             :     }
     451             :   } else {
     452          24 :     if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
     453          22 :       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
     454             :   }
     455             : 
     456          24 :   if (NewFDiv) {
     457          24 :     FDiv.replaceAllUsesWith(NewFDiv);
     458          24 :     NewFDiv->takeName(&FDiv);
     459          24 :     FDiv.eraseFromParent();
     460             :   }
     461             : 
     462          37 :   return !!NewFDiv;
     463             : }
     464             : 
     465       18005 : static bool hasUnsafeFPMath(const Function &F) {
     466       18005 :   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
     467       36010 :   return Attr.getValueAsString() == "true";
     468             : }
     469             : 
     470       10793 : bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     471             :   bool Changed = false;
     472             : 
     473       16178 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     474         379 :       DA->isUniform(&I))
     475         257 :     Changed |= promoteUniformOpToI32(I);
     476             : 
     477       10793 :   return Changed;
     478             : }
     479             : 
     480       14441 : bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
     481       14441 :   if (!WidenLoads)
     482             :     return false;
     483             : 
     484       13184 :   if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
     485       15762 :        I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
     486        1345 :       canWidenScalarExtLoad(I)) {
     487          69 :     IRBuilder<> Builder(&I);
     488          69 :     Builder.SetCurrentDebugLocation(I.getDebugLoc());
     489             : 
     490          69 :     Type *I32Ty = Builder.getInt32Ty();
     491          69 :     Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
     492          69 :     Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
     493          69 :     LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
     494          69 :     WidenLoad->copyMetadata(I);
     495             : 
     496             :     // If we have range metadata, we need to convert the type, and not make
     497             :     // assumptions about the high bits.
     498           7 :     if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
     499             :       ConstantInt *Lower =
     500             :         mdconst::extract<ConstantInt>(Range->getOperand(0));
     501             : 
     502           6 :       if (Lower->getValue().isNullValue()) {
     503           1 :         WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
     504             :       } else {
     505             :         Metadata *LowAndHigh[] = {
     506          10 :           ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
     507             :           // Don't make assumptions about the high bits.
     508           5 :           ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
     509          10 :         };
     510             : 
     511           5 :         WidenLoad->setMetadata(LLVMContext::MD_range,
     512           5 :                                MDNode::get(Mod->getContext(), LowAndHigh));
     513             :       }
     514             :     }
     515             : 
     516          69 :     int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
     517          69 :     Type *IntNTy = Builder.getIntNTy(TySize);
     518          69 :     Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
     519         138 :     Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
     520          69 :     I.replaceAllUsesWith(ValOrig);
     521          69 :     I.eraseFromParent();
     522             :     return true;
     523             :   }
     524             : 
     525             :   return false;
     526             : }
     527             : 
     528        2472 : bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
     529             :   bool Changed = false;
     530             : 
     531        5044 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
     532         252 :       DA->isUniform(&I))
     533         112 :     Changed |= promoteUniformOpToI32(I);
     534             : 
     535        2472 :   return Changed;
     536             : }
     537             : 
     538        1754 : bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
     539             :   bool Changed = false;
     540             : 
     541        2784 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     542         245 :       DA->isUniform(&I))
     543         125 :     Changed |= promoteUniformOpToI32(I);
     544             : 
     545        1754 :   return Changed;
     546             : }
     547             : 
     548       11825 : bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
     549       11825 :   switch (I.getIntrinsicID()) {
     550          41 :   case Intrinsic::bitreverse:
     551          41 :     return visitBitreverseIntrinsicInst(I);
     552             :   default:
     553             :     return false;
     554             :   }
     555             : }
     556             : 
     557          41 : bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
     558             :   bool Changed = false;
     559             : 
     560          67 :   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
     561          10 :       DA->isUniform(&I))
     562           8 :     Changed |= promoteUniformBitreverseToI32(I);
     563             : 
     564          41 :   return Changed;
     565             : }
     566             : 
     567        1790 : bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
     568        1790 :   Mod = &M;
     569        1790 :   return false;
     570             : }
     571             : 
     572       18016 : bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
     573       18016 :   if (skipFunction(F))
     574             :     return false;
     575             : 
     576       18013 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     577       18013 :   if (!TPC)
     578             :     return false;
     579             : 
     580       18005 :   const TargetMachine &TM = TPC->getTM<TargetMachine>();
     581       18005 :   ST = &TM.getSubtarget<SISubtarget>(F);
     582       18005 :   DA = &getAnalysis<DivergenceAnalysis>();
     583       18005 :   HasUnsafeFPMath = hasUnsafeFPMath(F);
     584             : 
     585             :   bool MadeChange = false;
     586             : 
     587       38188 :   for (BasicBlock &BB : F) {
     588             :     BasicBlock::iterator Next;
     589      246437 :     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
     590             :       Next = std::next(I);
     591      113127 :       MadeChange |= visit(*I);
     592             :     }
     593             :   }
     594             : 
     595             :   return MadeChange;
     596             : }
     597             : 
     598       76336 : INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
     599             :                       "AMDGPU IR optimizations", false, false)
     600       76336 : INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
     601      357084 : INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
     602             :                     false, false)
     603             : 
     604             : char AMDGPUCodeGenPrepare::ID = 0;
     605             : 
     606        1795 : FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
     607        3590 :   return new AMDGPUCodeGenPrepare();
     608      303507 : }

Generated by: LCOV version 1.13